CAST-genomics · aryarm · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/gwas/aou/phenotype_preprocessing/README.md b/gwas/aou/phenotype_preprocessing/README.md
@@ -29,7 +29,7 @@ Required arguments:
 * `--range <INT,INT>`: Minimum and maximum accepted values for the phenotype
 
 Optional arguments:
-* `--samples <FILE>`: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v7.csv`.
+* `--samples <FILE>`: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`.
 * `--drugexposure-covariate-concept-ids <STR>`: List of conceptid:conceptname to use as drug exposure covariates.
 * `--ppi`: Indicate if the phenotype concept id is from the PPI or LOINC (default) tables. PPI are general physical measurements (e.g. hight) and include very limited concept ids.
 * `--outlier-sd`: Filter samples whose phenotype values lie below or above this number of standard deviations from the mean phenotype value.

diff --git a/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py
@@ -17,7 +17,7 @@
 
 bucket = os.environ["WORKSPACE_BUCKET"]
 SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \
-    "passing_samples_v7.csv")
+    "passing_samples_v8.0.csv")
 
 def MSG(msg_str):
     """

diff --git a/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py
@@ -19,7 +19,7 @@
 import sys
 
 SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \
-    "passing_samples_v7.csv")
+    "passing_samples_v8.0.csv")
 
 def MSG(msg_str):
     """

diff --git a/gwas/aou/phenotype_preprocessing/process_phenotypes.py b/gwas/aou/phenotype_preprocessing/process_phenotypes.py
@@ -67,7 +67,7 @@ def UploadGCS(outfile, gcsfile):
     outfile = row["phenotype"]+"_phenocovar.csv"
     gcsfile = os.path.join(os.environ["WORKSPACE_BUCKET"], "phenotypes", outfile)
     RunCmd(cmd)
-    UploadGCS(outfile, gcsfile)
+    # UploadGCS(outfile, gcsfile)
     numsamples = len(open(outfile, "r").readlines())-1
     outitems = [row["phenotype"], row["concept_id"], \
     	row["units"], row["outlier_sd"], row["min"], row["max"], row["drugcovars"], \

diff --git a/gwas/aou/sample_preprocessing/README.md b/gwas/aou/sample_preprocessing/README.md
@@ -1,32 +1,21 @@
 # Sample preprocessing for AoU
 
-This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v7.1.csv`. 
+This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. 
 
 The sample qc scripts in this directory should be executed in the following order: 
-1. `compute_ratios.py` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample
-2. `merge_batches.py`: the previous script runs in batches, this script combines those for further processing
-3. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks
+1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample
+2. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks
+3. `construct_cohorts.py`: uses the final output of sample qc above to build ancestry cohorts
 
-### Running `compute_ratios.py`
+### Running `compute_ratios.bash`
 
-This script must be run on the AoU workbench and requires a dataproc cluster for hail/spark. Use the command `nohup python3 compute_ratios.py > output.log 2>&1 &`.
+This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv`.
 
-The image belows provides the parameters used to set up the cluster.
-
-![unnamed](https://github.com/CAST-genomics/cast-workflows/assets/16807372/52801a8d-d28b-4123-8011-5161d2a2a950)
-
-With the parameters above, batches of 50k samples are recommended. As there are currently 250k samples available in AoU, the default number of batches is 5.
-The `compute_ratios.py` script should, but does not, run through all the batches. The cluster runs out of memory after each batch. The solution, though not ideal, is
-to do the following:
-- Run one batch. Soon after the next batch starts, it will likely run out of memory.
-- Delete and recreate the compute environment.
-- In `compute_ratios.py`, change the `for` loop from `for i in range(num_batches):` to `for i in range(X, num_batches):` where `X` is the batch you wish to run. For example, if you have finished two batches and want to run the third batch, `X` would be 2 (following 0 indexing)
-
-Better solutions may be implemented in the future, but as we expect sample preprocessing to be an infrequent step (only performed when new samples are available), we defer this to later.
+You can just use the least expensive possible environment configuration for this one.
 
 ### Sample qc
 
-After `compute_ratios.py` has finished all batches, run `merge_batches.py` to combine the batches into one file. Then run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks.
+After `compute_ratios.bash` has finished, run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks.
 
 The sample preprocessing is performed in `finish_sampleqc.py`. This script uses the ratios to filter samples. 
 

diff --git a/gwas/aou/sample_preprocessing/compute_ratios.bash b/gwas/aou/sample_preprocessing/compute_ratios.bash
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Create a CSV containing the necessary data
+# Allocate 4 CPUs, 3.6 GB memory, and 120 GB of standard storage (the least-expensive possible configuration)
+
+V8_BUCKET="$1" # ex: gs://fc-secure-MYBUCKET
+V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux"
+
+# columns of all_samples.tsv file:
+# 1:s
+# 7:ins_del_ratio
+# 10:snp_het_homvar_ratio
+# 11:indel_het_homvar_ratio
+# 12:ti_tv_ratio
+# 13:singleton
+
+gsutil -u $GOOGLE_PROJECT cat "$V8_CDR_DIR/qc/all_samples.tsv" | \
+cut -f1,7,10,11,12,13 | tr $'\t' ,
diff --git a/gwas/aou/sample_preprocessing/compute_ratios.py b/gwas/aou/sample_preprocessing/compute_ratios.py
diff --git a/gwas/aou/sample_preprocessing/construct_cohorts.py b/gwas/aou/sample_preprocessing/construct_cohorts.py
@@ -4,12 +4,12 @@
 
 bucket = os.getenv('WORKSPACE_BUCKET')
 
-subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v7.1.csv ./", shell=True, check=True)
+subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v8.0.csv ./", shell=True, check=True)
 
-passing_samples = pd.read_csv("passing_samples_v7.1.csv")
+passing_samples = pd.read_csv("passing_samples_v8.0.csv")
 #columns = person_id, sex_at_birth_Male
 
-ancestry_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"
+ancestry_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"
 subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True)
 
 ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t")

diff --git a/gwas/aou/sample_preprocessing/finish_sampleqc.py b/gwas/aou/sample_preprocessing/finish_sampleqc.py
@@ -3,9 +3,6 @@
 import subprocess
 import sys
 
-bucket = os.getenv('WORKSPACE_BUCKET')
-
-subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/wgs_ratios_all.csv ./", shell=True, check=True)
 all_ratios = pd.read_csv("wgs_ratios_all.csv")
 
 #ratio filtering here
@@ -22,19 +19,19 @@
 
 
 #relatedness filtering
-related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv"
+related_samples_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv"
 subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True)
 related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t")
 all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)]
 
-#v7.1 update as per https://support.researchallofus.org/hc/en-us/articles/25646444720404-Incremental-Data-Release-for-v7-Genotyping-Array-and-Short-Read-Genomic-Data
-wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v7/known_issues/research_id_v7_wgs_known_issue_15.tsv"
+# remove any samples with known issues
+wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/wgs_v8_known_issue_1.txt"
 subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True)
-wgs_to_filter = pd.read_csv("research_id_v7_wgs_known_issue_15.tsv", sep="\t")
+wgs_to_filter = pd.read_csv("wgs_v8_known_issue_1.txt", sep="\t")
 all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)]
 
 #sex filtering/processing
-# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v7
+# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v8
 dataset_52206452_person_sql = """
     SELECT
         person.person_id,
@@ -89,4 +86,4 @@
 all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner')
 all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int')
 all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False)
-subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v7.1.csv", shell=True, check=True)
+#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v8.0.csv", shell=True, check=True)
diff --git a/gwas/aou/sample_preprocessing/merge_batches.py b/gwas/aou/sample_preprocessing/merge_batches.py