From 091e11739be20a4656aa609f58c9deeffe13073a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:28:32 -0800 Subject: [PATCH 01/12] create sample preprocessing script for v8 data --- gwas/aou/sample_preprocessing.v8/README.md | 32 +++++++ .../compute_ratios.bash | 18 ++++ .../construct_cohorts.py | 88 ++++++++++++++++++ .../finish_sampleqc.py | 92 +++++++++++++++++++ 4 files changed, 230 insertions(+) create mode 100644 gwas/aou/sample_preprocessing.v8/README.md create mode 100644 gwas/aou/sample_preprocessing.v8/compute_ratios.bash create mode 100644 gwas/aou/sample_preprocessing.v8/construct_cohorts.py create mode 100644 gwas/aou/sample_preprocessing.v8/finish_sampleqc.py diff --git a/gwas/aou/sample_preprocessing.v8/README.md b/gwas/aou/sample_preprocessing.v8/README.md new file mode 100644 index 00000000..1b3c4d03 --- /dev/null +++ b/gwas/aou/sample_preprocessing.v8/README.md @@ -0,0 +1,32 @@ +# Sample preprocessing for AoU + +This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v7.1.csv`. + +The sample qc scripts in this directory should be executed in the following order: +1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample +2. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks +3. `construct_cohorts.py`: uses the final output of sample qc above to build ancestry cohorts + +### Running `compute_ratios.bash` + +This script must be run on the AoU workbench. Use the command `compute_ratios.bash > wgs_ratios_all.csv 2>&1`. + +You can just use the least expensive possible environment configuration for this one. + +### Sample qc + +After `compute_ratios.bash` has finished, run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. + +The sample preprocessing is performed in `finish_sampleqc.py`. This script uses the ratios to filter samples. + +Related samples are removed as described in https://support.researchallofus.org/hc/en-us/articles/4614687617556-How-the-All-of-Us-Genomic-data-are-organized + +Lastly, this script performs sex-related preprocessing. + +# Constructing cohorts + +The `construct_cohorts.py` script uses the final output of sample qc above to build ancestry cohorts. For now, it constructs only two cohorts: +1. A European cohort consisting of samples where predicted ancestry is EUR and self-reported race is white +2. An African cohort consiting of samples where predicted ancestry is AFR and self-reported race is black and African American + +Each cohort file consists of the same two columns as before: `person_id` and `sex_at_birth_Male`. The cohort files are saved to `${WORKSPACE_BUCKET}/samples/`. Additional cohorts may be added in the future. diff --git a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash new file mode 100644 index 00000000..089fc1aa --- /dev/null +++ b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Create a CSV containing the necessary data +# Allocate 4 CPUs, 3.6 GB memory, and 120 GB of standard storage (the least-expensive possible configuration) + +V8_BUCKET="$1" # ex: gs://fc-secure-MYBUCKET +V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/" + +# columns of all_samples.tsv file: +# 1:s +# 7:ins_del_ratio +# 10:snp_het_homvar_ratio +# 11:indel_het_homvar_ratio +# 12:ti_tv_ratio +# 13:singleton + +gsutil -u $GOOGLE_PROJECT cat "$V8_CDR_DIR/qc/all_samples.tsv" | \ +cut -f1,7,10,11,12,13 --output-delimeter, diff --git a/gwas/aou/sample_preprocessing.v8/construct_cohorts.py b/gwas/aou/sample_preprocessing.v8/construct_cohorts.py new file mode 100644 index 00000000..15519dce --- /dev/null +++ b/gwas/aou/sample_preprocessing.v8/construct_cohorts.py @@ -0,0 +1,88 @@ +import pandas as pd +import os +import subprocess + +bucket = os.getenv('WORKSPACE_BUCKET') + +subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v7.1.csv ./", shell=True, check=True) + +passing_samples = pd.read_csv("passing_samples_v7.1.csv") +#columns = person_id, sex_at_birth_Male + +ancestry_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" +subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True) + +ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t") +ancestry = ancestry[['research_id', 'ancestry_pred', 'ancestry_pred_other']] +ancestry.rename(columns={'research_id':'person_id'}, inplace=True) + +passing_samples = passing_samples.merge(ancestry, on='person_id', how='inner') + +dataset_37150131_person_sql = """ + SELECT + person.person_id, + person.gender_concept_id, + p_gender_concept.concept_name as gender, + person.birth_datetime as date_of_birth, + person.race_concept_id, + p_race_concept.concept_name as race, + person.ethnicity_concept_id, + p_ethnicity_concept.concept_name as ethnicity, + person.sex_at_birth_concept_id, + p_sex_at_birth_concept.concept_name as sex_at_birth + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.person` person + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept + ON person.gender_concept_id = p_gender_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept + ON person.race_concept_id = p_race_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept + ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept + ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id + WHERE + person.PERSON_ID IN ( + SELECT + distinct person_id + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person + WHERE + cb_search_person.person_id IN ( + SELECT + person_id + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p + WHERE + has_whole_genome_variant = 1 + ) + )""" + +demog = pd.read_gbq( + dataset_37150131_person_sql, + dialect="standard", + use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ), + progress_bar_type="tqdm_notebook") + +demog = demog[['person_id', 'race']] +passing_samples = passing_samples.merge(demog, on='person_id', how='inner') + +eur_white = passing_samples[(passing_samples['ancestry_pred']=='eur') & (passing_samples['race']=='White')] +afr_black = passing_samples[(passing_samples['ancestry_pred']=='afr') & (passing_samples['race']=='Black or African American')] +not_afr_black = passing_samples[~((passing_samples['ancestry_pred']=='afr') | (passing_samples['race']=='Black or African American'))] + + +eur_white[['person_id', 'sex_at_birth_Male']].to_csv("EUR_WHITE.csv", index=False) +afr_black[['person_id', 'sex_at_birth_Male']].to_csv("AFR_BLACK.csv", index=False) +not_afr_black[['person_id', 'sex_at_birth_Male']].to_csv("NOT_AFR_BLACK.csv", index=False) + + +subprocess.run("gsutil cp EUR_WHITE.csv "+bucket+"/samples/EUR_WHITE.csv", shell=True, check=True) +subprocess.run("gsutil cp AFR_BLACK.csv "+bucket+"/samples/AFR_BLACK.csv", shell=True, check=True) +subprocess.run("gsutil cp NOT_AFR_BLACK.csv "+bucket+"/samples/NOT_AFR_BLACK.csv", shell=True, check=True) + + + diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py new file mode 100644 index 00000000..851d49e8 --- /dev/null +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -0,0 +1,92 @@ +import pandas as pd +import os +import subprocess +import sys + +bucket = os.getenv('WORKSPACE_BUCKET') + +subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/wgs_ratios_all.csv ./", shell=True, check=True) +all_ratios = pd.read_csv("wgs_ratios_all.csv") + +#ratio filtering here +means = all_ratios.mean() +stds = all_ratios.std() +for c in all_ratios.columns: + if c == 'person_id': + continue + avg = means[c] + sd = stds[c] + minval = avg - 8*sd + maxval = avg + 8*sd + all_ratios = all_ratios[(all_ratios[c] >= minval) & (all_ratios[c] <= maxval)] + + +#relatedness filtering +related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" +subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True) +related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t") +all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] + +#v7.1 update as per https://support.researchallofus.org/hc/en-us/articles/25646444720404-Incremental-Data-Release-for-v7-Genotyping-Array-and-Short-Read-Genomic-Data +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v7/known_issues/research_id_v7_wgs_known_issue_15.tsv" +subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) +wgs_to_filter = pd.read_csv("research_id_v7_wgs_known_issue_15.tsv", sep="\t") +all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] + +#sex filtering/processing +# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v7 +dataset_52206452_person_sql = """ + SELECT + person.person_id, + person.gender_concept_id, + p_gender_concept.concept_name as gender, + person.birth_datetime as date_of_birth, + person.race_concept_id, + p_race_concept.concept_name as race, + person.ethnicity_concept_id, + p_ethnicity_concept.concept_name as ethnicity, + person.sex_at_birth_concept_id, + p_sex_at_birth_concept.concept_name as sex_at_birth + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.person` person + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept + ON person.gender_concept_id = p_gender_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept + ON person.race_concept_id = p_race_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept + ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id + LEFT JOIN + `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept + ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id + WHERE + person.PERSON_ID IN ( + SELECT + distinct person_id + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person + WHERE + cb_search_person.person_id IN ( + SELECT + person_id + FROM + `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p + WHERE + has_whole_genome_variant = 1 + ) + )""" + +demog = pd.read_gbq( + dataset_52206452_person_sql, + dialect="standard", + use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ), + progress_bar_type="tqdm_notebook") + +demog = demog[(demog['sex_at_birth']=='Male') | (demog['sex_at_birth']=='Female')] + +all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') +all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') +all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) +subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v7.1.csv", shell=True, check=True) From 85cd5e4dc13d71a739db03ea4f0f51f0e97129be Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:31:13 -0800 Subject: [PATCH 02/12] fix spelling of output delimiter --- gwas/aou/sample_preprocessing.v8/README.md | 2 +- gwas/aou/sample_preprocessing.v8/compute_ratios.bash | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/README.md b/gwas/aou/sample_preprocessing.v8/README.md index 1b3c4d03..0d1bafe1 100644 --- a/gwas/aou/sample_preprocessing.v8/README.md +++ b/gwas/aou/sample_preprocessing.v8/README.md @@ -9,7 +9,7 @@ The sample qc scripts in this directory should be executed in the following orde ### Running `compute_ratios.bash` -This script must be run on the AoU workbench. Use the command `compute_ratios.bash > wgs_ratios_all.csv 2>&1`. +This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv 2>&1`. You can just use the least expensive possible environment configuration for this one. diff --git a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash index 089fc1aa..f46a490b 100644 --- a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash +++ b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash @@ -15,4 +15,4 @@ V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/" # 13:singleton gsutil -u $GOOGLE_PROJECT cat "$V8_CDR_DIR/qc/all_samples.tsv" | \ -cut -f1,7,10,11,12,13 --output-delimeter, +cut -f1,7,10,11,12,13 --output-delimiter, From 765d8e80558baac4a00c56dc28816108451079a8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:32:09 -0800 Subject: [PATCH 03/12] use tr instead of cut --- gwas/aou/sample_preprocessing.v8/compute_ratios.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash index f46a490b..0f4346ee 100644 --- a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash +++ b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash @@ -15,4 +15,4 @@ V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/" # 13:singleton gsutil -u $GOOGLE_PROJECT cat "$V8_CDR_DIR/qc/all_samples.tsv" | \ -cut -f1,7,10,11,12,13 --output-delimiter, +cut -f1,7,10,11,12,13 | tr $'\t' , From 1a13584e8835ebe3be703883254b67d847bf4ece Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:32:42 -0800 Subject: [PATCH 04/12] oops fix file paths --- gwas/aou/sample_preprocessing.v8/compute_ratios.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash index 0f4346ee..fa00463d 100644 --- a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash +++ b/gwas/aou/sample_preprocessing.v8/compute_ratios.bash @@ -4,7 +4,7 @@ # Allocate 4 CPUs, 3.6 GB memory, and 120 GB of standard storage (the least-expensive possible configuration) V8_BUCKET="$1" # ex: gs://fc-secure-MYBUCKET -V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/" +V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux" # columns of all_samples.tsv file: # 1:s From becb4eb8cfda1f4fffe59dc8f99f31b15321ef0a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:00:33 -0800 Subject: [PATCH 05/12] create script for finish sampleqc --- .../sample_preprocessing.v8/finish_sampleqc.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py index 851d49e8..136b0b93 100644 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -3,9 +3,6 @@ import subprocess import sys -bucket = os.getenv('WORKSPACE_BUCKET') - -subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/wgs_ratios_all.csv ./", shell=True, check=True) all_ratios = pd.read_csv("wgs_ratios_all.csv") #ratio filtering here @@ -22,19 +19,19 @@ #relatedness filtering -related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" +related_samples_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True) related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] -#v7.1 update as per https://support.researchallofus.org/hc/en-us/articles/25646444720404-Incremental-Data-Release-for-v7-Genotyping-Array-and-Short-Read-Genomic-Data -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v7/known_issues/research_id_v7_wgs_known_issue_15.tsv" +# remove any samples with known issues +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/research_id_v8_wgs_known_issue_1.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("research_id_v7_wgs_known_issue_15.tsv", sep="\t") +wgs_to_filter = pd.read_csv("research_id_v8_wgs_known_issue_1.tsv", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing -# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v7 +# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v8 dataset_52206452_person_sql = """ SELECT person.person_id, @@ -89,4 +86,4 @@ all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) -subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v7.1.csv", shell=True, check=True) +#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/data/samples/passing_samples_v8.csv", shell=True, check=True) From ddec139bdbcfb2bd71c3729e70b216b9c32e1bb9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:08:26 -0800 Subject: [PATCH 06/12] adjust sample qc file ext --- gwas/aou/sample_preprocessing.v8/README.md | 2 +- gwas/aou/sample_preprocessing.v8/finish_sampleqc.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/README.md b/gwas/aou/sample_preprocessing.v8/README.md index 0d1bafe1..3558cf5c 100644 --- a/gwas/aou/sample_preprocessing.v8/README.md +++ b/gwas/aou/sample_preprocessing.v8/README.md @@ -9,7 +9,7 @@ The sample qc scripts in this directory should be executed in the following orde ### Running `compute_ratios.bash` -This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv 2>&1`. +This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv`. You can just use the least expensive possible environment configuration for this one. diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py index 136b0b93..7a00efb1 100644 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -25,9 +25,9 @@ all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] # remove any samples with known issues -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/research_id_v8_wgs_known_issue_1.tsv" +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/research_id_v8_wgs_known_issue_1.txt" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("research_id_v8_wgs_known_issue_1.tsv", sep="\t") +wgs_to_filter = pd.read_csv("research_id_v8_wgs_known_issue_1.txt", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing From b95682a3e11aa5dfc86113aeeb2cdefd1e1f0b26 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:09:48 -0800 Subject: [PATCH 07/12] try again with fixed path --- gwas/aou/sample_preprocessing.v8/finish_sampleqc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py index 7a00efb1..1e03c36c 100644 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -25,9 +25,9 @@ all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] # remove any samples with known issues -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/research_id_v8_wgs_known_issue_1.txt" +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/v8_wgs_known_issue_1.txt" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("research_id_v8_wgs_known_issue_1.txt", sep="\t") +wgs_to_filter = pd.read_csv("v8_wgs_known_issue_1.txt", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing From 35529af3402172b249ce42c3d2352d2dd1c7c04f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:10:55 -0800 Subject: [PATCH 08/12] oops try again again --- gwas/aou/sample_preprocessing.v8/finish_sampleqc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py index 1e03c36c..fab70c63 100644 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -25,9 +25,9 @@ all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] # remove any samples with known issues -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/v8_wgs_known_issue_1.txt" +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/wgs_v8_known_issue_1.txt" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("v8_wgs_known_issue_1.txt", sep="\t") +wgs_to_filter = pd.read_csv("wgs_v8_known_issue_1.txt", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing From 4ad6990069438f6c58bf2d164d4efbab41544158 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:18:14 -0800 Subject: [PATCH 09/12] migrate construct_cohorts script as well --- gwas/aou/sample_preprocessing.v8/construct_cohorts.py | 6 +++--- gwas/aou/sample_preprocessing.v8/finish_sampleqc.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gwas/aou/sample_preprocessing.v8/construct_cohorts.py b/gwas/aou/sample_preprocessing.v8/construct_cohorts.py index 15519dce..b107eb4f 100644 --- a/gwas/aou/sample_preprocessing.v8/construct_cohorts.py +++ b/gwas/aou/sample_preprocessing.v8/construct_cohorts.py @@ -4,12 +4,12 @@ bucket = os.getenv('WORKSPACE_BUCKET') -subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v7.1.csv ./", shell=True, check=True) +subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v8.0.csv ./", shell=True, check=True) -passing_samples = pd.read_csv("passing_samples_v7.1.csv") +passing_samples = pd.read_csv("passing_samples_v8.0.csv") #columns = person_id, sex_at_birth_Male -ancestry_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" +ancestry_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True) ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t") diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py index fab70c63..97657e93 100644 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py @@ -86,4 +86,4 @@ all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) -#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/data/samples/passing_samples_v8.csv", shell=True, check=True) +#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v8.0.csv", shell=True, check=True) From 5cd58f599ff9ade1aa60fae74c6ecb43041c0a05 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Feb 2026 17:06:40 -0800 Subject: [PATCH 10/12] rename sample preprocessing script in readme --- gwas/aou/sample_preprocessing.v8/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwas/aou/sample_preprocessing.v8/README.md b/gwas/aou/sample_preprocessing.v8/README.md index 3558cf5c..20a6b33b 100644 --- a/gwas/aou/sample_preprocessing.v8/README.md +++ b/gwas/aou/sample_preprocessing.v8/README.md @@ -1,6 +1,6 @@ # Sample preprocessing for AoU -This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v7.1.csv`. +This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. The sample qc scripts in this directory should be executed in the following order: 1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample From f91c5bd0488db2b5eabc4ef844c4dfa703df5978 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Feb 2026 14:44:51 -0800 Subject: [PATCH 11/12] switch to v8 for phenotype preprocessing --- gwas/aou/phenotype_preprocessing/README.md | 2 +- .../aou_case_control_phenotype_preprocessing.py | 2 +- gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py | 2 +- gwas/aou/phenotype_preprocessing/process_phenotypes.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gwas/aou/phenotype_preprocessing/README.md b/gwas/aou/phenotype_preprocessing/README.md index c6c61c8e..9d5f76fa 100644 --- a/gwas/aou/phenotype_preprocessing/README.md +++ b/gwas/aou/phenotype_preprocessing/README.md @@ -29,7 +29,7 @@ Required arguments: * `--range `: Minimum and maximum accepted values for the phenotype Optional arguments: -* `--samples `: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v7.csv`. +* `--samples `: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. * `--drugexposure-covariate-concept-ids `: List of conceptid:conceptname to use as drug exposure covariates. * `--ppi`: Indicate if the phenotype concept id is from the PPI or LOINC (default) tables. PPI are general physical measurements (e.g. hight) and include very limited concept ids. * `--outlier-sd`: Filter samples whose phenotype values lie below or above this number of standard deviations from the mean phenotype value. diff --git a/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py index 165c011a..c7f00231 100755 --- a/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py +++ b/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py @@ -17,7 +17,7 @@ bucket = os.environ["WORKSPACE_BUCKET"] SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \ - "passing_samples_v7.csv") + "passing_samples_v8.0.csv") def MSG(msg_str): """ diff --git a/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py index d6e40693..3a80094e 100755 --- a/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py +++ b/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py @@ -19,7 +19,7 @@ import sys SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \ - "passing_samples_v7.csv") + "passing_samples_v8.0.csv") def MSG(msg_str): """ diff --git a/gwas/aou/phenotype_preprocessing/process_phenotypes.py b/gwas/aou/phenotype_preprocessing/process_phenotypes.py index ab8dcbc1..c0fc7512 100755 --- a/gwas/aou/phenotype_preprocessing/process_phenotypes.py +++ b/gwas/aou/phenotype_preprocessing/process_phenotypes.py @@ -67,7 +67,7 @@ def UploadGCS(outfile, gcsfile): outfile = row["phenotype"]+"_phenocovar.csv" gcsfile = os.path.join(os.environ["WORKSPACE_BUCKET"], "phenotypes", outfile) RunCmd(cmd) - UploadGCS(outfile, gcsfile) + # UploadGCS(outfile, gcsfile) numsamples = len(open(outfile, "r").readlines())-1 outitems = [row["phenotype"], row["concept_id"], \ row["units"], row["outlier_sd"], row["min"], row["max"], row["drugcovars"], \ From 795c424aeffe24321648195e09f4acdbd3fb4b34 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Feb 2026 14:45:10 -0800 Subject: [PATCH 12/12] switch sample preprocessing scripts to v8 too --- gwas/aou/sample_preprocessing.v8/README.md | 32 ------- .../construct_cohorts.py | 88 ------------------ .../finish_sampleqc.py | 89 ------------------- gwas/aou/sample_preprocessing/README.md | 27 ++---- .../compute_ratios.bash | 0 .../sample_preprocessing/compute_ratios.py | 51 ----------- .../sample_preprocessing/construct_cohorts.py | 6 +- .../sample_preprocessing/finish_sampleqc.py | 15 ++-- .../aou/sample_preprocessing/merge_batches.py | 27 ------ 9 files changed, 17 insertions(+), 318 deletions(-) delete mode 100644 gwas/aou/sample_preprocessing.v8/README.md delete mode 100644 gwas/aou/sample_preprocessing.v8/construct_cohorts.py delete mode 100644 gwas/aou/sample_preprocessing.v8/finish_sampleqc.py rename gwas/aou/{sample_preprocessing.v8 => sample_preprocessing}/compute_ratios.bash (100%) delete mode 100644 gwas/aou/sample_preprocessing/compute_ratios.py delete mode 100644 gwas/aou/sample_preprocessing/merge_batches.py diff --git a/gwas/aou/sample_preprocessing.v8/README.md b/gwas/aou/sample_preprocessing.v8/README.md deleted file mode 100644 index 20a6b33b..00000000 --- a/gwas/aou/sample_preprocessing.v8/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Sample preprocessing for AoU - -This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. - -The sample qc scripts in this directory should be executed in the following order: -1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample -2. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks -3. `construct_cohorts.py`: uses the final output of sample qc above to build ancestry cohorts - -### Running `compute_ratios.bash` - -This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv`. - -You can just use the least expensive possible environment configuration for this one. - -### Sample qc - -After `compute_ratios.bash` has finished, run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. - -The sample preprocessing is performed in `finish_sampleqc.py`. This script uses the ratios to filter samples. - -Related samples are removed as described in https://support.researchallofus.org/hc/en-us/articles/4614687617556-How-the-All-of-Us-Genomic-data-are-organized - -Lastly, this script performs sex-related preprocessing. - -# Constructing cohorts - -The `construct_cohorts.py` script uses the final output of sample qc above to build ancestry cohorts. For now, it constructs only two cohorts: -1. A European cohort consisting of samples where predicted ancestry is EUR and self-reported race is white -2. An African cohort consiting of samples where predicted ancestry is AFR and self-reported race is black and African American - -Each cohort file consists of the same two columns as before: `person_id` and `sex_at_birth_Male`. The cohort files are saved to `${WORKSPACE_BUCKET}/samples/`. Additional cohorts may be added in the future. diff --git a/gwas/aou/sample_preprocessing.v8/construct_cohorts.py b/gwas/aou/sample_preprocessing.v8/construct_cohorts.py deleted file mode 100644 index b107eb4f..00000000 --- a/gwas/aou/sample_preprocessing.v8/construct_cohorts.py +++ /dev/null @@ -1,88 +0,0 @@ -import pandas as pd -import os -import subprocess - -bucket = os.getenv('WORKSPACE_BUCKET') - -subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v8.0.csv ./", shell=True, check=True) - -passing_samples = pd.read_csv("passing_samples_v8.0.csv") -#columns = person_id, sex_at_birth_Male - -ancestry_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" -subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True) - -ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t") -ancestry = ancestry[['research_id', 'ancestry_pred', 'ancestry_pred_other']] -ancestry.rename(columns={'research_id':'person_id'}, inplace=True) - -passing_samples = passing_samples.merge(ancestry, on='person_id', how='inner') - -dataset_37150131_person_sql = """ - SELECT - person.person_id, - person.gender_concept_id, - p_gender_concept.concept_name as gender, - person.birth_datetime as date_of_birth, - person.race_concept_id, - p_race_concept.concept_name as race, - person.ethnicity_concept_id, - p_ethnicity_concept.concept_name as ethnicity, - person.sex_at_birth_concept_id, - p_sex_at_birth_concept.concept_name as sex_at_birth - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.person` person - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept - ON person.gender_concept_id = p_gender_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept - ON person.race_concept_id = p_race_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept - ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept - ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id - WHERE - person.PERSON_ID IN ( - SELECT - distinct person_id - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person - WHERE - cb_search_person.person_id IN ( - SELECT - person_id - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p - WHERE - has_whole_genome_variant = 1 - ) - )""" - -demog = pd.read_gbq( - dataset_37150131_person_sql, - dialect="standard", - use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ), - progress_bar_type="tqdm_notebook") - -demog = demog[['person_id', 'race']] -passing_samples = passing_samples.merge(demog, on='person_id', how='inner') - -eur_white = passing_samples[(passing_samples['ancestry_pred']=='eur') & (passing_samples['race']=='White')] -afr_black = passing_samples[(passing_samples['ancestry_pred']=='afr') & (passing_samples['race']=='Black or African American')] -not_afr_black = passing_samples[~((passing_samples['ancestry_pred']=='afr') | (passing_samples['race']=='Black or African American'))] - - -eur_white[['person_id', 'sex_at_birth_Male']].to_csv("EUR_WHITE.csv", index=False) -afr_black[['person_id', 'sex_at_birth_Male']].to_csv("AFR_BLACK.csv", index=False) -not_afr_black[['person_id', 'sex_at_birth_Male']].to_csv("NOT_AFR_BLACK.csv", index=False) - - -subprocess.run("gsutil cp EUR_WHITE.csv "+bucket+"/samples/EUR_WHITE.csv", shell=True, check=True) -subprocess.run("gsutil cp AFR_BLACK.csv "+bucket+"/samples/AFR_BLACK.csv", shell=True, check=True) -subprocess.run("gsutil cp NOT_AFR_BLACK.csv "+bucket+"/samples/NOT_AFR_BLACK.csv", shell=True, check=True) - - - diff --git a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py b/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py deleted file mode 100644 index 97657e93..00000000 --- a/gwas/aou/sample_preprocessing.v8/finish_sampleqc.py +++ /dev/null @@ -1,89 +0,0 @@ -import pandas as pd -import os -import subprocess -import sys - -all_ratios = pd.read_csv("wgs_ratios_all.csv") - -#ratio filtering here -means = all_ratios.mean() -stds = all_ratios.std() -for c in all_ratios.columns: - if c == 'person_id': - continue - avg = means[c] - sd = stds[c] - minval = avg - 8*sd - maxval = avg + 8*sd - all_ratios = all_ratios[(all_ratios[c] >= minval) & (all_ratios[c] <= maxval)] - - -#relatedness filtering -related_samples_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" -subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True) -related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t") -all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] - -# remove any samples with known issues -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/wgs_v8_known_issue_1.txt" -subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("wgs_v8_known_issue_1.txt", sep="\t") -all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] - -#sex filtering/processing -# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v8 -dataset_52206452_person_sql = """ - SELECT - person.person_id, - person.gender_concept_id, - p_gender_concept.concept_name as gender, - person.birth_datetime as date_of_birth, - person.race_concept_id, - p_race_concept.concept_name as race, - person.ethnicity_concept_id, - p_ethnicity_concept.concept_name as ethnicity, - person.sex_at_birth_concept_id, - p_sex_at_birth_concept.concept_name as sex_at_birth - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.person` person - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept - ON person.gender_concept_id = p_gender_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept - ON person.race_concept_id = p_race_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept - ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id - LEFT JOIN - `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept - ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id - WHERE - person.PERSON_ID IN ( - SELECT - distinct person_id - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person - WHERE - cb_search_person.person_id IN ( - SELECT - person_id - FROM - `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p - WHERE - has_whole_genome_variant = 1 - ) - )""" - -demog = pd.read_gbq( - dataset_52206452_person_sql, - dialect="standard", - use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ), - progress_bar_type="tqdm_notebook") - -demog = demog[(demog['sex_at_birth']=='Male') | (demog['sex_at_birth']=='Female')] - -all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') -all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') -all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) -#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v8.0.csv", shell=True, check=True) diff --git a/gwas/aou/sample_preprocessing/README.md b/gwas/aou/sample_preprocessing/README.md index 09400f88..20a6b33b 100644 --- a/gwas/aou/sample_preprocessing/README.md +++ b/gwas/aou/sample_preprocessing/README.md @@ -1,32 +1,21 @@ # Sample preprocessing for AoU -This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v7.1.csv`. +This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. The sample qc scripts in this directory should be executed in the following order: -1. `compute_ratios.py` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample -2. `merge_batches.py`: the previous script runs in batches, this script combines those for further processing -3. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks +1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample +2. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks +3. `construct_cohorts.py`: uses the final output of sample qc above to build ancestry cohorts -### Running `compute_ratios.py` +### Running `compute_ratios.bash` -This script must be run on the AoU workbench and requires a dataproc cluster for hail/spark. Use the command `nohup python3 compute_ratios.py > output.log 2>&1 &`. +This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv`. -The image belows provides the parameters used to set up the cluster. - -![unnamed](https://github.com/CAST-genomics/cast-workflows/assets/16807372/52801a8d-d28b-4123-8011-5161d2a2a950) - -With the parameters above, batches of 50k samples are recommended. As there are currently 250k samples available in AoU, the default number of batches is 5. -The `compute_ratios.py` script should, but does not, run through all the batches. The cluster runs out of memory after each batch. The solution, though not ideal, is -to do the following: -- Run one batch. Soon after the next batch starts, it will likely run out of memory. -- Delete and recreate the compute environment. -- In `compute_ratios.py`, change the `for` loop from `for i in range(num_batches):` to `for i in range(X, num_batches):` where `X` is the batch you wish to run. For example, if you have finished two batches and want to run the third batch, `X` would be 2 (following 0 indexing) - -Better solutions may be implemented in the future, but as we expect sample preprocessing to be an infrequent step (only performed when new samples are available), we defer this to later. +You can just use the least expensive possible environment configuration for this one. ### Sample qc -After `compute_ratios.py` has finished all batches, run `merge_batches.py` to combine the batches into one file. Then run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. +After `compute_ratios.bash` has finished, run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. The sample preprocessing is performed in `finish_sampleqc.py`. This script uses the ratios to filter samples. diff --git a/gwas/aou/sample_preprocessing.v8/compute_ratios.bash b/gwas/aou/sample_preprocessing/compute_ratios.bash similarity index 100% rename from gwas/aou/sample_preprocessing.v8/compute_ratios.bash rename to gwas/aou/sample_preprocessing/compute_ratios.bash diff --git a/gwas/aou/sample_preprocessing/compute_ratios.py b/gwas/aou/sample_preprocessing/compute_ratios.py deleted file mode 100644 index 2ad0a6b9..00000000 --- a/gwas/aou/sample_preprocessing/compute_ratios.py +++ /dev/null @@ -1,51 +0,0 @@ -import pandas as pd -import numpy as np -import os -import hail as hl -import subprocess -import sys - -#default params -num_batches = 5 #recommend batches of 50k samples -batch_file_name = "wgs_ratios_batch" - -if len(sys.argv) == 2: - num_batches = sys.argv[1] -if len(sys.argv) == 3: - batch_file_name = sys.argv[2] -#TODO add argument for output location - -bucket = os.getenv('WORKSPACE_BUCKET') - -hl.init(default_reference = "GRCh38") - -mt_wgs_path = os.getenv("WGS_ACAF_THRESHOLD_MULTI_HAIL_PATH") -mt = hl.read_matrix_table(mt_wgs_path) -sampleIDs = mt.cols().to_pandas() -mt = mt.annotate_cols(sint = hl.int(mt.s)) - - -qs = np.linspace(0,1, num_batches+1) -quantiles = sampleIDs.astype('int').quantile(qs) -quantiles.reset_index(inplace=True) - -for i in range(num_batches): - start = quantiles['s'][i] - end = quantiles['s'][i+1] - if i+1 == num_batches: #last round - end += 1 #for last round, want to end to include last sample - #filtered mt - fmt = mt.filter_cols(((mt.sint >= start) & (mt.sint < end))) - fmt = hl.sample_qc(fmt) - fmt = fmt.filter_cols(fmt.sample_qc.call_rate >= .90, keep = True) - - ratios = fmt.select_cols(singleton = fmt.sample_qc.n_singleton, - ins_del = fmt.sample_qc.r_insertion_deletion, - ti_tv = fmt.sample_qc.r_ti_tv, - het_hom = fmt.sample_qc.r_het_hom_var) - tbl = ratios.cols() - pddf = tbl.to_pandas() - f_towrite = batch_file_name+str(i)+".csv" - pddf.to_csv(f_towrite, sep=",", index=False) - subprocess.run("gsutil cp "+f_towrite+" "+bucket+"/tmirmira/data/qc/"+f_towrite, shell=True, check=True) - diff --git a/gwas/aou/sample_preprocessing/construct_cohorts.py b/gwas/aou/sample_preprocessing/construct_cohorts.py index 15519dce..b107eb4f 100644 --- a/gwas/aou/sample_preprocessing/construct_cohorts.py +++ b/gwas/aou/sample_preprocessing/construct_cohorts.py @@ -4,12 +4,12 @@ bucket = os.getenv('WORKSPACE_BUCKET') -subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v7.1.csv ./", shell=True, check=True) +subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v8.0.csv ./", shell=True, check=True) -passing_samples = pd.read_csv("passing_samples_v7.1.csv") +passing_samples = pd.read_csv("passing_samples_v8.0.csv") #columns = person_id, sex_at_birth_Male -ancestry_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" +ancestry_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True) ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t") diff --git a/gwas/aou/sample_preprocessing/finish_sampleqc.py b/gwas/aou/sample_preprocessing/finish_sampleqc.py index 851d49e8..97657e93 100644 --- a/gwas/aou/sample_preprocessing/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing/finish_sampleqc.py @@ -3,9 +3,6 @@ import subprocess import sys -bucket = os.getenv('WORKSPACE_BUCKET') - -subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/wgs_ratios_all.csv ./", shell=True, check=True) all_ratios = pd.read_csv("wgs_ratios_all.csv") #ratio filtering here @@ -22,19 +19,19 @@ #relatedness filtering -related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" +related_samples_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True) related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] -#v7.1 update as per https://support.researchallofus.org/hc/en-us/articles/25646444720404-Incremental-Data-Release-for-v7-Genotyping-Array-and-Short-Read-Genomic-Data -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v7/known_issues/research_id_v7_wgs_known_issue_15.tsv" +# remove any samples with known issues +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/wgs_v8_known_issue_1.txt" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("research_id_v7_wgs_known_issue_15.tsv", sep="\t") +wgs_to_filter = pd.read_csv("wgs_v8_known_issue_1.txt", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing -# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v7 +# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v8 dataset_52206452_person_sql = """ SELECT person.person_id, @@ -89,4 +86,4 @@ all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) -subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v7.1.csv", shell=True, check=True) +#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v8.0.csv", shell=True, check=True) diff --git a/gwas/aou/sample_preprocessing/merge_batches.py b/gwas/aou/sample_preprocessing/merge_batches.py deleted file mode 100644 index b74e1a50..00000000 --- a/gwas/aou/sample_preprocessing/merge_batches.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import pandas as pd -import subprocess -import sys - -#default params -num_batches = 5 -batch_file_name = "wgs_ratios_batch" - -if len(sys.argv) == 2: - num_batches = sys.argv[1] -if len(sys.argv) == 3: - batch_file_name = sys.argv[2] -#TODO add argument for input location - -bucket = os.getenv('WORKSPACE_BUCKET') - -dfs = [] -for i in range(num_batches): - f_toread = batch_file_name+str(i)+".csv" - subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/"+f_toread+" ./", shell=True, check=True) - df = pd.read_csv(f_toread) - dfs.append(df) - -all_ratios = pd.concat(dfs) -all_ratios.to_csv("wgs_ratios_all.csv", index=False) -subprocess.run("gsutil cp wgs_ratios_all.csv "+bucket+"/tmirmira/data/qc", shell=True, check=True)