diff --git a/gwas/aou/phenotype_preprocessing/README.md b/gwas/aou/phenotype_preprocessing/README.md index c6c61c8e..9d5f76fa 100644 --- a/gwas/aou/phenotype_preprocessing/README.md +++ b/gwas/aou/phenotype_preprocessing/README.md @@ -29,7 +29,7 @@ Required arguments: * `--range `: Minimum and maximum accepted values for the phenotype Optional arguments: -* `--samples `: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v7.csv`. +* `--samples `: csv file with list of samples to restrict to. Needs columns "person_id" and "sex_at_birth_Male". Typically this would a list of samples that passed upstream sample-level QC info. Defaults to `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. * `--drugexposure-covariate-concept-ids `: List of conceptid:conceptname to use as drug exposure covariates. * `--ppi`: Indicate if the phenotype concept id is from the PPI or LOINC (default) tables. PPI are general physical measurements (e.g. hight) and include very limited concept ids. * `--outlier-sd`: Filter samples whose phenotype values lie below or above this number of standard deviations from the mean phenotype value. diff --git a/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py index 165c011a..c7f00231 100755 --- a/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py +++ b/gwas/aou/phenotype_preprocessing/aou_case_control_phenotype_preprocessing.py @@ -17,7 +17,7 @@ bucket = os.environ["WORKSPACE_BUCKET"] SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \ - "passing_samples_v7.csv") + "passing_samples_v8.0.csv") def MSG(msg_str): """ diff --git a/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py b/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py index d6e40693..3a80094e 100755 --- a/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py +++ b/gwas/aou/phenotype_preprocessing/aou_phenotype_preprocessing.py @@ -19,7 +19,7 @@ import sys SAMPLEFILE = os.path.join(os.environ["WORKSPACE_BUCKET"], "samples", \ - "passing_samples_v7.csv") + "passing_samples_v8.0.csv") def MSG(msg_str): """ diff --git a/gwas/aou/phenotype_preprocessing/process_phenotypes.py b/gwas/aou/phenotype_preprocessing/process_phenotypes.py index ab8dcbc1..c0fc7512 100755 --- a/gwas/aou/phenotype_preprocessing/process_phenotypes.py +++ b/gwas/aou/phenotype_preprocessing/process_phenotypes.py @@ -67,7 +67,7 @@ def UploadGCS(outfile, gcsfile): outfile = row["phenotype"]+"_phenocovar.csv" gcsfile = os.path.join(os.environ["WORKSPACE_BUCKET"], "phenotypes", outfile) RunCmd(cmd) - UploadGCS(outfile, gcsfile) + # UploadGCS(outfile, gcsfile) numsamples = len(open(outfile, "r").readlines())-1 outitems = [row["phenotype"], row["concept_id"], \ row["units"], row["outlier_sd"], row["min"], row["max"], row["drugcovars"], \ diff --git a/gwas/aou/sample_preprocessing/README.md b/gwas/aou/sample_preprocessing/README.md index 09400f88..20a6b33b 100644 --- a/gwas/aou/sample_preprocessing/README.md +++ b/gwas/aou/sample_preprocessing/README.md @@ -1,32 +1,21 @@ # Sample preprocessing for AoU -This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v7.1.csv`. +This directory contains scripts for performing sample qc. The list of samples that pass qc checks can be found at `${WORKSPACE_BUCKET}/samples/passing_samples_v8.0.csv`. The sample qc scripts in this directory should be executed in the following order: -1. `compute_ratios.py` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample -2. `merge_batches.py`: the previous script runs in batches, this script combines those for further processing -3. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks +1. `compute_ratios.bash` : obtains #singletons insertions/deletions ratio, heterozygous/homozygous ratio, transition/transversion ratio per sample +2. `finish_sampleqc.py`: this uses the values computed in the previous script to perform sample qc and outputs a two-column list (`person_id` and `sex_at_birth_Male`) of samples that pass qc checks +3. `construct_cohorts.py`: uses the final output of sample qc above to build ancestry cohorts -### Running `compute_ratios.py` +### Running `compute_ratios.bash` -This script must be run on the AoU workbench and requires a dataproc cluster for hail/spark. Use the command `nohup python3 compute_ratios.py > output.log 2>&1 &`. +This script must be run on the AoU workbench. Use the command `bash compute_ratios.bash > wgs_ratios_all.csv`. -The image belows provides the parameters used to set up the cluster. - -![unnamed](https://github.com/CAST-genomics/cast-workflows/assets/16807372/52801a8d-d28b-4123-8011-5161d2a2a950) - -With the parameters above, batches of 50k samples are recommended. As there are currently 250k samples available in AoU, the default number of batches is 5. -The `compute_ratios.py` script should, but does not, run through all the batches. The cluster runs out of memory after each batch. The solution, though not ideal, is -to do the following: -- Run one batch. Soon after the next batch starts, it will likely run out of memory. -- Delete and recreate the compute environment. -- In `compute_ratios.py`, change the `for` loop from `for i in range(num_batches):` to `for i in range(X, num_batches):` where `X` is the batch you wish to run. For example, if you have finished two batches and want to run the third batch, `X` would be 2 (following 0 indexing) - -Better solutions may be implemented in the future, but as we expect sample preprocessing to be an infrequent step (only performed when new samples are available), we defer this to later. +You can just use the least expensive possible environment configuration for this one. ### Sample qc -After `compute_ratios.py` has finished all batches, run `merge_batches.py` to combine the batches into one file. Then run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. +After `compute_ratios.bash` has finished, run `finish_sampleqc.py` to perform sample qc and obtain a list of samples that pass qc checks. The sample preprocessing is performed in `finish_sampleqc.py`. This script uses the ratios to filter samples. diff --git a/gwas/aou/sample_preprocessing/compute_ratios.bash b/gwas/aou/sample_preprocessing/compute_ratios.bash new file mode 100644 index 00000000..fa00463d --- /dev/null +++ b/gwas/aou/sample_preprocessing/compute_ratios.bash @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Create a CSV containing the necessary data +# Allocate 4 CPUs, 3.6 GB memory, and 120 GB of standard storage (the least-expensive possible configuration) + +V8_BUCKET="$1" # ex: gs://fc-secure-MYBUCKET +V8_CDR_DIR="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux" + +# columns of all_samples.tsv file: +# 1:s +# 7:ins_del_ratio +# 10:snp_het_homvar_ratio +# 11:indel_het_homvar_ratio +# 12:ti_tv_ratio +# 13:singleton + +gsutil -u $GOOGLE_PROJECT cat "$V8_CDR_DIR/qc/all_samples.tsv" | \ +cut -f1,7,10,11,12,13 | tr $'\t' , diff --git a/gwas/aou/sample_preprocessing/compute_ratios.py b/gwas/aou/sample_preprocessing/compute_ratios.py deleted file mode 100644 index 2ad0a6b9..00000000 --- a/gwas/aou/sample_preprocessing/compute_ratios.py +++ /dev/null @@ -1,51 +0,0 @@ -import pandas as pd -import numpy as np -import os -import hail as hl -import subprocess -import sys - -#default params -num_batches = 5 #recommend batches of 50k samples -batch_file_name = "wgs_ratios_batch" - -if len(sys.argv) == 2: - num_batches = sys.argv[1] -if len(sys.argv) == 3: - batch_file_name = sys.argv[2] -#TODO add argument for output location - -bucket = os.getenv('WORKSPACE_BUCKET') - -hl.init(default_reference = "GRCh38") - -mt_wgs_path = os.getenv("WGS_ACAF_THRESHOLD_MULTI_HAIL_PATH") -mt = hl.read_matrix_table(mt_wgs_path) -sampleIDs = mt.cols().to_pandas() -mt = mt.annotate_cols(sint = hl.int(mt.s)) - - -qs = np.linspace(0,1, num_batches+1) -quantiles = sampleIDs.astype('int').quantile(qs) -quantiles.reset_index(inplace=True) - -for i in range(num_batches): - start = quantiles['s'][i] - end = quantiles['s'][i+1] - if i+1 == num_batches: #last round - end += 1 #for last round, want to end to include last sample - #filtered mt - fmt = mt.filter_cols(((mt.sint >= start) & (mt.sint < end))) - fmt = hl.sample_qc(fmt) - fmt = fmt.filter_cols(fmt.sample_qc.call_rate >= .90, keep = True) - - ratios = fmt.select_cols(singleton = fmt.sample_qc.n_singleton, - ins_del = fmt.sample_qc.r_insertion_deletion, - ti_tv = fmt.sample_qc.r_ti_tv, - het_hom = fmt.sample_qc.r_het_hom_var) - tbl = ratios.cols() - pddf = tbl.to_pandas() - f_towrite = batch_file_name+str(i)+".csv" - pddf.to_csv(f_towrite, sep=",", index=False) - subprocess.run("gsutil cp "+f_towrite+" "+bucket+"/tmirmira/data/qc/"+f_towrite, shell=True, check=True) - diff --git a/gwas/aou/sample_preprocessing/construct_cohorts.py b/gwas/aou/sample_preprocessing/construct_cohorts.py index 15519dce..b107eb4f 100644 --- a/gwas/aou/sample_preprocessing/construct_cohorts.py +++ b/gwas/aou/sample_preprocessing/construct_cohorts.py @@ -4,12 +4,12 @@ bucket = os.getenv('WORKSPACE_BUCKET') -subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v7.1.csv ./", shell=True, check=True) +subprocess.run("gsutil cp "+bucket+"/samples/passing_samples_v8.0.csv ./", shell=True, check=True) -passing_samples = pd.read_csv("passing_samples_v7.1.csv") +passing_samples = pd.read_csv("passing_samples_v8.0.csv") #columns = person_id, sex_at_birth_Male -ancestry_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" +ancestry_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+ancestry_path+" ./", shell=True, check=True) ancestry = pd.read_csv("ancestry_preds.tsv", sep="\t") diff --git a/gwas/aou/sample_preprocessing/finish_sampleqc.py b/gwas/aou/sample_preprocessing/finish_sampleqc.py index 851d49e8..97657e93 100644 --- a/gwas/aou/sample_preprocessing/finish_sampleqc.py +++ b/gwas/aou/sample_preprocessing/finish_sampleqc.py @@ -3,9 +3,6 @@ import subprocess import sys -bucket = os.getenv('WORKSPACE_BUCKET') - -subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/wgs_ratios_all.csv ./", shell=True, check=True) all_ratios = pd.read_csv("wgs_ratios_all.csv") #ratio filtering here @@ -22,19 +19,19 @@ #relatedness filtering -related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" +related_samples_path = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv" subprocess.run("gsutil -u $GOOGLE_PROJECT -m cp "+related_samples_path+" ./", shell=True, check=True) related = pd.read_csv("relatedness_flagged_samples.tsv", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(related['sample_id'].values)] -#v7.1 update as per https://support.researchallofus.org/hc/en-us/articles/25646444720404-Incremental-Data-Release-for-v7-Genotyping-Array-and-Short-Read-Genomic-Data -wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v7/known_issues/research_id_v7_wgs_known_issue_15.tsv" +# remove any samples with known issues +wgs_to_filter_path = "gs://fc-aou-datasets-controlled/v8/known_issues/wgs_v8_known_issue_1.txt" subprocess.run("gsutil -u $GOOGLE_PROJECT cp "+wgs_to_filter_path+" ./", shell=True, check=True) -wgs_to_filter = pd.read_csv("research_id_v7_wgs_known_issue_15.tsv", sep="\t") +wgs_to_filter = pd.read_csv("wgs_v8_known_issue_1.txt", sep="\t") all_ratios = all_ratios[~all_ratios['s'].isin(wgs_to_filter['research_id'].values)] #sex filtering/processing -# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v7 +# This query represents dataset "All_population" for domain "person" and was generated for All of Us Controlled Tier Dataset v8 dataset_52206452_person_sql = """ SELECT person.person_id, @@ -89,4 +86,4 @@ all_ratios = all_ratios.merge(demog, left_on='s', right_on='person_id', how='inner') all_ratios = pd.get_dummies(all_ratios, columns=['sex_at_birth'], dtype='int') all_ratios[['person_id','sex_at_birth_Male']].to_csv("postqc_samples.csv", index=False) -subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v7.1.csv", shell=True, check=True) +#subprocess.run("gsutil cp postqc_samples.csv "+bucket+"/samples/passing_samples_v8.0.csv", shell=True, check=True) diff --git a/gwas/aou/sample_preprocessing/merge_batches.py b/gwas/aou/sample_preprocessing/merge_batches.py deleted file mode 100644 index b74e1a50..00000000 --- a/gwas/aou/sample_preprocessing/merge_batches.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import pandas as pd -import subprocess -import sys - -#default params -num_batches = 5 -batch_file_name = "wgs_ratios_batch" - -if len(sys.argv) == 2: - num_batches = sys.argv[1] -if len(sys.argv) == 3: - batch_file_name = sys.argv[2] -#TODO add argument for input location - -bucket = os.getenv('WORKSPACE_BUCKET') - -dfs = [] -for i in range(num_batches): - f_toread = batch_file_name+str(i)+".csv" - subprocess.run("gsutil cp "+bucket+"/tmirmira/data/qc/"+f_toread+" ./", shell=True, check=True) - df = pd.read_csv(f_toread) - dfs.append(df) - -all_ratios = pd.concat(dfs) -all_ratios.to_csv("wgs_ratios_all.csv", index=False) -subprocess.run("gsutil cp wgs_ratios_all.csv "+bucket+"/tmirmira/data/qc", shell=True, check=True)