diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1284efa --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM snakemake/snakemake:v5.24.2 + +# work in the home directory +WORKDIR /home + +# copy all the files to the container +COPY . . + +# install the pipeline's dependencies +RUN snakemake --use-conda --conda-create-envs-only -j + +# run varCA +# specify as a CMD, so it can be overridden by the user +CMD ["./run.bash"] diff --git a/README.md b/README.md index 8e06620..b23d674 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Snakemake](https://img.shields.io/badge/snakemake-5.18.0-brightgreen.svg?style=flat-square)](https://snakemake.readthedocs.io/) +[![Snakemake](https://img.shields.io/badge/snakemake-5.24.2-brightgreen.svg?style=flat-square)](https://snakemake.readthedocs.io/) [![License](https://img.shields.io/apm/l/vim-mode.svg)](LICENSE) # varCA @@ -21,11 +21,12 @@ wget -O- -q https://github.com/aryarm/varCA/releases/latest/download/data.tar.gz ``` # setup -The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). We recommend installing version 5.18.0: +The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). For most users, we recommend installing Snakemake via mamba as [described in their documentation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda-mamba). + +However, if your aim is complete reproducbility, we recommend installing version 5.24.2 via this command: ``` -conda create -n snakemake -c bioconda -c conda-forge --no-channel-priority 'snakemake==5.18.0' +conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakemake==5.24.2' ``` -We highly recommend you install [Snakemake via conda](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda) like this so that you can use the `--use-conda` flag when calling `snakemake` to let it [automatically handle all dependencies](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) of the pipeline. Otherwise, you must manually install the dependencies listed in the [env files](envs). # execution 1. Activate snakemake via `conda`: @@ -52,7 +53,7 @@ You must modify [the config.yaml file](configs#configyaml) to specify paths to y The pipeline is made up of [two subworkflows](rules). These are usually executed together automatically by the master pipeline, but they can also be executed on their own for more advanced usage. See the [rules README](rules/README.md) for execution instructions and a description of the outputs. You will need to execute the subworkflows separately [if you ever want to create your own trained models](rules#training-and-testing-varca). #### Reproducing our results -We provide the example data so that you may quickly (in ~1 hr, excluding dependency installation) verify that the pipeline can be executed on your machine. This process does not reproduce our results. Those with more time can follow [these steps](rules#testing-your-model--reproducing-our-results) to create all of the plots and tables in our paper. +We provide the example data so that you may quickly (in ~15 mins, excluding dependency installation) verify that the pipeline can be executed on your machine. This process does not reproduce our results. Those with more time can follow [these steps](rules#testing-your-model--reproducing-our-results) to create all of the plots and tables in our paper. ### If this is your first time using Snakemake We recommend that you run `snakemake --help` to learn about Snakemake's options. For example, to check that the pipeline will be executed correctly before you run it, you can call Snakemake with the `-n -p -r` flags. This is also a good way to familiarize yourself with the steps of the pipeline and their inputs and outputs (the latter of which are inputs to the first rule in each workflow -- ie the `all` rule). diff --git a/Snakefile b/Snakefile index 0844018..50e3af7 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,7 @@ import warnings from snakemake.utils import min_version ##### set minimum snakemake version ##### -min_version("5.18.0") +min_version("5.24.2") configfile: "configs/config.yaml" configfile: "configs/callers.yaml" diff --git a/callers/README.md b/callers/README.md index 179465a..c312831 100644 --- a/callers/README.md +++ b/callers/README.md @@ -40,3 +40,9 @@ By providing a dash character `-` in the caller identifier, the caller script (e The pipeline will run this separate script first but with the same parameters as the caller script, and the directory containing its output will be passed as an argument to the original caller script. (For example, the directory containing the output of the `gatk` special script will be passed as a parameter to the `gatk-snp` caller script.) By providing multiple dashes in your caller identifiers using this scheme, you may design complicated caller script heirarchies involving multiple levels of nesting. In a sense, you can create small pipelines within the pipeline. + +### Caller script dependencies +If you caller script requires specific dependencies, just add them to the [prepare.yml](/envs/prepare.yml) [conda environment file](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#create-env-file-manually). +Any packages listed in this file will be available to your caller script when it is executed. + +If you would like to use a newer version of an existing variant caller, just change the version specified in the [prepare.yml](/envs/prepare.yml) file. Snakemake will automatically update the package upon your next execution of the pipeline. diff --git a/callers/pindel b/callers/pindel index 17fc32e..060becc 100755 --- a/callers/pindel +++ b/callers/pindel @@ -14,18 +14,13 @@ samp="$5" # we sleep between each command to avoid potential problems with filesystem latency -pindel -f "$genome" -i <(echo "$bam" 300 "$samp") -j "$peaks" -o "$output_dir/" -L "$output_dir/log" && \ +pindel -f "$genome" -i <(echo "$bam" 300 "$samp") -j "$peaks" -o "$output_dir/" -L "$output_dir/log" >"$output_dir/stdout" && \ sleep 10 && \ -for file in _D _INV _SI _TD; do - pindel2vcf -p "$output_dir/$file" -r "$genome" -R x -d 00000000 -v "$output_dir/$file.raw.vcf" && \ - sleep 10 && \ - bgzip -f "$output_dir/$file.raw.vcf" && \ - sleep 10 && \ - tabix -p vcf -f "$output_dir/$file.raw.vcf.gz" && \ - sleep 10 && \ - bcftools norm -d all "$output_dir/$file.raw.vcf.gz" > "$output_dir/$file.vcf" && \ - sleep 10 && \ - bgzip -f "$output_dir/$file.vcf" && tabix -p vcf -f "$output_dir/$file.vcf.gz" -done && \ +pindel2vcf -P "$output_dir/" -r "$genome" -R x -d 00000000 -v "$output_dir/pindel.raw.vcf" && \ sleep 10 && \ -bcftools concat -a -d all "$output_dir/"{_D,_INV,_SI,_TD}.vcf.gz > "$output_dir/pindel.vcf" +bgzip -f "$output_dir/pindel.raw.vcf" && \ +sleep 10 && \ +tabix -p vcf -f "$output_dir/pindel.raw.vcf.gz" && \ +sleep 10 && \ +bcftools norm -d all "$output_dir/pindel.raw.vcf.gz" > "$output_dir/pindel.vcf" + diff --git a/callers/varscan b/callers/varscan index 2b28adc..78a9a04 100755 --- a/callers/varscan +++ b/callers/varscan @@ -13,4 +13,5 @@ output_dir="$4" samtools mpileup -l "$peaks" -f "$genome" -B "$bam" | \ varscan mpileup2cns --p-value 1 --strand-filter 0 --output-vcf > "$output_dir/varscan.vcf" && \ -bgzip -f "$output_dir/varscan.vcf" && tabix -p vcf -f "$output_dir/varscan.vcf.gz" +awk -F $"\t" -v 'OFS=\t' '/^#/ || $4 !~ /^(R|Y|M|W|S|K|V|H|D|B|N)$/' "$output_dir/varscan.vcf" | \ +bgzip > "$output_dir/varscan.vcf.gz" && tabix -p vcf -f "$output_dir/varscan.vcf.gz" diff --git a/configs/classify.yaml b/configs/classify.yaml index a8aa9e1..1f8f63e 100644 --- a/configs/classify.yaml +++ b/configs/classify.yaml @@ -43,13 +43,13 @@ data: - 'gatk-indel~DP>10' # we only train on sites with a read depth above 10 # here are some example test samples, but you can replace these with whichever # samples for which you'd like to predict variants - molt4_chr1: - path: out/merged_indel/molt4_chr1/final.tsv.gz - merged: out/merged_indel/molt4_chr1/merge.tsv.gz + molt4_mini: + path: out/merged_indel/molt4_mini/final.tsv.gz + merged: out/merged_indel/molt4_mini/merge.tsv.gz filter: *filter # use the same filtering expression as the SRR891269 data - jurkat_chr1: - path: out/merged_indel/jurkat_chr1/final.tsv.gz - merged: out/merged_indel/jurkat_chr1/merge.tsv.gz + jurkat_mini: + path: out/merged_indel/jurkat_mini/final.tsv.gz + merged: out/merged_indel/jurkat_mini/merge.tsv.gz filter: *filter # use the same filtering expression as the SRR891269 data # # uncomment the lines below if you want to reproduce our results # SRR891269_evens: @@ -87,8 +87,8 @@ tune: false # You may optionally specify a truth set for each dataset if you want it to # serve as test data (see "truth" above). # This config entry is required! -predict: [molt4_chr1, jurkat_chr1] -# predict: [molt4_chr1, jurkat_chr1, SRR891269_evens] # uncomment this line if you want to reproduce our results +predict: [molt4_mini, jurkat_mini] +# predict: [SRR891269_evens] # uncomment this line if you want to reproduce our results # Callers to use when creating a VCF from the merged .tsv.gz file. # Supply a list of caller names in the order in which they should be considered diff --git a/configs/config.yaml b/configs/config.yaml index 1e386f5..a20701f 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -8,6 +8,9 @@ # Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) +#---------------------------------------- REQUIRED INPUT -----------------------------------------------# +# If you are executing the pipeline on your own data, you MUST edit these config values + # The path to a text file specifying where to find read information for each sample # Each row in the sample file should represent a different sample. # The sample file should have 3 columns (each separated by a single tab): @@ -26,14 +29,27 @@ # this is a required input! sample_file: data/samples.tsv -# Which samples from the samples_file should we execute the pipeline on? +# Which samples from the sample_file should we execute the pipeline on? # Comment out this line or set it to a falsey value if you want to run all samples in the sample file -SAMP_NAMES: [molt4_chr1, jurkat_chr1] +SAMP_NAMES: [molt4_mini, jurkat_mini] # The path to a reference genome -# The BWA index files, samtools index file (.fa.fai), and samtools dictionary file (.dict) must be stored in the same directory +# The BWA index files, samtools index file (.fai), and GATK dictionary file (.dict) must be stored in the same directory +# Note: your fasta file cannot be gzipped! +# required! +genome: data/hg38.mini.fa + +# The path to trained, ensemble classifiers, which can be used for calling SNVs and +# indels +# Unless you have created your own models, you should just use the ones packaged in the +# example dataset and leave these paths unchanged. # required! -genome: data/hg38.chr1.fa +snp_model: data/snp.rda +indel_model: data/indel.rda + + +#------------------------------------------- OPTIONS ---------------------------------------------------# +# If you are executing the pipeline on your own data, you can usually just leave these unchanged. # The path to the directory in which to place all of the output files # Defined relative to whatever directory you execute the snakemake command in @@ -74,10 +90,3 @@ indel_callers: [gatk-indel, varscan-indel, vardict-indel, pindel, illumina-strel snp_filter: ['gatk-snp~DP>10'] indel_filter: ['gatk-indel~DP>10'] -# The path to a trained, ensemble classifier, which can be used for calling SNVs -# required! -snp_model: data/snp.rda - -# The path to a trained, ensemble classifier, which can be used for calling indels -# required! -indel_model: data/indel.rda diff --git a/configs/prepare.yaml b/configs/prepare.yaml index 057d6dd..4fe0024 100644 --- a/configs/prepare.yaml +++ b/configs/prepare.yaml @@ -30,7 +30,8 @@ sample_file: data/samples.tsv SAMP_NAMES: [SRR891269] # The path to a reference genome -# The BWA index files, samtools index file (.fa.fai), and samtools dictionary file (.dict) must be stored in the same directory +# The BWA index files, samtools index file (.fai), and GATK dictionary file (.dict) must be stored in the same directory +# Note: your fasta file cannot be gzipped! # required! genome: /iblm/netapp/data1/external/GRC37/combined/bwa_index_assembly19/Homo_sapiens_assembly19.fasta # replace this with the path to your own reference genome @@ -62,20 +63,6 @@ indel_callers: [gatk-indel, varscan-indel, vardict-indel, pindel, illumina-strel # If this value is not provided, it will default to "." label: "." -# Whether to "normalize" the VCF output of every caller script -# This option is not recommended if you plan to feed the data to a random forest -# Normalization usually involves left-alignment and trimming of variants -# See https://genome.sph.umich.edu/wiki/Variant_Normalization for more info -# In our pipeline, the normalization step also reduces counts of variants that appear at the same position to 1 -# If this value is not provided, it will default to false -no_normalize: false - -# Whether to replace NA values in the dataset before outputting the final TSV -# This option is not recommended if you plan to feed the data to a random forest -# Unless otherwise specified in the Caller Specific Parameters below, NA values will be replaced with 0 -# If this value is not provided, it will default to false -keep_na: false - # A list of filtering expressions for filtering the sites before outputting the final TSV # A filtering expression consists of the following concatenated together: # - the caller id @@ -87,9 +74,3 @@ keep_na: false snp_filter: ['gatk-snp~DP>10'] indel_filter: ['gatk-indel~DP>10'] -# Whether to normalize weird numerical columns before outputting the final TSV -# (ex: scientific notation or numbers followed by a percent sign) -# We recommended setting this to false in order to allow normalization if you -# plan to feed the data to a random forest -# If this option is commented out, it will default to false -pure_numerics: false diff --git a/envs/classify.yml b/envs/classify.yml index 5ddb80e..7509e17 100644 --- a/envs/classify.yml +++ b/envs/classify.yml @@ -1,15 +1,20 @@ channels: - - defaults + - conda-forge + - nodefaults dependencies: - r-dplyr==0.8.5 - r-r.devices==2.16.1 - r-ranger==0.11.2 - r-earth==5.1.2 - - conda-forge::coreutils==8.31 + - coreutils==8.31 - r-mlr==2.17.1 - r-parallelmap==1.5.0 - r==3.5.1 - r-readr==1.1.1 - r-data.table==1.12.8 - r-plyr==1.8.4 - - conda-forge::gawk==5.1.0 + - gawk==5.1.0 + - conda-forge::coreutils==8.31 + - conda-forge::sed==4.8 + - conda-forge::bash==5.0.018 + - conda-forge::gzip==1.10 diff --git a/envs/prc.yml b/envs/prc.yml index f695875..091e6ea 100644 --- a/envs/prc.yml +++ b/envs/prc.yml @@ -1,11 +1,16 @@ channels: - conda-forge + - nodefaults dependencies: - - numpy==1.18.5 - - pandas==1.0.4 - - scikit-learn==0.23.1 - - matplotlib==3.1.2 - - coreutils==8.31 + - conda-forge::numpy==1.18.5 + - conda-forge::pandas==1.0.4 + - conda-forge::scikit-learn==0.23.1 + - conda-forge::matplotlib==3.1.2 + - conda-forge::coreutils==8.31 - bioconda::pysam==0.15.3 - bioconda::bcftools==1.10.2 - - gawk==5.1.0 + - conda-forge::gawk==5.1.0 + - conda-forge::coreutils==8.31 + - conda-forge::sed==4.8 + - conda-forge::bash==5.0.018 + - conda-forge::gzip==1.10 diff --git a/envs/prepare.yml b/envs/prepare.yml index 9b92631..53ea7cb 100644 --- a/envs/prepare.yml +++ b/envs/prepare.yml @@ -1,34 +1,38 @@ channels: - - defaults + - conda-forge + - nodefaults dependencies: - - r-pbapply==1.4_2 - - r-stringr==1.4.0 - - r-readr==1.1.1 + - conda-forge::r-pbapply==1.4_2 + - conda-forge::r-stringr==1.4.0 + - conda-forge::r-readr==1.1.1 - bioconda::bioconductor-rsamtools==1.34.0 - - gawk==5.1.0 - - tabix==0.2.6 - - bcftools==1.9=ha228f0b_4 + - conda-forge::gawk==5.1.0 + - bioconda::tabix==0.2.6 + - bioconda::bcftools==1.9=ha228f0b_4 - bioconda::delly==0.8.3 - bioconda::bioconductor-bsgenome.hsapiens.ucsc.hg19==1.4.0 - bioconda::bioconductor-biovizbase==1.30.1 - - vcftools==0.1.16 + - bioconda::vcftools==0.1.16 - conda-forge::r-rmutil==1.1.4 - - gatk4==4.1.7.0 - - r-lattice==0.20_41 - - r-reshape==0.8.8 - - vardict-java==1.7.0 - - manta==1.6.0 - - bwa==0.7.17 - - r-data.table==1.12.8 - - r-plyr==1.8.4 - - samtools==1.9=h8571acd_11 - - strelka==2.9.10 + - bioconda::gatk4==4.1.7.0 + - conda-forge::r-lattice==0.20_41 + - conda-forge::r-reshape==0.8.8 + - bioconda::vardict-java==1.7.0 + - bioconda::manta==1.6.0 + - bioconda::bwa==0.7.17 + - conda-forge::r-data.table==1.12.8 + - conda-forge::r-plyr==1.8.4 + - bioconda::samtools==1.9=h8571acd_11 + - bioconda::strelka==2.9.10 - bioconda::bioconductor-variantannotation==1.28.3 - - pindel==0.2.5b9 - - macs2==2.1.2 - - r-dplyr==0.8.5 + - bioconda::pindel==0.2.5b9 + - bioconda::macs2==2.1.2 + - conda-forge::r-dplyr==0.8.5 - bioconda::bioconductor-genomicalignments==1.18.1 - - bedtools==2.29.2 - - varscan==2.4.4 + - bioconda::bedtools==2.29.2 + - bioconda::varscan==2.4.4 - bioconda::bioconductor-rtracklayer==1.42.1 - conda-forge::coreutils==8.31 + - conda-forge::sed==4.8 + - conda-forge::bash==5.0.018 + - conda-forge::gzip==1.10 diff --git a/rules/README.md b/rules/README.md index 5106880..1ef2ce9 100644 --- a/rules/README.md +++ b/rules/README.md @@ -103,7 +103,7 @@ First, split the truth dataset by chromosome parity using `awk` commands like th Then, specify in `classify.yaml` the path to the `indel-odds.tsv.gz` file as your training set and the path to the `indel-evens.tsv.gz` file as your test set. Make sure that both your training and test sets have a [truth caller ID](/configs/classify.yaml#L26) and that you've specified the test set ID in the list of `predict` datasets. Once you've finished, execute the `classify` subworkflow on its own (see [above](#execution-1)). -To truly reproduce our results, we also recommend predicting variants for the ATAC-seq cell lines we tested (Jurkat, Molt-4, K-562, etc) by downloading them from GEO accession GSE129086. Although the example data includes the Jurkat and Molt-4 samples, it only includes chromosome 1 from those cell lines, thereby artificially lowering the VarCA scores of the variants in those samples. +To truly reproduce our results, we also recommend predicting variants for the ATAC-seq cell lines we tested (Jurkat, Molt-4, K-562, etc) by downloading them from GEO accession GSE129086. Although the example data includes the Jurkat and Molt-4 samples, it only includes a small portion of chromosome 1 from those cell lines, thereby artificially lowering the VarCA scores of the variants in those samples. The `classify` subworkflow can only work with one trained model at a time, so you will need to repeat these steps for SNVs. Just replace every mention of "indel" in `classify.yaml` with "snp". Also remember to use only the SNV callers (ie GATK, VarScan 2, and VarDict). diff --git a/rules/classify.smk b/rules/classify.smk index 49f542d..596ab40 100644 --- a/rules/classify.smk +++ b/rules/classify.smk @@ -1,7 +1,7 @@ from snakemake.utils import min_version ##### set minimum snakemake version ##### -min_version("5.18.0") +min_version("5.24.2") if 'imported' not in config: configfile: "configs/classify.yaml" @@ -242,7 +242,7 @@ rule tsv2vcf: output: temp(config['out']+"/{sample}/results.vcf.gz") conda: "../envs/prc.yml" shell: - "zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results}" + "scripts/2vcf.py -o {output} {params.callers} {input.results} {input.merge}" rule fix_vcf_header: """ add contigs to the header of the vcf """ diff --git a/rules/prepare.smk b/rules/prepare.smk index ce5f93d..99d4d2e 100644 --- a/rules/prepare.smk +++ b/rules/prepare.smk @@ -5,7 +5,7 @@ import warnings from snakemake.utils import min_version ##### set minimum snakemake version ##### -min_version("5.18.0") +min_version("5.24.2") if 'imported' not in config: configfile: "configs/prepare.yaml" @@ -144,7 +144,7 @@ rule bed_peaks: # to convert to BED, we must extract the first three columns (chr, start, stop) "cut -f -3 \"{input.peaks}\" | " "bedtools getfasta -fi {input.ref} -bedOut -bed stdin | " - "sort -t $'\t' -k1,1V -k2,2n > \"{output}\" && " + "sort -t $'\\t' -k1,1V -k2,2n > \"{output}\" && " "test -s \"{output}\"" @@ -359,7 +359,7 @@ rule join_all_sites: tsv = caller_tsv, prepared_tsv = rules.prepare_merge.output output: - pipe(config['out'] + "/merged_{type}/{sample}/{caller}.tsv") + temp(config['out'] + "/merged_{type}/{sample}/{caller}.tsv") conda: "../envs/prepare.yml" shell: "LC_ALL=C join -t $'\\t' -e. -a1 -j1 -o auto --nocheck-order " @@ -385,16 +385,6 @@ rule merge_callers: "{input.all_sites}) {input.caller_output} | " "(read -r head; echo \"$head\"; sort -t $'\\t' -k1,1V -k2,2n) | gzip > {output}" -rule binarize: - """convert REF/ALT columns to binary labels""" - input: rules.merge_callers.output - params: - label = config['label'] if 'label' in config else "." - output: config['out']+"/merged_{type}/{sample}/binary.tsv.gz" - conda: "../envs/prepare.yml" - shell: - "paste <(zcat {input} | scripts/cgrep.bash - -Ev '~(REF|ALT)$') <(scripts/classify.bash {input} '{params.label:q}') | gzip >{output}" - def na_vals(wildcards): """ retrieve all of the NA parameters """ @@ -412,27 +402,29 @@ def na_vals(wildcards): rule fillna: """ prepare the caller for use by the classifier by - 1) extracting the columns desired by the user - 2) filling NA values with the defaults provided + 1) converting REF/ALT columns to binary labels + 2) extracting the columns desired by the user + 3) filling NA values with the defaults provided """ - input: rules.binarize.output + input: rules.merge_callers.output params: na_vals = na_vals, - output: config['out']+"/merged_{type}/{sample}/fillna.tsv.gz" + label = config['label'] if 'label' in config else "." + output: temp(config['out']+"/merged_{type}/{sample}/fillna.tsv.gz") conda: "../envs/prepare.yml" shell: - "scripts/fillna.bash {input} {params.na_vals:q} | " + "paste <(zcat {input} | scripts/cgrep.bash - -Ev '~(REF|ALT)$') <(scripts/classify.bash {input} '{params.label:q}') | " + "scripts/fillna.bash {params.na_vals:q} | " "awk -F $'\\t' -v 'OFS=\\t' '{{for (i=1;i<=NF;i++) if ($i==\".\") $i=0}}1' | " "gzip >{output}" rule apply_filters: """ apply filtering on the data according to the filtering expressions """ - input: rules.fillna.input if check_config('keep_na') \ - else rules.fillna.output + input: rules.fillna.output params: expr = lambda wildcards: config[wildcards.type+"_filter"] - output: config['out']+"/merged_{type}/{sample}/filter.tsv.gz" + output: temp(config['out']+"/merged_{type}/{sample}/filter.tsv.gz") conda: "../envs/prepare.yml" shell: "zcat {input} | scripts/filter.bash {params.expr:q} | gzip > {output}" @@ -452,7 +444,7 @@ rule norm_nums: rule result: """ symlink the result so the user knows what the final output is """ - input: rules.norm_nums.input if check_config('pure_numerics') else rules.norm_nums.output + input: rules.norm_nums.output output: config['out']+"/merged_{type}/{sample}/final.tsv.gz" conda: "../envs/prepare.yml" shell: diff --git a/scripts/2vcf.py b/scripts/2vcf.py index d7f19ad..64afddb 100755 --- a/scripts/2vcf.py +++ b/scripts/2vcf.py @@ -14,8 +14,12 @@ def phred(vals): """ apply the phred scale to the vals provided """ - return -10*np.log10(1-vals) - return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30 + with np.errstate(divide='raise'): + try: + return -10*np.log10(1-vals) + except FloatingPointError: + return np.float64(30) + return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30 def plot_line(lst, show_discards=False): plt.clf() @@ -214,7 +218,8 @@ def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=Fa prepared = get_calls( pd.read_csv( prepared, sep="\t", header=0, index_col=["CHROM", "POS"], - dtype=str, chunksize=cs, na_values="." + dtype=str, chunksize=cs, na_values=".", + usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT') ), callers, pretty ) # flush out the first item in the generator: the vartype @@ -334,6 +339,8 @@ def write_vcf(out, records): callers = args.callers.split(",") if not args.internal: + import matplotlib + matplotlib.use('Agg') vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type)) else: if not sys.flags.interactive: diff --git a/scripts/README.md b/scripts/README.md index 5547750..6e0c264 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -7,6 +7,9 @@ All python scripts implement the `--help` argument. For bash, R, and awk scripts ### [2vcf.py](2vcf.py) A python script that uses files from the `prepare` and `classify` pipelines to create a VCF with the final, predicted variants. This script also has a special internal mode, which can be used for recalibrating the QUAL scores output in the VCF. +### [allele_conflicts.bash](allele_conflicts.bash) +A bash script for identifying sites at which the variant callers in our ensemble outputted conflicting alleles. + ### [cgrep.bash](cgrep.bash) A bash script for extracting columns from TSVs via `grep`. Every argument besides the first is passed directly to `grep`. @@ -16,6 +19,9 @@ A fast awk script for classifying each site in a VCF as DEL, INS, SNP, etc. It a ### [classify.bash](classify.bash) A bash script for converting all REF/ALT columns in a TSV to binary positive/negative labels using `classify.awk`. +### [create_test_data.bash](create_test_data.bash) +A bash script for creating a small test dataset for the pipeline from existing data. + ### [fillna.bash](fillna.bash) A bash script for replacing NA values in a large TSV. @@ -47,7 +53,7 @@ A python script for creating ROC plots. It takes as input the output of `statist A python script for generating points to use in a precision-recall or ROC curve. It takes as input a two column TSV: true labels and prediction p-values. ### [train_RF.R](train_RF.R) -An R script for creating a trained classifier. We recommend using the `Snakefile-classify` pipeline to run this script. +An R script for creating a trained classifier. We recommend using the _classify_ subworkflow to run this script. ### [tune_plot.R](tune_plot.R) An R script for visualizing the results of hyperparameter tuning from the `train_RF.R` script. diff --git a/scripts/allele_conflicts.bash b/scripts/allele_conflicts.bash new file mode 100755 index 0000000..a794998 --- /dev/null +++ b/scripts/allele_conflicts.bash @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele. +# Note that this script will only work for the chosen subset of variant callers. + +# arg1: merge.tsv.gz file +# arg2: results.tsv.gz file +# arg3: 'snp' or 'indel' + +# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel + +if [ "$3" = 'indel' ]; then +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca" + zcat "$1" | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 | \ + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( + zcat "$2" | \ + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 + ) +else +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca" + zcat "$1" | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 | \ + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( + zcat "$2" | \ + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 + ) +fi | \ +sed 's/,/\t/' | \ +LC_ALL=C sort -t $'\t' -k1,1V -k2,2n + +# To make a table containing alternative alleles for the following rules (as columns) +# 1) caller priority rule +# 2) majority rule +# 3) platinum genomes +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less diff --git a/scripts/create_test_data.bash b/scripts/create_test_data.bash new file mode 100755 index 0000000..2c44145 --- /dev/null +++ b/scripts/create_test_data.bash @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +# Create a small test dataset from a larger example dataset composed of chr1 from the +# Jurkat and MOLT4 cell lines + + +create_seq() { + # arg1 (string): either "jurkat" or "molt4" + + comm -12 <( + bedtools intersect -c -bed -a out/peaks/"$1"_chr1/"$1"_chr1_peaks.narrowPeak -b <( + zcat out/merged_indel/"$1"_chr1/merge.tsv.gz | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 == $4 && $4 == $5 && $5 == $6 && $6 == $7 && $7 == $3 && $3 != "."' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{print $1, $2, $2+length($3);}' + ) | \ + grep -vE '0$' | cut -f-3 | sort + ) <( + bedtools intersect -c -bed -a out/peaks/"$1"_chr1/"$1"_chr1_peaks.narrowPeak -b <( + zcat out/merged_snp/"$1"_chr1/merge.tsv.gz | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 == $4 && $4 == $5 && $3 != "."' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{print $1, $2, $2+length($3);}' + ) | \ + grep -vE '0$' | cut -f-3 | sort + ) | sort -t $'\t' -k1,1 -k2,2n | head -3 > data/"$1".mini.bed + + samtools sort -n out/align/"$1"_chr1/rmdup.bam | \ + bedtools pairtobed -abam - -b data/"$1".mini.bed 2>/dev/null | \ + samtools view -h | sed 's/'"$1"'_chr1/'"$1"'_mini/g' | grep -Ev '^\@PG' | \ + samtools sort -nO bam > data/"$1"_mini.bam + + if [ "$1" == "molt4" ]; then + samtools fastq -1 data/"$1".mini.1.fq.gz -2 data/"$1".mini.2.fq.gz -0 /dev/null -s /dev/null -n data/"$1"_mini.bam + rm data/molt4_mini.bam + fi +} + +create_seq jurkat +create_seq molt4 + +rm data/hg38.mini.* + +cat data/*.bed | sort -k1,1 -k2,2n | sed -e 1b -e '$!d' | tail -n+2 | \ +awk -F $'\t' -v 'OFS=\t' '{print $1, 0, $3;}' | \ +bedtools slop -i - -g <(cut -f-2 out/data-full-chr1/hg38.chr1.fa.fai) -b 500 | \ +bedtools getfasta -fi out/data-full-chr1/hg38.chr1.fa -bed - | \ +sed 's/>chr1:0-.*$/>chr1/' > data/hg38.mini.fa + +samtools faidx data/hg38.mini.fa +gatk CreateSequenceDictionary -R data/hg38.mini.fa +bwa index data/hg38.mini.fa &>/dev/null + +samtools view -h data/jurkat_mini.bam | sed 's/LN:.*$/LN:'"$(cut -f2 data/hg38.mini.fa.fai)"'/' | samtools sort -O bam > data/jurkat.mini.bam +samtools index data/jurkat.mini.bam + +rm data/molt4.mini.bed data/jurkat_mini.bam + +gatk ValidateSamFile -R data/hg38.mini.fa -I data/jurkat.mini.bam --MODE SUMMARY + diff --git a/scripts/fillna.bash b/scripts/fillna.bash index 917b050..9925e9e 100755 --- a/scripts/fillna.bash +++ b/scripts/fillna.bash @@ -2,22 +2,22 @@ # this script replaces NA values in a large TSV with the provided replacements -# param1: the .tsv.gz file containing the large table for which we want to fill NA values -# param2+: the name of a column followed by the NA value replacement for that col (as two adjacent arguments) +# stdin: the .tsv file containing the large table for which we want to fill NA values +# param1+: the name of a column followed by the NA value replacement for that col (as two adjacent arguments) # you can specify multiple column-value pairs # return (to stdout): the same table except where all specified NA values have been replaced -script_dir="$(dirname "$BASH_SOURCE")"; +read -r head pfiles=() -for (( i=2; i<$#; i+=2 )); do +for (( i=1; i<$#; i+=2 )); do j=$((i+1)) reg="${!i}" val="${!j}" # retrieve the idx of the column among the columns - reg="$(zcat "$1" | head -n1 | tr '\t' '\n' | grep -Fn "$reg" | cut -f1 -d: | head -n1)" + reg="$(echo "$head" | tr '\t' '\n' | grep -Fn "$reg" | cut -f1 -d: | head -n1)" pfiles+=("{if (\$$reg==\".\") \$$reg=\"$val\"}") done -zcat "$1" | awk -F $'\t' -v 'OFS=\t' "${pfiles[*]} 1" +{ echo "$head"; cat; } | awk -F $'\t' -v 'OFS=\t' "${pfiles[*]} 1"