From fade998da10a319159f5b471135c0a03d29e384b Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 19 Sep 2024 19:09:10 -0400 Subject: [PATCH 01/12] WIP add workflow to distribute long-running steps, config --- .gitignore | 1 + config/config.yaml | 8 ++ workflow/Snakefile | 180 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 189 insertions(+) create mode 100644 config/config.yaml create mode 100644 workflow/Snakefile diff --git a/.gitignore b/.gitignore index 5266ae40..47dcf31f 100755 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ agora*/ representation.py *.pickle +*~ diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 00000000..5189ddb9 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,8 @@ +metadata: /path/to/sample_metadata.csv +abundances: /path/to/long_otu_table.csv +taxonomy_data: None +agora: agora103_genus.qza +medium: western_diet_gut_agora.qza +cutoff: 0.01 +tradeoff: None +container: "docker://ghcr.io/vdblab/micom:0.37.0" diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 00000000..228cfcf7 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,180 @@ +import os +import sys + + +from micom.workflows import build +from micom import Community +import micom.qiime_formats as qf +from micom.qiime_formats import load_qiime_medium +from micom.workflows import build +from micom import Community +from micom.workflows import tradeoff +import seaborn as sns +import pandas as pd + + +configfile: os.path.join(workflow.basedir, "../config/config.yaml") + +data = pd.read_csv(config["abundances"]) +for col in ["sample_id", "id", "genus", "abundance"]: + assert col in data.columns, f"{col} must be a column name" + +metadata = pd.read_csv(config["metadata"]) +assert "sample_id" in metadata.columns, "sample_id must be a column name" +if "testn" in config: + metadata = metadata.head(config["testn"]) + + +agora_taxa = qf.load_qiime_manifest(config["agora"]) +agora_genera = agora_taxa.genus +#print(agora_genera) +ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("id")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False) + +ambig_genera_df.to_csv("unmatchable_genera.csv") + +# TODO: take these from an external file: +# here I'm picking the best match based on GTDB +# https://raw.githubusercontent.com/biobakery/MetaPhlAn/master/metaphlan/utils/mpa_vJun23_CHOCOPhlAnSGB_202403_SGB2GTDB.tsv +replacements = { + 'Clostridia_unclassified': 'Clostridium', + 'Candidatus_Cibiobacter': 'Faecalibacterium', # no good solution, its either Ruminococcicae or Faecalibacterium https://www.sciencedirect.com/science/article/pii/S0092867419300017 + 'Lachnospiraceae_unclassified':'unclassified Lachnospiraceae genus', + 'Fusicatenibacter':'unclassified Lachnospiraceae genus', + 'Ruminococcaceae_unclassified': 'unclassified Ruminococcaceae genus', + 'Gemmiger': 'unclassified Ruminococcaceae genus', # bummer + 'GGB9633': 'unclassified Ruminococcaceae genus', +'GGB3293': 'Clostridium', + 'Ruthenibacterium': 'unclassified Ruminococcaceae genus', +'Clostridiaceae_unclassified': 'Clostridium', + 'GGB3256': 'unclassified Ruminococcaceae genus', # Acutalibacteraceae + # no good match for Evtepia + "Faecalimonas":'unclassified Lachnospiraceae genus', + # No good match for f__Ruminococcaceae|g__GGB9699. | o__Oscillospirales + # no good match for "|g__Eggerthellaceae_unclassified" + # no good match for Eisenbergiella +} +for k,v in replacements.items(): + tochange = data["genus"] == k + data.loc[tochange, 'genus'] = v + +assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow" + +if config["stage"] == "tradeoff": + targets = "tradeoff.csv" +else: + targets = "growth.zip" + +rule all: + input: + expand("models/{sample}.pickle", sample = metadata.sample_id), + targets + + + +rule build: + input: + agora_file=config["agora"], + output: + outf="models/{sample}.pickle", + out_manifest = "manifests/{sample}.csv" + threads: 2 + params: + cutoff=config["cutoff"] + run: + thisdata = data[data.sample_id == wildcards.sample] + print(thisdata.shape) + manifest_osqp = build( + thisdata, + input.agora_file, + os.path.dirname(output.outf), + solver="osqp", + cutoff=params.cutoff, + threads=2) + manifest_osqp.to_csv(output.out_manifest, index=False) + + + +rule merge_minifests: + input: + infiles = expand("manifests/{sample}.csv", sample = metadata.sample_id) + output: + outf = "manifest.csv" + run: + manifest = pd.concat((pd.read_csv(f) for f in input.infiles), ignore_index=True) + manifest.to_csv(output.outf, index=False) + +rule get_tradeoff: + input: + manifest = "manifests/{sample}.csv", + model = "models/{sample}.pickle", + medium = config["medium"] + output: + outf = "tradeoffs/{sample}.csv" + threads: 2 + resources: + runtime=4 * 60, + run: + medium = load_qiime_medium(input.medium) + manifest = pd.read_csv(input.manifest) + tradeoff_results = tradeoff( + manifest, + os.path.dirname(input.model), + medium, + threads=2 + ) + tradeoff_results.to_csv(output.outf, index=False) + +use rule merge_minifests as merge_tradeoffs with: + input: + infiles = expand("tradeoffs/{sample}.csv", sample = metadata.sample_id) + output: + outf = "tradeoff.csv" + +rule grow_sample: + input: + manifest = "manifests/{sample}.csv", + model = "models/{sample}.pickle", + medium = config["medium"] + output: + growth="growth/{sample}.zip" + params: + tradeoff = config["tradeoff"] + threads: 4 + run: + growth = grow( + data[data.sample_id == wildcards.sample], + os.path.dirname(input.model), + tradeoff=params.tradeoff, + threads=threads, + presolve=True) + save_results(growth, output.growth) + +def merge_gr(gra, grb): + from copy import deepcopy + new = deepcopy(gra) + _exchanges = pd.concat(gra.exchanges, grb.exchanges, ignore_index=True) + _growth_rates = pd.concat(gra.exchanges, grb.exchanges, ignore_index=True) + new.exchanges = _exchanges + new.growth_rates = _growth_rates + return(new) + + +rule merge_growth: + input: + infiles = expand("growths/{sample}.zip", sample= data.sample_id.unique()) + output: + growth = "growth.zip" + run: + growth = load_results(infiles[0]) + for i in infiles[2: len(infiles)]: + print(i) + growthb = load_results(infiles[i]) + merge_gr(growth, growthb) + save_results(growth, output.growth) + +# rule plot_found_abundance_fraction: +# input: +# manifest + +# sns.boxplot(data=manifest_osqp, x="found_abundance_fraction", y="cancer") +# sns.swarmplot(data=manifest_osqp, x="found_abundance_fraction", y="cancer") From f86293cc8ea37e16ff2b2df23e72e50134cf647b Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Thu, 19 Sep 2024 22:11:04 -0400 Subject: [PATCH 02/12] working tradeoff and grow --- config/config.yaml | 2 +- workflow/Snakefile | 37 ++++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 5189ddb9..a4b61833 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -4,5 +4,5 @@ taxonomy_data: None agora: agora103_genus.qza medium: western_diet_gut_agora.qza cutoff: 0.01 -tradeoff: None +tradeoff: null container: "docker://ghcr.io/vdblab/micom:0.37.0" diff --git a/workflow/Snakefile b/workflow/Snakefile index 228cfcf7..bd22f0a6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -2,13 +2,11 @@ import os import sys -from micom.workflows import build from micom import Community import micom.qiime_formats as qf from micom.qiime_formats import load_qiime_medium -from micom.workflows import build +from micom.workflows import grow, save_results, build, tradeoff, load_results from micom import Community -from micom.workflows import tradeoff import seaborn as sns import pandas as pd @@ -60,8 +58,12 @@ for k,v in replacements.items(): assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow" if config["stage"] == "tradeoff": - targets = "tradeoff.csv" + targets = [ + "tradeoff.csv", + "tradeoff.html", + ] else: + assert config["tradeoff"] is not None, "Must set tradeoff config value" targets = "growth.zip" rule all: @@ -130,6 +132,16 @@ use rule merge_minifests as merge_tradeoffs with: output: outf = "tradeoff.csv" +rule plot_tradeoff: + input: + inf = "tradeoff.csv" + output: + html = "tradeoff.html" + run: + from micom.viz import plot_tradeoff + tradeoff = pd.read_csv(input.inf) + plot_tradeoff(tradeoff, filename=output.html, tolerance=1e-06) + rule grow_sample: input: manifest = "manifests/{sample}.csv", @@ -141,9 +153,12 @@ rule grow_sample: tradeoff = config["tradeoff"] threads: 4 run: + medium = load_qiime_medium(input.medium) + manifest = pd.read_csv(input.manifest) growth = grow( - data[data.sample_id == wildcards.sample], + manifest, os.path.dirname(input.model), + medium, tradeoff=params.tradeoff, threads=threads, presolve=True) @@ -152,8 +167,8 @@ rule grow_sample: def merge_gr(gra, grb): from copy import deepcopy new = deepcopy(gra) - _exchanges = pd.concat(gra.exchanges, grb.exchanges, ignore_index=True) - _growth_rates = pd.concat(gra.exchanges, grb.exchanges, ignore_index=True) + _exchanges = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) + _growth_rates = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) new.exchanges = _exchanges new.growth_rates = _growth_rates return(new) @@ -161,14 +176,14 @@ def merge_gr(gra, grb): rule merge_growth: input: - infiles = expand("growths/{sample}.zip", sample= data.sample_id.unique()) + infiles = expand("growth/{sample}.zip", sample= metadata.sample_id) output: growth = "growth.zip" run: - growth = load_results(infiles[0]) - for i in infiles[2: len(infiles)]: + growth = load_results(input.infiles[0]) + for i in range(2, len(input.infiles)): print(i) - growthb = load_results(infiles[i]) + growthb = load_results(input.infiles[i]) merge_gr(growth, growthb) save_results(growth, output.growth) From ad67d95583e2e89d7ff1797eaca2f8088ad4c3f3 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Fri, 20 Sep 2024 09:05:04 -0400 Subject: [PATCH 03/12] bump tradeoff runtime --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index bd22f0a6..6aa8e40d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -114,7 +114,7 @@ rule get_tradeoff: outf = "tradeoffs/{sample}.csv" threads: 2 resources: - runtime=4 * 60, + runtime=8 * 60, run: medium = load_qiime_medium(input.medium) manifest = pd.read_csv(input.manifest) From fd1515512f9e23879c0e1d72052cf5fdc3a79739 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Mon, 23 Sep 2024 15:31:33 -0400 Subject: [PATCH 04/12] add readme, simplify workflow --- workflow/Snakefile | 44 ++++++++++++-------------------------- workflow_README.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 30 deletions(-) create mode 100644 workflow_README.md diff --git a/workflow/Snakefile b/workflow/Snakefile index 6aa8e40d..d1f170ac 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -17,43 +17,25 @@ data = pd.read_csv(config["abundances"]) for col in ["sample_id", "id", "genus", "abundance"]: assert col in data.columns, f"{col} must be a column name" -metadata = pd.read_csv(config["metadata"]) -assert "sample_id" in metadata.columns, "sample_id must be a column name" + +metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"]) if "testn" in config: metadata = metadata.head(config["testn"]) agora_taxa = qf.load_qiime_manifest(config["agora"]) agora_genera = agora_taxa.genus -#print(agora_genera) -ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("id")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False) +ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("genus")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False) ambig_genera_df.to_csv("unmatchable_genera.csv") -# TODO: take these from an external file: -# here I'm picking the best match based on GTDB -# https://raw.githubusercontent.com/biobakery/MetaPhlAn/master/metaphlan/utils/mpa_vJun23_CHOCOPhlAnSGB_202403_SGB2GTDB.tsv -replacements = { - 'Clostridia_unclassified': 'Clostridium', - 'Candidatus_Cibiobacter': 'Faecalibacterium', # no good solution, its either Ruminococcicae or Faecalibacterium https://www.sciencedirect.com/science/article/pii/S0092867419300017 - 'Lachnospiraceae_unclassified':'unclassified Lachnospiraceae genus', - 'Fusicatenibacter':'unclassified Lachnospiraceae genus', - 'Ruminococcaceae_unclassified': 'unclassified Ruminococcaceae genus', - 'Gemmiger': 'unclassified Ruminococcaceae genus', # bummer - 'GGB9633': 'unclassified Ruminococcaceae genus', -'GGB3293': 'Clostridium', - 'Ruthenibacterium': 'unclassified Ruminococcaceae genus', -'Clostridiaceae_unclassified': 'Clostridium', - 'GGB3256': 'unclassified Ruminococcaceae genus', # Acutalibacteraceae - # no good match for Evtepia - "Faecalimonas":'unclassified Lachnospiraceae genus', - # No good match for f__Ruminococcaceae|g__GGB9699. | o__Oscillospirales - # no good match for "|g__Eggerthellaceae_unclassified" - # no good match for Eisenbergiella -} -for k,v in replacements.items(): - tochange = data["genus"] == k - data.loc[tochange, 'genus'] = v +if "taxakey" in config: + replacementsdf = pd.read_csv(config["taxakey"], usecols=['A', 'B']) + replacements = dict(zip(replacementsdf.A, replacementsdf.B)) + print(replacements) + for k,v in replacements.items(): + tochange = data["genus"] == k + data.loc[tochange, 'genus'] = v assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow" @@ -114,7 +96,7 @@ rule get_tradeoff: outf = "tradeoffs/{sample}.csv" threads: 2 resources: - runtime=8 * 60, + runtime=lambda wc, attempt: 24 * 60 * attempt, run: medium = load_qiime_medium(input.medium) manifest = pd.read_csv(input.manifest) @@ -122,7 +104,7 @@ rule get_tradeoff: manifest, os.path.dirname(input.model), medium, - threads=2 + threads=threads ) tradeoff_results.to_csv(output.outf, index=False) @@ -152,6 +134,8 @@ rule grow_sample: params: tradeoff = config["tradeoff"] threads: 4 + resources: + runtime=lambda wc, attempt: 4 * 60 * attempt, run: medium = load_qiime_medium(input.medium) manifest = pd.read_csv(input.manifest) diff --git a/workflow_README.md b/workflow_README.md new file mode 100644 index 00000000..2ac79dfd --- /dev/null +++ b/workflow_README.md @@ -0,0 +1,53 @@ +# Running MICOM with Snakemake + +Running the cooperative tradeoff analysis can be a long-running task. Here we describe how to use the included Snakemake workflow to distribute those tasks to the cloud or HPC + +## Prerequisites + +- an environment containing MICOM and all its dependencies, as well as Snakemake + -- eg, `mamba create -n micom snakemake python -y ; mamba activate micom ; pip install numpy==2.1.1 Cython biom-format==2.1.16 micom seaborn` +- a configured Snakemake Profile + +## Input Data + +This workflow requires two files: + +- a long-format sample counts table with columns as follows: + - `sample_id` + - `genus` + - `abundance` +- (optional) file containing genus taxonomy translations between the tool generating the counts and the underlying taxa models used by MICOM. Column `A` shoudl have the genus as provided by your tool, and column `B` should have the AGORA genus. + + +## Step 0: Checking taxonomic coverage + +First, run the workflow with the `--dry-run` flag to prevent snakemake from submitting any jobs. This will run the logic to match up the genera you provided with those present in the pre-formated AGORA model. Genera not able to matched will be written to `unmatchable_genera.csv` in your output directory. In cases of naming mismatches, create a csv file containing your and AGORA's genus names as columns `A` and `B`, and supply in the next step. + +``` +snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ --dry-run +``` + +## Step 1: Identifying an optimal tradeoff + +We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. + +``` +snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +``` + + +## Step 2: Grow +Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs. This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`. + +``` +snakemake --config stage=grow tradeoff=0.6 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +``` + +## Step 3: Analysis + +From here, you can follow along the tutorial! Run the following in place of th `growth = grow(com, "models", medium, tradeoff=0.8, threads=2)` line: + +```python +from micom.workflows import load_results +growth = load_results("results/growth.zip") +``` From 9c98879f1f1c09d81c22184bec6f8b887de58042 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Tue, 24 Sep 2024 10:12:09 -0400 Subject: [PATCH 05/12] tidy for a PR --- config/config.yaml | 3 --- workflow_README.md | 16 +++++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index a4b61833..8ef11a55 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,8 +1,5 @@ -metadata: /path/to/sample_metadata.csv abundances: /path/to/long_otu_table.csv -taxonomy_data: None agora: agora103_genus.qza medium: western_diet_gut_agora.qza cutoff: 0.01 tradeoff: null -container: "docker://ghcr.io/vdblab/micom:0.37.0" diff --git a/workflow_README.md b/workflow_README.md index 2ac79dfd..a3a16bae 100644 --- a/workflow_README.md +++ b/workflow_README.md @@ -1,6 +1,6 @@ # Running MICOM with Snakemake -Running the cooperative tradeoff analysis can be a long-running task. Here we describe how to use the included Snakemake workflow to distribute those tasks to the cloud or HPC +Several steps of MICOM can be quite time consuming. Luckily, they are also parallelizable! Here we describe how to use the included Snakemake workflow to distribute those tasks to the cloud or HPC, and hand the results back to the tutorial. ## Prerequisites @@ -10,12 +10,14 @@ Running the cooperative tradeoff analysis can be a long-running task. Here we d ## Input Data -This workflow requires two files: +This workflow requires three files: - a long-format sample counts table with columns as follows: - `sample_id` - `genus` - `abundance` +- a MICOM-formatted AGORA model bundle: `wget -O agora103_genus.qza https://zenodo.org/record/3755182/files/agora103_genus.qza?download=1` +- a MICOM-formatted diet/medium: `wget -O western_diet_gut.qza https://zenodo.org/record/3755182/files/western_diet_gut.qza?download=1` - (optional) file containing genus taxonomy translations between the tool generating the counts and the underlying taxa models used by MICOM. Column `A` shoudl have the genus as provided by your tool, and column `B` should have the AGORA genus. @@ -29,23 +31,23 @@ snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources ## Step 1: Identifying an optimal tradeoff -We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. +We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. The config can be modified to set the appropriate taxa abundance cutoff. ``` -snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ ``` ## Step 2: Grow -Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs. This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`. +Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs! This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`. ``` -snakemake --config stage=grow tradeoff=0.6 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +snakemake --config stage=grow tradeoff=0.6 cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ ``` ## Step 3: Analysis -From here, you can follow along the tutorial! Run the following in place of th `growth = grow(com, "models", medium, tradeoff=0.8, threads=2)` line: +From here, you can follow along the tutorial! Run the following in place of the `growth = grow(com, "models", medium, tradeoff=0.8, threads=2)` line: ```python from micom.workflows import load_results From e31e4f2cc95290878580c777236ae43c62bb1e78 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Tue, 24 Sep 2024 10:14:06 -0400 Subject: [PATCH 06/12] remove unneeded code --- workflow/Snakefile | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index d1f170ac..adbc7906 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -83,6 +83,7 @@ rule merge_minifests: infiles = expand("manifests/{sample}.csv", sample = metadata.sample_id) output: outf = "manifest.csv" + run: manifest = pd.concat((pd.read_csv(f) for f in input.infiles), ignore_index=True) manifest.to_csv(output.outf, index=False) @@ -170,10 +171,3 @@ rule merge_growth: growthb = load_results(input.infiles[i]) merge_gr(growth, growthb) save_results(growth, output.growth) - -# rule plot_found_abundance_fraction: -# input: -# manifest - -# sns.boxplot(data=manifest_osqp, x="found_abundance_fraction", y="cancer") -# sns.swarmplot(data=manifest_osqp, x="found_abundance_fraction", y="cancer") From 567d9fd3ca9bc48162e068ba19a98cc6d3cc5694 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Tue, 24 Sep 2024 10:50:45 -0400 Subject: [PATCH 07/12] add missing file for rule all --- workflow/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index adbc7906..86595f5d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -50,6 +50,7 @@ else: rule all: input: + "manifest.csv", expand("models/{sample}.pickle", sample = metadata.sample_id), targets From 4e99a2898b07f6e8eb163a65265c3bda1e042a9e Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Tue, 24 Sep 2024 13:48:45 -0400 Subject: [PATCH 08/12] clean up growth merge --- workflow/Snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 86595f5d..7213833b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -154,7 +154,7 @@ def merge_gr(gra, grb): from copy import deepcopy new = deepcopy(gra) _exchanges = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) - _growth_rates = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) + _growth_rates = pd.concat([gra.growth_rates, grb.growth_rates], ignore_index=True) new.exchanges = _exchanges new.growth_rates = _growth_rates return(new) @@ -170,5 +170,6 @@ rule merge_growth: for i in range(2, len(input.infiles)): print(i) growthb = load_results(input.infiles[i]) - merge_gr(growth, growthb) + growth = merge_gr(growth, growthb) + print(growth.growth_rates.shape) save_results(growth, output.growth) From c212a386aa32e8034bf218055fcbe21e34aed777 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Fri, 4 Oct 2024 10:54:53 -0400 Subject: [PATCH 09/12] fix readme paths, reaggregate after taxa replacements --- workflow/Snakefile | 21 ++++++++++++++++++--- workflow_README.md | 6 +++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 7213833b..2947a8fb 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,8 +14,9 @@ import pandas as pd configfile: os.path.join(workflow.basedir, "../config/config.yaml") data = pd.read_csv(config["abundances"]) -for col in ["sample_id", "id", "genus", "abundance"]: +for col in ["sample_id", "genus", "abundance"]: assert col in data.columns, f"{col} must be a column name" +data = thisdata[data.abundance > 0] metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"]) @@ -28,6 +29,7 @@ agora_genera = agora_taxa.genus ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("genus")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False) ambig_genera_df.to_csv("unmatchable_genera.csv") +agora_taxa.genus.to_csv("available_genera.csv") if "taxakey" in config: replacementsdf = pd.read_csv(config["taxakey"], usecols=['A', 'B']) @@ -36,6 +38,18 @@ if "taxakey" in config: for k,v in replacements.items(): tochange = data["genus"] == k data.loc[tochange, 'genus'] = v + # after replacing, we need to resum by genus to avoid duplicates + data = data.groupby(['sample_id', 'genus'])['abundance'].sum().reset_index() + + +# check for samples with no evaluable taxa +for sample in data.sample_id.unique(): + _data = data[data.sample_id == sample] + _data = _data[_data.genus.isin(agora_genera)] + if _data.shape[0] == 0: + raise ValueError(f"sample {sample} has no evaluable genera") +# remove non-agora taxa +data = data[data.genus.isin(agora_genera)] assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow" @@ -66,15 +80,16 @@ rule build: params: cutoff=config["cutoff"] run: + from micom.logger import logger + logger.setLevel("INFO") thisdata = data[data.sample_id == wildcards.sample] - print(thisdata.shape) manifest_osqp = build( thisdata, input.agora_file, os.path.dirname(output.outf), solver="osqp", cutoff=params.cutoff, - threads=2) + threads=1) manifest_osqp.to_csv(output.out_manifest, index=False) diff --git a/workflow_README.md b/workflow_README.md index a3a16bae..acab2c61 100644 --- a/workflow_README.md +++ b/workflow_README.md @@ -26,7 +26,7 @@ This workflow requires three files: First, run the workflow with the `--dry-run` flag to prevent snakemake from submitting any jobs. This will run the logic to match up the genera you provided with those present in the pre-formated AGORA model. Genera not able to matched will be written to `unmatchable_genera.csv` in your output directory. In cases of naming mismatches, create a csv file containing your and AGORA's genus names as columns `A` and `B`, and supply in the next step. ``` -snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ --dry-run +snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ --dry-run ``` ## Step 1: Identifying an optimal tradeoff @@ -34,7 +34,7 @@ snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/resources We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. The config can be modified to set the appropriate taxa abundance cutoff. ``` -snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ ``` @@ -42,7 +42,7 @@ snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$P Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs! This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`. ``` -snakemake --config stage=grow tradeoff=0.6 cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/resources/agora103_genus.qza medium=$PWD/resources/western_diet_gut_agora.qza --directory $PWD/results/ +snakemake --config stage=grow tradeoff=0.6 cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ ``` ## Step 3: Analysis From b53915ac2093cfd1a7e45d0eef42a348f1feafd3 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Fri, 4 Oct 2024 10:58:33 -0400 Subject: [PATCH 10/12] fix syntax --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 2947a8fb..706e4ba9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,7 +16,7 @@ configfile: os.path.join(workflow.basedir, "../config/config.yaml") data = pd.read_csv(config["abundances"]) for col in ["sample_id", "genus", "abundance"]: assert col in data.columns, f"{col} must be a column name" -data = thisdata[data.abundance > 0] +data = data[data.abundance > 0] metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"]) From 9dafdcaf2591203114736777c067a7554407aa1f Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Tue, 8 Oct 2024 21:14:45 -0400 Subject: [PATCH 11/12] add ignore param for unconverging samples --- workflow/Snakefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 706e4ba9..8754bebe 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -20,6 +20,13 @@ data = data[data.abundance > 0] metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"]) +if "ignore_samples" in config: + if config["stage"] == "grow": + print("Are you sure you want to exclude samples from the grow stage?") + for sample in config["ignore_samples"]: + print("dropping sample") + metadata = metadata[metadata.sample_id != sample] + if "testn" in config: metadata = metadata.head(config["testn"]) From b8be05a964a87d83242eabbe0ce71a1770a29a0a Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Wed, 9 Oct 2024 12:40:11 -0400 Subject: [PATCH 12/12] fix dropped annotations during merge --- workflow/Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 8754bebe..20524594 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -177,8 +177,10 @@ def merge_gr(gra, grb): new = deepcopy(gra) _exchanges = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) _growth_rates = pd.concat([gra.growth_rates, grb.growth_rates], ignore_index=True) + _annotations = pd.concat([gra.annotations, grb.annotations], ignore_index=True).drop_duplicates().reset_index(drop=True) new.exchanges = _exchanges new.growth_rates = _growth_rates + new.annotations = _annotations return(new)