diff --git a/.gitignore b/.gitignore index 5266ae40..47dcf31f 100755 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ agora*/ representation.py *.pickle +*~ diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 00000000..8ef11a55 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,5 @@ +abundances: /path/to/long_otu_table.csv +agora: agora103_genus.qza +medium: western_diet_gut_agora.qza +cutoff: 0.01 +tradeoff: null diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 00000000..20524594 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,199 @@ +import os +import sys + + +from micom import Community +import micom.qiime_formats as qf +from micom.qiime_formats import load_qiime_medium +from micom.workflows import grow, save_results, build, tradeoff, load_results +from micom import Community +import seaborn as sns +import pandas as pd + + +configfile: os.path.join(workflow.basedir, "../config/config.yaml") + +data = pd.read_csv(config["abundances"]) +for col in ["sample_id", "genus", "abundance"]: + assert col in data.columns, f"{col} must be a column name" +data = data[data.abundance > 0] + + +metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"]) +if "ignore_samples" in config: + if config["stage"] == "grow": + print("Are you sure you want to exclude samples from the grow stage?") + for sample in config["ignore_samples"]: + print("dropping sample") + metadata = metadata[metadata.sample_id != sample] + +if "testn" in config: + metadata = metadata.head(config["testn"]) + + +agora_taxa = qf.load_qiime_manifest(config["agora"]) +agora_genera = agora_taxa.genus +ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("genus")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False) + +ambig_genera_df.to_csv("unmatchable_genera.csv") +agora_taxa.genus.to_csv("available_genera.csv") + +if "taxakey" in config: + replacementsdf = pd.read_csv(config["taxakey"], usecols=['A', 'B']) + replacements = dict(zip(replacementsdf.A, replacementsdf.B)) + print(replacements) + for k,v in replacements.items(): + tochange = data["genus"] == k + data.loc[tochange, 'genus'] = v + # after replacing, we need to resum by genus to avoid duplicates + data = data.groupby(['sample_id', 'genus'])['abundance'].sum().reset_index() + + +# check for samples with no evaluable taxa +for sample in data.sample_id.unique(): + _data = data[data.sample_id == sample] + _data = _data[_data.genus.isin(agora_genera)] + if _data.shape[0] == 0: + raise ValueError(f"sample {sample} has no evaluable genera") +# remove non-agora taxa +data = data[data.genus.isin(agora_genera)] + +assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow" + +if config["stage"] == "tradeoff": + targets = [ + "tradeoff.csv", + "tradeoff.html", + ] +else: + assert config["tradeoff"] is not None, "Must set tradeoff config value" + targets = "growth.zip" + +rule all: + input: + "manifest.csv", + expand("models/{sample}.pickle", sample = metadata.sample_id), + targets + + + +rule build: + input: + agora_file=config["agora"], + output: + outf="models/{sample}.pickle", + out_manifest = "manifests/{sample}.csv" + threads: 2 + params: + cutoff=config["cutoff"] + run: + from micom.logger import logger + logger.setLevel("INFO") + thisdata = data[data.sample_id == wildcards.sample] + manifest_osqp = build( + thisdata, + input.agora_file, + os.path.dirname(output.outf), + solver="osqp", + cutoff=params.cutoff, + threads=1) + manifest_osqp.to_csv(output.out_manifest, index=False) + + + +rule merge_minifests: + input: + infiles = expand("manifests/{sample}.csv", sample = metadata.sample_id) + output: + outf = "manifest.csv" + + run: + manifest = pd.concat((pd.read_csv(f) for f in input.infiles), ignore_index=True) + manifest.to_csv(output.outf, index=False) + +rule get_tradeoff: + input: + manifest = "manifests/{sample}.csv", + model = "models/{sample}.pickle", + medium = config["medium"] + output: + outf = "tradeoffs/{sample}.csv" + threads: 2 + resources: + runtime=lambda wc, attempt: 24 * 60 * attempt, + run: + medium = load_qiime_medium(input.medium) + manifest = pd.read_csv(input.manifest) + tradeoff_results = tradeoff( + manifest, + os.path.dirname(input.model), + medium, + threads=threads + ) + tradeoff_results.to_csv(output.outf, index=False) + +use rule merge_minifests as merge_tradeoffs with: + input: + infiles = expand("tradeoffs/{sample}.csv", sample = metadata.sample_id) + output: + outf = "tradeoff.csv" + +rule plot_tradeoff: + input: + inf = "tradeoff.csv" + output: + html = "tradeoff.html" + run: + from micom.viz import plot_tradeoff + tradeoff = pd.read_csv(input.inf) + plot_tradeoff(tradeoff, filename=output.html, tolerance=1e-06) + +rule grow_sample: + input: + manifest = "manifests/{sample}.csv", + model = "models/{sample}.pickle", + medium = config["medium"] + output: + growth="growth/{sample}.zip" + params: + tradeoff = config["tradeoff"] + threads: 4 + resources: + runtime=lambda wc, attempt: 4 * 60 * attempt, + run: + medium = load_qiime_medium(input.medium) + manifest = pd.read_csv(input.manifest) + growth = grow( + manifest, + os.path.dirname(input.model), + medium, + tradeoff=params.tradeoff, + threads=threads, + presolve=True) + save_results(growth, output.growth) + +def merge_gr(gra, grb): + from copy import deepcopy + new = deepcopy(gra) + _exchanges = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True) + _growth_rates = pd.concat([gra.growth_rates, grb.growth_rates], ignore_index=True) + _annotations = pd.concat([gra.annotations, grb.annotations], ignore_index=True).drop_duplicates().reset_index(drop=True) + new.exchanges = _exchanges + new.growth_rates = _growth_rates + new.annotations = _annotations + return(new) + + +rule merge_growth: + input: + infiles = expand("growth/{sample}.zip", sample= metadata.sample_id) + output: + growth = "growth.zip" + run: + growth = load_results(input.infiles[0]) + for i in range(2, len(input.infiles)): + print(i) + growthb = load_results(input.infiles[i]) + growth = merge_gr(growth, growthb) + print(growth.growth_rates.shape) + save_results(growth, output.growth) diff --git a/workflow_README.md b/workflow_README.md new file mode 100644 index 00000000..acab2c61 --- /dev/null +++ b/workflow_README.md @@ -0,0 +1,55 @@ +# Running MICOM with Snakemake + +Several steps of MICOM can be quite time consuming. Luckily, they are also parallelizable! Here we describe how to use the included Snakemake workflow to distribute those tasks to the cloud or HPC, and hand the results back to the tutorial. + +## Prerequisites + +- an environment containing MICOM and all its dependencies, as well as Snakemake + -- eg, `mamba create -n micom snakemake python -y ; mamba activate micom ; pip install numpy==2.1.1 Cython biom-format==2.1.16 micom seaborn` +- a configured Snakemake Profile + +## Input Data + +This workflow requires three files: + +- a long-format sample counts table with columns as follows: + - `sample_id` + - `genus` + - `abundance` +- a MICOM-formatted AGORA model bundle: `wget -O agora103_genus.qza https://zenodo.org/record/3755182/files/agora103_genus.qza?download=1` +- a MICOM-formatted diet/medium: `wget -O western_diet_gut.qza https://zenodo.org/record/3755182/files/western_diet_gut.qza?download=1` +- (optional) file containing genus taxonomy translations between the tool generating the counts and the underlying taxa models used by MICOM. Column `A` shoudl have the genus as provided by your tool, and column `B` should have the AGORA genus. + + +## Step 0: Checking taxonomic coverage + +First, run the workflow with the `--dry-run` flag to prevent snakemake from submitting any jobs. This will run the logic to match up the genera you provided with those present in the pre-formated AGORA model. Genera not able to matched will be written to `unmatchable_genera.csv` in your output directory. In cases of naming mismatches, create a csv file containing your and AGORA's genus names as columns `A` and `B`, and supply in the next step. + +``` +snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ --dry-run +``` + +## Step 1: Identifying an optimal tradeoff + +We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. The config can be modified to set the appropriate taxa abundance cutoff. + +``` +snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ +``` + + +## Step 2: Grow +Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs! This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`. + +``` +snakemake --config stage=grow tradeoff=0.6 cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ +``` + +## Step 3: Analysis + +From here, you can follow along the tutorial! Run the following in place of the `growth = grow(com, "models", medium, tradeoff=0.8, threads=2)` line: + +```python +from micom.workflows import load_results +growth = load_results("results/growth.zip") +```