Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,4 @@ agora*/
representation.py
*.pickle

*~
5 changes: 5 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
abundances: /path/to/long_otu_table.csv
agora: agora103_genus.qza
medium: western_diet_gut_agora.qza
cutoff: 0.01
tradeoff: null
199 changes: 199 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import os
import sys


from micom import Community
import micom.qiime_formats as qf
from micom.qiime_formats import load_qiime_medium
from micom.workflows import grow, save_results, build, tradeoff, load_results
from micom import Community
import seaborn as sns
import pandas as pd


configfile: os.path.join(workflow.basedir, "../config/config.yaml")

data = pd.read_csv(config["abundances"])
for col in ["sample_id", "genus", "abundance"]:
assert col in data.columns, f"{col} must be a column name"
data = data[data.abundance > 0]


metadata = pd.DataFrame(data.sample_id.unique(), columns=["sample_id"])
if "ignore_samples" in config:
if config["stage"] == "grow":
print("Are you sure you want to exclude samples from the grow stage?")
for sample in config["ignore_samples"]:
print("dropping sample")
metadata = metadata[metadata.sample_id != sample]

if "testn" in config:
metadata = metadata.head(config["testn"])


agora_taxa = qf.load_qiime_manifest(config["agora"])
agora_genera = agora_taxa.genus
ambig_genera_df = data[~data.genus.isin(agora_genera)].groupby("genus")["abundance"].agg(["median", "mean", "max"]).sort_values("mean", ascending=False)

ambig_genera_df.to_csv("unmatchable_genera.csv")
agora_taxa.genus.to_csv("available_genera.csv")

if "taxakey" in config:
replacementsdf = pd.read_csv(config["taxakey"], usecols=['A', 'B'])
replacements = dict(zip(replacementsdf.A, replacementsdf.B))
print(replacements)
for k,v in replacements.items():
tochange = data["genus"] == k
data.loc[tochange, 'genus'] = v
# after replacing, we need to resum by genus to avoid duplicates
data = data.groupby(['sample_id', 'genus'])['abundance'].sum().reset_index()


# check for samples with no evaluable taxa
for sample in data.sample_id.unique():
_data = data[data.sample_id == sample]
_data = _data[_data.genus.isin(agora_genera)]
if _data.shape[0] == 0:
raise ValueError(f"sample {sample} has no evaluable genera")
# remove non-agora taxa
data = data[data.genus.isin(agora_genera)]

assert config["stage"] in ["tradeoff", "grow"], "stage must either be tradeoff or grow"

if config["stage"] == "tradeoff":
targets = [
"tradeoff.csv",
"tradeoff.html",
]
else:
assert config["tradeoff"] is not None, "Must set tradeoff config value"
targets = "growth.zip"

rule all:
input:
"manifest.csv",
expand("models/{sample}.pickle", sample = metadata.sample_id),
targets



rule build:
input:
agora_file=config["agora"],
output:
outf="models/{sample}.pickle",
out_manifest = "manifests/{sample}.csv"
threads: 2
params:
cutoff=config["cutoff"]
run:
from micom.logger import logger
logger.setLevel("INFO")
thisdata = data[data.sample_id == wildcards.sample]
manifest_osqp = build(
thisdata,
input.agora_file,
os.path.dirname(output.outf),
solver="osqp",
cutoff=params.cutoff,
threads=1)
manifest_osqp.to_csv(output.out_manifest, index=False)



rule merge_minifests:
input:
infiles = expand("manifests/{sample}.csv", sample = metadata.sample_id)
output:
outf = "manifest.csv"

run:
manifest = pd.concat((pd.read_csv(f) for f in input.infiles), ignore_index=True)
manifest.to_csv(output.outf, index=False)

rule get_tradeoff:
input:
manifest = "manifests/{sample}.csv",
model = "models/{sample}.pickle",
medium = config["medium"]
output:
outf = "tradeoffs/{sample}.csv"
threads: 2
resources:
runtime=lambda wc, attempt: 24 * 60 * attempt,
run:
medium = load_qiime_medium(input.medium)
manifest = pd.read_csv(input.manifest)
tradeoff_results = tradeoff(
manifest,
os.path.dirname(input.model),
medium,
threads=threads
)
tradeoff_results.to_csv(output.outf, index=False)

use rule merge_minifests as merge_tradeoffs with:
input:
infiles = expand("tradeoffs/{sample}.csv", sample = metadata.sample_id)
output:
outf = "tradeoff.csv"

rule plot_tradeoff:
input:
inf = "tradeoff.csv"
output:
html = "tradeoff.html"
run:
from micom.viz import plot_tradeoff
tradeoff = pd.read_csv(input.inf)
plot_tradeoff(tradeoff, filename=output.html, tolerance=1e-06)

rule grow_sample:
input:
manifest = "manifests/{sample}.csv",
model = "models/{sample}.pickle",
medium = config["medium"]
output:
growth="growth/{sample}.zip"
params:
tradeoff = config["tradeoff"]
threads: 4
resources:
runtime=lambda wc, attempt: 4 * 60 * attempt,
run:
medium = load_qiime_medium(input.medium)
manifest = pd.read_csv(input.manifest)
growth = grow(
manifest,
os.path.dirname(input.model),
medium,
tradeoff=params.tradeoff,
threads=threads,
presolve=True)
save_results(growth, output.growth)

def merge_gr(gra, grb):
from copy import deepcopy
new = deepcopy(gra)
_exchanges = pd.concat([gra.exchanges, grb.exchanges], ignore_index=True)
_growth_rates = pd.concat([gra.growth_rates, grb.growth_rates], ignore_index=True)
_annotations = pd.concat([gra.annotations, grb.annotations], ignore_index=True).drop_duplicates().reset_index(drop=True)
new.exchanges = _exchanges
new.growth_rates = _growth_rates
new.annotations = _annotations
return(new)


rule merge_growth:
input:
infiles = expand("growth/{sample}.zip", sample= metadata.sample_id)
output:
growth = "growth.zip"
run:
growth = load_results(input.infiles[0])
for i in range(2, len(input.infiles)):
print(i)
growthb = load_results(input.infiles[i])
growth = merge_gr(growth, growthb)
print(growth.growth_rates.shape)
save_results(growth, output.growth)
55 changes: 55 additions & 0 deletions workflow_README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Running MICOM with Snakemake

Several steps of MICOM can be quite time consuming. Luckily, they are also parallelizable! Here we describe how to use the included Snakemake workflow to distribute those tasks to the cloud or HPC, and hand the results back to the tutorial.

## Prerequisites

- an environment containing MICOM and all its dependencies, as well as Snakemake
-- eg, `mamba create -n micom snakemake python -y ; mamba activate micom ; pip install numpy==2.1.1 Cython biom-format==2.1.16 micom seaborn`
- a configured Snakemake Profile

## Input Data

This workflow requires three files:

- a long-format sample counts table with columns as follows:
- `sample_id`
- `genus`
- `abundance`
- a MICOM-formatted AGORA model bundle: `wget -O agora103_genus.qza https://zenodo.org/record/3755182/files/agora103_genus.qza?download=1`
- a MICOM-formatted diet/medium: `wget -O western_diet_gut.qza https://zenodo.org/record/3755182/files/western_diet_gut.qza?download=1`
- (optional) file containing genus taxonomy translations between the tool generating the counts and the underlying taxa models used by MICOM. Column `A` shoudl have the genus as provided by your tool, and column `B` should have the AGORA genus.


## Step 0: Checking taxonomic coverage

First, run the workflow with the `--dry-run` flag to prevent snakemake from submitting any jobs. This will run the logic to match up the genera you provided with those present in the pre-formated AGORA model. Genera not able to matched will be written to `unmatchable_genera.csv` in your output directory. In cases of naming mismatches, create a csv file containing your and AGORA's genus names as columns `A` and `B`, and supply in the next step.

```
snakemake --config stage=tradeoff abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/ --dry-run
```

## Step 1: Identifying an optimal tradeoff

We will utilize Snakemake config arguments to first submit jobs calculating the optimal tradeoff parameter for each sample. The config can be modified to set the appropriate taxa abundance cutoff.

```
snakemake --config stage=tradeoff cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/
```


## Step 2: Grow
Having decided on the tradeoff value based on the visualizations in `results/tradeoff.html`, you can now submit the growth jobs! This is done by changing the `stage` config argument, setting the `tradeoff` argument, and re-using the same output `--directory`.

```
snakemake --config stage=grow tradeoff=0.6 cutoff=0.05 abundances=$PWD/data.csv agora=$PWD/agora103_genus.qza medium=$PWD/western_diet_gut.qza --directory $PWD/results/
```

## Step 3: Analysis

From here, you can follow along the tutorial! Run the following in place of the `growth = grow(com, "models", medium, tradeoff=0.8, threads=2)` line:

```python
from micom.workflows import load_results
growth = load_results("results/growth.zip")
```