From 7f7da8cc03f13e6654f805a4680b0f85ce5c2344 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Sat, 13 Feb 2021 08:51:05 -0500 Subject: [PATCH 01/48] updated documentation --- docs/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 97afc88..e21f738 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -239,7 +239,7 @@ extracts a genome id from a sequence id in a fasta file. It is used to group contigs together. The default argument ``[^\|]+`` matches all characters up to the first ``|``, and uses them as a genome id. -You can check your regular expression using the ``--check-regex-flag``, which +You can check your regular expression using the ``--check-regex`` flag, which skips the extract step and instead outputs a list of all genome ids. From ba9a956b66f16ef02a2c266b0ecf18263e6ed299 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Mon, 15 Feb 2021 10:00:55 -0500 Subject: [PATCH 02/48] read positions must be integer type --- test/test_coptr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_coptr.py b/test/test_coptr.py index 8c254b6..b9df749 100644 --- a/test/test_coptr.py +++ b/test/test_coptr.py @@ -26,6 +26,7 @@ def test_coptr(self): for i in range(20): print("\tsimulation", i+1, "of", 20) read_positions, ptr, ori_pos, ter_pos = simulator.simulate_reads(20000, ori_pos=ori_pos) + read_positions = read_positions.astype(int) sim_log2_ptrs.append(np.log2(ptr)) est_log2_ptr, ori, ter, best_f, qc_result = coptr_ref.estimate_ptr(read_positions, simulator.get_genome_length()) From f399de371b506248c629c34a2d27f49443eef99b Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 25 Feb 2021 09:14:16 -0500 Subject: [PATCH 03/48] separate relative abundance from read cound calculation --- coptr.py | 72 ++++++++++++++++++++++++-- src/compute_read_counts.py | 25 ++++----- src/compute_rel_abun.py | 102 +++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 20 deletions(-) create mode 100644 src/compute_rel_abun.py diff --git a/coptr.py b/coptr.py index 2d1161e..4f2e3e5 100644 --- a/coptr.py +++ b/coptr.py @@ -26,11 +26,12 @@ from src.coptr_contig import estimate_ptrs_coptr_contig from src.coptr_ref import estimate_ptrs_coptr_ref from src.compute_read_counts import compute_read_counts +from src.compute_rel_abun import compute_rel_abun from src.print import print_error, print_info from src.read_mapper import ReadMapper from src.util import get_fastq_name -VERSION="1.0.0" +VERSION="1.1.0" class ProgramOptions: @@ -45,6 +46,7 @@ def __init__(self): extract compute coverage maps from bam files estimate estimate PTRs from coverage maps count compute read counts for each genome after filtering + rabun estimate relative abundances for each genomes after filtering """ ) self.default_bt2_k = 10 @@ -188,7 +190,7 @@ def extract(self): def estimate(self): parser = argparse.ArgumentParser(usage= - """usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] [--plot OUTFOLDER] [--batch-size INT] [--restart] coverage-map-folder out-file + """usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file """ ) parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") @@ -340,11 +342,17 @@ def estimate(self): def count(self): parser = argparse.ArgumentParser(usage= - """usage: coptr.py count [-h] coverage-map-folder out-file + """usage: coptr.py count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file """ ) parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") parser.add_argument("out_file", help="Filename to store PTR table.") + parser.add_argument("--min-cov", type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 + ) + parser.add_argument("--min-samples", type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 + ) if len(sys.argv[2:]) < 1: parser.print_help() @@ -352,9 +360,9 @@ def count(self): args = parser.parse_args(sys.argv[2:]) - print_info("CoPTR", "estimating relative abundances") + print_info("CoPTR", "computing read counts") - counts, genome_ids = compute_read_counts(args.coverage_map_folder) + counts, genome_ids = compute_read_counts(args.coverage_map_folder, args.min_cov, args.min_samples) out_file = args.out_file _, ext = os.path.splitext(out_file) @@ -383,5 +391,59 @@ def count(self): print_info("CoPTR", "done!") + def rabun(self): + parser = argparse.ArgumentParser(usage= + """usage: coptr.py rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + """ + ) + parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") + parser.add_argument("out_file", help="Filename to store PTR table.") + parser.add_argument("--min-reads", type=float, + help="Minimum number of reads required to compute a PTR (default 5000).", default=5000 + ) + parser.add_argument("--min-cov", type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 + ) + parser.add_argument("--min-samples", type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + exit(1) + + args = parser.parse_args(sys.argv[2:]) + + print_info("CoPTR", "computing relative abundances") + + rel_abun, genome_ids = compute_rel_abun(args.coverage_map_folder, args.min_reads, args.min_cov, args.min_samples) + + out_file = args.out_file + _, ext = os.path.splitext(out_file) + if ext != ".csv": + out_file += ".csv" + + print_info("CoPTR", "writing {}".format(out_file)) + + with open(out_file, "w") as f: + # write the header + f.write("rel_abun:genome_id/sample_id") + for sample_id in rel_abun: + f.write(",{}".format(sample_id)) + f.write("\n") + + for genome_id in sorted(genome_ids): + row = [genome_id] + for sample_id in rel_abun: + if genome_id in rel_abun[sample_id]: + row.append(str(rel_abun[sample_id][genome_id])) + else: + row.append(str(0)) + row = ",".join(row) + "\n" + f.write(row) + + print_info("CoPTR", "done!") + + if __name__ == "__main__": ProgramOptions() \ No newline at end of file diff --git a/src/compute_read_counts.py b/src/compute_read_counts.py index c8d568c..8f000e3 100644 --- a/src/compute_read_counts.py +++ b/src/compute_read_counts.py @@ -30,11 +30,11 @@ from src.print import print_info, print_warning -def compute_read_counts_from_coverage_maps(coverage_maps): +def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): # instantiate classes for filtering methods - coptr_contig = CoPTRContig(5000, 5) - rf_ref = ReadFilterRef(5000, 0.75) + coptr_contig = CoPTRContig(1, min_samples) + rf_ref = ReadFilterRef(1, min_cov) total_passing_reads = 0 read_counts = {} sample_id = None @@ -59,21 +59,16 @@ def compute_read_counts_from_coverage_maps(coverage_maps): count = filtered_reads.size read_counts[cm.genome_id] = count total_passing_reads += count + genome_ids.add(cm.genome_id) - genome_ids.add(cm.genome_id) - - rel_abun = {} - for genome_id in read_counts: - rel_abun[genome_id] = read_counts[genome_id] / total_passing_reads + return sample_id, read_counts, genome_ids - return sample_id, rel_abun, genome_ids +def compute_read_counts(coverage_map_folder, min_cov, min_samples): -def compute_read_counts(coverage_map_folder): - - rel_abundances = {} + read_counts = {} genome_ids = set() for f in sorted(os.listdir(coverage_map_folder)): fname, ext = os.path.splitext(f) @@ -85,10 +80,10 @@ def compute_read_counts(coverage_map_folder): with open(fpath, "rb") as file: coverage_maps = pkl.load(file) - sample_id, sample_rel_abun, sample_genome_ids = compute_read_counts_from_coverage_maps(coverage_maps) + sample_id, sample_read_counts, sample_genome_ids = compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples) if sample_id is not None: - rel_abundances[sample_id] = sample_rel_abun + read_counts[sample_id] = sample_read_counts genome_ids.update(sample_genome_ids) - return rel_abundances, genome_ids \ No newline at end of file + return read_counts, genome_ids \ No newline at end of file diff --git a/src/compute_rel_abun.py b/src/compute_rel_abun.py new file mode 100644 index 0000000..a19aa12 --- /dev/null +++ b/src/compute_rel_abun.py @@ -0,0 +1,102 @@ +""" +compute_rel_abun.py +====================== +Estimate relative abundances. +""" + +""" +This file is part of CoPTR. + +CoPTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +CoPTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with CoPTR. If not, see . +""" + +import numpy as np +import os +import pickle as pkl + +from src.coptr_contig import CoPTRContig +from src.coptr_ref import ReadFilterRef +from src.print import print_info, print_warning + + +def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples): + # instantiate classes for filtering methods + + coptr_contig = CoPTRContig(min_reads, min_samples) + rf_ref = ReadFilterRef(min_reads, min_cov) + total_passing_reads = 0 + read_counts = {} + genome_lengths = {} + sample_id = None + genome_ids = set() + for genome_id in coverage_maps: + cm = coverage_maps[genome_id] + sample_id = cm.sample_id + + if cm.is_assembly and cm.passed_qc(): + binned_reads = coptr_contig.construct_coverage_matrix([cm]) + + lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds(binned_reads) + count = binned_reads[np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound)].sum() + read_counts[cm.genome_id] = count + genome_lengths[cm.genome_id] = binned_reads.shape[0]*cm.compute_bin_size() + total_passing_reads += count + genome_ids.add(cm.genome_id) + + elif not cm.is_assembly: + filtered_reads, filtered_length, qc_result = rf_ref.filter_reads(cm.read_positions, cm.length) + + if qc_result.passed_qc: + count = filtered_reads.size + read_counts[cm.genome_id] = count + genome_lengths[cm.genome_id] = filtered_length + total_passing_reads += count + genome_ids.add(cm.genome_id) + + rel_abun = {} + normalizing_constant = 0 + for genome_id in read_counts: + unnormalized_rel_abun = (read_counts[genome_id] / total_passing_reads) / genome_lengths[genome_id] + rel_abun[genome_id] = unnormalized_rel_abun + normalizing_constant += unnormalized_rel_abun + + for genome_id in read_counts: + rel_abun[genome_id] = rel_abun[genome_id] / normalizing_constant + + return sample_id, rel_abun, genome_ids + + + + +def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): + + rel_abun = {} + genome_ids = set() + for f in sorted(os.listdir(coverage_map_folder)): + fname, ext = os.path.splitext(f) + if ext != ".pkl": continue + fpath = os.path.join(coverage_map_folder, f) + + print_info("RAbun", "\tprocessing {}".format(f)) + + with open(fpath, "rb") as file: + coverage_maps = pkl.load(file) + + sample_id, sample_rel_abun, sample_genome_ids = compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples) + + if sample_id is not None: + rel_abun[sample_id] = sample_rel_abun + genome_ids.update(sample_genome_ids) + + return rel_abun, genome_ids \ No newline at end of file From 2bebed44223f51bd48d694a872cd97c2965e9c21 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 4 Mar 2021 10:28:36 -0500 Subject: [PATCH 04/48] throw error on possible mislabeled contigs --- coptr.py | 2 +- src/coptr_ref.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/coptr.py b/coptr.py index 4f2e3e5..f65da28 100644 --- a/coptr.py +++ b/coptr.py @@ -31,7 +31,7 @@ from src.read_mapper import ReadMapper from src.util import get_fastq_name -VERSION="1.1.0" +VERSION="1.1.1" class ProgramOptions: diff --git a/src/coptr_ref.py b/src/coptr_ref.py index 798e6a6..109de92 100644 --- a/src/coptr_ref.py +++ b/src/coptr_ref.py @@ -30,7 +30,7 @@ import scipy.stats import sys -from src.print import print_info, print_warning +from src.print import print_error, print_info, print_warning class QCResult: @@ -136,6 +136,14 @@ def compute_bin_size(self, genome_length): # want a number divisible by 100 for downstream steps bin_size = bin_size - (bin_size % 100) + if bin_size < 1: + print_error("CoPTR-Ref", + "{}\n{}\n{}".format("found complete reference genome with <500bp", + "this is probably a mislabeled contig", + "please check your .genomes file", + ) + ) + return bin_size From 881e67e481babd31fd132edcffbe04a065bdc001 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:02:10 +0200 Subject: [PATCH 05/48] refactor: create package in src directory --- src/{ => coptr}/__init__.py | 0 src/{ => coptr}/bam_processor.py | 0 src/{ => coptr}/compute_read_counts.py | 0 src/{ => coptr}/compute_rel_abun.py | 0 src/{ => coptr}/coptr_contig.py | 0 src/{ => coptr}/coptr_ref.py | 0 src/{ => coptr}/poisson_pca.py | 0 src/{ => coptr}/print.py | 0 src/{ => coptr}/read_assigner.py | 0 src/{ => coptr}/read_mapper.py | 0 src/{ => coptr}/util.py | 0 11 files changed, 0 insertions(+), 0 deletions(-) rename src/{ => coptr}/__init__.py (100%) rename src/{ => coptr}/bam_processor.py (100%) rename src/{ => coptr}/compute_read_counts.py (100%) rename src/{ => coptr}/compute_rel_abun.py (100%) rename src/{ => coptr}/coptr_contig.py (100%) rename src/{ => coptr}/coptr_ref.py (100%) rename src/{ => coptr}/poisson_pca.py (100%) rename src/{ => coptr}/print.py (100%) rename src/{ => coptr}/read_assigner.py (100%) rename src/{ => coptr}/read_mapper.py (100%) rename src/{ => coptr}/util.py (100%) diff --git a/src/__init__.py b/src/coptr/__init__.py similarity index 100% rename from src/__init__.py rename to src/coptr/__init__.py diff --git a/src/bam_processor.py b/src/coptr/bam_processor.py similarity index 100% rename from src/bam_processor.py rename to src/coptr/bam_processor.py diff --git a/src/compute_read_counts.py b/src/coptr/compute_read_counts.py similarity index 100% rename from src/compute_read_counts.py rename to src/coptr/compute_read_counts.py diff --git a/src/compute_rel_abun.py b/src/coptr/compute_rel_abun.py similarity index 100% rename from src/compute_rel_abun.py rename to src/coptr/compute_rel_abun.py diff --git a/src/coptr_contig.py b/src/coptr/coptr_contig.py similarity index 100% rename from src/coptr_contig.py rename to src/coptr/coptr_contig.py diff --git a/src/coptr_ref.py b/src/coptr/coptr_ref.py similarity index 100% rename from src/coptr_ref.py rename to src/coptr/coptr_ref.py diff --git a/src/poisson_pca.py b/src/coptr/poisson_pca.py similarity index 100% rename from src/poisson_pca.py rename to src/coptr/poisson_pca.py diff --git a/src/print.py b/src/coptr/print.py similarity index 100% rename from src/print.py rename to src/coptr/print.py diff --git a/src/read_assigner.py b/src/coptr/read_assigner.py similarity index 100% rename from src/read_assigner.py rename to src/coptr/read_assigner.py diff --git a/src/read_mapper.py b/src/coptr/read_mapper.py similarity index 100% rename from src/read_mapper.py rename to src/coptr/read_mapper.py diff --git a/src/util.py b/src/coptr/util.py similarity index 100% rename from src/util.py rename to src/coptr/util.py From 3379d23bf203f8c1197e44c5750115fc1bd60027 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:03:30 +0200 Subject: [PATCH 06/48] refactor: move CLI into package --- coptr.py => src/coptr/cli.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename coptr.py => src/coptr/cli.py (100%) diff --git a/coptr.py b/src/coptr/cli.py similarity index 100% rename from coptr.py rename to src/coptr/cli.py From 53de2ec238b1e458d54ac55c7154dffd9c6d6f94 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:10:05 +0200 Subject: [PATCH 07/48] chore: add editor configuration --- .editorconfig | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..711388e --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# Editor Configuration (http://editorconfig.org) +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 4 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +max_line_length = 88 + +[*.{json,yml}] +indent_size = 2 + +[*.{md,rst}] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab From 17b6e3c0d92065cfef84cc63d70f0c8eb5a39a3d Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:14:08 +0200 Subject: [PATCH 08/48] chore: add configuration and setup --- pyproject.toml | 18 +++++++++++++++++ setup.cfg | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 27 +++++++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a01b28a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +build-backend = 'setuptools.build_meta' +requires = [ + 'setuptools>=40.6.0', + 'wheel' +] + +[tool.black] +line-length = 88 +python-version = ['py37'] + +[tool.isort] +profile = 'black' +src_paths = ['src', 'test', 'setup.py'] +skip = ['__init__.py'] +lines_after_imports = 2 +known_first_party = 'coptr' +known_third_party = ['numpy', 'pysam', 'scipy'] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2950393 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,55 @@ +[metadata] +name = coptr +url = https://github.com/tyjo/coptr +download_url = https://pypi.org/project/coptr/ +project_urls = + Source Code = https://github.com/tyjo/coptr + Documentation = https://coptr.readthedocs.io + Bug Tracker = https://github.com/tyjo/coptr/issues +author = Tyler +author_email = ? +# Please consult https://pypi.org/classifiers/ for a full list. +classifiers = + Development Status :: 3 - Alpha + Environment :: Console + Intended Audience :: Science/Research + License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) + Natural Language :: English + Operating System :: OS Independent + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3 :: Only + Topic :: Scientific/Engineering :: Bio-Informatics +license = GPL-3.0-or-later +description = ? +long_description = file: README.md +long_description_content_type = text/markdown +keywords = + growth rate + peak-to-trough ratio + +[options] +zip_safe = True +install_requires = + numpy + pysam + scipy +python_requires = >=3.7 +tests_require = + tox +packages = find: +package_dir = + = src + +[options.entry_points] +console_scripts = + coptr = coptr.cli:ProgramOptions + +[options.packages.find] +where = src + +[options.extras_require] +development = + black + isort diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4f876ca --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + + +# This file is part of CoPTR. +# +# CoPTR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# CoPTR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CoPTR. If not, see . + + +"""Set up the CoPTR for Python package.""" + + +from setuptools import setup + + +# All other arguments are defined in `setup.cfg`. +setup() From b06a89d41a8a715172f9339c8b14afe45979c5f7 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:15:28 +0200 Subject: [PATCH 09/48] chore: add gitignore config --- .gitignore | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83f1817 --- /dev/null +++ b/.gitignore @@ -0,0 +1,209 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Windows template +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + From df3415cd0bcab21253f49144f1e72d7610c472b5 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:18:49 +0200 Subject: [PATCH 10/48] fix: make package imports relative, optimize all --- docs/conf.py | 2 ++ src/coptr/bam_processor.py | 36 +++++++++++++++----------------- src/coptr/cli.py | 30 +++++++++++++------------- src/coptr/compute_read_counts.py | 11 +++++----- src/coptr/compute_rel_abun.py | 11 +++++----- src/coptr/coptr_contig.py | 16 +++++++------- src/coptr/coptr_ref.py | 26 +++++++++++------------ src/coptr/poisson_pca.py | 4 +++- src/coptr/print.py | 1 + src/coptr/read_assigner.py | 7 ++----- src/coptr/read_mapper.py | 10 ++++----- src/coptr/util.py | 1 + test/test_bam_processor.py | 14 +++++++------ test/test_coptr.py | 16 +++++++------- 14 files changed, 96 insertions(+), 89 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2502cab..451768a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,8 @@ import os import sys + + sys.path.insert(0, os.path.abspath('..')) diff --git a/src/coptr/bam_processor.py b/src/coptr/bam_processor.py index 16f9693..53bdc0a 100644 --- a/src/coptr/bam_processor.py +++ b/src/coptr/bam_processor.py @@ -1,7 +1,7 @@ """ bam_processor.py ====================== -Extract coordinates along each reference sequence and group reference +Extract coordinates along each reference sequence and group reference sequences by genome. """ @@ -25,17 +25,15 @@ import array import bisect import math -import numpy as np import os.path -import pysam import re -import sys -from scipy.sparse import lil_matrix, csr_matrix - -from src.print import print_info -from src.read_assigner import ReadAssigner +import numpy as np +import pysam +from scipy.sparse import csr_matrix +from .print import print_info +from .read_assigner import ReadAssigner class ReadContainer: @@ -57,7 +55,7 @@ def __init__(self): def check_add_mapping(self, query_id, ref_name, ref_position, score): - """Stores a mapping if it has an alignment score greater than or equal + """Stores a mapping if it has an alignment score greater than or equal to the mappings seen so far. Parameters @@ -103,7 +101,7 @@ def check_add_mapping(self, query_id, ref_name, ref_position, score): def get_mappings(self, query_id): """Return the highest scoring mappings for a read. - + Parameters ---------- query_id : str @@ -240,7 +238,7 @@ def extract_reads(self, bam_file): and the value is the length of that reference sequence """ lengths = self.get_ref_seq_lengths(bam_file) - + read_container = ReadContainer() alignment_file = pysam.AlignmentFile(bam_file, self.read_mode) @@ -405,7 +403,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): for read_id in sorted(read_container.reads): ref_names, ref_positions = read_container.get_mappings(read_id) ref_genomes = [ref_seq_genome_id[r] for r in ref_names] - + # the read maps twice to the same genome if len(ref_genomes) != np.unique(ref_genomes).size: discarded_reads.add(read_id) @@ -464,7 +462,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): ref_seq_genome_id[ref_name] = genome_id genome_id = ref_seq_genome_id[ref_name] mapping_genome_ids.append(genome_id) - + assignment_id = assignments[current_row] assigned_genome = genome_ids[assignment_id] sequence_id = mapping_genome_ids.index(assigned_genome) @@ -599,7 +597,7 @@ def merge(self, bam_files, out_bam): bf.close() # combined header - header = { + header = { "HD" : {"VN": "1.0", "SO": "unsorted"}, "SQ" : [{"SN": sq, "LN" : seq_len[sq]} for sq in sorted(seq_len)] } @@ -712,7 +710,7 @@ def __repr__(self): def get_length(self): """Get the lenth of the genome. - + Returns ------- length : int @@ -859,7 +857,7 @@ def compute_bin_size(self): def get_length(self, contig_id): """Get the length of a contig. - + Parameters ---------- contig_id : str @@ -876,12 +874,12 @@ def get_length(self, contig_id): def get_reads(self, contig_id): """Get the coordinates of all reads for a contig. - + Parameters ---------- contig_id : str Reference sequence id of the contig - + Returns ------- reads : numpy.array @@ -968,4 +966,4 @@ def passed_qc(self): self.frac_nonzero = (total_bins - zero_bins) / total_bins if total_bins > 0 else 0 self.reads = total_reads self.total_bins = total_bins - return self.passed_qc_flag \ No newline at end of file + return self.passed_qc_flag diff --git a/src/coptr/cli.py b/src/coptr/cli.py index f65da28..f66ebbe 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -16,20 +16,22 @@ """ import argparse -import numpy as np import os import os.path import pickle as pkl import sys -from src.bam_processor import BamProcessor, CoverageMapRef, CoverageMapContig -from src.coptr_contig import estimate_ptrs_coptr_contig -from src.coptr_ref import estimate_ptrs_coptr_ref -from src.compute_read_counts import compute_read_counts -from src.compute_rel_abun import compute_rel_abun -from src.print import print_error, print_info -from src.read_mapper import ReadMapper -from src.util import get_fastq_name +import numpy as np + +from .bam_processor import BamProcessor +from .compute_read_counts import compute_read_counts +from .compute_rel_abun import compute_rel_abun +from .coptr_contig import estimate_ptrs_coptr_contig +from .coptr_ref import estimate_ptrs_coptr_ref +from .print import print_error, print_info +from .read_mapper import ReadMapper +from .util import get_fastq_name + VERSION="1.1.1" @@ -100,7 +102,7 @@ def map(self): ) parser.add_argument("--paired", action="store_true", help="Set for paired end reads. Assumes fastq files end in _1.* and _2.*") - parser.add_argument("--threads", type=int, default=1, + parser.add_argument("--threads", type=int, default=1, help="Number of threads for bowtie2 mapping." ) parser.add_argument("--bt2-k", type=int, default=self.default_bt2_k, @@ -172,7 +174,7 @@ def extract(self): print_info("BamProcessor", "output for {} already found, skipping".format(fname)) continue - # don't process the rest of the bam file if we just want to + # don't process the rest of the bam file if we just want to # sanity check the regular expression if args.check_regex: continue @@ -283,11 +285,11 @@ def estimate(self): sample_ids = sorted(list(sample_ids)) results_ref = estimate_ptrs_coptr_ref( - ref_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_cov, + ref_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_cov, threads=args.threads, plot_folder=args.plot ) results_contig = estimate_ptrs_coptr_contig( - assembly_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_samples, + assembly_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_samples, threads=args.threads, plot_folder=args.plot ) @@ -446,4 +448,4 @@ def rabun(self): if __name__ == "__main__": - ProgramOptions() \ No newline at end of file + ProgramOptions() diff --git a/src/coptr/compute_read_counts.py b/src/coptr/compute_read_counts.py index 8f000e3..c197747 100644 --- a/src/coptr/compute_read_counts.py +++ b/src/coptr/compute_read_counts.py @@ -21,13 +21,14 @@ along with CoPTR. If not, see . """ -import numpy as np import os import pickle as pkl -from src.coptr_contig import CoPTRContig -from src.coptr_ref import ReadFilterRef -from src.print import print_info, print_warning +import numpy as np + +from .coptr_contig import CoPTRContig +from .coptr_ref import ReadFilterRef +from .print import print_info def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): @@ -86,4 +87,4 @@ def compute_read_counts(coverage_map_folder, min_cov, min_samples): read_counts[sample_id] = sample_read_counts genome_ids.update(sample_genome_ids) - return read_counts, genome_ids \ No newline at end of file + return read_counts, genome_ids diff --git a/src/coptr/compute_rel_abun.py b/src/coptr/compute_rel_abun.py index a19aa12..b5e25cf 100644 --- a/src/coptr/compute_rel_abun.py +++ b/src/coptr/compute_rel_abun.py @@ -21,13 +21,14 @@ along with CoPTR. If not, see . """ -import numpy as np import os import pickle as pkl -from src.coptr_contig import CoPTRContig -from src.coptr_ref import ReadFilterRef -from src.print import print_info, print_warning +import numpy as np + +from .coptr_contig import CoPTRContig +from .coptr_ref import ReadFilterRef +from .print import print_info def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples): @@ -99,4 +100,4 @@ def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): rel_abun[sample_id] = sample_rel_abun genome_ids.update(sample_genome_ids) - return rel_abun, genome_ids \ No newline at end of file + return rel_abun, genome_ids diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index 11c4ded..1189611 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -21,18 +21,16 @@ along with CoPTR. If not, see . """ -import math import multiprocessing as mp -import numpy as np import os.path import pickle as pkl + +import numpy as np import scipy.special import scipy.stats -import sys - -from src.poisson_pca import PoissonPCA -from src.print import print_info, print_warning, print_error +from .poisson_pca import PoissonPCA +from .print import print_error, print_info class CoPTRContigEstimate: @@ -76,7 +74,7 @@ def __repr__(self): class CoPTRContig: """Estimate PTRs from draft assemblies. - + Parameters ---------- min_reads : float @@ -274,7 +272,7 @@ def estimate_ptr_maximum_likelihood(self, binned_counts): def estimate_ptrs(self, coverage_maps, return_bins=False): """Estimate PTRs across multiple samples of the same reference genome. - + Parameters ---------- coverage_maps : list[CoverageMapContig] @@ -597,4 +595,4 @@ def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, coverage_maps = {} - return coptr_contig_estimates \ No newline at end of file + return coptr_contig_estimates diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 109de92..4183989 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -23,14 +23,14 @@ import math import multiprocessing as mp -import numpy as np import os.path import pickle as pkl + +import numpy as np import scipy.optimize import scipy.stats -import sys -from src.print import print_error, print_info, print_warning +from .print import print_error, print_info, print_warning class QCResult: @@ -137,7 +137,7 @@ def compute_bin_size(self, genome_length): bin_size = bin_size - (bin_size % 100) if bin_size < 1: - print_error("CoPTR-Ref", + print_error("CoPTR-Ref", "{}\n{}\n{}".format("found complete reference genome with <500bp", "this is probably a mislabeled contig", "please check your .genomes file", @@ -257,7 +257,7 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): rolling_counts : np.array of float The read count in each rolling bin endpoints : np.array of tuple - Each tuple gives the left (inclusive) and right (exclusive) + Each tuple gives the left (inclusive) and right (exclusive) end point of each bin. """ step = math.ceil(bin_size/100) @@ -277,7 +277,7 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): def remove_reads_by_region(self, read_positions, genome_length, regions): """Remove reads that overlap a region in regions. - + Parameters ---------- read_positions : np.array of int @@ -358,7 +358,7 @@ def filter_reads_phase1(self, read_positions, genome_length, bin_size): remove_regions.append( (remove_start, remove_end) ) read_positions, new_genome_length = self.remove_reads_by_region(read_positions, genome_length, remove_regions) - + binned_reads = self.bin_reads(read_positions, genome_length, bin_size) return read_positions, new_genome_length @@ -437,7 +437,7 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): def bin_reads(self, read_positions, genome_length, bin_size=10000): """Aggregate reads into bin_size windows and compute the read count in each window. - + Parameters ---------- read_positions : np.array of int @@ -647,7 +647,7 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima for ori in oris: xs.append([ori, log_ptr]) xs = np.array(xs) - + # initial ori estimate, initial ptr estimate best_x = 0 best_f = np.inf @@ -719,7 +719,7 @@ def update_ori_ter(self, log2_ptrs, read_locs_list): def update_ptrs(self, ori, ter, prv_log2_ptrs, read_locs_list): """Compute maximum likelihood PTR estimates across samples given an estimated replication origin and terminus. - + Parameters ---------- ori : float @@ -842,7 +842,7 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p min_y = np.min(binned_counts[binned_counts != 0]) min_y -= 0.5*min_y max_y = np.max(binned_counts[binned_counts != 0]) + 0.008*binned_counts.sum() - + # plot bins ax[0].scatter(x, binned_counts) ax[0].set_yscale("log", base=2) @@ -871,7 +871,7 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p ax[1].set_ylim((min_y, max_y)) # plot fit - y = [ coptr.compute_log_likelihood(coptr_ref_est.ori_estimate, coptr_ref_est.ter_estimate, coptr_ref_est.estimate, xi) + y = [ coptr.compute_log_likelihood(coptr_ref_est.ori_estimate, coptr_ref_est.ter_estimate, coptr_ref_est.estimate, xi) for xi in x ] y = np.power(2, y)*(x[1] - x[0]) ax[1].plot(x, y, c="black", linewidth=3) @@ -963,4 +963,4 @@ def estimate_ptrs_coptr_ref(ref_genome_ids, grouped_coverage_map_folder, min_rea if plot_folder is not None and not np.isnan(est.estimate): plot_fit(est, sample.read_positions, sample.length, min_reads, min_cov, plot_folder) - return coptr_ref_estimates \ No newline at end of file + return coptr_ref_estimates diff --git a/src/coptr/poisson_pca.py b/src/coptr/poisson_pca.py index f45b1f5..bb353a7 100644 --- a/src/coptr/poisson_pca.py +++ b/src/coptr/poisson_pca.py @@ -21,9 +21,11 @@ along with CoPTR. If not, see . """ +import sys + import numpy as np import scipy.optimize -import sys + class PoissonPCA: """Principal Component Analysis with Poisson observations. diff --git a/src/coptr/print.py b/src/coptr/print.py index dbc1975..9d179b8 100644 --- a/src/coptr/print.py +++ b/src/coptr/print.py @@ -24,6 +24,7 @@ import sys import time + def print_error(module, msg, quit=True): """Print an error message to the standard error. diff --git a/src/coptr/read_assigner.py b/src/coptr/read_assigner.py index 19f71d6..f960b3b 100644 --- a/src/coptr/read_assigner.py +++ b/src/coptr/read_assigner.py @@ -23,14 +23,11 @@ """ import numpy as np +import scipy.misc import scipy.special import scipy.stats -import scipy.misc - -from scipy.special import digamma -from scipy.special import loggamma - from scipy.sparse import csr_matrix +from scipy.special import digamma class ReadAssigner(object): diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index 3ea51fb..2865886 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -22,16 +22,16 @@ along with CoPTR. If not, see . """ -import numpy as np import os import os.path -import pysam import subprocess as sub import sys import time -from src.print import print_error, print_info -from src.util import get_fastq_name +import pysam + +from .print import print_error, print_info +from .util import get_fastq_name class ReadMapper: @@ -215,7 +215,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): print_info("ReadMapper", "cleaning up {}".format(out_sam)) call = ["rm", out_sam] sub.check_call(call) - + # paired end sequencing elif os.path.isdir(inputf) and paired: valid_ext = [".fastq", "fq", ".gz"] diff --git a/src/coptr/util.py b/src/coptr/util.py index a3ec115..30253da 100644 --- a/src/coptr/util.py +++ b/src/coptr/util.py @@ -23,6 +23,7 @@ import os.path + def get_fastq_name(fpath): """Remove the path and extension from a fastq file. diff --git a/test/test_bam_processor.py b/test/test_bam_processor.py index c4c0dc7..ccf2d56 100644 --- a/test/test_bam_processor.py +++ b/test/test_bam_processor.py @@ -1,13 +1,15 @@ import os -import pysam import unittest -from src.bam_processor import BamProcessor +import pysam + +from coptr.bam_processor import BamProcessor + def read_to_dict(read): read = read.strip("\n").split() read_dict = { - "name" : read[0], + "name" : read[0], "flag" : read[1], "ref_name" : read[2], "ref_pos" : read[3], @@ -26,9 +28,9 @@ def read_to_dict(read): class TestBamProcessor(unittest.TestCase): def setUp(self): - header = { + header = { "HD" : {"VN": "1.0", "SO": "unsorted"}, - "SQ" : + "SQ" : [ {"SN": "ref1|seq1", "LN" : 1000000}, {"SN": "ref1|seq2", "LN" : 1000000}, @@ -112,4 +114,4 @@ def test_merge_bam(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/test_coptr.py b/test/test_coptr.py index b9df749..a80c6b2 100644 --- a/test/test_coptr.py +++ b/test/test_coptr.py @@ -1,11 +1,13 @@ -import numpy as np import pickle as pkl -import scipy.stats import unittest -from src.bam_processor import CoverageMapContig -from src.coptr_ref import CoPTRRef -from src.coptr_contig import CoPTRContig +import numpy as np +import scipy.stats + +from coptr.bam_processor import CoverageMapContig +from coptr.coptr_contig import CoPTRContig +from coptr.coptr_ref import CoPTRRef + class TestCoPTR(unittest.TestCase): @@ -119,7 +121,7 @@ def simulate_reads(self, nreads, ptr = None, ori_pos = None): # convert counts to coordinates read_positions = [] for i,c in enumerate(binned_counts): - size = binned_counts.size + size = binned_counts.size m = (0.5*(2*i + 1)/size) read_positions += [m for i in range(c)] read_positions = self.get_genome_length()*np.array(read_positions) @@ -188,4 +190,4 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): new_probs[self.probs != 0] -= scipy.special.logsumexp(new_probs[self.probs != 0]) new_probs[self.probs != 0] = np.exp(new_probs[self.probs != 0]) - return new_probs \ No newline at end of file + return new_probs From 048b8eaf9f4b39f54cfe5d071f6f725ad5c4ec00 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:25:07 +0200 Subject: [PATCH 11/48] chore: remove useless src-paths --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a01b28a..2fbbd37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ python-version = ['py37'] [tool.isort] profile = 'black' -src_paths = ['src', 'test', 'setup.py'] skip = ['__init__.py'] lines_after_imports = 2 known_first_party = 'coptr' From 55dfb6541d85d977953d8f1ae74bf2b642a46f3e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:25:34 +0200 Subject: [PATCH 12/48] style: apply black formatting --- src/coptr/bam_processor.py | 232 ++++++++++++------- src/coptr/cli.py | 321 ++++++++++++++++++-------- src/coptr/compute_read_counts.py | 25 +- src/coptr/compute_rel_abun.py | 31 ++- src/coptr/coptr_contig.py | 198 ++++++++++------ src/coptr/coptr_ref.py | 379 +++++++++++++++++++++---------- src/coptr/poisson_pca.py | 54 +++-- src/coptr/print.py | 22 +- src/coptr/read_assigner.py | 64 +++--- src/coptr/read_mapper.py | 86 +++++-- src/coptr/util.py | 6 +- test/test_bam_processor.py | 63 ++--- test/test_coptr.py | 81 ++++--- 13 files changed, 1036 insertions(+), 526 deletions(-) diff --git a/src/coptr/bam_processor.py b/src/coptr/bam_processor.py index 53bdc0a..4e621e5 100644 --- a/src/coptr/bam_processor.py +++ b/src/coptr/bam_processor.py @@ -37,8 +37,7 @@ class ReadContainer: - """Container to store read positions and metadata. - """ + """Container to store read positions and metadata.""" def __init__(self): # read query_id -> [ids in read_data] @@ -53,7 +52,6 @@ def __init__(self): self.index_ref_name = {} self.next_ref_idx = 0 - def check_add_mapping(self, query_id, ref_name, ref_position, score): """Stores a mapping if it has an alignment score greater than or equal to the mappings seen so far. @@ -74,12 +72,12 @@ def check_add_mapping(self, query_id, ref_name, ref_position, score): best_score = -np.inf else: row_ids = list(self.reads[query_id]) - best_score = np.max(self.read_data[row_ids,2]) + best_score = np.max(self.read_data[row_ids, 2]) # we've run out of room in the read data matrix, if self.next_row_idx == self.read_data.shape[0]: - new_read_data = np.zeros((2*self.next_row_idx, 3)) - new_read_data[:self.next_row_idx, :] = self.read_data + new_read_data = np.zeros((2 * self.next_row_idx, 3)) + new_read_data[: self.next_row_idx, :] = self.read_data self.read_data = new_read_data if ref_name not in self.ref_names_index: @@ -98,7 +96,6 @@ def check_add_mapping(self, query_id, ref_name, ref_position, score): self.reads[query_id].append(self.next_row_idx) self.next_row_idx += 1 - def get_mappings(self, query_id): """Return the highest scoring mappings for a read. @@ -115,10 +112,10 @@ def get_mappings(self, query_id): A list of positions in each reference sequences. """ row_ids = list(self.reads[query_id]) - best_score = np.max(self.read_data[row_ids,2]) + best_score = np.max(self.read_data[row_ids, 2]) ref_names = [] ref_positions = [] - for row in self.read_data[row_ids,:]: + for row in self.read_data[row_ids, :]: ref_name_idx = int(row[0]) ref_pos = int(row[1]) score = int(row[2]) @@ -180,7 +177,6 @@ def __init__(self, regex="[^\|]+", is_bam=True): self.read_mode = "r" self.write_mode = "w" - def get_ref_names(self, bam_file): """Get the names of reference sequences and reference genomes. @@ -218,7 +214,6 @@ def get_ref_names(self, bam_file): return ref_seqs, ref_genomes - def extract_reads(self, bam_file): """Get the read positions from sam_file for each reference genome. @@ -260,7 +255,7 @@ def extract_reads(self, bam_file): # This minimum scores means that reads with perfect # quality scores that fall below min_identity will # be discarded - min_score = -6*(1-self.min_identity)*aln.query_alignment_length + min_score = -6 * (1 - self.min_identity) * aln.query_alignment_length alignment_score = aln.get_tag("AS") if alignment_score < min_score: @@ -273,7 +268,6 @@ def extract_reads(self, bam_file): alignment_file.close() return read_container, lengths - def get_ref_seq_lengths(self, bam_file): """Extract the lengths of each reference sequence from a bam file. @@ -299,8 +293,9 @@ def get_ref_seq_lengths(self, bam_file): alignment_file.close() return ref_seq_lengths - - def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_genome_id): + def compute_bin_coverage( + self, genome_ids, read_container, lengths, ref_seq_genome_id + ): """Compute fraction of bins with reads for each reference genome. Parameters @@ -326,7 +321,8 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno for read_id in sorted(read_container.reads): ref_names, ref_positions = read_container.get_mappings(read_id) - if len(ref_names) > 1: continue + if len(ref_names) > 1: + continue bin_idx = math.floor(ref_positions[0] / 5000) binned_counts[ref_names[0]][bin_idx] += 1 @@ -335,8 +331,12 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno for ref_seq in binned_counts: genome_id = ref_seq_genome_id[ref_seq] - covered_bins[genome_id] = covered_bins.get(genome_id, 0) + (binned_counts[ref_seq] != 0).sum() - total_bins[genome_id] = total_bins.get(genome_id, 0) + binned_counts[ref_seq].size + covered_bins[genome_id] = ( + covered_bins.get(genome_id, 0) + (binned_counts[ref_seq] != 0).sum() + ) + total_bins[genome_id] = ( + total_bins.get(genome_id, 0) + binned_counts[ref_seq].size + ) # now add contigs without reads for ref_seq in ref_seq_genome_id: @@ -348,13 +348,13 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno coverage_frac = {} for genome_id in genome_ids: if genome_id in covered_bins: - coverage_frac[genome_id] = covered_bins[genome_id] / total_bins[genome_id] + coverage_frac[genome_id] = ( + covered_bins[genome_id] / total_bins[genome_id] + ) else: coverage_frac[genome_id] = 0 return coverage_frac - - def assign_multimapped_reads(self, read_container, lengths, max_alignments): """Assign multiple mapped reads to a single genome. @@ -385,7 +385,9 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): ref_seq_genome_id[ref_name] = genome_id genome_ids.add(genome_id) - genome_coverage = self.compute_bin_coverage(genome_ids, read_container, lengths, ref_seq_genome_id) + genome_coverage = self.compute_bin_coverage( + genome_ids, read_container, lengths, ref_seq_genome_id + ) print_info("BamProcessor", "collecting multi-mapped reads") genome_ids = sorted(list(genome_ids)) @@ -428,7 +430,6 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): nreads += 1 indptr.append(len(indices)) - if len(data) > 0: X = csr_matrix((data, indices, indptr), shape=(nreads, len(genome_ids))) @@ -480,8 +481,6 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): return read_positions, ref_seq_genome_id - - def process_bam(self, bam_file, max_alignments=10): """Extract read coordinates along each reference sequence. @@ -509,7 +508,9 @@ def process_bam(self, bam_file, max_alignments=10): # assign multiply mapped reads to reference sequences # 1. dict[ref_seq_id] -> int (read position) # 2. dict[ref_seq_id] -> genome_id - read_positions, ref_seq_genome_id = self.assign_multimapped_reads(reads, lengths, max_alignments) + read_positions, ref_seq_genome_id = self.assign_multimapped_reads( + reads, lengths, max_alignments + ) # fill in read_positions with remaining sequence ids for seq_id in lengths: @@ -543,23 +544,29 @@ def process_bam(self, bam_file, max_alignments=10): contig_read_positions[genome_id][ref] = read_positions[ref] contig_lengths[genome_id][ref] = lengths[ref] - # now, store coverage maps for each refence genome coverage_maps = {} for genome_id in contig_read_positions: is_assembly = len(contig_read_positions[genome_id]) > 1 if is_assembly: - coverage_maps[genome_id] = \ - CoverageMapContig(bam_file, genome_id, contig_read_positions[genome_id], contig_lengths[genome_id]) + coverage_maps[genome_id] = CoverageMapContig( + bam_file, + genome_id, + contig_read_positions[genome_id], + contig_lengths[genome_id], + ) else: ref_id = list(contig_read_positions[genome_id].keys())[0] - coverage_maps[genome_id] = \ - CoverageMapRef(bam_file, genome_id, contig_read_positions[genome_id][ref_id], contig_lengths[genome_id][ref_id]) + coverage_maps[genome_id] = CoverageMapRef( + bam_file, + genome_id, + contig_read_positions[genome_id][ref_id], + contig_lengths[genome_id][ref_id], + ) return coverage_maps - def merge(self, bam_files, out_bam): """Merge many bam files from different indexes into one, taking the reads with the highest mapping quality from each bam. @@ -598,13 +605,12 @@ def merge(self, bam_files, out_bam): # combined header header = { - "HD" : {"VN": "1.0", "SO": "unsorted"}, - "SQ" : [{"SN": sq, "LN" : seq_len[sq]} for sq in sorted(seq_len)] + "HD": {"VN": "1.0", "SO": "unsorted"}, + "SQ": [{"SN": sq, "LN": seq_len[sq]} for sq in sorted(seq_len)], } seq_names = sorted(seq_len.keys()) - print_info("BamProcessor", "writing merged file {}".format(out_bam)) out = pysam.AlignmentFile(out_bam, self.write_mode, header=header) for bam_file in bam_files: @@ -614,13 +620,19 @@ def merge(self, bam_files, out_bam): continue alignment_score = in_aln.get_tag("AS") - if in_aln.query_name in score and alignment_score == score[in_aln.query_name] and not in_aln.is_read2: + if ( + in_aln.query_name in score + and alignment_score == score[in_aln.query_name] + and not in_aln.is_read2 + ): # reference name will be wrong if it is not expicility set out_aln = pysam.AlignedSegment() out_aln.query_name = in_aln.query_name out_aln.query_sequence = in_aln.query_sequence out_aln.flag = in_aln.flag - out_aln.reference_id = bisect.bisect_left(seq_names, in_aln.reference_name) + out_aln.reference_id = bisect.bisect_left( + seq_names, in_aln.reference_name + ) out_aln.reference_start = in_aln.reference_start out_aln.cigar = in_aln.cigar out_aln.template_length = in_aln.template_length @@ -628,7 +640,9 @@ def merge(self, bam_files, out_bam): out_aln.tags = in_aln.tags # check that reference sequence is set correctly - assert seq_names[out_aln.reference_id] == in_aln.reference_name, "missing reference sequence from {}".format(bam_file) + assert ( + seq_names[out_aln.reference_id] == in_aln.reference_name + ), "missing reference sequence from {}".format(bam_file) out.write(out_aln) inf.close() @@ -636,8 +650,6 @@ def merge(self, bam_files, out_bam): print_info("BamProcessor", "finished writing {}".format(out_bam)) - - class CoverageMap: """Data structure to store read positions along reference genomes. @@ -662,7 +674,6 @@ def __init__(self, bam_file, genome_id, is_assembly): self.is_assembly = is_assembly - class CoverageMapRef(CoverageMap): """Data structure to store read positions from complete reference genomes. @@ -676,38 +687,54 @@ class CoverageMapRef(CoverageMap): A list of coordinates, one per read. """ - __slots__ = ("bam_file", "genome_id", "sample_id", "is_assembly", "read_positions", "length") - + __slots__ = ( + "bam_file", + "genome_id", + "sample_id", + "is_assembly", + "read_positions", + "length", + ) def __getstate__(self): - return self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.read_positions, self.length - + return ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.read_positions, + self.length, + ) def __setstate__(self, state): if type(state) == list or type(state) == tuple: - self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.read_positions, self.length = state + ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.read_positions, + self.length, + ) = state else: for key in state: setattr(self, key, state[key]) - def __init__(self, bam_file, genome_id, read_positions, length): super().__init__(bam_file, genome_id, is_assembly=False) self.read_positions = np.array(read_positions) self.length = length - def __str__(self): - return "CoverageMapRef(bam_file={}, genome_id={}, sample_id={}, reads={})".format( - self.bam_file, self.genome_id, self.sample_id, len(self.read_positions) + return ( + "CoverageMapRef(bam_file={}, genome_id={}, sample_id={}, reads={})".format( + self.bam_file, self.genome_id, self.sample_id, len(self.read_positions) + ) ) def __repr__(self): return self.__str__() - def get_length(self): """Get the lenth of the genome. @@ -718,7 +745,6 @@ def get_length(self): """ return self.length - def get_reads(self): """Get the read coordinates along the reference genome. @@ -730,7 +756,6 @@ def get_reads(self): reads = np.copy(self.read_positions) return reads - def count_reads(self): """Return number of mapped reads. @@ -742,7 +767,6 @@ def count_reads(self): return np.array(self.read_positions).size - class CoverageMapContig(CoverageMap): """Data structure to store read positions from assemblies. @@ -761,30 +785,65 @@ class CoverageMapContig(CoverageMap): and value is the length of that contig. """ - __slots__ = ("bam_file", "genome_id", "sample_id", "is_assembly", - "contig_ids", "contig_read_positions", "contig_lengths", "bin_size", - "binned_reads", "total_bins", "frac_nonzero", "reads", "passed_qc_flag") - + __slots__ = ( + "bam_file", + "genome_id", + "sample_id", + "is_assembly", + "contig_ids", + "contig_read_positions", + "contig_lengths", + "bin_size", + "binned_reads", + "total_bins", + "frac_nonzero", + "reads", + "passed_qc_flag", + ) def __getstate__(self): - return self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.contig_ids, self.contig_read_positions, self.contig_lengths, self.bin_size, \ - self.binned_reads, self.total_bins, self.frac_nonzero, self.reads, self.passed_qc_flag - + return ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.contig_ids, + self.contig_read_positions, + self.contig_lengths, + self.bin_size, + self.binned_reads, + self.total_bins, + self.frac_nonzero, + self.reads, + self.passed_qc_flag, + ) def __setstate__(self, state): if type(state) == list or type(state) == tuple: - self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.contig_ids, self.contig_read_positions, self.contig_lengths, self.bin_size, \ - self.binned_reads, self.total_bins, self.frac_nonzero, self.reads, self.passed_qc_flag = state + ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.contig_ids, + self.contig_read_positions, + self.contig_lengths, + self.bin_size, + self.binned_reads, + self.total_bins, + self.frac_nonzero, + self.reads, + self.passed_qc_flag, + ) = state else: for key in state: setattr(self, key, state[key]) # load old pickle files for contig in self.contig_read_positions: - self.contig_read_positions[contig] = array.array("I", self.contig_read_positions[contig]) - + self.contig_read_positions[contig] = array.array( + "I", self.contig_read_positions[contig] + ) def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): super().__init__(bam_file, genome_id, is_assembly=True) @@ -793,7 +852,9 @@ def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): self.contig_read_positions = {} for contig in contig_read_positions: - self.contig_read_positions[contig] = array.array("I", contig_read_positions[contig]) + self.contig_read_positions[contig] = array.array( + "I", contig_read_positions[contig] + ) self.bin_size = None self.binned_reads = {} @@ -806,17 +867,19 @@ def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): # flag for qc check self.passed_qc_flag = None - def __str__(self): return "CoverageMapContig(bam_file={}, genome_id={}, ncontigs={}, nreads={}, cov_frac={}, passed_qc={})".format( - self.bam_file, self.genome_id, len(self.contig_ids), self.reads, self.frac_nonzero, self.passed_qc_flag + self.bam_file, + self.genome_id, + len(self.contig_ids), + self.reads, + self.frac_nonzero, + self.passed_qc_flag, ) - def __repr__(self): return self.__str__() - def reset(self): self.bin_size = None self.binned_reads = {} @@ -825,7 +888,6 @@ def reset(self): self.reads = None self.passed_qc_flag = None - def compute_bin_size(self): """Compute bin size for read counts. @@ -843,7 +905,8 @@ def compute_bin_size(self): total_length = 0 for contig_id in self.contig_lengths: length = self.contig_lengths[contig_id] - if length >= 11000: total_length += length + if length >= 11000: + total_length += length # bound bin size below by 1000 bin_size = np.max((1000, total_length / target_bins)) @@ -854,7 +917,6 @@ def compute_bin_size(self): return bin_size - def get_length(self, contig_id): """Get the length of a contig. @@ -871,7 +933,6 @@ def get_length(self, contig_id): length = self.contig_lengths[contig_id] return length - def get_reads(self, contig_id): """Get the coordinates of all reads for a contig. @@ -888,7 +949,6 @@ def get_reads(self, contig_id): reads = np.copy(self.contig_read_positions[contig_id]) return reads - def bin_reads(self, contig_id): """Count reads in bins. @@ -927,7 +987,6 @@ def bin_reads(self, contig_id): return self.binned_reads[contig_id] - def count_reads(self): """Return number of mapped reads. @@ -941,10 +1000,8 @@ def count_reads(self): nreads += np.array(self.contig_read_positions[contig_id]).size return nreads - def passed_qc(self): - """Run basic quality control checks. Sets the passed_gc_flag attribute. - """ + """Run basic quality control checks. Sets the passed_gc_flag attribute.""" if self.passed_qc_flag is not None: return self.passed_qc_flag total_bins = 0 @@ -952,7 +1009,8 @@ def passed_qc(self): zero_bins = 0 for cid in self.contig_ids: length = self.contig_lengths[cid] - if length < 11000: continue + if length < 11000: + continue bins = self.bin_reads(cid) total_bins += bins.size total_reads += bins.sum() @@ -963,7 +1021,9 @@ def passed_qc(self): else: self.passed_qc_flag = True - self.frac_nonzero = (total_bins - zero_bins) / total_bins if total_bins > 0 else 0 - self.reads = total_reads + self.frac_nonzero = ( + (total_bins - zero_bins) / total_bins if total_bins > 0 else 0 + ) + self.reads = total_reads self.total_bins = total_bins return self.passed_qc_flag diff --git a/src/coptr/cli.py b/src/coptr/cli.py index f66ebbe..32fb50a 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -33,13 +33,15 @@ from .util import get_fastq_name -VERSION="1.1.1" +VERSION = "1.1.1" -class ProgramOptions: +class ProgramOptions: def __init__(self): parser = argparse.ArgumentParser( - description="CoPTR (v{}): Compute PTRs from complete reference genomes and assemblies.".format(VERSION), + description="CoPTR (v{}): Compute PTRs from complete reference genomes and assemblies.".format( + VERSION + ), usage="""coptr.py [options] command: index create a bowtie2 index for a reference database @@ -49,7 +51,7 @@ def __init__(self): estimate estimate PTRs from coverage maps count compute read counts for each genome after filtering rabun estimate relative abundances for each genomes after filtering -""" +""", ) self.default_bt2_k = 10 @@ -66,19 +68,37 @@ def __init__(self): exit(1) getattr(self, args.command)() - def index(self): - parser = argparse.ArgumentParser(usage="coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out") - parser.add_argument("ref_fasta", help= - """File or folder containing fasta to index. If a folder, the extension for each + parser = argparse.ArgumentParser( + usage="coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out" + ) + parser.add_argument( + "ref_fasta", + help="""File or folder containing fasta to index. If a folder, the extension for each fasta must be one of [.fasta, .fna, .fa] - """ + """, ) parser.add_argument("index_out", help="Filepath to store index.") - parser.add_argument("--bt2-bmax", default=None, help="Set the --bmax arguement for bowtie2-build. Used to control memory useage.") - parser.add_argument("--bt2-dcv", default=None, help="Set the --dcv argument for bowtie2-build. Used to control memory usage.") - parser.add_argument("--bt2-threads", default="1", help="Number of threads to pass to bowtie2-build.") - parser.add_argument("--bt2-packed", action="store_true", help="Set the --packed flag for bowtie2-build. Used to control memory usage.") + parser.add_argument( + "--bt2-bmax", + default=None, + help="Set the --bmax arguement for bowtie2-build. Used to control memory useage.", + ) + parser.add_argument( + "--bt2-dcv", + default=None, + help="Set the --dcv argument for bowtie2-build. Used to control memory usage.", + ) + parser.add_argument( + "--bt2-threads", + default="1", + help="Number of threads to pass to bowtie2-build.", + ) + parser.add_argument( + "--bt2-packed", + action="store_true", + help="Set the --packed flag for bowtie2-build. Used to control memory usage.", + ) if len(sys.argv[2:]) < 1: parser.print_help() @@ -86,45 +106,72 @@ def index(self): args = parser.parse_args(sys.argv[2:]) read_mapper = ReadMapper() - read_mapper.index(args.ref_fasta, args.index_out, args.bt2_bmax, args.bt2_dcv, args.bt2_threads, args.bt2_packed) - + read_mapper.index( + args.ref_fasta, + args.index_out, + args.bt2_bmax, + args.bt2_dcv, + args.bt2_threads, + args.bt2_packed, + ) def map(self): - parser = argparse.ArgumentParser(usage="coptr.py map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder") + parser = argparse.ArgumentParser( + usage="coptr.py map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder" + ) parser.add_argument("index", help="Name of database index.") - parser.add_argument("input", help= - """File or folder containing fastq reads to map. If a folder, the extension for + parser.add_argument( + "input", + help="""File or folder containing fastq reads to map. If a folder, the extension for each fastq must be one of [.fastq, .fq, .fastq.gz, fq.gz] - """ + """, + ) + parser.add_argument( + "out_folder", help="Folder to save mapped reads. BAM files are output here." ) - parser.add_argument("out_folder", - help="Folder to save mapped reads. BAM files are output here." + parser.add_argument( + "--paired", + action="store_true", + help="Set for paired end reads. Assumes fastq files end in _1.* and _2.*", ) - parser.add_argument("--paired", action="store_true", - help="Set for paired end reads. Assumes fastq files end in _1.* and _2.*") - parser.add_argument("--threads", type=int, default=1, - help="Number of threads for bowtie2 mapping." + parser.add_argument( + "--threads", + type=int, + default=1, + help="Number of threads for bowtie2 mapping.", ) - parser.add_argument("--bt2-k", type=int, default=self.default_bt2_k, + parser.add_argument( + "--bt2-k", + type=int, + default=self.default_bt2_k, help="(Default 10). Number of alignments to report. Passed to -k flag of bowtie2.", - ) if len(sys.argv[2:]) < 1: parser.print_help() exit(1) - args = parser.parse_args(sys.argv[2:]) read_mapper = ReadMapper() - read_mapper.map(args.index, args.input, args.out_folder, args.paired, args.threads, args.bt2_k) - + read_mapper.map( + args.index, + args.input, + args.out_folder, + args.paired, + args.threads, + args.bt2_k, + ) def merge(self): - parser = argparse.ArgumentParser(usage="coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam") - parser.add_argument("in-bams", nargs="+", - help="A space separated list of BAM files to merge. Assumes same reads were mapped against different indexes. " + - "Only keeps read 1 of paired end sequencing, since this is used downstream.") + parser = argparse.ArgumentParser( + usage="coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam" + ) + parser.add_argument( + "in-bams", + nargs="+", + help="A space separated list of BAM files to merge. Assumes same reads were mapped against different indexes. " + + "Only keeps read 1 of paired end sequencing, since this is used downstream.", + ) parser.add_argument("out-bam", help="Path to merged BAM.") if len(sys.argv[2:]) < 2: @@ -137,20 +184,28 @@ def merge(self): bam_processor = BamProcessor() bam_processor.merge(in_bams, out_bam) - def extract(self): - parser = argparse.ArgumentParser(usage="coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder") + parser = argparse.ArgumentParser( + usage="coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder" + ) parser.add_argument("in_folder", help="Folder with BAM files.") parser.add_argument("out_folder", help="Folder to store coverage maps.") - parser.add_argument("--ref-genome-regex", default="[^\|]+", + parser.add_argument( + "--ref-genome-regex", + default="[^\|]+", help="Regular expression extracting a reference genome id from the sequence id in a bam file.", ) - parser.add_argument("--check-regex", action="store_true", default=False, - help="Check the regular expression by counting reference genomes without processing." + parser.add_argument( + "--check-regex", + action="store_true", + default=False, + help="Check the regular expression by counting reference genomes without processing.", ) - parser.add_argument("--bt2-k", type=int, default=self.default_bt2_k, + parser.add_argument( + "--bt2-k", + type=int, + default=self.default_bt2_k, help="Maximum number of alignments.", - ) if len(sys.argv[2:]) < 1: @@ -170,8 +225,13 @@ def extract(self): ref_sequences.update(seq) ref_genomes.update(gen) - if os.path.isfile(os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl")): - print_info("BamProcessor", "output for {} already found, skipping".format(fname)) + if os.path.isfile( + os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl") + ): + print_info( + "BamProcessor", + "output for {} already found, skipping".format(fname), + ) continue # don't process the rest of the bam file if we just want to @@ -180,36 +240,64 @@ def extract(self): continue coverage_maps = bam_processor.process_bam(fpath, args.bt2_k) - with open(os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl"), "wb") as f: + with open( + os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl"), "wb" + ) as f: pkl.dump(coverage_maps, f) - print_info("BamProcessor", "found {} reference sequences corresponding to {} genomes".format(len(ref_sequences), len(ref_genomes))) + print_info( + "BamProcessor", + "found {} reference sequences corresponding to {} genomes".format( + len(ref_sequences), len(ref_genomes) + ), + ) if args.check_regex: print_info("BamProcessor", "reference genome ids:") for ref in sorted(ref_genomes): print("\t", ref, file=sys.stderr) - def estimate(self): - parser = argparse.ArgumentParser(usage= - """usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file + parser = argparse.ArgumentParser( + usage="""usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file """ ) - parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) parser.add_argument("out_file", help="Filename to store PTR table.") - parser.add_argument("--min-reads", type=float, - help="Minimum number of reads required to compute a PTR (default 5000).", default=5000 + parser.add_argument( + "--min-reads", + type=float, + help="Minimum number of reads required to compute a PTR (default 5000).", + default=5000, + ) + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, ) - parser.add_argument("--min-cov", type=float, - help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, ) - parser.add_argument("--min-samples", type=float, - help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 + parser.add_argument( + "--threads", + type=int, + help="Number of threads to use (default 1).", + default=1, ) - parser.add_argument("--threads", type=int, help="Number of threads to use (default 1).", default=1) - parser.add_argument("--plot", default=None, help="Plot model fit and save the results.") - parser.add_argument("--restart", default=False, action="store_true", - help="Restarts the estimation step using the genomes in the coverage-maps-genome folder." + parser.add_argument( + "--plot", default=None, help="Plot model fit and save the results." + ) + parser.add_argument( + "--restart", + default=False, + action="store_true", + help="Restarts the estimation step using the genomes in the coverage-maps-genome folder.", ) if len(sys.argv[2:]) < 1: @@ -221,19 +309,25 @@ def estimate(self): ref_genome_ids = set() assembly_genome_ids = set() - - grouped_coverage_map_folder = os.path.join(args.coverage_map_folder, "coverage-maps-genome") + grouped_coverage_map_folder = os.path.join( + args.coverage_map_folder, "coverage-maps-genome" + ) if not args.restart and not os.path.isdir(grouped_coverage_map_folder): os.mkdir(grouped_coverage_map_folder) if args.restart: - print_info("CoPTR", "restarting from files in " + grouped_coverage_map_folder) + print_info( + "CoPTR", "restarting from files in " + grouped_coverage_map_folder + ) for cm_file in sorted(os.listdir(grouped_coverage_map_folder)): _, ext = os.path.splitext(cm_file) - if ext != ".pkl": continue + if ext != ".pkl": + continue print_info("CoPTR", "checking " + cm_file) - with open(os.path.join(grouped_coverage_map_folder, cm_file), "rb") as f: + with open( + os.path.join(grouped_coverage_map_folder, cm_file), "rb" + ) as f: try: while True: cm = pkl.load(f) @@ -253,7 +347,8 @@ def estimate(self): # first construct a list of genome_ids for ptr estimates for f in sorted(os.listdir(args.coverage_map_folder)): fname, ext = os.path.splitext(f) - if ext != ".pkl": continue + if ext != ".pkl": + continue fpath = os.path.join(args.coverage_map_folder, f) print_info("CoPTR", "\tprocessing {}".format(f)) @@ -268,10 +363,19 @@ def estimate(self): if coverage_maps[ref_id].count_reads() < args.min_reads: continue - write_mode = "ab+" if ref_id in ref_genome_ids or ref_id in assembly_genome_ids else "wb" + write_mode = ( + "ab+" + if ref_id in ref_genome_ids or ref_id in assembly_genome_ids + else "wb" + ) # append to genome file - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), write_mode) as tofile: + with open( + os.path.join( + grouped_coverage_map_folder, ref_id + ".cm.pkl" + ), + write_mode, + ) as tofile: pkl.dump(coverage_maps[ref_id], tofile) if coverage_maps[ref_id].is_assembly: @@ -285,12 +389,20 @@ def estimate(self): sample_ids = sorted(list(sample_ids)) results_ref = estimate_ptrs_coptr_ref( - ref_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_cov, - threads=args.threads, plot_folder=args.plot + ref_genome_ids, + grouped_coverage_map_folder, + args.min_reads, + args.min_cov, + threads=args.threads, + plot_folder=args.plot, ) results_contig = estimate_ptrs_coptr_contig( - assembly_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_samples, - threads=args.threads, plot_folder=args.plot + assembly_genome_ids, + grouped_coverage_map_folder, + args.min_reads, + args.min_samples, + threads=args.threads, + plot_folder=args.plot, ) out_file = args.out_file @@ -334,26 +446,36 @@ def estimate(self): f.write(",".join(row) + "\n") print_info("CoPTR", "done!") - print_info("CoPTR", "you may remove the folder {}".format(grouped_coverage_map_folder)) + print_info( + "CoPTR", "you may remove the folder {}".format(grouped_coverage_map_folder) + ) # print_info("CoPTR", "cleaning up {}".format(grouped_coverage_map_folder)) # for ref_id in ref_genome_ids.union(assembly_genome_ids): # filename = os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl") # os.remove(filename) # os.rmdir(grouped_coverage_map_folder) - def count(self): - parser = argparse.ArgumentParser(usage= - """usage: coptr.py count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + parser = argparse.ArgumentParser( + usage="""usage: coptr.py count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file """ ) - parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) parser.add_argument("out_file", help="Filename to store PTR table.") - parser.add_argument("--min-cov", type=float, - help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, ) - parser.add_argument("--min-samples", type=float, - help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, ) if len(sys.argv[2:]) < 1: @@ -364,7 +486,9 @@ def count(self): print_info("CoPTR", "computing read counts") - counts, genome_ids = compute_read_counts(args.coverage_map_folder, args.min_cov, args.min_samples) + counts, genome_ids = compute_read_counts( + args.coverage_map_folder, args.min_cov, args.min_samples + ) out_file = args.out_file _, ext = os.path.splitext(out_file) @@ -392,22 +516,33 @@ def count(self): print_info("CoPTR", "done!") - def rabun(self): - parser = argparse.ArgumentParser(usage= - """usage: coptr.py rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + parser = argparse.ArgumentParser( + usage="""usage: coptr.py rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file """ ) - parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) parser.add_argument("out_file", help="Filename to store PTR table.") - parser.add_argument("--min-reads", type=float, - help="Minimum number of reads required to compute a PTR (default 5000).", default=5000 + parser.add_argument( + "--min-reads", + type=float, + help="Minimum number of reads required to compute a PTR (default 5000).", + default=5000, ) - parser.add_argument("--min-cov", type=float, - help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, ) - parser.add_argument("--min-samples", type=float, - help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, ) if len(sys.argv[2:]) < 1: @@ -418,7 +553,9 @@ def rabun(self): print_info("CoPTR", "computing relative abundances") - rel_abun, genome_ids = compute_rel_abun(args.coverage_map_folder, args.min_reads, args.min_cov, args.min_samples) + rel_abun, genome_ids = compute_rel_abun( + args.coverage_map_folder, args.min_reads, args.min_cov, args.min_samples + ) out_file = args.out_file _, ext = os.path.splitext(out_file) diff --git a/src/coptr/compute_read_counts.py b/src/coptr/compute_read_counts.py index c197747..10bb635 100644 --- a/src/coptr/compute_read_counts.py +++ b/src/coptr/compute_read_counts.py @@ -47,14 +47,20 @@ def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): if cm.is_assembly and cm.passed_qc(): binned_reads = coptr_contig.construct_coverage_matrix([cm]) - lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds(binned_reads) - count = binned_reads[np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound)].sum() + lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds( + binned_reads + ) + count = binned_reads[ + np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound) + ].sum() read_counts[cm.genome_id] = count total_passing_reads += count genome_ids.add(cm.genome_id) elif not cm.is_assembly: - filtered_reads, filtered_length, qc_result = rf_ref.filter_reads(cm.read_positions, cm.length) + filtered_reads, filtered_length, qc_result = rf_ref.filter_reads( + cm.read_positions, cm.length + ) if qc_result.passed_qc: count = filtered_reads.size @@ -65,15 +71,14 @@ def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): return sample_id, read_counts, genome_ids - - def compute_read_counts(coverage_map_folder, min_cov, min_samples): read_counts = {} genome_ids = set() for f in sorted(os.listdir(coverage_map_folder)): fname, ext = os.path.splitext(f) - if ext != ".pkl": continue + if ext != ".pkl": + continue fpath = os.path.join(coverage_map_folder, f) print_info("Count", "\tprocessing {}".format(f)) @@ -81,7 +86,13 @@ def compute_read_counts(coverage_map_folder, min_cov, min_samples): with open(fpath, "rb") as file: coverage_maps = pkl.load(file) - sample_id, sample_read_counts, sample_genome_ids = compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples) + ( + sample_id, + sample_read_counts, + sample_genome_ids, + ) = compute_read_counts_from_coverage_maps( + coverage_maps, min_cov, min_samples + ) if sample_id is not None: read_counts[sample_id] = sample_read_counts diff --git a/src/coptr/compute_rel_abun.py b/src/coptr/compute_rel_abun.py index b5e25cf..95616b6 100644 --- a/src/coptr/compute_rel_abun.py +++ b/src/coptr/compute_rel_abun.py @@ -48,15 +48,21 @@ def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_s if cm.is_assembly and cm.passed_qc(): binned_reads = coptr_contig.construct_coverage_matrix([cm]) - lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds(binned_reads) - count = binned_reads[np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound)].sum() + lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds( + binned_reads + ) + count = binned_reads[ + np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound) + ].sum() read_counts[cm.genome_id] = count - genome_lengths[cm.genome_id] = binned_reads.shape[0]*cm.compute_bin_size() + genome_lengths[cm.genome_id] = binned_reads.shape[0] * cm.compute_bin_size() total_passing_reads += count genome_ids.add(cm.genome_id) elif not cm.is_assembly: - filtered_reads, filtered_length, qc_result = rf_ref.filter_reads(cm.read_positions, cm.length) + filtered_reads, filtered_length, qc_result = rf_ref.filter_reads( + cm.read_positions, cm.length + ) if qc_result.passed_qc: count = filtered_reads.size @@ -68,7 +74,9 @@ def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_s rel_abun = {} normalizing_constant = 0 for genome_id in read_counts: - unnormalized_rel_abun = (read_counts[genome_id] / total_passing_reads) / genome_lengths[genome_id] + unnormalized_rel_abun = ( + read_counts[genome_id] / total_passing_reads + ) / genome_lengths[genome_id] rel_abun[genome_id] = unnormalized_rel_abun normalizing_constant += unnormalized_rel_abun @@ -78,15 +86,14 @@ def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_s return sample_id, rel_abun, genome_ids - - def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): rel_abun = {} genome_ids = set() for f in sorted(os.listdir(coverage_map_folder)): fname, ext = os.path.splitext(f) - if ext != ".pkl": continue + if ext != ".pkl": + continue fpath = os.path.join(coverage_map_folder, f) print_info("RAbun", "\tprocessing {}".format(f)) @@ -94,7 +101,13 @@ def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): with open(fpath, "rb") as file: coverage_maps = pkl.load(file) - sample_id, sample_rel_abun, sample_genome_ids = compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples) + ( + sample_id, + sample_rel_abun, + sample_genome_ids, + ) = compute_rel_abun_from_coverage_maps( + coverage_maps, min_reads, min_cov, min_samples + ) if sample_id is not None: rel_abun[sample_id] = sample_rel_abun diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index 1189611..33732be 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -65,8 +65,13 @@ def __init__(self, bam_file, genome_id, sample_id, estimate, nreads, cov_frac): def __str__(self): return "CoPTRContigEstimate(bam_file={}, genome_id={}, sample_id={}, estimate={:.3f}, nreads={}, cov_frac={})".format( - self.bam_file, self.genome_id, self.sample_id, self.estimate, self.nreads, self.cov_frac - ) + self.bam_file, + self.genome_id, + self.sample_id, + self.estimate, + self.nreads, + self.cov_frac, + ) def __repr__(self): return self.__str__() @@ -87,7 +92,6 @@ def __init__(self, min_reads, min_samples): self.min_reads = min_reads self.min_samples = min_samples - def compute_genomewide_bounds(self, binned_counts, crit_region=0.05): """Compute bounds on read counts in bins. @@ -111,14 +115,13 @@ def compute_genomewide_bounds(self, binned_counts, crit_region=0.05): std = np.max((1, std)) norm = scipy.stats.norm() - m = norm.ppf(1-crit_region) + m = norm.ppf(1 - crit_region) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def construct_coverage_matrix(self, coverage_maps): """Aggregate coverage maps into a bin by sample matrix. @@ -145,7 +148,13 @@ def construct_coverage_matrix(self, coverage_maps): for contig_id in sorted(contig_ids): if contig_id not in cm.contig_ids: - print_error("CoPTRContig", "missing contig {} from {} in {}".format(contig_id, ref_genome, cm.sample_id), quit=True) + print_error( + "CoPTRContig", + "missing contig {} from {} in {}".format( + contig_id, ref_genome, cm.sample_id + ), + quit=True, + ) length = cm.get_length(contig_id) if length < 11000: @@ -160,7 +169,6 @@ def construct_coverage_matrix(self, coverage_maps): A = np.array(A).T return A - def estimate_slope_intercept(self, bin_log_probs): """Estimate the slope along reordering bins using least squares. @@ -179,16 +187,15 @@ def estimate_slope_intercept(self, bin_log_probs): rng = np.linspace(0, 1, bin_log_probs.size) y = bin_log_probs x = np.ones((y.size, 2)) - x[:,0] = rng + x[:, 0] = rng sol, residuals, rank, s = np.linalg.lstsq(x, y, rcond=None) - m,b = sol + m, b = sol - pred = m*rng + b + pred = m * rng + b residuals = y - pred - return m,b,residuals - + return m, b, residuals def compute_log_likelihood(self, log2_ptr, binned_counts): """Compute the log likelihood of reordered bin probabilities given a log2(PTR). @@ -212,7 +219,7 @@ def compute_log_likelihood(self, log2_ptr, binned_counts): x = np.linspace(0, 1, binned_counts.size) # change to base e loge_ptr = log2_ptr / np.log2(np.e) - log_probs = np.array([loge_ptr*xi for xi in x]) + log_probs = np.array([loge_ptr * xi for xi in x]) log_norm = scipy.special.logsumexp(log_probs) log_probs -= log_norm @@ -222,11 +229,10 @@ def compute_log_likelihood(self, log2_ptr, binned_counts): # log_norm = np.log2(np.power(2, log_probs).sum()) # log_probs -= log_norm - log_lk = (binned_counts*log_probs).sum() + log_lk = (binned_counts * log_probs).sum() return log_lk - def estimate_ptr_maximum_likelihood(self, binned_counts): """Compute maximum likelihood estimates of log2(PTR) gived observed reads. @@ -253,7 +259,7 @@ def estimate_ptr_maximum_likelihood(self, binned_counts): x = np.linspace(0, 1, binned_counts.size) # compute normalized probabilities loge_ptr = log2_ptr / np.log2(np.e) - log_probs = np.array([loge_ptr*xi for xi in x]) + log_probs = np.array([loge_ptr * xi for xi in x]) log_norm = scipy.special.logsumexp(log_probs) log_probs -= log_norm probs = np.exp(log_probs) @@ -269,7 +275,6 @@ def estimate_ptr_maximum_likelihood(self, binned_counts): return log2_ptr, intercept, log_lk, flip - def estimate_ptrs(self, coverage_maps, return_bins=False): """Estimate PTRs across multiple samples of the same reference genome. @@ -311,7 +316,14 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: bam_file = cm.bam_file genome_id = cm.genome_id - estimate = CoPTRContigEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, cm.reads, cm.frac_nonzero) + estimate = CoPTRContigEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + cm.reads, + cm.frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -330,7 +342,6 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): A = self.construct_coverage_matrix(passing_coverage_maps) nbins = A.shape[0] - # filter out low coverage and high coverage bins, # remove samples with too few reads A_filtered = [] @@ -340,9 +351,11 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): tmp = [] np.set_printoptions(suppress=True) - for i,col in enumerate(A.T): + for i, col in enumerate(A.T): - lower_bound, upper_bound = self.compute_genomewide_bounds(col, crit_region=0.05) + lower_bound, upper_bound = self.compute_genomewide_bounds( + col, crit_region=0.05 + ) col[np.logical_or(col > upper_bound, col < lower_bound)] = np.nan frac_filtered = np.isnan(col).sum() / col.size @@ -359,17 +372,25 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): sample_id = passing_coverage_maps[i].sample_id frac_nonzero = passing_coverage_maps[i].frac_nonzero m = np.nan - estimate = CoPTRContigEstimate(bam_file, genome_id, sample_id, np.nan, reads, frac_nonzero) + estimate = CoPTRContigEstimate( + bam_file, genome_id, sample_id, np.nan, reads, frac_nonzero + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) - if len(A_filtered) < self.min_samples: for cm in tmp: bam_file = cm.bam_file genome_id = cm.genome_id - estimate = CoPTRContigEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, cm.reads, cm.frac_nonzero) + estimate = CoPTRContigEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + cm.reads, + cm.frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -385,14 +406,23 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): A = np.array(A_filtered).T # filter out rows where too many bins are missing - keep_rows = np.sum(np.isnan(A), axis=1) < 0.025*A.shape[1] - A = A[keep_rows,:] + keep_rows = np.sum(np.isnan(A), axis=1) < 0.025 * A.shape[1] + A = A[keep_rows, :] - if keep_rows.sum() < 0.5*keep_rows.size: - for j,col in enumerate(A.T): + if keep_rows.sum() < 0.5 * keep_rows.size: + for j, col in enumerate(A.T): reads = col[np.isfinite(col)].sum() - frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[np.isfinite(col)].size - estimate = CoPTRContigEstimate(bam_files[j], genome_ids[j], sample_ids[j], np.nan, reads, frac_nonzero) + frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[ + np.isfinite(col) + ].size + estimate = CoPTRContigEstimate( + bam_files[j], + genome_ids[j], + sample_ids[j], + np.nan, + reads, + frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -403,18 +433,18 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): return estimates poisson_pca = PoissonPCA() - W,V = poisson_pca.pca(A,k=1) + W, V = poisson_pca.pca(A, k=1) sorted_idx = np.argsort(W.flatten()) - A = A[sorted_idx,:] + A = A[sorted_idx, :] - for j,col in enumerate(A.T): + for j, col in enumerate(A.T): col = col[np.isfinite(col)] reads = col.sum() - lb = int(0.05*col.size) - ub = int(0.95*col.size) + lb = int(0.05 * col.size) + ub = int(0.95 * col.size) col = col[lb:ub] - m,b,log_lk,flip = self.estimate_ptr_maximum_likelihood(col) + m, b, log_lk, flip = self.estimate_ptr_maximum_likelihood(col) # m,b,res = self.estimate_slope_intercept(col) # flip = m < 0 @@ -422,7 +452,9 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): # so let's look at positive slopes only m = np.abs(m) frac_nonzero = (col > 0).sum() / col.size - estimate = CoPTRContigEstimate(bam_files[j], genome_ids[j], sample_ids[j], m, reads, frac_nonzero) + estimate = CoPTRContigEstimate( + bam_files[j], genome_ids[j], sample_ids[j], m, reads, frac_nonzero + ) estimates.append(estimate) parameters.append((m, b)) @@ -432,7 +464,6 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): bc = np.flip(bc) binned_counts.append(bc) - print_info("CoPTRContig", "finished {}".format(genome_id)) if return_bins: @@ -446,7 +477,9 @@ def _parallel_helper(self, x): plot_folder = x[2] if plot_folder is not None: - estimates, parameters, reordered_bins = self.estimate_ptrs(coverage_maps, return_bins=True) + estimates, parameters, reordered_bins = self.estimate_ptrs( + coverage_maps, return_bins=True + ) plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) else: estimates = self.estimate_ptrs(coverage_maps) @@ -454,23 +487,23 @@ def _parallel_helper(self, x): return (ref_genome, estimates) - def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): import matplotlib.pyplot as plt coptr_contig = CoPTRContig(min_reads=5000, min_samples=5) # make sure esimates and coverage maps are in the same order - order = dict([ (est.sample_id, i) for i,est in enumerate(estimates)]) + order = dict([(est.sample_id, i) for i, est in enumerate(estimates)]) tmp = [[] for cm in coverage_maps] for cm in coverage_maps: idx = order[cm.sample_id] tmp[idx] = cm coverage_maps = tmp - nplots = 0 - for estimate,p,cm,rbins in zip(estimates, parameters, coverage_maps, reordered_bins): + for estimate, p, cm, rbins in zip( + estimates, parameters, coverage_maps, reordered_bins + ): if np.isnan(estimate.estimate): continue @@ -479,7 +512,8 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): binned_counts = [] for cid in contig_ids: length = cm.get_length(cid) - if length < 11000: continue + if length < 11000: + continue binned_counts += cm.bin_reads(cid).tolist() @@ -488,25 +522,37 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): median = np.median(binned_counts[binned_counts != 0]) - fig,ax = plt.subplots(nrows=2,ncols=1, figsize=(9,6)) + fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(9, 6)) # plot unordered bins x1 = np.linspace(0, 1, binned_counts.size) ax[0].scatter(x1, binned_counts) ax[0].set_yscale("log", base=2) - ax[0].plot(x1, np.ones(binned_counts.size)*median, c="red", linewidth=3) - ax[0].plot(x1, np.ones(binned_counts.size)*upper_bound, c="red", linestyle=":", linewidth=3) - ax[0].plot(x1, np.ones(binned_counts.size)*lower_bound, c="red", linestyle=":", linewidth=3) + ax[0].plot(x1, np.ones(binned_counts.size) * median, c="red", linewidth=3) + ax[0].plot( + x1, + np.ones(binned_counts.size) * upper_bound, + c="red", + linestyle=":", + linewidth=3, + ) + ax[0].plot( + x1, + np.ones(binned_counts.size) * lower_bound, + c="red", + linestyle=":", + linewidth=3, + ) y_lim = ax[0].get_ylim() ax[0].set_ylabel("Read Count") ax[0].set_xlabel("Unordered Bins") # plot model fit emp_probs = rbins / rbins.sum() - m,b = p # slope intercept in log space + m, b = p # slope intercept in log space x2 = np.linspace(0, 1, emp_probs.size) - y = np.array([m*xi + b for xi in x2]) + y = np.array([m * xi + b for xi in x2]) y = np.power(2, y) y /= y.sum() ax[1].scatter(x2, emp_probs, c="C1") @@ -514,15 +560,20 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): ax[1].set_yscale("log", base=2) ax[1].set_ylabel("Density") ax[1].set_xlabel("Reordered Bins") - ax[1].set_title("\nlog2(PTR)={:.3f} (Reads = {})".format(m, int(estimate.nreads))) + ax[1].set_title( + "\nlog2(PTR)={:.3f} (Reads = {})".format(m, int(estimate.nreads)) + ) plt.tight_layout() - plt.savefig(os.path.join(plot_folder, estimate.sample_id + "-" + estimate.genome_id + ".pdf")) + plt.savefig( + os.path.join( + plot_folder, estimate.sample_id + "-" + estimate.genome_id + ".pdf" + ) + ) plt.close() nplots += 1 - def load_coverage_maps(file_handle, ref_id): reached_end = False coverage_maps = [] @@ -537,8 +588,14 @@ def load_coverage_maps(file_handle, ref_id): return coverage_maps - -def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, min_reads, min_samples, threads, plot_folder=None): +def estimate_ptrs_coptr_contig( + assembly_genome_ids, + grouped_coverage_map_folder, + min_reads, + min_samples, + threads, + plot_folder=None, +): """Estimate Peak-to-Trough ratios across samples. Parameters @@ -568,12 +625,18 @@ def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, if threads == 1: for ref_id in sorted(assembly_genome_ids): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: coverage_maps = load_coverage_maps(f, ref_id) if plot_folder is not None: - estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs(coverage_maps, return_bins=True) - plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) + estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs( + coverage_maps, return_bins=True + ) + plot_fit( + estimates, parameters, coverage_maps, reordered_bins, plot_folder + ) else: estimates = coptr_contig.estimate_ptrs(coverage_maps) @@ -584,13 +647,20 @@ def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, pool = mp.Pool(threads) coverage_maps = {} - for i,ref_id in enumerate(sorted(assembly_genome_ids)): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: + for i, ref_id in enumerate(sorted(assembly_genome_ids)): + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: coverage_maps[ref_id] = load_coverage_maps(f, ref_id) if (i % threads == 0 and i > 0) or i == len(assembly_genome_ids) - 1: - flat_coverage_maps = [(ref_id, coverage_maps[ref_id], plot_folder) for ref_id in coverage_maps] - flat_results = pool.map(coptr_contig._parallel_helper, flat_coverage_maps) + flat_coverage_maps = [ + (ref_id, coverage_maps[ref_id], plot_folder) + for ref_id in coverage_maps + ] + flat_results = pool.map( + coptr_contig._parallel_helper, flat_coverage_maps + ) coptr_contig_estimates.update(dict(flat_results)) coverage_maps = {} diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 4183989..5b7e1e3 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -62,6 +62,7 @@ def __str__(self): def __repr__(self): return self.__str__() + class ReadFilterRef: """Read filtering steps for CoPTR Ref. @@ -77,7 +78,6 @@ def __init__(self, min_reads, min_cov): self.min_reads = min_reads self.min_cov = min_cov - def filter_reads(self, read_positions, genome_length): """Filter out reads in regions of the genome where the coverage is either too high or too low. @@ -101,21 +101,39 @@ def filter_reads(self, read_positions, genome_length): # don't filter if there are too few reads # just return that the sample failed QC if len(read_positions) < self.min_reads: - return np.array([]), genome_length, QCResult(-1, len(read_positions), -1, False) + return ( + np.array([]), + genome_length, + QCResult(-1, len(read_positions), -1, False), + ) bin_size = self.compute_bin_size(genome_length) read_positions = np.copy(read_positions) - filtered_read_positions, filtered_genome_length = self.filter_reads_phase1(read_positions, genome_length, bin_size) + filtered_read_positions, filtered_genome_length = self.filter_reads_phase1( + read_positions, genome_length, bin_size + ) # more than 25% of the genome is removed frac_removed = 1 - filtered_genome_length / genome_length - if frac_removed > 0.25: - return np.array(filtered_read_positions), filtered_genome_length, QCResult(-1, filtered_read_positions.size, frac_removed, False) - - filtered_read_positions, filtered_genome_length = self.filter_reads_phase2(filtered_read_positions, filtered_genome_length, bin_size) - qc_result = self.quality_check(filtered_read_positions, filtered_genome_length, genome_length, bin_size, self.min_reads, self.min_cov) + if frac_removed > 0.25: + return ( + np.array(filtered_read_positions), + filtered_genome_length, + QCResult(-1, filtered_read_positions.size, frac_removed, False), + ) + + filtered_read_positions, filtered_genome_length = self.filter_reads_phase2( + filtered_read_positions, filtered_genome_length, bin_size + ) + qc_result = self.quality_check( + filtered_read_positions, + filtered_genome_length, + genome_length, + bin_size, + self.min_reads, + self.min_cov, + ) return filtered_read_positions, filtered_genome_length, qc_result - def compute_bin_size(self, genome_length): """Compute bin size for read counts. @@ -137,17 +155,26 @@ def compute_bin_size(self, genome_length): bin_size = bin_size - (bin_size % 100) if bin_size < 1: - print_error("CoPTR-Ref", - "{}\n{}\n{}".format("found complete reference genome with <500bp", - "this is probably a mislabeled contig", - "please check your .genomes file", - ) - ) + print_error( + "CoPTR-Ref", + "{}\n{}\n{}".format( + "found complete reference genome with <500bp", + "this is probably a mislabeled contig", + "please check your .genomes file", + ), + ) return bin_size - - def quality_check(self, filtered_read_positions, filtered_genome_length, original_genome_length, bin_size, min_reads, frac_nonzero): + def quality_check( + self, + filtered_read_positions, + filtered_genome_length, + original_genome_length, + bin_size, + min_reads, + frac_nonzero, + ): """A basic quality check for required coverage of a genome in a sample. Parameters @@ -169,11 +196,15 @@ def quality_check(self, filtered_read_positions, filtered_genome_length, origina qc_result : QCResult Object that stores the result of the quality check """ - binned_counts = self.bin_reads(filtered_read_positions, filtered_genome_length, bin_size) + binned_counts = self.bin_reads( + filtered_read_positions, filtered_genome_length, bin_size + ) nonzero_bins = (binned_counts > 0).sum() / np.max((1, binned_counts.size)) nreads = binned_counts.sum() - passed_qc = True if nonzero_bins >= frac_nonzero and nreads >= min_reads else False + passed_qc = ( + True if nonzero_bins >= frac_nonzero and nreads >= min_reads else False + ) frac_removed = 1 - filtered_genome_length / original_genome_length if frac_removed > 0.25: @@ -181,7 +212,6 @@ def quality_check(self, filtered_read_positions, filtered_genome_length, origina return QCResult(nonzero_bins, nreads, frac_removed, passed_qc) - def compute_genomewide_bounds(self, binned_counts): """Compute bounds on read counts in bins. @@ -206,12 +236,11 @@ def compute_genomewide_bounds(self, binned_counts): norm = scipy.stats.norm() m = norm.ppf(0.95) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def compute_bin_bounds(self, binned_counts): """Compute bounds on read counts in a smaller window along the genome. @@ -234,12 +263,11 @@ def compute_bin_bounds(self, binned_counts): norm = scipy.stats.norm() m = norm.ppf(0.975) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def compute_rolling_sum(self, read_positions, genome_length, bin_size): """Compute a rolling sum of read counts in 10Kb bins, sliding 1Kb at a time. @@ -260,11 +288,11 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): Each tuple gives the left (inclusive) and right (exclusive) end point of each bin. """ - step = math.ceil(bin_size/100) + step = math.ceil(bin_size / 100) s = read_positions.size sorted_reads = np.sort(read_positions) - endpoints = np.array([(i, i+bin_size) for i in range(0, genome_length, step)]) + endpoints = np.array([(i, i + bin_size) for i in range(0, genome_length, step)]) left_endpoints = [e[0] for e in endpoints] right_endpoints = [e[1] for e in endpoints] @@ -274,7 +302,6 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): return rolling_counts, endpoints - def remove_reads_by_region(self, read_positions, genome_length, regions): """Remove reads that overlap a region in regions. @@ -299,19 +326,21 @@ def remove_reads_by_region(self, read_positions, genome_length, regions): new_genome_length = genome_length filter_count = 0 adjustment = 0 - for left,right in regions: + for left, right in regions: remove_size = int(right - left) new_genome_length -= remove_size - keep = np.logical_or(read_positions < (left - adjustment), read_positions >= (right - adjustment)) + keep = np.logical_or( + read_positions < (left - adjustment), + read_positions >= (right - adjustment), + ) adjustment += remove_size - filter_count += (read_positions.size - keep.sum()) + filter_count += read_positions.size - keep.sum() read_positions = read_positions[keep] read_positions[read_positions > (right - adjustment)] -= remove_size return read_positions, new_genome_length - def filter_reads_phase1(self, read_positions, genome_length, bin_size): """A coarse-grained genomewide filter that removes reads in ultra-high or ultra-low coverage regions. @@ -334,35 +363,38 @@ def filter_reads_phase1(self, read_positions, genome_length, bin_size): binned_reads = self.bin_reads(read_positions, genome_length, bin_size) lower_bound, upper_bound = self.compute_genomewide_bounds(binned_reads) - rolling_counts, endpoints = self.compute_rolling_sum(read_positions, genome_length, bin_size) + rolling_counts, endpoints = self.compute_rolling_sum( + read_positions, genome_length, bin_size + ) # find regions to remove remove_start = None remove_end = None remove_regions = [] - for i,c in enumerate(rolling_counts): + for i, c in enumerate(rolling_counts): left, right = endpoints[i] if (c > upper_bound or c < lower_bound) and remove_start is None: remove_start = left elif c < upper_bound and c > lower_bound and remove_start is not None: - remove_end = left # want endpoint of previous bin - remove_regions.append( (remove_start, remove_end) ) + remove_end = left # want endpoint of previous bin + remove_regions.append((remove_start, remove_end)) remove_start = None remove_end = None # get endpoint if remove_start is not None: remove_end = genome_length - remove_regions.append( (remove_start, remove_end) ) + remove_regions.append((remove_start, remove_end)) - read_positions, new_genome_length = self.remove_reads_by_region(read_positions, genome_length, remove_regions) + read_positions, new_genome_length = self.remove_reads_by_region( + read_positions, genome_length, remove_regions + ) binned_reads = self.bin_reads(read_positions, genome_length, bin_size) return read_positions, new_genome_length - def filter_reads_phase2(self, read_positions, genome_length, bin_size): """A fine-grained filter that removes reads in localized regions with too-high or too-low coverage. For each 10Kb, looks 6.25% of @@ -385,11 +417,13 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): """ # rolling counts in 10Kb bins, shifting by 1000bp - rolling_counts, endpoints = self.compute_rolling_sum(read_positions, genome_length, bin_size) + rolling_counts, endpoints = self.compute_rolling_sum( + read_positions, genome_length, bin_size + ) nbins = rolling_counts.size # how many bins to use - window = np.max((4,math.ceil(0.0625*genome_length / bin_size))) + window = np.max((4, math.ceil(0.0625 * genome_length / bin_size))) window = int(window) # find regions to remove @@ -399,14 +433,16 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): bounds = {} - for i,c in enumerate(rolling_counts): + for i, c in enumerate(rolling_counts): # endpoints left, right = endpoints[i] # take bins in window to the left and right - left_bins = [(i - 100*j) for j in range(window)] - right_bins = [(i + 100*j) % nbins for j in range(window)] - bins = np.concatenate((rolling_counts[left_bins], rolling_counts[right_bins])) + left_bins = [(i - 100 * j) for j in range(window)] + right_bins = [(i + 100 * j) % nbins for j in range(window)] + bins = np.concatenate( + (rolling_counts[left_bins], rolling_counts[right_bins]) + ) median = np.median(bins) # computing these bounds is expensive, so @@ -421,8 +457,8 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): remove_start = left elif (c < upper_bound and c > lower_bound) and remove_start is not None: - remove_end = left # want endpoint of previous bin - remove_regions.append( (remove_start, remove_end) ) + remove_end = left # want endpoint of previous bin + remove_regions.append((remove_start, remove_end)) remove_start = None remove_end = None @@ -431,10 +467,11 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): remove_end = genome_length remove_regions.append((remove_start, remove_end)) - read_positions, new_genome_length = self.remove_reads_by_region(read_positions, genome_length, remove_regions) + read_positions, new_genome_length = self.remove_reads_by_region( + read_positions, genome_length, remove_regions + ) return read_positions, new_genome_length - def bin_reads(self, read_positions, genome_length, bin_size=10000): """Aggregate reads into bin_size windows and compute the read count in each window. @@ -460,7 +497,6 @@ def bin_reads(self, read_positions, genome_length, bin_size=10000): return bin_counts - class CoPTRRefEstimate: """Data structure to store CoPTRRef estimates. @@ -485,7 +521,17 @@ class CoPTRRefEstimate: The fraction of nonzero bins in 10Kb windows. """ - def __init__(self, bam_file, genome_id, sample_id, estimate, ori_estimate, ter_estimate, nreads, cov_frac): + def __init__( + self, + bam_file, + genome_id, + sample_id, + estimate, + ori_estimate, + ter_estimate, + nreads, + cov_frac, + ): self.bam_file = bam_file self.genome_id = genome_id self.sample_id = sample_id @@ -495,17 +541,20 @@ def __init__(self, bam_file, genome_id, sample_id, estimate, ori_estimate, ter_e self.nreads = nreads self.cov_frac = cov_frac - def __str__(self): return "CoPTRRefEstimate(bam_file={}, genome_id={}, sample_id={}, estimate={:.3f}, nreads={}, cov_frac={})".format( - self.bam_file, self.genome_id, self.sample_id, self.estimate, self.nreads, self.cov_frac - ) + self.bam_file, + self.genome_id, + self.sample_id, + self.estimate, + self.nreads, + self.cov_frac, + ) def __repr__(self): return self.__str__() - class CoPTRRef: """CoPTR estimator for complete reference genomes. @@ -521,7 +570,6 @@ def __init__(self, min_reads, min_cov): self.min_reads = min_reads self.min_cov = min_cov - def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): """Model log-likelihood for one sample. @@ -536,27 +584,52 @@ def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): read_locs : np.array of float A 1D numpy array giving read positions in [0, 1] """ - if ori_loc > 1 or ori_loc < 0 or ter_loc > 1 or ter_loc < 0 or log2_ptr > np.log2(16) or log2_ptr <= np.log2(1): + if ( + ori_loc > 1 + or ori_loc < 0 + or ter_loc > 1 + or ter_loc < 0 + or log2_ptr > np.log2(16) + or log2_ptr <= np.log2(1) + ): return -np.inf - assert read_locs.size == 1 or np.all(read_locs[:-1] <= read_locs[1:]), "reads must be sorted" + assert read_locs.size == 1 or np.all( + read_locs[:-1] <= read_locs[1:] + ), "reads must be sorted" x1 = np.min([ori_loc, ter_loc]) x2 = np.max([ori_loc, ter_loc]) - dist = np.abs(x2-x1) + dist = np.abs(x2 - x1) if dist < 0.45 or dist > 0.55: return -np.inf # compute log2_p(x_i), log2_p(x_t) so that the density normalizes alpha = log2_ptr / (ori_loc - ter_loc) if ori_loc < ter_loc: - c1 = np.log2(np.log(2)) - \ - np.log2((1/alpha)*(2**(alpha*x1) + 2**(alpha*(x2 - x1)) - 2 - 2**(-alpha*(1-x2) - log2_ptr) + 2**(-log2_ptr))) + c1 = np.log2(np.log(2)) - np.log2( + (1 / alpha) + * ( + 2 ** (alpha * x1) + + 2 ** (alpha * (x2 - x1)) + - 2 + - 2 ** (-alpha * (1 - x2) - log2_ptr) + + 2 ** (-log2_ptr) + ) + ) c2 = c1 - log2_ptr else: - c1 = np.log2(np.log(2)) - \ - np.log2((1/alpha)*(2**(alpha*x1) + 2**(alpha*(x2 - x1)) - 2 - 2**(-alpha*(1-x2) + log2_ptr) + 2**(log2_ptr))) + c1 = np.log2(np.log(2)) - np.log2( + (1 / alpha) + * ( + 2 ** (alpha * x1) + + 2 ** (alpha * (x2 - x1)) + - 2 + - 2 ** (-alpha * (1 - x2) + log2_ptr) + + 2 ** (log2_ptr) + ) + ) c2 = c1 + log2_ptr log_prob = 0 @@ -570,21 +643,23 @@ def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): left_part = read_locs[:left_x] middle_part = read_locs[left_x:right_x] right_part = read_locs[right_x:] - log_prob += (-alpha*(left_part - x1) + c1).sum() - log_prob += ( alpha*(middle_part - x1) + c1).sum() - log_prob += (-alpha*(right_part - x2) + c2).sum() + log_prob += (-alpha * (left_part - x1) + c1).sum() + log_prob += (alpha * (middle_part - x1) + c1).sum() + log_prob += (-alpha * (right_part - x2) + c2).sum() return log_prob - - def compute_multisample_log_likelihood(self, ori_loc, ter_loc, log2_ptrs, read_locs_list): + def compute_multisample_log_likelihood( + self, ori_loc, ter_loc, log2_ptrs, read_locs_list + ): log_lk = 0 for log2_ptr, read_locs in zip(log2_ptrs, read_locs_list): log_lk += self.compute_log_likelihood(ori_loc, ter_loc, log2_ptr, read_locs) return log_lk - - def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estimate_terminus=False): + def estimate_ptr( + self, read_positions, ref_genome_len, filter_reads=True, estimate_terminus=False + ): """Estimate the PTR for a single sample. Parameters @@ -612,7 +687,9 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima """ if filter_reads: rf = ReadFilterRef(self.min_reads, self.min_cov) - read_positions, ref_genome_len, qc_result = rf.filter_reads(read_positions, ref_genome_len) + read_positions, ref_genome_len, qc_result = rf.filter_reads( + read_positions, ref_genome_len + ) if not qc_result.passed_qc: return np.nan, np.nan, np.nan, np.nan, qc_result @@ -624,8 +701,8 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima if estimate_terminus: f = lambda x: -self.compute_log_likelihood(x[0], x[1], x[2], read_locs) - log_ptrs = [0.2*(i+1) for i in range(5)] - oris = [0.2*i for i in range(5)] + log_ptrs = [0.2 * (i + 1) for i in range(5)] + oris = [0.2 * i for i in range(5)] np.random.shuffle(log_ptrs) np.random.shuffle(oris) xs = [] @@ -636,10 +713,12 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima xs = np.array(xs) else: - f = lambda x: -self.compute_log_likelihood(x[0], (x[0] + 0.5) % 1, x[1], read_locs) + f = lambda x: -self.compute_log_likelihood( + x[0], (x[0] + 0.5) % 1, x[1], read_locs + ) - log_ptrs = [0.2*(i+1) for i in range(5)] - oris = [0.2*i for i in range(5)] + log_ptrs = [0.2 * (i + 1) for i in range(5)] + oris = [0.2 * i for i in range(5)] np.random.shuffle(log_ptrs) np.random.shuffle(oris) xs = [] @@ -669,7 +748,6 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima ter = (ori + 0.5) % 1 log2_ptr = np.clip(best_x[1], 0, np.inf) - np.seterr(invalid="warn") if best_f == np.inf: print_warning("CoPTRRef", "PTR optimization failed") @@ -679,7 +757,6 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima else: return log2_ptr, ori, ter, -best_f, qc_result - def update_ori_ter(self, log2_ptrs, read_locs_list): """Compute maximum likelihood ori estimates across samples given estimated log2(PTR)s across samples. @@ -700,8 +777,10 @@ def update_ori_ter(self, log2_ptrs, read_locs_list): """ np.seterr(invalid="ignore") - xs = [0.1*i for i in range(10)] - f = lambda x: -self.compute_multisample_log_likelihood(x[0], (x[0] + 0.5 ) % 1, log2_ptrs, read_locs_list) + xs = [0.1 * i for i in range(10)] + f = lambda x: -self.compute_multisample_log_likelihood( + x[0], (x[0] + 0.5) % 1, log2_ptrs, read_locs_list + ) best_x = None best_f = np.inf @@ -713,8 +792,7 @@ def update_ori_ter(self, log2_ptrs, read_locs_list): best_f = res.fun np.seterr(invalid="warn") - return best_x, (best_x+0.5)%1 - + return best_x, (best_x + 0.5) % 1 def update_ptrs(self, ori, ter, prv_log2_ptrs, read_locs_list): """Compute maximum likelihood PTR estimates across samples @@ -739,14 +817,13 @@ def update_ptrs(self, ori, ter, prv_log2_ptrs, read_locs_list): np.seterr(invalid="ignore") log2_ptrs = [] - for i,read_locs in enumerate(read_locs_list): + for i, read_locs in enumerate(read_locs_list): f = lambda x: -self.compute_log_likelihood(ori, ter, x, read_locs) res = scipy.optimize.minimize(f, prv_log2_ptrs[i], method="SLSQP") log2_ptrs.append(res.x[0]) np.seterr(invalid="warn") return log2_ptrs - def estimate_ptrs(self, coverage_maps): """Compute maximum likelihood PTR estimates across samples. @@ -768,13 +845,26 @@ def estimate_ptrs(self, coverage_maps): lengths = [] genome_id = coverage_maps[0].genome_id for cm in coverage_maps: - read_positions, ref_genome_len, qc_result = rf.filter_reads(cm.read_positions, cm.length) + read_positions, ref_genome_len, qc_result = rf.filter_reads( + cm.read_positions, cm.length + ) if qc_result.passed_qc: read_positions_list.append(np.array(read_positions)) lengths.append(ref_genome_len) sample_ids.append(cm.sample_id) - estimates.append(CoPTRRefEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, np.nan, np.nan, qc_result.nreads, qc_result.frac_nonzero)) + estimates.append( + CoPTRRefEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + np.nan, + np.nan, + qc_result.nreads, + qc_result.frac_nonzero, + ) + ) if len(sample_ids) == 0: return estimates @@ -784,8 +874,10 @@ def estimate_ptrs(self, coverage_maps): # first, compute individiual log2_ptr, ori, ter estimates log2_ptrs = [] read_locs_list = [] - for read_positions,length in zip(read_positions_list, lengths): - log2_ptr, ori, ter, f = self.estimate_ptr(read_positions, length, filter_reads=False) + for read_positions, length in zip(read_positions_list, lengths): + log2_ptr, ori, ter, f = self.estimate_ptr( + read_positions, length, filter_reads=False + ) log2_ptrs.append(log2_ptr) read_locs_list.append(np.sort(read_positions / length)) @@ -796,7 +888,7 @@ def estimate_ptrs(self, coverage_maps): log2_ptrs = self.update_ptrs(ori, ter, log2_ptrs, read_locs_list) n = 0 - for i,cm in enumerate(coverage_maps): + for i, cm in enumerate(coverage_maps): if cm.sample_id in sample_ids: # sanity check that the ptr estimate is good if np.abs(log2_ptrs[n] - 4) < 1e-4: @@ -813,7 +905,6 @@ def estimate_ptrs(self, coverage_maps): return estimates - def _parallel_helper(self, x): ref_genome = x[0] coverage_maps = x[1] @@ -821,7 +912,9 @@ def _parallel_helper(self, x): return (ref_genome, self.estimate_ptrs(coverage_maps)) -def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, plot_folder): +def plot_fit( + coptr_ref_est, read_positions, genome_length, min_reads, min_cov, plot_folder +): import matplotlib.pyplot as plt coptr = CoPTRRef(min_reads, min_cov) @@ -830,7 +923,7 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p rp = np.array(read_positions) length = genome_length - fig,ax = plt.subplots(nrows=2,ncols=1, figsize=(9,6)) + fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(9, 6)) # first plot unfiltered bins bin_size = rf.compute_bin_size(genome_length) @@ -840,8 +933,8 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p lower_bound, upper_bound = rf.compute_genomewide_bounds(binned_counts) min_y = np.min(binned_counts[binned_counts != 0]) - min_y -= 0.5*min_y - max_y = np.max(binned_counts[binned_counts != 0]) + 0.008*binned_counts.sum() + min_y -= 0.5 * min_y + max_y = np.max(binned_counts[binned_counts != 0]) + 0.008 * binned_counts.sum() # plot bins ax[0].scatter(x, binned_counts) @@ -850,9 +943,13 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p y_lim = ax[0].get_ylim() # plot upper and lower bounds - ax[0].plot(x, median*np.ones(binned_counts.size), c="red", linewidth=3) - ax[0].plot(x, lower_bound*np.ones(binned_counts.size), c="red", ls=":", linewidth=3) - ax[0].plot(x, upper_bound*np.ones(binned_counts.size), c="red", ls=":", linewidth=3) + ax[0].plot(x, median * np.ones(binned_counts.size), c="red", linewidth=3) + ax[0].plot( + x, lower_bound * np.ones(binned_counts.size), c="red", ls=":", linewidth=3 + ) + ax[0].plot( + x, upper_bound * np.ones(binned_counts.size), c="red", ls=":", linewidth=3 + ) ax[0].set_ylabel("Read Count") ax[0].set_xlabel("Position Along Genome") @@ -866,25 +963,40 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p ax[1].scatter(x, probs, c="C1") ax[1].set_yscale("log", base=2) min_y = np.min(probs[probs != 0]) - min_y -= 0.5*min_y + min_y -= 0.5 * min_y max_y = np.max(probs) + 0.008 ax[1].set_ylim((min_y, max_y)) # plot fit - y = [ coptr.compute_log_likelihood(coptr_ref_est.ori_estimate, coptr_ref_est.ter_estimate, coptr_ref_est.estimate, xi) - for xi in x ] - y = np.power(2, y)*(x[1] - x[0]) + y = [ + coptr.compute_log_likelihood( + coptr_ref_est.ori_estimate, + coptr_ref_est.ter_estimate, + coptr_ref_est.estimate, + xi, + ) + for xi in x + ] + y = np.power(2, y) * (x[1] - x[0]) ax[1].plot(x, y, c="black", linewidth=3) ax[1].set_ylabel("Density") ax[1].set_xlabel("Position Along Genome") - ax[1].set_title("\nlog2(PTR)={:.3f} (Reads = {})".format(coptr_ref_est.estimate, int(binned_counts.sum()))) + ax[1].set_title( + "\nlog2(PTR)={:.3f} (Reads = {})".format( + coptr_ref_est.estimate, int(binned_counts.sum()) + ) + ) plt.tight_layout() - plt.savefig(os.path.join(plot_folder, coptr_ref_est.sample_id + "-" + coptr_ref_est.genome_id + ".pdf")) + plt.savefig( + os.path.join( + plot_folder, + coptr_ref_est.sample_id + "-" + coptr_ref_est.genome_id + ".pdf", + ) + ) plt.close() - def load_coverage_maps(file_handle, ref_id): coverage_maps = [] try: @@ -896,8 +1008,15 @@ def load_coverage_maps(file_handle, ref_id): return coverage_maps - -def estimate_ptrs_coptr_ref(ref_genome_ids, grouped_coverage_map_folder, min_reads, min_cov, threads, plot_folder=None, mem_limit=None): +def estimate_ptrs_coptr_ref( + ref_genome_ids, + grouped_coverage_map_folder, + min_reads, + min_cov, + threads, + plot_folder=None, + mem_limit=None, +): """Estimate Peak-to-Trough ratios across samples. Parameters @@ -934,33 +1053,55 @@ def estimate_ptrs_coptr_ref(ref_genome_ids, grouped_coverage_map_folder, min_rea pool = mp.Pool(threads) coverage_maps = {} - for i,ref_id in enumerate(sorted(ref_genome_ids)): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: + for i, ref_id in enumerate(sorted(ref_genome_ids)): + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: coverage_maps[ref_id] = load_coverage_maps(f, ref_id) if (i % threads == 0 and i > 0) or i == len(ref_genome_ids) - 1: - flat_coverage_maps = [(ref_id, coverage_maps[ref_id]) for ref_id in coverage_maps] + flat_coverage_maps = [ + (ref_id, coverage_maps[ref_id]) for ref_id in coverage_maps + ] flat_results = pool.map(coptr_ref._parallel_helper, flat_coverage_maps) coptr_ref_estimates.update(dict(flat_results)) for ref_id in sorted(coverage_maps): - for sample,est in zip(coverage_maps[ref_id], coptr_ref_estimates[ref_id]): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit(est, sample.read_positions, sample.length, min_reads, min_cov, plot_folder) + for sample, est in zip( + coverage_maps[ref_id], coptr_ref_estimates[ref_id] + ): + # only plot samples with a PTR estimate + if plot_folder is not None and not np.isnan(est.estimate): + plot_fit( + est, + sample.read_positions, + sample.length, + min_reads, + min_cov, + plot_folder, + ) coverage_maps = {} else: for ref_id in sorted(ref_genome_ids): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: coverage_maps = load_coverage_maps(f, ref_id) coptr_ref_estimates[ref_id] = coptr_ref.estimate_ptrs(coverage_maps) - for sample,est in zip(coverage_maps, coptr_ref_estimates[ref_id]): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit(est, sample.read_positions, sample.length, min_reads, min_cov, plot_folder) + for sample, est in zip(coverage_maps, coptr_ref_estimates[ref_id]): + # only plot samples with a PTR estimate + if plot_folder is not None and not np.isnan(est.estimate): + plot_fit( + est, + sample.read_positions, + sample.length, + min_reads, + min_cov, + plot_folder, + ) return coptr_ref_estimates diff --git a/src/coptr/poisson_pca.py b/src/coptr/poisson_pca.py index bb353a7..bf35010 100644 --- a/src/coptr/poisson_pca.py +++ b/src/coptr/poisson_pca.py @@ -28,10 +28,9 @@ class PoissonPCA: - """Principal Component Analysis with Poisson observations. - """ + """Principal Component Analysis with Poisson observations.""" - def pca(self, X, k, tol=1E-3, verbose=False): + def pca(self, X, k, tol=1e-3, verbose=False): """Run the PCA. For a data matrix X (sample by observations), finds two matrices W and V such that: @@ -48,21 +47,23 @@ def pca(self, X, k, tol=1E-3, verbose=False): """ assert k <= X.shape[1], "k must be smaller than the number of columns of X" assert k > 0 and k == int(k), "k must be a positive integer" - assert np.all(X[np.isfinite(X)] >= 0), "X must have nonnegative entries or be missing" + assert np.all( + X[np.isfinite(X)] >= 0 + ), "X must have nonnegative entries or be missing" X = X.astype(float) # use PCA to initialize matrices W,V # initializing missing values to mean Xt = np.copy(X) Xt[Xt == 0] = 0.1 - for i,row in enumerate(Xt): + for i, row in enumerate(Xt): if np.isnan(row).sum() > 0: - Xt[i,np.isnan(row)] = np.mean(row[np.isfinite(row)]) - u,s,vh = np.linalg.svd(np.log2(Xt)) - u = u[:,:k] + Xt[i, np.isnan(row)] = np.mean(row[np.isfinite(row)]) + u, s, vh = np.linalg.svd(np.log2(Xt)) + u = u[:, :k] s = s[:k] - vh = vh[:k,:] - W = u*s + vh = vh[:k, :] + W = u * s V = vh # W = np.random.normal(loc=0, scale=0.1, size=(X.shape[0], k)) # V = np.random.normal(loc=0, scale=0.1, size=(k, X.shape[1])) @@ -84,9 +85,10 @@ def pca(self, X, k, tol=1E-3, verbose=False): it += 1 - assert prv_log_lk <= log_lk or np.abs(prv_log_lk - log_lk) < 1E-6, "log likelihood must be increasing ({} > {})".format(prv_log_lk, log_lk) - return W,V - + assert ( + prv_log_lk <= log_lk or np.abs(prv_log_lk - log_lk) < 1e-6 + ), "log likelihood must be increasing ({} > {})".format(prv_log_lk, log_lk) + return W, V def update_W(self, X, W, V): """Optimize the log-likelihood with respect to W. @@ -100,15 +102,20 @@ def update_W(self, X, W, V): V : np.array The current estimate of V """ - loss_fn = lambda Wx : -self.log_likelihood(X, Wx.reshape(X.shape[0], V.shape[0]), V) - grad_fn = lambda Wx : -self.grad_W(X, Wx.reshape(X.shape[0], V.shape[0]), V).flatten() + loss_fn = lambda Wx: -self.log_likelihood( + X, Wx.reshape(X.shape[0], V.shape[0]), V + ) + grad_fn = lambda Wx: -self.grad_W( + X, Wx.reshape(X.shape[0], V.shape[0]), V + ).flatten() # these are handled by the minimizer np.seterr(over="ignore") - res = scipy.optimize.minimize(loss_fn, np.copy(W).flatten(), jac=grad_fn, method="CG") + res = scipy.optimize.minimize( + loss_fn, np.copy(W).flatten(), jac=grad_fn, method="CG" + ) np.seterr(over="warn") return res.x.reshape(X.shape[0], V.shape[0]) - def update_V(self, X, W, V): """Optimize the log-likelihood with respect to V. @@ -123,7 +130,6 @@ def update_V(self, X, W, V): """ return self.update_W(X.T, V.T, W.T).T - def log_likelihood(self, X, W, V): """Compute the log likelihood (minus a constant). @@ -137,8 +143,7 @@ def log_likelihood(self, X, W, V): A k x m matrix """ param = np.exp(W.dot(V)) - return (X*W.dot(V) - param)[np.isfinite(X)].sum() - + return (X * W.dot(V) - param)[np.isfinite(X)].sum() def grad_W(self, X, W, V): """Compute the gradient of the log likelihood wrt W. @@ -166,11 +171,11 @@ def grad_W(self, X, W, V): # generate 100 random test sets for i in range(100): - print("running test set", i+1, "of", 100) + print("running test set", i + 1, "of", 100) n = np.random.randint(10, 100) m = np.random.randint(5, n) k = np.random.randint(1, np.min((m, 10))) - print("\tn {} m {} k {}".format(n,m,k)) + print("\tn {} m {} k {}".format(n, m, k)) # generate test data W = np.random.normal(loc=0, scale=1, size=(n, k)) @@ -178,13 +183,12 @@ def grad_W(self, X, W, V): X = np.random.poisson(np.exp(W.dot(V))).astype(float) - coords = [(i,j) for i in range(n) for j in range(m)] - nmissing = int(0.25*X.size) + coords = [(i, j) for i in range(n) for j in range(m)] + nmissing = int(0.25 * X.size) for l in range(nmissing): coord = coords[np.random.randint(len(coords))] X[coord] = np.nan - X_finite = X[np.isfinite(X)] print("\tmin {}".format(X_finite.min())) print("\tmax {}".format(X_finite.max())) diff --git a/src/coptr/print.py b/src/coptr/print.py index 9d179b8..1d8a7d2 100644 --- a/src/coptr/print.py +++ b/src/coptr/print.py @@ -37,14 +37,18 @@ def print_error(module, msg, quit=True): exit : bool If TRUE, the program terminates after printing the error """ - print("[ERROR] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) + print( + "[ERROR] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), + file=sys.stderr, + flush=True, + ) if quit: exit(1) def print_warning(module, msg): """Print a warning message to the standard error. - + Parameters ---------- module : str @@ -53,12 +57,16 @@ def print_warning(module, msg): The message itself """ - print("[WARNING] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) + print( + "[WARNING] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), + file=sys.stderr, + flush=True, + ) def print_info(module, msg): """Print an info message to the standard error. - + Parameters ---------- module : str @@ -67,4 +75,8 @@ def print_info(module, msg): The message itself """ - print("[INFO] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) \ No newline at end of file + print( + "[INFO] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), + file=sys.stderr, + flush=True, + ) diff --git a/src/coptr/read_assigner.py b/src/coptr/read_assigner.py index f960b3b..4b229d4 100644 --- a/src/coptr/read_assigner.py +++ b/src/coptr/read_assigner.py @@ -44,7 +44,9 @@ def __init__(self, X, prior_counts): self.X = csr_matrix(X) self.nreads = X.shape[0] self.ngenomes = X.shape[1] - assert np.all(self.X.sum(axis=1) > 0), "each read must have at least one valid mapping" + assert np.all( + self.X.sum(axis=1) > 0 + ), "each read must have at least one valid mapping" # set alpha based on unambiguously assigned reads self.alpha = np.ones(self.ngenomes) + prior_counts @@ -53,14 +55,12 @@ def __init__(self, X, prior_counts): self.phi = self.X.multiply(1 / self.X.sum(axis=1)) self.eta = np.copy(self.alpha) - def compute_elbo(self): - """Compute the variational objective function. - """ + """Compute the variational objective function.""" elbo = 0 # E_q[log p(pi)] - elbo += ((self.alpha - 1)*(digamma(self.eta) - digamma(self.eta.sum()))).sum() + elbo += ((self.alpha - 1) * (digamma(self.eta) - digamma(self.eta.sum()))).sum() # E_q[log p(X | Z)] elbo += (self.phi.multiply(self.X)).sum() @@ -72,29 +72,25 @@ def compute_elbo(self): elbo += scipy.stats.dirichlet(self.eta).entropy() # entropy of categorical - tmp = (self.phi.data*np.log(self.phi.data)) + tmp = self.phi.data * np.log(self.phi.data) elbo += -tmp.sum() return elbo - def update_phi(self): - """Optimize phi with fixed eta. - """ + """Optimize phi with fixed eta.""" log_phi = self.X.multiply(digamma(self.eta)) - phi = csr_matrix((np.exp(log_phi.data), (log_phi.row, log_phi.col)), shape=log_phi.shape) - self.phi = phi.multiply(1/phi.sum(axis=1)) - + phi = csr_matrix( + (np.exp(log_phi.data), (log_phi.row, log_phi.col)), shape=log_phi.shape + ) + self.phi = phi.multiply(1 / phi.sum(axis=1)) def update_eta(self): - """Optimize eta with fixed phi. - """ + """Optimize eta with fixed phi.""" self.eta = np.array(self.alpha + self.phi.sum(axis=0))[0] - - def run_vi(self, tol=1E-3, verbose=False): - """Optimize variational parameters with respect to elbo. - """ + def run_vi(self, tol=1e-3, verbose=False): + """Optimize variational parameters with respect to elbo.""" prv_elbo = -np.inf elbo = self.compute_elbo() @@ -108,12 +104,13 @@ def run_vi(self, tol=1E-3, verbose=False): elbo = self.compute_elbo() it += 1 - assert elbo >= prv_elbo - 1E-3, "elbo must be strictly increasing ({} > {})".format(elbo, prv_elbo) + assert ( + elbo >= prv_elbo - 1e-3 + ), "elbo must be strictly increasing ({} > {})".format(elbo, prv_elbo) if verbose: print("it:", it, "elbo:", elbo) - def assign_reads(self, verbose=False): """Compute read assignments. @@ -137,7 +134,7 @@ def assign_reads(self, verbose=False): # test a large number of cases for i in range(100): - print("running test set", i+1) + print("running test set", i + 1) nreads = np.random.randint(10000, 100000) ngenomes = np.random.randint(10, 50) Z = np.zeros((nreads, ngenomes)) @@ -147,25 +144,30 @@ def assign_reads(self, verbose=False): for n in range(nreads): true_assignment = np.random.randint(5) - Z[n,true_assignment] = 1 + Z[n, true_assignment] = 1 X = np.copy(Z) - multimapped_reads = int(0.1*nreads) + multimapped_reads = int(0.1 * nreads) for n in range(multimapped_reads): true_assignment = np.argwhere(Z[n] == 1).flatten()[0] - additional_mappings = np.max((1,np.random.poisson(1))) + additional_mappings = np.max((1, np.random.poisson(1))) for i in range(additional_mappings): mapping_id = np.random.randint(ngenomes) - X[n,mapping_id] = 1 - X = X[:multimapped_reads,:] + X[n, mapping_id] = 1 + X = X[:multimapped_reads, :] - prior = Z[multimapped_reads:,].sum(axis=0) + prior = Z[ + multimapped_reads:, + ].sum(axis=0) read_assigner = ReadAssigner(X[:multimapped_reads], prior) assignments = read_assigner.assign_reads() correct_assignments = 0 - for i,assignment in enumerate(assignments): - if Z[i,assignment] == 1: + for i, assignment in enumerate(assignments): + if Z[i, assignment] == 1: correct_assignments += 1 - print("\t{} of reads correctly mapped".format(correct_assignments / len(assignments))) + print( + "\t{} of reads correctly mapped".format( + correct_assignments / len(assignments) + ) + ) print("done!") - diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index 2865886..3c6883f 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -35,8 +35,7 @@ class ReadMapper: - """Wrapper around bowtie2. - """ + """Wrapper around bowtie2.""" def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed): """Build a bowtie2 index from ref_fasta. @@ -69,8 +68,8 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed files_found = 0 total_size = 0 for f in os.listdir(ref_fasta): - fname,ext = os.path.splitext(f) - fpath = os.path.join(ref_fasta,f) + fname, ext = os.path.splitext(f) + fpath = os.path.join(ref_fasta, f) if os.path.isfile(fpath) and ext in valid_ext: ref_files.append(fpath) @@ -80,14 +79,23 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed else: print_error("ReadMapper", "index must either be file or folder.") - - print_info("ReadMapper", "found {} files totaling {:.3f} Gb".format(len(ref_files), total_size / (1024**3))) + print_info( + "ReadMapper", + "found {} files totaling {:.3f} Gb".format( + len(ref_files), total_size / (1024 ** 3) + ), + ) fna_out = open("coptr-fna-{}.fna".format(time.time()), "w") genomes_out = open("{}.genomes".format(index_out), "w") n_genomes = 0 - print_info("ReadMapper", "copying to fasta files to {} with prepended genome ids (filenames)".format(fna_out.name)) + print_info( + "ReadMapper", + "copying to fasta files to {} with prepended genome ids (filenames)".format( + fna_out.name + ), + ) # assume 1 genome per fasta # let's set the filename as the genome identifier @@ -106,7 +114,10 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed fna_out.close() genomes_out.close() - print_info("ReadMapper", "writing {} reference genome ids to {}".format(n_genomes, genomes_out.name)) + print_info( + "ReadMapper", + "writing {} reference genome ids to {}".format(n_genomes, genomes_out.name), + ) call = ["bowtie2-build", fna_out.name, index_out, "--threads", bt2_threads] if bt2_bmax is not None: call += ["--bmax", bt2_bmax] @@ -118,14 +129,16 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed try: print_info("ReadMapper", " ".join(call)) sub.check_call(call) - print_info("ReadMapper", "indexed {} fasta files for reference database.".format(files_found)) + print_info( + "ReadMapper", + "indexed {} fasta files for reference database.".format(files_found), + ) except Exception as e: print(e, file=sys.stderr) finally: print_info("ReadMapper", "cleaning up {}".format(fna_out.name)) sub.check_call(["rm", fna_out.name]) - def map(self, index, inputf, outfolder, paired, threads, bt2_k): """Map reads from infile against reference database using bowtie2, then convert to a bam file. @@ -152,13 +165,23 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): if os.path.isfile(inputf): bn = os.path.basename(inputf) - bn,ext = os.path.splitext(bn) + bn, ext = os.path.splitext(bn) out_sam = os.path.join(outfolder, bn + ".sam") out_bam = os.path.join(outfolder, bn + ".bam") # first map to a sam file with bowtie2 print_info("ReadMapper", "mapping {} to {}".format(inputf, out_sam)) - call = ["bowtie2", "-x", index, inputf, "--no-unal", "-p", str(threads), "-k", bt2_k] + call = [ + "bowtie2", + "-x", + index, + inputf, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] print_info("ReadMapper", " ".join(call)) with open(out_sam, "w") as out: sub.check_call(call, stdout=out) @@ -182,28 +205,40 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): valid_ext = [".fastq", ".fq", ".gz"] files_found = 0 for f in sorted(os.listdir(inputf)): - fname,ext1 = os.path.splitext(f) + fname, ext1 = os.path.splitext(f) if ext1 == ".gz": ext2 = fname.split(".")[1] else: ext2 = "" - fpath = os.path.join(inputf,f) + fpath = os.path.join(inputf, f) if ext1 in valid_ext or ext2 in valid_ext: bn = os.path.basename(f) - bn,ext = os.path.splitext(bn) + bn, ext = os.path.splitext(bn) out_sam = os.path.join(outfolder, get_fastq_name(bn) + ".sam") out_bam = os.path.join(outfolder, get_fastq_name(bn) + ".bam") # map reads with bowtie2 print_info("ReadMapper", "mapping {} to {}".format(fpath, out_sam)) - call = ["bowtie2", "-x", index, fpath, "--no-unal", "-p", str(threads), "-k", bt2_k] + call = [ + "bowtie2", + "-x", + index, + fpath, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] print_info("ReadMapper", " ".join(call)) with open(out_sam, "w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + print_info( + "ReadMapper", "converting {} to {}".format(out_sam, out_bam) + ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) for s in infile: @@ -223,7 +258,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): # file prefix -> [pair_1, pair_2] read_pairs = {} for f in sorted(os.listdir(inputf)): - fname,ext1 = os.path.splitext(f) + fname, ext1 = os.path.splitext(f) if ext1 == ".gz": ext2 = fname.split(".")[1] else: @@ -253,7 +288,20 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): # map reads with bowtie2 print_info("ReadMapper", "mapping {},{} to {}".format(f1, f2, out_sam)) - call = ["bowtie2", "-x", index, "-1", f1, "-2", f2, "--no-unal", "-p", str(threads), "-k", bt2_k] + call = [ + "bowtie2", + "-x", + index, + "-1", + f1, + "-2", + f2, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] print_info("ReadMapper", " ".join(call)) with open(out_sam, "w") as out: sub.check_call(call, stdout=out) diff --git a/src/coptr/util.py b/src/coptr/util.py index 30253da..9a2a5e9 100644 --- a/src/coptr/util.py +++ b/src/coptr/util.py @@ -36,7 +36,7 @@ def get_fastq_name(fpath): fname : str the name of the file with its path and extension removeed """ - bn,ex = os.path.splitext(fpath) + bn, ex = os.path.splitext(fpath) if ex == ".gz": - bn,ex = os.path.splitext(bn) - return bn \ No newline at end of file + bn, ex = os.path.splitext(bn) + return bn diff --git a/test/test_bam_processor.py b/test/test_bam_processor.py index ccf2d56..cf71312 100644 --- a/test/test_bam_processor.py +++ b/test/test_bam_processor.py @@ -9,40 +9,38 @@ def read_to_dict(read): read = read.strip("\n").split() read_dict = { - "name" : read[0], - "flag" : read[1], - "ref_name" : read[2], - "ref_pos" : read[3], - "map_quality" : read[4], - "cigar" : read[5], - "next_ref_name" : read[6], - "next_ref_pos" : read[7], - "length" : read[8], - "seq" : read[9], - "qual" : read[10], - "tags" : read[11:] + "name": read[0], + "flag": read[1], + "ref_name": read[2], + "ref_pos": read[3], + "map_quality": read[4], + "cigar": read[5], + "next_ref_name": read[6], + "next_ref_pos": read[7], + "length": read[8], + "seq": read[9], + "qual": read[10], + "tags": read[11:], } return read_dict class TestBamProcessor(unittest.TestCase): - def setUp(self): header = { - "HD" : {"VN": "1.0", "SO": "unsorted"}, - "SQ" : - [ - {"SN": "ref1|seq1", "LN" : 1000000}, - {"SN": "ref1|seq2", "LN" : 1000000}, - {"SN": "ref2|seq1", "LN" : 1000000} - ] + "HD": {"VN": "1.0", "SO": "unsorted"}, + "SQ": [ + {"SN": "ref1|seq1", "LN": 1000000}, + {"SN": "ref1|seq2", "LN": 1000000}, + {"SN": "ref2|seq1", "LN": 1000000}, + ], } bam1_reads = [ "read1 0 ref1|seq1 24975 42 80M * 0 0 TGGGCCAGAAAAAATGACTTCTCCATCTCGCTGCCGGTAGACCGACTCTCTTTTCTGCTGGCGGTTGCCACGCTGAGCGG AAAAAF.A.FFAFFFFFAFFFFFFFFFFFFFF 0.9) - - class Simulator: - """Simulate sequencing reads using the density of reads along an E. coli genome. - """ + """Simulate sequencing reads using the density of reads along an E. coli genome.""" def __init__(self): f = open("test/e-coli-NZ_CP011495.1.pkl", "rb") read_density_map, ptr = pkl.load(f) f.close() - self.bin_edges = read_density_map[:,0] - self.probs = read_density_map[:,1] # left bin edges + self.bin_edges = read_density_map[:, 0] + self.probs = read_density_map[:, 1] # left bin edges self.min_log2_ptr = 0.05 self.max_log2_ptr = 1.25 - def get_genome_length(self): - """Return the number of bases in the genome. - """ - return 100*self.probs.size - + """Return the number of bases in the genome.""" + return 100 * self.probs.size - def simulate_reads(self, nreads, ptr = None, ori_pos = None): + def simulate_reads(self, nreads, ptr=None, ori_pos=None): """Simulate nreads along 100bp bins with specified ptr and origin position. If ptr or ori_pos are None, they are chosen randomly. @@ -101,12 +106,14 @@ def simulate_reads(self, nreads, ptr = None, ori_pos = None): The simulated replication terminus position as a fraction along the genome """ if ptr is None: - ptr = np.random.random()*(self.max_log2_ptr - self.min_log2_ptr) + self.min_log2_ptr - ptr = 2**ptr + ptr = ( + np.random.random() * (self.max_log2_ptr - self.min_log2_ptr) + + self.min_log2_ptr + ) + ptr = 2 ** ptr else: assert np.log2(ptr) > self.min_log2_ptr, "ptr < 2^0.5 may be unreliable" - if ori_pos is None: ori_pos = np.random.random() else: @@ -120,14 +127,13 @@ def simulate_reads(self, nreads, ptr = None, ori_pos = None): # convert counts to coordinates read_positions = [] - for i,c in enumerate(binned_counts): + for i, c in enumerate(binned_counts): size = binned_counts.size - m = (0.5*(2*i + 1)/size) + m = 0.5 * (2 * i + 1) / size read_positions += [m for i in range(c)] - read_positions = self.get_genome_length()*np.array(read_positions) - - return read_positions, ptr, ori_pos, ter_pos # 100bp bins + read_positions = self.get_genome_length() * np.array(read_positions) + return read_positions, ptr, ori_pos, ter_pos # 100bp bins def adjust_read_probs(self, ptr, ori_pos, ter_pos): """Scale bin probabilities based on the PTR. @@ -150,16 +156,15 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): # (bin_edges[i], bin_edges[i+1]), so let's # take the midpoint of each bin if i == self.bin_edges.size - 1: - m = 0.5*(self.bin_edges[i] + 1) - length = (1 - self.bin_edges[i]) + m = 0.5 * (self.bin_edges[i] + 1) + length = 1 - self.bin_edges[i] else: - m = 0.5*(self.bin_edges[i] + self.bin_edges[i+1]) - length = (self.bin_edges[i+1] - self.bin_edges[i]) + m = 0.5 * (self.bin_edges[i] + self.bin_edges[i + 1]) + length = self.bin_edges[i + 1] - self.bin_edges[i] x1 = np.min([ori_pos, ter_pos]) x2 = np.max([ori_pos, ter_pos]) - # since we normalize bins at the end, # we can compute c1 and c2 + a constant # in this case we can use c - log p(x_t) @@ -171,11 +176,11 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): c2 = np.log2(ptr) if m <= x1: - adj_probs[i] = -alpha*(m - x1) + c1 + adj_probs[i] = -alpha * (m - x1) + c1 elif x1 < m and m < x2: - adj_probs[i] = alpha*(m - x1) + c1 + adj_probs[i] = alpha * (m - x1) + c1 else: - adj_probs[i] = -alpha*(m - x2) + c2 + adj_probs[i] = -alpha * (m - x2) + c2 adj_probs[i] += np.log2(length) # normalize @@ -186,8 +191,12 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): # reweight and normalize new_probs = np.zeros(self.probs.size) - new_probs[self.probs != 0] = np.log(self.probs[self.probs != 0]) + adj_probs[self.probs != 0] - new_probs[self.probs != 0] -= scipy.special.logsumexp(new_probs[self.probs != 0]) + new_probs[self.probs != 0] = ( + np.log(self.probs[self.probs != 0]) + adj_probs[self.probs != 0] + ) + new_probs[self.probs != 0] -= scipy.special.logsumexp( + new_probs[self.probs != 0] + ) new_probs[self.probs != 0] = np.exp(new_probs[self.probs != 0]) return new_probs From c77b6e9c970fdae70f94818d07b1c9fd1f809c19 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 22 Jun 2021 16:27:19 +0200 Subject: [PATCH 13/48] docs: add short description --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2950393..3bff840 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ classifiers = Programming Language :: Python :: 3 :: Only Topic :: Scientific/Engineering :: Bio-Informatics license = GPL-3.0-or-later -description = ? +description = Accurate and robust inference of microbial growth dynamics from metagenomic sequencing reads. long_description = file: README.md long_description_content_type = text/markdown keywords = From e137badd7a1a605211b3ee9073bede25681a4828 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 23 Jun 2021 10:32:53 +0200 Subject: [PATCH 14/48] chore: enable Python 3.6 --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 3bff840..1dbd893 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,6 +16,7 @@ classifiers = License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) Natural Language :: English Operating System :: OS Independent + Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 @@ -35,7 +36,7 @@ install_requires = numpy pysam scipy -python_requires = >=3.7 +python_requires = >=3.6 tests_require = tox packages = find: From aa5df4b985d83d8e3c68e3ed4919b9b87ac4f396 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 23 Jun 2021 17:29:25 +0200 Subject: [PATCH 15/48] fix: use a function as entry point Using the class directly as the entry point would return the instance and thus create a non-zero exit code. --- setup.cfg | 2 +- src/coptr/cli.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 1dbd893..91418ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,7 +45,7 @@ package_dir = [options.entry_points] console_scripts = - coptr = coptr.cli:ProgramOptions + coptr = coptr.cli:cli [options.packages.find] where = src diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 32fb50a..365ba7a 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -584,5 +584,6 @@ def rabun(self): print_info("CoPTR", "done!") -if __name__ == "__main__": +def cli(): + """Serve as an entry point for command line calls.""" ProgramOptions() From f0cc668e18345b97eb9f3e4dda920aafeceb2e98 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 09:14:33 -0400 Subject: [PATCH 16/48] update import path for test files --- test/test_bam_processor.py | 2 +- test/test_coptr.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_bam_processor.py b/test/test_bam_processor.py index cf71312..76f60b9 100644 --- a/test/test_bam_processor.py +++ b/test/test_bam_processor.py @@ -3,7 +3,7 @@ import pysam -from coptr.bam_processor import BamProcessor +from src.coptr.bam_processor import BamProcessor def read_to_dict(read): diff --git a/test/test_coptr.py b/test/test_coptr.py index 0b0d4d3..c850178 100644 --- a/test/test_coptr.py +++ b/test/test_coptr.py @@ -4,9 +4,9 @@ import numpy as np import scipy.stats -from coptr.bam_processor import CoverageMapContig -from coptr.coptr_contig import CoPTRContig -from coptr.coptr_ref import CoPTRRef +from src.coptr.bam_processor import CoverageMapContig +from src.coptr.coptr_contig import CoPTRContig +from src.coptr.coptr_ref import CoPTRRef class TestCoPTR(unittest.TestCase): From da61223f50bf4fc64838229cd5579f5913c75893 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 09:29:46 -0400 Subject: [PATCH 17/48] environment yml file name should make name of environment created --- coptr-env.yml => coptr.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename coptr-env.yml => coptr.yml (100%) diff --git a/coptr-env.yml b/coptr.yml similarity index 100% rename from coptr-env.yml rename to coptr.yml From d69357d73317830ac1296e943c6864d7ba60edf9 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 14:19:59 -0400 Subject: [PATCH 18/48] updated docs to reflect new installation, updated path for autodoc --- docs/installation.rst | 9 +++++---- docs/modules.rst | 16 ++++++++-------- docs/quick-start.rst | 13 ++++++++----- docs/tutorial.rst | 28 ++++++++++++++-------------- 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 34cd198..822cbb1 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -18,17 +18,18 @@ You can then create a new environment to run coPTR: .. code-block:: # creates an environment called coptr - $ conda env create -f coptr-env.yml + $ conda env create -f coptr.yml # you only need to create this once # to activate the environment: $ conda activate coptr + $ pip install . -To check if you can run coPTR: +To check if you can run CoPTR: .. code-block:: - $ python coptr.py + $ coptr usage: coptr.py [options] @@ -50,7 +51,7 @@ To check if you can run coPTR: Dependencies ------------ -Installation using conda will install all required dependencies. However, +Installation using conda iswill install all required dependencies. However, CoPTR's dependencies are listed below if you wish to install them manually. * bowtie2 (>=2.4.1): CoPTR assumes bowtie2 is install and accessible from your PATH variable diff --git a/docs/modules.rst b/docs/modules.rst index 675f313..738633f 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -1,26 +1,26 @@ Modules ======= -.. automodule:: src.bam_processor +.. automodule:: src.coptr.bam_processor :members: -.. automodule:: src.coptr_contig +.. automodule:: src.coptr.coptr_contig :members: -.. automodule:: src.coptr_ref +.. automodule:: src.coptr.coptr_ref :members: -.. automodule:: src.poisson_pca +.. automodule:: src.coptr.poisson_pca :members: -.. automodule:: src.print +.. automodule:: src.coptr.print :members: -.. automodule:: src.read_mapper +.. automodule:: src.coptr.read_mapper :members: -.. automodule:: src.read_assigner +.. automodule:: src.coptr.read_assigner :members: -.. automodule:: src.util +.. automodule:: src.coptr.util :members: \ No newline at end of file diff --git a/docs/quick-start.rst b/docs/quick-start.rst index 9e7df36..233d129 100644 --- a/docs/quick-start.rst +++ b/docs/quick-start.rst @@ -18,21 +18,24 @@ Detailed instructions: :doc:`Tutorial `. tar -xzvf example-data.tar.gz # Set up the environment: - conda env create -f coptr-env.yml + conda env create -f coptr.yml conda activate coptr + + # Install coptr + pip install . # Index the reference database: - python coptr.py index example-data/ref-db example-data/ref-db/example-index + coptr index example-data/ref-db example-data/ref-db/example-index # Map reads against the reference: - python coptr.py map example-data/ref-db/example-index example-data/fastq example-data/bam + coptr map example-data/ref-db/example-index example-data/fastq example-data/bam # Extract read positions: - python coptr.py extract example-data/bam example-data/coverage-maps + coptr extract example-data/bam example-data/coverage-maps # Estimate ptrs: # Note the min-reads flag is optional. We recommend the default setting (5000 reads). - python coptr.py estimate example-data/coverage-maps out.csv --min-reads 2500 + coptr estimate example-data/coverage-maps out.csv --min-reads 2500 # View the output: cat out.csv diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e21f738..ccd4c56 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -58,7 +58,7 @@ Indexing the reference database .. code-block:: text - $ python coptr.py index example-data/ref-db example-data/ref-db/example-index + $ coptr index example-data/ref-db example-data/ref-db/example-index [INFO] (Sep 10, 2020 18:06:41) ReadMapper: found 2 files totaling 0.006 Gb [INFO] (Sep 10, 2020 18:06:41) ReadMapper: copying to fasta files to coptr-fna-1599775601.629647.fna with prepended genome ids (filenames) [INFO] (Sep 10, 2020 18:06:41) ReadMapper: writing 2 reference genome ids to example-data/ref-db/example-index.genomes @@ -76,11 +76,11 @@ file, with the filename prepended to each sequencing id. This is how CoPTR keeps track of contigs from the same assembly. Indexing a database of fasta files can be accomplished by calling -``coptr.py index``: +``coptr index``: .. code-block:: text - usage: coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out + usage: coptr index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out positional arguments: ref_fasta File or folder containing fasta to index. If a folder, @@ -113,7 +113,7 @@ Notes on memory usage --------------------- Sometimes the database is too large for a single index. You can split the reference database into multiple parts, and index each separately. After -read mapping, the resulting BAM files can be merged with ```coptr.py merge```. +read mapping, the resulting BAM files can be merged with ```coptr merge```. Mapping reads ============= @@ -122,7 +122,7 @@ Mapping reads .. code-block:: text - $ python coptr.py map example-data/ref-db/example-index example-data/fastq example-data/bam + $ coptr map example-data/ref-db/example-index example-data/fastq example-data/bam [INFO] (Aug 31, 2020 12:12:10) ReadMapper: mapping example-data/fastq/ERR969281.fastq.gz to example-data/bam/ERR969281.sam [INFO] (Aug 31, 2020 12:12:10) ReadMapper: bowtie2 -x example-data/ref-db/example-index example-data/fastq/ERR969281.fastq.gz --no-unal -p 1 10818 reads; of these: @@ -144,7 +144,7 @@ convenient: .. code-block:: text - usage: coptr.py map [-h] [--threads INT] [--paired] index input out-folder + usage: coptr map [-h] [--threads INT] [--paired] index input out-folder positional arguments: index Name of database index. @@ -161,7 +161,7 @@ convenient: --bt2-k BT2_K (Default 10). Number of alignments to report. Passed to -k flag of bowtie2. -The name of the database index corresponds to the name used from ``coptr.py index``. +The name of the database index corresponds to the name used from ``coptr index``. The input can either be a single fastq file, or a folder of fastq files to map. It also takes an optional ``--threads`` argument that allows bowtie2 to use multiple threads. Reads are output as ``bam`` files to save space. @@ -174,11 +174,11 @@ Merging mapped reads from multiple indexes For large reference databases, it is sometimes necessary to create several indexes for subsets of the data and map reads against each index. Results from each index need to be merged to select reads with the best MAPQ across -indexes. You can use ```coptr.py merge``` to merge multiple bam files. +indexes. You can use ```coptr merge``` to merge multiple bam files. .. code-block:: text - usage: coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam + usage: coptr merge [-h] in-bam1 in-bam2 ... in-bamN out-bam positional arguments: in-bams A space separateed list of BAM files to merge. Assumes same @@ -195,7 +195,7 @@ Computing coverage maps .. code-block:: text - $ python coptr.py extract example-data/bam example-data/coverage-maps + $ coptr extract example-data/bam example-data/coverage-maps [INFO] (Jan 18, 2021 10:31:43) BamProcessor: processing example-data/bam/ERR969281.bam [INFO] (Jan 18, 2021 10:31:43) BamProcessor: determining reference genomes [INFO] (Jan 18, 2021 10:31:43) BamProcessor: collecting multi-mapped reads @@ -210,7 +210,7 @@ assembly are collected. .. code-block:: text - usage: usage: coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] + usage: usage: coptr extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder positional arguments: @@ -227,7 +227,7 @@ assembly are collected. The important argument here is the ``--ref-genome-regex``. This is a regular expression that extracts the reference genome id from a sequence id. The default -argument will work with the index created by ```coptr.py index```, and works by +argument will work with the index created by ```coptr index```, and works by prepending the name of the fasta file, and special character ```|``` to each sequence id. @@ -250,7 +250,7 @@ Estimating PTRs .. code-block:: text - # python coptr.py estimate example-data/coverage-maps out --min-reads 2500 + # coptr estimate example-data/coverage-maps out --min-reads 2500 [INFO] (Jan 18, 2021 10:36:05) CoPTR: grouping reads by reference genome [INFO] (Jan 18, 2021 10:36:05) CoPTR: saving to example-data/coverage-maps/coverage-maps-genome [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969281.cm.pkl @@ -279,7 +279,7 @@ on all samples at once.** .. code-block:: text - usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] coverage-map-folder out-file + usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] coverage-map-folder out-file positional arguments: coverage_map_folder Folder with coverage maps computed from 'extract'. From 5c5a71dcc6399425bfd764650f4cfaaaea6fb3eb Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 14:20:32 -0400 Subject: [PATCH 19/48] coptr.py -> coptr --- src/coptr/cli.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 365ba7a..12d12fe 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -32,17 +32,16 @@ from .read_mapper import ReadMapper from .util import get_fastq_name - -VERSION = "1.1.1" +from .__init__ import __version__ class ProgramOptions: def __init__(self): parser = argparse.ArgumentParser( description="CoPTR (v{}): Compute PTRs from complete reference genomes and assemblies.".format( - VERSION + __version__ ), - usage="""coptr.py [options] + usage="""coptr [options] command: index create a bowtie2 index for a reference database map map reads against a reference database @@ -70,7 +69,7 @@ def __init__(self): def index(self): parser = argparse.ArgumentParser( - usage="coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out" + usage="coptr index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out" ) parser.add_argument( "ref_fasta", @@ -117,7 +116,7 @@ def index(self): def map(self): parser = argparse.ArgumentParser( - usage="coptr.py map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder" + usage="coptr map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder" ) parser.add_argument("index", help="Name of database index.") parser.add_argument( @@ -164,7 +163,7 @@ def map(self): def merge(self): parser = argparse.ArgumentParser( - usage="coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam" + usage="coptr merge [-h] in-bam1 in-bam2 ... in-bamN out-bam" ) parser.add_argument( "in-bams", @@ -186,7 +185,7 @@ def merge(self): def extract(self): parser = argparse.ArgumentParser( - usage="coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder" + usage="coptr extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder" ) parser.add_argument("in_folder", help="Folder with BAM files.") parser.add_argument("out_folder", help="Folder to store coverage maps.") @@ -258,7 +257,7 @@ def extract(self): def estimate(self): parser = argparse.ArgumentParser( - usage="""usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file + usage="""usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file """ ) parser.add_argument( @@ -457,7 +456,7 @@ def estimate(self): def count(self): parser = argparse.ArgumentParser( - usage="""usage: coptr.py count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + usage="""usage: coptr count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file """ ) parser.add_argument( @@ -518,7 +517,7 @@ def count(self): def rabun(self): parser = argparse.ArgumentParser( - usage="""usage: coptr.py rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + usage="""usage: coptr rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file """ ) parser.add_argument( From 735828f14ddca6ef648b1f39527f665446a36638 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 14:20:41 -0400 Subject: [PATCH 20/48] moved version number --- src/coptr/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index e69de29..545d07d 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -0,0 +1 @@ +__version__ = "1.1.1" \ No newline at end of file From 40b072696998c7d732650011584d382d3cb8661b Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 24 Jun 2021 14:27:55 -0400 Subject: [PATCH 21/48] added version numbers to packages; added version number for coptr; set author and author_email --- setup.cfg | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 91418ac..75d0e7e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,13 +1,14 @@ [metadata] name = coptr +version = attr: coptr.__version__ url = https://github.com/tyjo/coptr download_url = https://pypi.org/project/coptr/ project_urls = Source Code = https://github.com/tyjo/coptr Documentation = https://coptr.readthedocs.io Bug Tracker = https://github.com/tyjo/coptr/issues -author = Tyler -author_email = ? +author = Tyler Joseph +author_email = tj2357@columbia.edu # Please consult https://pypi.org/classifiers/ for a full list. classifiers = Development Status :: 3 - Alpha @@ -33,9 +34,10 @@ keywords = [options] zip_safe = True install_requires = - numpy - pysam - scipy + numpy >= 1.19.1 + pysam >= 0.16.0.1 + scipy >= 1.5.2 + matplotlib >= 3.3.2 python_requires = >=3.6 tests_require = tox From b40b357e442fbbb9b1769c2c81a2c66d53286aa1 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 25 Jun 2021 10:39:10 -0400 Subject: [PATCH 22/48] updated docs --- docs/tutorial.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index ccd4c56..5fb5ead 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -166,7 +166,8 @@ The input can either be a single fastq file, or a folder of fastq files to map. It also takes an optional ``--threads`` argument that allows bowtie2 to use multiple threads. Reads are output as ``bam`` files to save space. -For paired end sequencing, it is recommend to only map reads from a single mate-pair. +For paired-end sequencing, we recommand mapping reads from end only (e.g. the files +ending in either _1.* or _2.*). Merging mapped reads from multiple indexes From eb2fd303409be5c6ea23a73c85e076690fb1f878 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sun, 27 Jun 2021 23:18:20 +0200 Subject: [PATCH 23/48] style: apply isort & black --- src/coptr/__init__.py | 2 +- src/coptr/cli.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index 545d07d..a82b376 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.1" \ No newline at end of file +__version__ = "1.1.1" diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 12d12fe..c299b10 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -23,6 +23,7 @@ import numpy as np +from . import __version__ from .bam_processor import BamProcessor from .compute_read_counts import compute_read_counts from .compute_rel_abun import compute_rel_abun @@ -32,8 +33,6 @@ from .read_mapper import ReadMapper from .util import get_fastq_name -from .__init__ import __version__ - class ProgramOptions: def __init__(self): From 00f9df5a219c2a36e82e14c6c39a0d0c344ebd6c Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sun, 27 Jun 2021 23:18:40 +0200 Subject: [PATCH 24/48] refactor: replace `rm` subprocess calls * use `pathlib.Path` in critical places * open files in a context where possible * use `datetime` instead of `time` --- src/coptr/read_mapper.py | 157 ++++++++++++++++++++++----------------- 1 file changed, 90 insertions(+), 67 deletions(-) diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index 3c6883f..d6c333d 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -26,7 +26,8 @@ import os.path import subprocess as sub import sys -import time +from datetime import datetime, timezone +from pathlib import Path import pysam @@ -56,20 +57,20 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed bt2_packed : str Parameter to pass to bowtie2-build --packed argument. """ + files_found = 0 + total_size = 0 + ref_files = [] if os.path.isfile(ref_fasta): - ref_files = [ref_fasta] - files_found = 1 - total_size = os.stat(ref_fasta).st_size + ref_files.append(ref_fasta) + files_found += 1 + total_size += os.stat(ref_fasta).st_size elif os.path.isdir(ref_fasta): valid_ext = [".fasta", ".fa", ".fna"] # for multiple fasta files, bowtie2 takes a comma # separated list - ref_files = [] - files_found = 0 - total_size = 0 - for f in os.listdir(ref_fasta): - fname, ext = os.path.splitext(f) - fpath = os.path.join(ref_fasta, f) + for in_handle in os.listdir(ref_fasta): + fname, ext = os.path.splitext(in_handle) + fpath = os.path.join(ref_fasta, in_handle) if os.path.isfile(fpath) and ext in valid_ext: ref_files.append(fpath) @@ -86,39 +87,47 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed ), ) - fna_out = open("coptr-fna-{}.fna".format(time.time()), "w") - genomes_out = open("{}.genomes".format(index_out), "w") - n_genomes = 0 + sequence_collection = Path( + f"coptr-fna-{datetime.now(timezone.utc).isoformat(timespec='seconds')}.fna" + ) + genomes = Path(f"{index_out}.genomes") print_info( "ReadMapper", "copying to fasta files to {} with prepended genome ids (filenames)".format( - fna_out.name + str(sequence_collection) ), ) # assume 1 genome per fasta # let's set the filename as the genome identifier - for fpath in ref_files: - fname = os.path.basename(os.path.splitext(fpath)[0]) - genomes_out.write(os.path.basename(fname) + "\n") - n_genomes += 1 - - with open(fpath, "r") as f: - for line in f: - if ">" == line[0]: - # prepend the filename as the identifier for the genome - line = line[0] + fname + "|" + line[1:] - fna_out.write(line) - - fna_out.close() - genomes_out.close() + n_genomes = 0 + with sequence_collection.open("w") as out_handle, genomes.open( + "w" + ) as genomes_handle: + for fpath in ref_files: + fname = os.path.basename(os.path.splitext(fpath)[0]) + genomes_handle.write(os.path.basename(fname) + "\n") + n_genomes += 1 + + with open(fpath, "r") as in_handle: + for line in in_handle: + if line.startswith(">"): + # prepend the filename as the identifier for the genome + line = f">{fname}|{line[1:]}" + out_handle.write(line) print_info( "ReadMapper", - "writing {} reference genome ids to {}".format(n_genomes, genomes_out.name), + "writing {} reference genome ids to {}".format(n_genomes, str(genomes)), ) - call = ["bowtie2-build", fna_out.name, index_out, "--threads", bt2_threads] + call = [ + "bowtie2-build", + str(sequence_collection), + index_out, + "--threads", + bt2_threads, + ] if bt2_bmax is not None: call += ["--bmax", bt2_bmax] if bt2_dcv is not None: @@ -136,8 +145,8 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed except Exception as e: print(e, file=sys.stderr) finally: - print_info("ReadMapper", "cleaning up {}".format(fna_out.name)) - sub.check_call(["rm", fna_out.name]) + print_info("ReadMapper", "cleaning up {}".format(str(sequence_collection))) + sequence_collection.unlink() def map(self, index, inputf, outfolder, paired, threads, bt2_k): """Map reads from infile against reference database using bowtie2, then @@ -160,17 +169,18 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): """ bt2_k = str(bt2_k) - if not os.path.isdir(outfolder): + outfolder = Path(outfolder) + if not outfolder.is_dir(): print_error("ReadMapper", "output folder does not exist.") if os.path.isfile(inputf): bn = os.path.basename(inputf) bn, ext = os.path.splitext(bn) - out_sam = os.path.join(outfolder, bn + ".sam") - out_bam = os.path.join(outfolder, bn + ".bam") + out_sam = outfolder / f"{bn}.sam" + out_bam = outfolder / f"{bn}.bam" # first map to a sam file with bowtie2 - print_info("ReadMapper", "mapping {} to {}".format(inputf, out_sam)) + print_info("ReadMapper", "mapping {} to {}".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -183,22 +193,25 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): bt2_k, ] print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + print_info( + "ReadMapper", "converting {} to {}".format(str(out_sam), str(out_bam)) + ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) + print_info("ReadMapper", "cleaning up {}".format(str(out_sam))) + out_sam.unlink() # single end sequencing elif os.path.isdir(inputf) and not paired: @@ -215,11 +228,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): if ext1 in valid_ext or ext2 in valid_ext: bn = os.path.basename(f) bn, ext = os.path.splitext(bn) - out_sam = os.path.join(outfolder, get_fastq_name(bn) + ".sam") - out_bam = os.path.join(outfolder, get_fastq_name(bn) + ".bam") + out_sam = outfolder / f"{get_fastq_name(bn)}.sam" + out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info("ReadMapper", "mapping {} to {}".format(fpath, out_sam)) + print_info( + "ReadMapper", "mapping {},{} to {}".format(f1, f2, str(out_sam)) + ) call = [ "bowtie2", "-x", @@ -232,24 +247,26 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): bt2_k, ] print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file print_info( - "ReadMapper", "converting {} to {}".format(out_sam, out_bam) + "ReadMapper", + "converting {} to {}".format(str(out_sam), str(out_bam)), ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) + out_sam.unlink() # paired end sequencing elif os.path.isdir(inputf) and paired: @@ -283,11 +300,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): f2 = os.path.join(inputf, read_pairs[pair_name][1]) bn = os.path.basename(f1) bn = bn.split("_")[0] - out_sam = os.path.join(outfolder, get_fastq_name(bn) + ".sam") - out_bam = os.path.join(outfolder, get_fastq_name(bn) + ".bam") + out_sam = outfolder / f"{get_fastq_name(bn)}.sam" + out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info("ReadMapper", "mapping {},{} to {}".format(f1, f2, out_sam)) + print_info( + "ReadMapper", "mapping {},{} to {}".format(f1, f2, str(out_sam)) + ) call = [ "bowtie2", "-x", @@ -303,22 +322,26 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): bt2_k, ] print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + print_info( + "ReadMapper", + "converting {} to {}".format(str(out_sam), str(out_bam)), + ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) + out_sam.unlink() else: print_error("ReadMapper", "input must either be a file or folder.") From 7b00bdffb3851412d7c4c7a0f9956b92ee00aac9 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 23 Jun 2021 11:10:33 +0200 Subject: [PATCH 25/48] refactor: introduce logging to the read mapper --- src/coptr/read_mapper.py | 83 ++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index d6c333d..2faaf32 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -22,19 +22,21 @@ along with CoPTR. If not, see . """ +import logging import os import os.path import subprocess as sub -import sys from datetime import datetime, timezone from pathlib import Path import pysam -from .print import print_error, print_info from .util import get_fastq_name +logger = logging.getLogger(__name__) + + class ReadMapper: """Wrapper around bowtie2.""" @@ -75,16 +77,12 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed if os.path.isfile(fpath) and ext in valid_ext: ref_files.append(fpath) total_size += os.stat(fpath).st_size - files_found += 1 else: - print_error("ReadMapper", "index must either be file or folder.") + logger.error("The index must be either a file or directory.") - print_info( - "ReadMapper", - "found {} files totaling {:.3f} Gb".format( - len(ref_files), total_size / (1024 ** 3) - ), + logger.info( + "Found %d files totaling %.3g GB.", len(ref_files), total_size / (1024 ** 3) ) sequence_collection = Path( @@ -92,11 +90,9 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed ) genomes = Path(f"{index_out}.genomes") - print_info( - "ReadMapper", - "copying to fasta files to {} with prepended genome ids (filenames)".format( - str(sequence_collection) - ), + logger.info( + "Copying FASTA files to '%s' with prepended genome ids (filenames).", + str(sequence_collection), ) # assume 1 genome per fasta @@ -117,10 +113,7 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed line = f">{fname}|{line[1:]}" out_handle.write(line) - print_info( - "ReadMapper", - "writing {} reference genome ids to {}".format(n_genomes, str(genomes)), - ) + logger.info("writing %d reference genome ids to '%s'.", n_genomes, str(genomes)) call = [ "bowtie2-build", str(sequence_collection), @@ -136,16 +129,16 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed call += ["--packed"] try: - print_info("ReadMapper", " ".join(call)) + logger.info("%s", " ".join(call)) sub.check_call(call) - print_info( - "ReadMapper", - "indexed {} fasta files for reference database.".format(files_found), + logger.info( + "Indexed %d FASTA files for the reference database.", len(ref_files) ) - except Exception as e: - print(e, file=sys.stderr) + except Exception as error: + logger.error("An error occurred while indexing with bowtie2.") + logger.info("Detailed information:", exc_info=error) finally: - print_info("ReadMapper", "cleaning up {}".format(str(sequence_collection))) + logger.info("Cleaning up '%s'.", str(sequence_collection)) sequence_collection.unlink() def map(self, index, inputf, outfolder, paired, threads, bt2_k): @@ -171,7 +164,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfolder = Path(outfolder) if not outfolder.is_dir(): - print_error("ReadMapper", "output folder does not exist.") + logger.error("The output directory '%s' does not exist.", str(outfolder)) if os.path.isfile(inputf): bn = os.path.basename(inputf) @@ -180,7 +173,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{bn}.bam" # first map to a sam file with bowtie2 - print_info("ReadMapper", "mapping {} to {}".format(inputf, str(out_sam))) + logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -192,14 +185,12 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - print_info("ReadMapper", " ".join(call)) + logger.debug(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info( - "ReadMapper", "converting {} to {}".format(str(out_sam), str(out_bam)) - ) + logger.info("Converting '%s' to '%s'.".format(str(out_sam), str(out_bam))) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) try: @@ -210,7 +201,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(str(out_sam))) + logger.info("Cleaning up '%s'.", str(out_sam)) out_sam.unlink() # single end sequencing @@ -232,9 +223,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info( - "ReadMapper", "mapping {},{} to {}".format(f1, f2, str(out_sam)) - ) + logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -246,14 +235,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - print_info("ReadMapper", " ".join(call)) + logger.debug(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info( - "ReadMapper", - "converting {} to {}".format(str(out_sam), str(out_bam)), + logger.info( + "Converting '%s' to '%s'.".format(str(out_sam), str(out_bam)) ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) @@ -265,7 +253,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) + logger.info("Cleaning up '%s'.", str(out_sam)) out_sam.unlink() # paired end sequencing @@ -304,9 +292,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info( - "ReadMapper", "mapping {},{} to {}".format(f1, f2, str(out_sam)) - ) + logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -321,14 +307,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - print_info("ReadMapper", " ".join(call)) + logger.debug(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info( - "ReadMapper", - "converting {} to {}".format(str(out_sam), str(out_bam)), + logger.info( + "Converting '%s' to '%s'.".format(str(out_sam), str(out_bam)) ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) @@ -340,8 +325,8 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) + logger.info("Cleaning up '%s'.", str(out_sam)) out_sam.unlink() else: - print_error("ReadMapper", "input must either be a file or folder.") + logger.error("Input must be either a file or a folder.") From 01f0ba49610b367accbc919b660b737edf93e785 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sun, 27 Jun 2021 23:55:28 +0200 Subject: [PATCH 26/48] refactor: add logging to the cli module --- src/coptr/cli.py | 82 +++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/src/coptr/cli.py b/src/coptr/cli.py index c299b10..1481162 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -16,6 +16,7 @@ """ import argparse +import logging import os import os.path import pickle as pkl @@ -29,11 +30,13 @@ from .compute_rel_abun import compute_rel_abun from .coptr_contig import estimate_ptrs_coptr_contig from .coptr_ref import estimate_ptrs_coptr_ref -from .print import print_error, print_info from .read_mapper import ReadMapper from .util import get_fastq_name +logger = logging.getLogger(__name__) + + class ProgramOptions: def __init__(self): parser = argparse.ArgumentParser( @@ -55,15 +58,15 @@ def __init__(self): if len(sys.argv[1:]) < 1: parser.print_help() - exit(1) + sys.exit(2) parser.add_argument("command", type=str, help="Command to run.") args = parser.parse_args(sys.argv[1:2]) if not hasattr(self, args.command): - print_error("Main", "Unrecognized command.", quit=False) + logger.critical("Unrecognized command.") parser.print_help() - exit(1) + sys.exit(2) getattr(self, args.command)() def index(self): @@ -100,7 +103,7 @@ def index(self): if len(sys.argv[2:]) < 1: parser.print_help() - exit(1) + sys.exit(2) args = parser.parse_args(sys.argv[2:]) read_mapper = ReadMapper() @@ -147,7 +150,7 @@ def map(self): if len(sys.argv[2:]) < 1: parser.print_help() - exit(1) + sys.exit(2) args = parser.parse_args(sys.argv[2:]) read_mapper = ReadMapper() @@ -174,7 +177,7 @@ def merge(self): if len(sys.argv[2:]) < 2: parser.print_help() - exit(1) + sys.exit(2) args = vars(parser.parse_args(sys.argv[2:])) in_bams = args["in-bams"] @@ -208,7 +211,7 @@ def extract(self): if len(sys.argv[2:]) < 1: parser.print_help() - exit(1) + sys.exit(2) args = parser.parse_args(sys.argv[2:]) @@ -226,10 +229,7 @@ def extract(self): if os.path.isfile( os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl") ): - print_info( - "BamProcessor", - "output for {} already found, skipping".format(fname), - ) + logger.info("Output for '%s' already found, skipping.", fname) continue # don't process the rest of the bam file if we just want to @@ -243,16 +243,15 @@ def extract(self): ) as f: pkl.dump(coverage_maps, f) - print_info( - "BamProcessor", - "found {} reference sequences corresponding to {} genomes".format( - len(ref_sequences), len(ref_genomes) - ), + logger.info( + "Found %d reference sequences corresponding to %d genomes.", + len(ref_sequences), + len(ref_genomes), ) if args.check_regex: - print_info("BamProcessor", "reference genome ids:") + logger.info("Reference genome ids:") for ref in sorted(ref_genomes): - print("\t", ref, file=sys.stderr) + logger.info("\t%s", ref) def estimate(self): parser = argparse.ArgumentParser( @@ -300,7 +299,7 @@ def estimate(self): if len(sys.argv[2:]) < 1: parser.print_help() - exit(1) + sys.exit(2) args = parser.parse_args(sys.argv[2:]) sample_ids = set() @@ -314,15 +313,13 @@ def estimate(self): os.mkdir(grouped_coverage_map_folder) if args.restart: - print_info( - "CoPTR", "restarting from files in " + grouped_coverage_map_folder - ) + logger.info("Restarting from files in '%s'.", grouped_coverage_map_folder) for cm_file in sorted(os.listdir(grouped_coverage_map_folder)): _, ext = os.path.splitext(cm_file) if ext != ".pkl": continue - print_info("CoPTR", "checking " + cm_file) + logger.info("Checking '%s'.", cm_file) with open( os.path.join(grouped_coverage_map_folder, cm_file), "rb" ) as f: @@ -339,8 +336,8 @@ def estimate(self): pass else: - print_info("CoPTR", "grouping reads by reference genome") - print_info("CoPTR", "saving to " + grouped_coverage_map_folder) + logger.info("Grouping reads by reference genome.") + logger.info("Saving to '%s':", grouped_coverage_map_folder) # first construct a list of genome_ids for ptr estimates for f in sorted(os.listdir(args.coverage_map_folder)): @@ -349,8 +346,7 @@ def estimate(self): continue fpath = os.path.join(args.coverage_map_folder, f) - print_info("CoPTR", "\tprocessing {}".format(f)) - + logger.info("\t%s", f) with open(fpath, "rb") as file: coverage_maps = pkl.load(file) @@ -382,8 +378,8 @@ def estimate(self): ref_genome_ids.add(ref_id) del coverage_maps - print_info("CoPTR", "done grouping by reference genome") - print_info("CoPTR", "the --restart flag can be used to start from here") + logger.info("Grouping by reference genome: Complete.") + logger.info("The --restart flag can be used to start from here.") sample_ids = sorted(list(sample_ids)) results_ref = estimate_ptrs_coptr_ref( @@ -408,7 +404,7 @@ def estimate(self): if ext != ".csv": out_file += ".csv" - print_info("CoPTR", "writing {}".format(out_file)) + logger.info("Writing '%s'.", out_file) with open(out_file, "w") as f: # write the header @@ -443,15 +439,8 @@ def estimate(self): row[sample_ids.index(result.sample_id)] = str(result.estimate) f.write(",".join(row) + "\n") - print_info("CoPTR", "done!") - print_info( - "CoPTR", "you may remove the folder {}".format(grouped_coverage_map_folder) - ) - # print_info("CoPTR", "cleaning up {}".format(grouped_coverage_map_folder)) - # for ref_id in ref_genome_ids.union(assembly_genome_ids): - # filename = os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl") - # os.remove(filename) - # os.rmdir(grouped_coverage_map_folder) + logger.info("Done.") + logger.info("You may now remove the folder '%s'.", grouped_coverage_map_folder) def count(self): parser = argparse.ArgumentParser( @@ -482,7 +471,7 @@ def count(self): args = parser.parse_args(sys.argv[2:]) - print_info("CoPTR", "computing read counts") + logger.info("Computing read counts.") counts, genome_ids = compute_read_counts( args.coverage_map_folder, args.min_cov, args.min_samples @@ -493,7 +482,7 @@ def count(self): if ext != ".csv": out_file += ".csv" - print_info("CoPTR", "writing {}".format(out_file)) + logger.info("Writing '%s'.", out_file) with open(out_file, "w") as f: # write the header @@ -512,7 +501,7 @@ def count(self): row = ",".join(row) + "\n" f.write(row) - print_info("CoPTR", "done!") + logger.info("Done.") def rabun(self): parser = argparse.ArgumentParser( @@ -549,7 +538,7 @@ def rabun(self): args = parser.parse_args(sys.argv[2:]) - print_info("CoPTR", "computing relative abundances") + logger.info("Computing relative abundances.") rel_abun, genome_ids = compute_rel_abun( args.coverage_map_folder, args.min_reads, args.min_cov, args.min_samples @@ -560,7 +549,7 @@ def rabun(self): if ext != ".csv": out_file += ".csv" - print_info("CoPTR", "writing {}".format(out_file)) + logger.info("Writing '%s'.", out_file) with open(out_file, "w") as f: # write the header @@ -579,9 +568,10 @@ def rabun(self): row = ",".join(row) + "\n" f.write(row) - print_info("CoPTR", "done!") + logger.info("Done.") def cli(): """Serve as an entry point for command line calls.""" + logging.basicConfig(level="INFO", format="[%(levelname)s] [%(name)s] %(message)s") ProgramOptions() From 8e83f763bf4c1136ecbb953ef4ce7147fc6af814 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:01:15 +0200 Subject: [PATCH 27/48] refactor: add logging to bam_processor module --- src/coptr/bam_processor.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/coptr/bam_processor.py b/src/coptr/bam_processor.py index 4e621e5..3356cfc 100644 --- a/src/coptr/bam_processor.py +++ b/src/coptr/bam_processor.py @@ -24,6 +24,7 @@ import array import bisect +import logging import math import os.path import re @@ -32,10 +33,12 @@ import pysam from scipy.sparse import csr_matrix -from .print import print_info from .read_assigner import ReadAssigner +logger = logging.getLogger(__name__) + + class ReadContainer: """Container to store read positions and metadata.""" @@ -370,7 +373,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): A dictionary whose key is a sequence id, and value are the read positions along that sequence. """ - print_info("BamProcessor", "determining reference genomes") + logger.info("Determining reference genomes.") genome_ids = set() # sequence_id -> genome_id @@ -389,7 +392,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): genome_ids, read_container, lengths, ref_seq_genome_id ) - print_info("BamProcessor", "collecting multi-mapped reads") + logger.info("Collecting multi-mapped reads.") genome_ids = sorted(list(genome_ids)) # reads that fail filtering criteria discarded_reads = set() @@ -433,7 +436,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): if len(data) > 0: X = csr_matrix((data, indices, indptr), shape=(nreads, len(genome_ids))) - print_info("BamProcessor", "assigning multi-mapped reads") + logger.info("Assigning multi-mapped reads.") read_assigner = ReadAssigner(X, prior_counts) assignments = read_assigner.assign_reads() @@ -498,7 +501,7 @@ def process_bam(self, bam_file, max_alignments=10): A dictionary whose key is a reference genome id, and value is a CoverageMap. """ - print_info("BamProcessor", "processing {}".format(bam_file)) + logger.info("Processing '%s'.", bam_file) # extract read positions # reads : dict[query_seq_id] -> Read @@ -523,7 +526,7 @@ def process_bam(self, bam_file, max_alignments=10): contig_read_positions = {} contig_lengths = {} - print_info("BamProcessor", "grouping reads by reference genome") + logger.info("Grouping reads by reference genome.") # group by reference genome # for contigs, this will group all contigs together by species for ref in sorted(lengths): @@ -578,8 +581,8 @@ def merge(self, bam_files, out_bam): out_bam : str Location to store the merged bam file. """ - print_info("BamProcessor", "merging bam_files {}".format(bam_files)) - print_info("BamProcessor", "keeping reads with highest alignment score") + logger.info("Merging BAM files %s.", ", ".join(bam_files)) + logger.info("Keeping reads with highest alignment score.") # bam header: SN => LN seq_len = {} # read_id => best_score @@ -611,7 +614,7 @@ def merge(self, bam_files, out_bam): seq_names = sorted(seq_len.keys()) - print_info("BamProcessor", "writing merged file {}".format(out_bam)) + logger.info("Writing merged file '%s'.", out_bam) out = pysam.AlignmentFile(out_bam, self.write_mode, header=header) for bam_file in bam_files: inf = pysam.AlignmentFile(bam_file, self.read_mode) @@ -647,7 +650,7 @@ def merge(self, bam_files, out_bam): out.write(out_aln) inf.close() out.close() - print_info("BamProcessor", "finished writing {}".format(out_bam)) + logger.info("Finished writing '%s'.", out_bam) class CoverageMap: From 038c89d0a8edac74198e38a5c0ec2222178a84f2 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:02:43 +0200 Subject: [PATCH 28/48] refactor: add logging to compute_read_counts module --- src/coptr/compute_read_counts.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/coptr/compute_read_counts.py b/src/coptr/compute_read_counts.py index 10bb635..b846f0e 100644 --- a/src/coptr/compute_read_counts.py +++ b/src/coptr/compute_read_counts.py @@ -22,13 +22,16 @@ """ import os +import logging import pickle as pkl import numpy as np from .coptr_contig import CoPTRContig from .coptr_ref import ReadFilterRef -from .print import print_info + + +logger = logging.getLogger(__name__) def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): @@ -81,7 +84,7 @@ def compute_read_counts(coverage_map_folder, min_cov, min_samples): continue fpath = os.path.join(coverage_map_folder, f) - print_info("Count", "\tprocessing {}".format(f)) + logger.info("\t%s", f) with open(fpath, "rb") as file: coverage_maps = pkl.load(file) From b1a9e50446a768b835e7f23c3915ad98c428c95a Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:03:46 +0200 Subject: [PATCH 29/48] refactor: add logging to compute_rel_abun module --- src/coptr/compute_rel_abun.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/coptr/compute_rel_abun.py b/src/coptr/compute_rel_abun.py index 95616b6..57d0825 100644 --- a/src/coptr/compute_rel_abun.py +++ b/src/coptr/compute_rel_abun.py @@ -22,13 +22,16 @@ """ import os +import logging import pickle as pkl import numpy as np from .coptr_contig import CoPTRContig from .coptr_ref import ReadFilterRef -from .print import print_info + + +logger = logging.getLogger(__name__) def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples): @@ -96,7 +99,7 @@ def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): continue fpath = os.path.join(coverage_map_folder, f) - print_info("RAbun", "\tprocessing {}".format(f)) + logger.info("\t%s", f) with open(fpath, "rb") as file: coverage_maps = pkl.load(file) From 59b0d6f182e7efae1e039523094b7aa43f1044e1 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:07:18 +0200 Subject: [PATCH 30/48] refactor: add logging to coptr_contig module --- src/coptr/coptr_contig.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index 33732be..f4550a2 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -3,6 +3,7 @@ ====================== Estimate peak-to-trough ratios from assemblies. """ +import sys """ This file is part of CoPTR. @@ -21,6 +22,7 @@ along with CoPTR. If not, see . """ +import logging import multiprocessing as mp import os.path import pickle as pkl @@ -30,7 +32,9 @@ import scipy.stats from .poisson_pca import PoissonPCA -from .print import print_error, print_info + + +logger = logging.getLogger(__name__) class CoPTRContigEstimate: @@ -138,7 +142,6 @@ def construct_coverage_matrix(self, coverage_maps): A = [] # find all common contig ids - contig_id_list = [cm.contig_ids for cm in coverage_maps] contig_ids = coverage_maps[0].contig_ids ref_genome = coverage_maps[0].genome_id @@ -148,13 +151,13 @@ def construct_coverage_matrix(self, coverage_maps): for contig_id in sorted(contig_ids): if contig_id not in cm.contig_ids: - print_error( - "CoPTRContig", - "missing contig {} from {} in {}".format( - contig_id, ref_genome, cm.sample_id - ), - quit=True, + logger.critical( + "Missing contig %s from %s in %s", + contig_id, + ref_genome, + cm.sample_id, ) + sys.exit(1) length = cm.get_length(contig_id) if length < 11000: @@ -401,7 +404,7 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: tmp = None - print_info("CoPTRContig", "running {}".format(genome_id)) + logger.info("Running %s.", genome_id) A = np.array(A_filtered).T @@ -464,7 +467,7 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): bc = np.flip(bc) binned_counts.append(bc) - print_info("CoPTRContig", "finished {}".format(genome_id)) + logger.info("Finished %s.", genome_id) if return_bins: return estimates, parameters, binned_counts @@ -619,7 +622,7 @@ def estimate_ptrs_coptr_contig( A dictionary with key reference genome id and value a list of CoPTRContigEstimate for that reference genome. """ - print_info("CoPTRContig", "checking reference genomes") + logger.info("Checking reference genomes.") coptr_contig_estimates = {} coptr_contig = CoPTRContig(min_reads, min_samples) From 7c5426ace0c2dfc285c578d0eaa02edebc1c0301 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:17:24 +0200 Subject: [PATCH 31/48] refactor: add logging to coptr_ref module --- src/coptr/coptr_ref.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 5b7e1e3..46eca3f 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -3,6 +3,7 @@ ====================== Estimate peak-to-trough ratios using complete reference genomes. """ +import sys """ This file is part of CoPTR. @@ -21,6 +22,7 @@ along with CoPTR. If not, see . """ +import logging import math import multiprocessing as mp import os.path @@ -30,7 +32,8 @@ import scipy.optimize import scipy.stats -from .print import print_error, print_info, print_warning + +logger = logging.getLogger(__name__) class QCResult: @@ -155,13 +158,10 @@ def compute_bin_size(self, genome_length): bin_size = bin_size - (bin_size % 100) if bin_size < 1: - print_error( - "CoPTR-Ref", - "{}\n{}\n{}".format( - "found complete reference genome with <500bp", - "this is probably a mislabeled contig", - "please check your .genomes file", - ), + logger.error( + "Found complete reference genome with <500bp. " + "This is probably due to a mislabeled contig. " + "Please check your .genomes file." ) return bin_size @@ -750,7 +750,7 @@ def estimate_ptr( np.seterr(invalid="warn") if best_f == np.inf: - print_warning("CoPTRRef", "PTR optimization failed") + logger.error("PTR optimization failed.") return np.nan, np.nan, np.nan, np.nan elif not filter_reads: return log2_ptr, ori, ter, -best_f @@ -869,7 +869,7 @@ def estimate_ptrs(self, coverage_maps): if len(sample_ids) == 0: return estimates - print_info("CoPTRRef", "running {}".format(genome_id)) + logger.info("Running %s.", genome_id) # first, compute individiual log2_ptr, ori, ter estimates log2_ptrs = [] @@ -901,7 +901,7 @@ def estimate_ptrs(self, coverage_maps): estimates[i].estimate = log2_ptrs[n] n += 1 - print_info("CoPTRRef", "finished {}".format(genome_id)) + logger.info("Finished %s.", genome_id) return estimates @@ -915,7 +915,11 @@ def _parallel_helper(self, x): def plot_fit( coptr_ref_est, read_positions, genome_length, min_reads, min_cov, plot_folder ): - import matplotlib.pyplot as plt + try: + import matplotlib.pyplot as plt + except ModuleNotFoundError: + logger.critical("Please install matplotlib to enable plotting.") + raise coptr = CoPTRRef(min_reads, min_cov) rf = ReadFilterRef(min_reads, min_cov) @@ -1044,7 +1048,7 @@ def estimate_ptrs_coptr_ref( A dictionary with key reference genome id and value a list of CoPTRRefEstimate for that reference genome. """ - print_info("CoPTRRef", "checking reference genomes") + logger.info("Checking reference genomes.") coptr_ref_estimates = {} coptr_ref = CoPTRRef(min_reads, min_cov) From cd6d496278707a8a3967e3391a1a0fac32fdc312 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:20:29 +0200 Subject: [PATCH 32/48] refactor: add logging to poisson_pca module --- src/coptr/poisson_pca.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/coptr/poisson_pca.py b/src/coptr/poisson_pca.py index bf35010..c5add7a 100644 --- a/src/coptr/poisson_pca.py +++ b/src/coptr/poisson_pca.py @@ -21,16 +21,19 @@ along with CoPTR. If not, see . """ -import sys +import logging import numpy as np import scipy.optimize +logger = logging.getLogger(__name__) + + class PoissonPCA: """Principal Component Analysis with Poisson observations.""" - def pca(self, X, k, tol=1e-3, verbose=False): + def pca(self, X, k, tol=1e-3): """Run the PCA. For a data matrix X (sample by observations), finds two matrices W and V such that: @@ -65,18 +68,13 @@ def pca(self, X, k, tol=1e-3, verbose=False): vh = vh[:k, :] W = u * s V = vh - # W = np.random.normal(loc=0, scale=0.1, size=(X.shape[0], k)) - # V = np.random.normal(loc=0, scale=0.1, size=(k, X.shape[1])) - # W = np.zeros((X.shape[0], k)) - # V = np.zeros((k, X.shape[1])) it = 0 prv_log_lk = -np.inf log_lk = self.log_likelihood(X, W, V) while np.abs(prv_log_lk - log_lk) > tol: - if verbose: - print("iteration:", it, "log likelihood:", log_lk, file=sys.stderr) + logger.debug("Iteration: %d; Log likelihood: %.3G", it, log_lk) prv_log_lk = log_lk W = self.update_W(X, W, V) From 5b31ebf231ede2b7f3d4db6dd53144b96907b992 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:23:33 +0200 Subject: [PATCH 33/48] refactor: add logging to read_assigner module --- src/coptr/read_assigner.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/coptr/read_assigner.py b/src/coptr/read_assigner.py index 4b229d4..3a3d58c 100644 --- a/src/coptr/read_assigner.py +++ b/src/coptr/read_assigner.py @@ -22,6 +22,8 @@ along with CoPTR. If not, see . """ +import logging + import numpy as np import scipy.misc import scipy.special @@ -30,6 +32,9 @@ from scipy.special import digamma +logger = logging.getLogger(__name__) + + class ReadAssigner(object): """Assign multi-mapped reads using a mixture model. @@ -89,15 +94,14 @@ def update_eta(self): """Optimize eta with fixed phi.""" self.eta = np.array(self.alpha + self.phi.sum(axis=0))[0] - def run_vi(self, tol=1e-3, verbose=False): + def run_vi(self, tol=1e-3): """Optimize variational parameters with respect to elbo.""" prv_elbo = -np.inf elbo = self.compute_elbo() it = 0 while np.abs(prv_elbo - elbo) > tol: - if verbose: - print("it:", it, "elbo:", elbo) + logger.debug("Iteration: %d; elbo: %.3G", it, elbo) self.update_eta() self.update_phi() prv_elbo = elbo @@ -108,8 +112,7 @@ def run_vi(self, tol=1e-3, verbose=False): elbo >= prv_elbo - 1e-3 ), "elbo must be strictly increasing ({} > {})".format(elbo, prv_elbo) - if verbose: - print("it:", it, "elbo:", elbo) + logger.debug("Iteration: %d; elbo: %.3G", it, elbo) def assign_reads(self, verbose=False): """Compute read assignments. From 0783556c4550932c1e48616227777a3776e79452 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:24:34 +0200 Subject: [PATCH 34/48] refactor: remove superfluous print module --- src/coptr/print.py | 82 ---------------------------------------------- 1 file changed, 82 deletions(-) delete mode 100644 src/coptr/print.py diff --git a/src/coptr/print.py b/src/coptr/print.py deleted file mode 100644 index 1d8a7d2..0000000 --- a/src/coptr/print.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -print.py -====================== -Print error, warning, and info messages. -""" - -""" -This file is part of CoPTR. - -CoPTR is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -CoPTR is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with CoPTR. If not, see . -""" - -import sys -import time - - -def print_error(module, msg, quit=True): - """Print an error message to the standard error. - - Parameters - ---------- - module : str - Which module generated the error message - msg : str - The error message itself - exit : bool - If TRUE, the program terminates after printing the error - """ - print( - "[ERROR] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), - file=sys.stderr, - flush=True, - ) - if quit: - exit(1) - - -def print_warning(module, msg): - """Print a warning message to the standard error. - - Parameters - ---------- - module : str - Which module generated the message - msg : str - The message itself - """ - - print( - "[WARNING] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), - file=sys.stderr, - flush=True, - ) - - -def print_info(module, msg): - """Print an info message to the standard error. - - Parameters - ---------- - module : str - Which module generated the message - msg : str - The message itself - """ - - print( - "[INFO] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), - file=sys.stderr, - flush=True, - ) From f9b65d85011fb279e0a556dee81849f8d3a4ed1d Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 28 Jun 2021 00:24:56 +0200 Subject: [PATCH 35/48] style: isort imports --- src/coptr/compute_read_counts.py | 2 +- src/coptr/compute_rel_abun.py | 2 +- src/coptr/coptr_contig.py | 1 + src/coptr/coptr_ref.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coptr/compute_read_counts.py b/src/coptr/compute_read_counts.py index b846f0e..950f6be 100644 --- a/src/coptr/compute_read_counts.py +++ b/src/coptr/compute_read_counts.py @@ -21,8 +21,8 @@ along with CoPTR. If not, see . """ -import os import logging +import os import pickle as pkl import numpy as np diff --git a/src/coptr/compute_rel_abun.py b/src/coptr/compute_rel_abun.py index 57d0825..ac56fcb 100644 --- a/src/coptr/compute_rel_abun.py +++ b/src/coptr/compute_rel_abun.py @@ -21,8 +21,8 @@ along with CoPTR. If not, see . """ -import os import logging +import os import pickle as pkl import numpy as np diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index f4550a2..007f130 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -5,6 +5,7 @@ """ import sys + """ This file is part of CoPTR. diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 46eca3f..8e5efce 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -5,6 +5,7 @@ """ import sys + """ This file is part of CoPTR. From b598c048f913997b070131afe56db67a619c2007 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Mon, 28 Jun 2021 15:54:17 -0400 Subject: [PATCH 36/48] refactor: '%s' -> %s, add time to logging, calls to format require brackets --- src/coptr/bam_processor.py | 6 +++--- src/coptr/cli.py | 22 +++++++++++++--------- src/coptr/read_mapper.py | 32 ++++++++++++++++---------------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/coptr/bam_processor.py b/src/coptr/bam_processor.py index 3356cfc..1dd377f 100644 --- a/src/coptr/bam_processor.py +++ b/src/coptr/bam_processor.py @@ -501,7 +501,7 @@ def process_bam(self, bam_file, max_alignments=10): A dictionary whose key is a reference genome id, and value is a CoverageMap. """ - logger.info("Processing '%s'.", bam_file) + logger.info("Processing %s.", bam_file) # extract read positions # reads : dict[query_seq_id] -> Read @@ -614,7 +614,7 @@ def merge(self, bam_files, out_bam): seq_names = sorted(seq_len.keys()) - logger.info("Writing merged file '%s'.", out_bam) + logger.info("Writing merged file %s.", out_bam) out = pysam.AlignmentFile(out_bam, self.write_mode, header=header) for bam_file in bam_files: inf = pysam.AlignmentFile(bam_file, self.read_mode) @@ -650,7 +650,7 @@ def merge(self, bam_files, out_bam): out.write(out_aln) inf.close() out.close() - logger.info("Finished writing '%s'.", out_bam) + logger.info("Finished writing %s.", out_bam) class CoverageMap: diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 1481162..5fc6734 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -229,7 +229,7 @@ def extract(self): if os.path.isfile( os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl") ): - logger.info("Output for '%s' already found, skipping.", fname) + logger.info("Output for %s already found, skipping.", fname) continue # don't process the rest of the bam file if we just want to @@ -313,13 +313,13 @@ def estimate(self): os.mkdir(grouped_coverage_map_folder) if args.restart: - logger.info("Restarting from files in '%s'.", grouped_coverage_map_folder) + logger.info("Restarting from files in %s.", grouped_coverage_map_folder) for cm_file in sorted(os.listdir(grouped_coverage_map_folder)): _, ext = os.path.splitext(cm_file) if ext != ".pkl": continue - logger.info("Checking '%s'.", cm_file) + logger.info("Checking %s.", cm_file) with open( os.path.join(grouped_coverage_map_folder, cm_file), "rb" ) as f: @@ -337,7 +337,7 @@ def estimate(self): else: logger.info("Grouping reads by reference genome.") - logger.info("Saving to '%s':", grouped_coverage_map_folder) + logger.info("Saving to %s:", grouped_coverage_map_folder) # first construct a list of genome_ids for ptr estimates for f in sorted(os.listdir(args.coverage_map_folder)): @@ -404,7 +404,7 @@ def estimate(self): if ext != ".csv": out_file += ".csv" - logger.info("Writing '%s'.", out_file) + logger.info("Writing %s.", out_file) with open(out_file, "w") as f: # write the header @@ -440,7 +440,7 @@ def estimate(self): f.write(",".join(row) + "\n") logger.info("Done.") - logger.info("You may now remove the folder '%s'.", grouped_coverage_map_folder) + logger.info("You may now remove the folder %s.", grouped_coverage_map_folder) def count(self): parser = argparse.ArgumentParser( @@ -482,7 +482,7 @@ def count(self): if ext != ".csv": out_file += ".csv" - logger.info("Writing '%s'.", out_file) + logger.info("Writing %s.", out_file) with open(out_file, "w") as f: # write the header @@ -549,7 +549,7 @@ def rabun(self): if ext != ".csv": out_file += ".csv" - logger.info("Writing '%s'.", out_file) + logger.info("Writing %s.", out_file) with open(out_file, "w") as f: # write the header @@ -573,5 +573,9 @@ def rabun(self): def cli(): """Serve as an entry point for command line calls.""" - logging.basicConfig(level="INFO", format="[%(levelname)s] [%(name)s] %(message)s") + logging.basicConfig( + level="INFO", + format="[%(levelname)s] [%(asctime)s] [%(name)s] %(message)s", + datefmt="%b %d, %Y %H:%M:%S", + ) ProgramOptions() diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index 2faaf32..df5a645 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -91,7 +91,7 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed genomes = Path(f"{index_out}.genomes") logger.info( - "Copying FASTA files to '%s' with prepended genome ids (filenames).", + "Copying FASTA files to %s with prepended genome ids (filenames).", str(sequence_collection), ) @@ -113,7 +113,7 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed line = f">{fname}|{line[1:]}" out_handle.write(line) - logger.info("writing %d reference genome ids to '%s'.", n_genomes, str(genomes)) + logger.info("Writing %d reference genome ids to %s.", n_genomes, str(genomes)) call = [ "bowtie2-build", str(sequence_collection), @@ -138,7 +138,7 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed logger.error("An error occurred while indexing with bowtie2.") logger.info("Detailed information:", exc_info=error) finally: - logger.info("Cleaning up '%s'.", str(sequence_collection)) + logger.info("Cleaning up %s.", str(sequence_collection)) sequence_collection.unlink() def map(self, index, inputf, outfolder, paired, threads, bt2_k): @@ -164,7 +164,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfolder = Path(outfolder) if not outfolder.is_dir(): - logger.error("The output directory '%s' does not exist.", str(outfolder)) + logger.error("The output directory %s does not exist.", str(outfolder)) if os.path.isfile(inputf): bn = os.path.basename(inputf) @@ -173,7 +173,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{bn}.bam" # first map to a sam file with bowtie2 - logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -185,12 +185,12 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - logger.debug(" ".join(call)) + logger.info(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - logger.info("Converting '%s' to '%s'.".format(str(out_sam), str(out_bam))) + logger.info("Converting {} to {}.".format(str(out_sam), str(out_bam))) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) try: @@ -201,7 +201,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - logger.info("Cleaning up '%s'.", str(out_sam)) + logger.info("Cleaning up {}.", str(out_sam)) out_sam.unlink() # single end sequencing @@ -223,7 +223,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -235,13 +235,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - logger.debug(" ".join(call)) + logger.info(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file logger.info( - "Converting '%s' to '%s'.".format(str(out_sam), str(out_bam)) + "Converting {} to {}.".format(str(out_sam), str(out_bam)) ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) @@ -253,7 +253,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - logger.info("Cleaning up '%s'.", str(out_sam)) + logger.info("Cleaning up %s.", str(out_sam)) out_sam.unlink() # paired end sequencing @@ -292,7 +292,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - logger.info("Mapping '%s' to '%s'".format(inputf, str(out_sam))) + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) call = [ "bowtie2", "-x", @@ -307,13 +307,13 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): "-k", bt2_k, ] - logger.debug(" ".join(call)) + logger.info(" ".join(call)) with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file logger.info( - "Converting '%s' to '%s'.".format(str(out_sam), str(out_bam)) + "Converting {} to {}.".format(str(out_sam), str(out_bam)) ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) @@ -325,7 +325,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - logger.info("Cleaning up '%s'.", str(out_sam)) + logger.info("Cleaning up %s.", str(out_sam)) out_sam.unlink() else: From f6fee149e593cd3407b422b1c9e8be62ca950178 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Mon, 28 Jun 2021 15:54:48 -0400 Subject: [PATCH 37/48] docs: updated tutorial with new logging calls --- docs/tutorial.rst | 84 +++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 5fb5ead..a518228 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -59,13 +59,13 @@ Indexing the reference database .. code-block:: text $ coptr index example-data/ref-db example-data/ref-db/example-index - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: found 2 files totaling 0.006 Gb - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: copying to fasta files to coptr-fna-1599775601.629647.fna with prepended genome ids (filenames) - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: writing 2 reference genome ids to example-data/ref-db/example-index.genomes - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: bowtie2-build coptr-fna-1599775601.629647.fna example-data/ref-db/example-index --noref --threads 1 + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Found 2 files totaling 0.00611 GB. + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Copying FASTA files to coptr-fna-2021-06-28T19:46:57+00:00.fna with prepended genome ids (filenames). + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Writing 2 reference genome ids to example-data/ref-db/example-index.genomes. + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] bowtie2-build coptr-fna-2021-06-28T19:46:57+00:00.fna example-data/ref-db/example-index --threads 1 ...bowtie2 output... - [INFO] (Sep 10, 2020 18:06:44) ReadMapper: indexed 2 fasta files for reference database. - [INFO] (Sep 10, 2020 18:06:44) ReadMapper: cleaning up coptr-fna-1599775601.629647.fna + [INFO] [Jun 28, 2021 15:47:00] [coptr.read_mapper] Indexed 2 FASTA files for the reference database. + [INFO] [Jun 28, 2021 15:47:00] [coptr.read_mapper] Cleaning up coptr-fna-2021-06-28T19:46:57+00:00.fna. @@ -100,13 +100,13 @@ Indexing a database of fasta files can be accomplished by calling control memory usage. It takes two arguments. The first ``ref-fasta`` is either a folder containing -fasta for the database. If it is a folder, coPTR will scan the folder for +fasta for the database. If it is a folder, CoPTR will scan the folder for all files ending in ``.fasta``, ``.fna``, or ``.fa``. CoPTR will combine these into a single fasta file, prepending the filename to each sequence id in order -to track contigs from the same reference genome. It then calls the ```bowtie2-build``` +to track contigs from the same reference genome. It then calls the ``bowtie2-build`` to index this file. -coPTR additionally outputs an ```index_name.genomes``` file with a list of ids for each +coPTR additionally outputs an ``index_name.genomes`` file with a list of ids for each reference genome. Notes on memory usage @@ -123,19 +123,19 @@ Mapping reads .. code-block:: text $ coptr map example-data/ref-db/example-index example-data/fastq example-data/bam - [INFO] (Aug 31, 2020 12:12:10) ReadMapper: mapping example-data/fastq/ERR969281.fastq.gz to example-data/bam/ERR969281.sam - [INFO] (Aug 31, 2020 12:12:10) ReadMapper: bowtie2 -x example-data/ref-db/example-index example-data/fastq/ERR969281.fastq.gz --no-unal -p 1 + [INFO] [Jun 28, 2021 15:48:26] [coptr.read_mapper] Mapping example-data/fastq to example-data/bam/ERR969281.sam + [INFO] [Jun 28, 2021 15:48:26] [coptr.read_mapper] bowtie2 -x example-data/ref-db/example-index example-data/fastq/ERR969281.fastq.gz --no-unal -p 1 -k 10 10818 reads; of these: 10818 (100.00%) were unpaired; of these: 4071 (37.63%) aligned 0 times 6709 (62.02%) aligned exactly 1 time 38 (0.35%) aligned >1 times 62.37% overall alignment rate - [INFO] (Aug 31, 2020 12:12:11) ReadMapper: converting example-data/bam/ERR969281.sam to example-data/bam/ERR969281.bam - [INFO] (Aug 31, 2020 12:12:11) ReadMapper: cleaning up example-data/bam/ERR969281.sam + [INFO] [Jun 28, 2021 15:48:27] [coptr.read_mapper] Converting example-data/bam/ERR969281.sam to example-data/bam/ERR969281.bam. + [INFO] [Jun 28, 2021 15:48:27] [coptr.read_mapper] Cleaning up example-data/bam/ERR969281.sam. .... - [INFO] (Aug 31, 2020 12:12:24) ReadMapper: converting example-data/bam/ERR969285.sam to example-data/bam/ERR969285.bam - [INFO] (Aug 31, 2020 12:12:24) ReadMapper: cleaning up example-data/bam/ERR969285.sam + [INFO] [Jun 28, 2021 15:48:40] [coptr.read_mapper] Converting example-data/bam/ERR969430.sam to example-data/bam/ERR969430.bam. + [INFO] [Jun 28, 2021 15:48:40] [coptr.read_mapper] Cleaning up example-data/bam/ERR969430.sam. Once you have indexed a reference database. You can then map reads against @@ -197,12 +197,12 @@ Computing coverage maps .. code-block:: text $ coptr extract example-data/bam example-data/coverage-maps - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: processing example-data/bam/ERR969281.bam - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: determining reference genomes - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: collecting multi-mapped reads - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: grouping reads by reference genome + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Processing example-data/bam/ERR969281.bam. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Determining reference genomes. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Collecting multi-mapped reads. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Grouping reads by reference genome. ... - [INFO] (Aug 31, 2020 12:13:56) BamProcessor: found 190 reference sequences corresponding to 2 genomes + [INFO] [Jun 28, 2021 15:50:00] [coptr.cli] Found 190 reference sequences corresponding to 2 genomes. Once reads have been mapped, the next step is to compute the coverage along each reference genome. In this step, starting positions of each read are @@ -251,28 +251,28 @@ Estimating PTRs .. code-block:: text - # coptr estimate example-data/coverage-maps out --min-reads 2500 - [INFO] (Jan 18, 2021 10:36:05) CoPTR: grouping reads by reference genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: saving to example-data/coverage-maps/coverage-maps-genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969281.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969282.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969283.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969285.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969286.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969428.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969429.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969430.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: done grouping by reference genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: the --restart flag can be used to start from here - [INFO] (Jan 18, 2021 10:36:05) CoPTRRef: checking reference genomes - [INFO] (Jan 18, 2021 10:36:15) CoPTRRef: running l-gasseri-ref - [INFO] (Jan 18, 2021 10:36:16) CoPTRRef: finished l-gasseri-ref - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: checking reference genomes - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: running e-coli-mag - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: finished e-coli-mag - [INFO] (Jan 18, 2021 10:36:16) CoPTR: writing out.csv - [INFO] (Jan 18, 2021 10:36:16) CoPTR: done! - [INFO] (Jan 18, 2021 10:36:16) CoPTR: you may remove the folder example-data/coverage-maps/coverage-maps-genome + $ coptr estimate example-data/coverage-maps out --min-reads 2500 + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Grouping reads by reference genome. + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Saving to example-data/coverage-maps/coverage-maps-genome: + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969281.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969282.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969283.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969285.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969286.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969428.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969429.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969430.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Grouping by reference genome: Complete. + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] The --restart flag can be used to start from here. + [INFO] [Jun 28, 2021 15:50:15] [coptr.coptr_ref] Checking reference genomes. + [INFO] [Jun 28, 2021 15:50:24] [coptr.coptr_ref] Running l-gasseri-ref. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_ref] Finished l-gasseri-ref. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Checking reference genomes. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Running e-coli-mag. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Finished e-coli-mag. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] Writing out.csv. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] Done. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] You may now remove the folder example-data/coverage-maps/coverage-maps-genome. The final stage is to estimate PTR ratios from coverage maps. This is accomplished with the ``estimate`` command. **It is strongly recommended that you perform this step From 36eb7a84e228297f3aef79d44c0b8f75ab5709e2 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 1 Jul 2021 09:23:35 -0400 Subject: [PATCH 38/48] updated README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f28093d..fcb875f 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -Our documentation is on Read The Docs: \ No newline at end of file +CoPTR is a tool for estimating peak-to-trough ratios from metagenomic sequencing. + +You can find our documentation on Read The Docs: \ No newline at end of file From 1b9d82943b56f711902387ed1e99a0f352162bf2 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 23 Jul 2021 08:22:22 -0400 Subject: [PATCH 39/48] bug fix: updated call to run_vi --- src/coptr/__init__.py | 2 +- src/coptr/read_assigner.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index a82b376..72f26f5 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.1.2" diff --git a/src/coptr/read_assigner.py b/src/coptr/read_assigner.py index 3a3d58c..dd2e41a 100644 --- a/src/coptr/read_assigner.py +++ b/src/coptr/read_assigner.py @@ -114,7 +114,7 @@ def run_vi(self, tol=1e-3): logger.debug("Iteration: %d; elbo: %.3G", it, elbo) - def assign_reads(self, verbose=False): + def assign_reads(self): """Compute read assignments. Return @@ -123,7 +123,7 @@ def assign_reads(self, verbose=False): A 1D array. Each entry gives a genome id that is the genome assignment for each read. """ - self.run_vi(verbose=verbose) + self.run_vi() self.phi = csr_matrix(self.phi) assignments = [] for i in range(self.nreads): From f0bcddcc16b669baeaa1878508898e37d76e9268 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Tue, 17 Aug 2021 15:14:16 -0400 Subject: [PATCH 40/48] catch exception and throw warning when poisson pca fails --- src/coptr/__init__.py | 2 +- src/coptr/coptr_contig.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index 72f26f5..0b2f79d 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.2" +__version__ = "1.1.3" diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index 007f130..ecb3334 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -413,7 +413,16 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): keep_rows = np.sum(np.isnan(A), axis=1) < 0.025 * A.shape[1] A = A[keep_rows, :] - if keep_rows.sum() < 0.5 * keep_rows.size: + ppca_failed = False + try: + if keep_rows.sum() >= 0.5 * keep_rows.size: + poisson_pca = PoissonPCA() + W, V = poisson_pca.pca(A, k=1) + except: + ppca_failed = True + logger.warn("PoissonPCA failed for assembly " + genome_ids[0]) + + if keep_rows.sum() < 0.5 * keep_rows.size or ppca_failed: for j, col in enumerate(A.T): reads = col[np.isfinite(col)].sum() frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[ @@ -436,8 +445,6 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: return estimates - poisson_pca = PoissonPCA() - W, V = poisson_pca.pca(A, k=1) sorted_idx = np.argsort(W.flatten()) A = A[sorted_idx, :] From f06bb583966c9c749c7bcab430c411d8d7690c69 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 20 Aug 2021 09:54:22 -0400 Subject: [PATCH 41/48] remove multiprocessing option: memory limits on mp outweigh performance gains from running in parallel --- docs/tutorial.rst | 1 - src/coptr/__init__.py | 2 +- src/coptr/cli.py | 8 ---- src/coptr/coptr_contig.py | 59 +++++++++-------------------- src/coptr/coptr_ref.py | 79 +++++++++++---------------------------- 5 files changed, 39 insertions(+), 110 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index a518228..6678006 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -296,7 +296,6 @@ on all samples at once.** --min-samples MIN_SAMPLES CoPTRContig only. Minimum number of samples required to reorder bins (default 5). - --threads THREADS Number of threads to use (default 1). This combines all coverage maps by species, then estimates PTRs for each species. We have tried to set sensible default parameters for PTR estimatation. We set diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index 0b2f79d..c72e379 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.3" +__version__ = "1.1.4" diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 5fc6734..c6c9305 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -281,12 +281,6 @@ def estimate(self): help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5, ) - parser.add_argument( - "--threads", - type=int, - help="Number of threads to use (default 1).", - default=1, - ) parser.add_argument( "--plot", default=None, help="Plot model fit and save the results." ) @@ -387,7 +381,6 @@ def estimate(self): grouped_coverage_map_folder, args.min_reads, args.min_cov, - threads=args.threads, plot_folder=args.plot, ) results_contig = estimate_ptrs_coptr_contig( @@ -395,7 +388,6 @@ def estimate(self): grouped_coverage_map_folder, args.min_reads, args.min_samples, - threads=args.threads, plot_folder=args.plot, ) diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index ecb3334..f360a90 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -3,8 +3,6 @@ ====================== Estimate peak-to-trough ratios from assemblies. """ -import sys - """ This file is part of CoPTR. @@ -27,6 +25,8 @@ import multiprocessing as mp import os.path import pickle as pkl +import struct +import sys import numpy as np import scipy.special @@ -604,7 +604,6 @@ def estimate_ptrs_coptr_contig( grouped_coverage_map_folder, min_reads, min_samples, - threads, plot_folder=None, ): """Estimate Peak-to-Trough ratios across samples. @@ -634,46 +633,22 @@ def estimate_ptrs_coptr_contig( coptr_contig_estimates = {} coptr_contig = CoPTRContig(min_reads, min_samples) - if threads == 1: - for ref_id in sorted(assembly_genome_ids): - with open( - os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" - ) as f: - coverage_maps = load_coverage_maps(f, ref_id) + for ref_id in sorted(assembly_genome_ids): + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: + coverage_maps = load_coverage_maps(f, ref_id) - if plot_folder is not None: - estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs( - coverage_maps, return_bins=True - ) - plot_fit( - estimates, parameters, coverage_maps, reordered_bins, plot_folder - ) - else: - estimates = coptr_contig.estimate_ptrs(coverage_maps) - - coptr_contig_estimates[ref_id] = estimates - - else: - # workers for multiprocessing - pool = mp.Pool(threads) - coverage_maps = {} - - for i, ref_id in enumerate(sorted(assembly_genome_ids)): - with open( - os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" - ) as f: - coverage_maps[ref_id] = load_coverage_maps(f, ref_id) - - if (i % threads == 0 and i > 0) or i == len(assembly_genome_ids) - 1: - flat_coverage_maps = [ - (ref_id, coverage_maps[ref_id], plot_folder) - for ref_id in coverage_maps - ] - flat_results = pool.map( - coptr_contig._parallel_helper, flat_coverage_maps - ) - coptr_contig_estimates.update(dict(flat_results)) + if plot_folder is not None: + estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs( + coverage_maps, return_bins=True + ) + plot_fit( + estimates, parameters, coverage_maps, reordered_bins, plot_folder + ) + else: + estimates = coptr_contig.estimate_ptrs(coverage_maps) - coverage_maps = {} + coptr_contig_estimates[ref_id] = estimates return coptr_contig_estimates diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 8e5efce..a8f46b7 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -3,8 +3,6 @@ ====================== Estimate peak-to-trough ratios using complete reference genomes. """ -import sys - """ This file is part of CoPTR. @@ -28,6 +26,8 @@ import multiprocessing as mp import os.path import pickle as pkl +import struct +import sys import numpy as np import scipy.optimize @@ -1018,7 +1018,6 @@ def estimate_ptrs_coptr_ref( grouped_coverage_map_folder, min_reads, min_cov, - threads, plot_folder=None, mem_limit=None, ): @@ -1053,60 +1052,24 @@ def estimate_ptrs_coptr_ref( coptr_ref_estimates = {} coptr_ref = CoPTRRef(min_reads, min_cov) - if threads > 1: - # workers for multiprocessing - pool = mp.Pool(threads) - coverage_maps = {} - - for i, ref_id in enumerate(sorted(ref_genome_ids)): - with open( - os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" - ) as f: - coverage_maps[ref_id] = load_coverage_maps(f, ref_id) - - if (i % threads == 0 and i > 0) or i == len(ref_genome_ids) - 1: - flat_coverage_maps = [ - (ref_id, coverage_maps[ref_id]) for ref_id in coverage_maps - ] - flat_results = pool.map(coptr_ref._parallel_helper, flat_coverage_maps) - coptr_ref_estimates.update(dict(flat_results)) - - for ref_id in sorted(coverage_maps): - for sample, est in zip( - coverage_maps[ref_id], coptr_ref_estimates[ref_id] - ): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit( - est, - sample.read_positions, - sample.length, - min_reads, - min_cov, - plot_folder, - ) - - coverage_maps = {} - - else: - for ref_id in sorted(ref_genome_ids): - with open( - os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" - ) as f: - coverage_maps = load_coverage_maps(f, ref_id) - - coptr_ref_estimates[ref_id] = coptr_ref.estimate_ptrs(coverage_maps) - - for sample, est in zip(coverage_maps, coptr_ref_estimates[ref_id]): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit( - est, - sample.read_positions, - sample.length, - min_reads, - min_cov, - plot_folder, - ) + for ref_id in sorted(ref_genome_ids): + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: + coverage_maps = load_coverage_maps(f, ref_id) + + coptr_ref_estimates[ref_id] = coptr_ref.estimate_ptrs(coverage_maps) + + for sample, est in zip(coverage_maps, coptr_ref_estimates[ref_id]): + # only plot samples with a PTR estimate + if plot_folder is not None and not np.isnan(est.estimate): + plot_fit( + est, + sample.read_positions, + sample.length, + min_reads, + min_cov, + plot_folder, + ) return coptr_ref_estimates From ecc79a6f717273ed55eec8c1e661421e94614051 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 27 Aug 2021 19:08:13 -0400 Subject: [PATCH 42/48] updated docs --- docs/modules.rst | 3 --- docs/tutorial.rst | 27 +++++++++++++-------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/modules.rst b/docs/modules.rst index 738633f..c035a51 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -13,9 +13,6 @@ Modules .. automodule:: src.coptr.poisson_pca :members: -.. automodule:: src.coptr.print - :members: - .. automodule:: src.coptr.read_mapper :members: diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 6678006..f930324 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -282,20 +282,19 @@ on all samples at once.** usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] coverage-map-folder out-file - positional arguments: - coverage_map_folder Folder with coverage maps computed from 'extract'. - out_file Filename to store PTR table. - - optional arguments: - -h, --help show this help message and exit - --min-reads MIN_READS - Minimum number of reads required to compute a PTR - (default 5000). - --min-cov MIN_COV Fraction of nonzero 10Kb bins required to compute a - PTR (default 0.75). - --min-samples MIN_SAMPLES - CoPTRContig only. Minimum number of samples required - to reorder bins (default 5). + positional arguments: + coverage_map_folder Folder with coverage maps computed from 'extract'. + out_file Filename to store PTR table. + + optional arguments: + -h, --help show this help message and exit + --min-reads MIN_READS + Minimum number of reads required to compute a PTR (default 5000). + --min-cov MIN_COV Fraction of nonzero bins required to compute a PTR (default 0.75). + --min-samples MIN_SAMPLES + CoPTRContig only. Minimum number of samples required to reorder bins (default 5). + --plot PLOT Plot model fit and save the results. + --restart Restarts the estimation step using the genomes in the coverage-maps-genome folder. This combines all coverage maps by species, then estimates PTRs for each species. We have tried to set sensible default parameters for PTR estimatation. We set From 3a937bba6db7eb279cbe5d531c32cc2df32c6b2f Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 27 Aug 2021 19:16:04 -0400 Subject: [PATCH 43/48] updated docs --- docs/tutorial.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index f930324..db7b53c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -280,7 +280,8 @@ on all samples at once.** .. code-block:: text - usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] coverage-map-folder out-file + usage: usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file + positional arguments: coverage_map_folder Folder with coverage maps computed from 'extract'. @@ -293,7 +294,7 @@ on all samples at once.** --min-cov MIN_COV Fraction of nonzero bins required to compute a PTR (default 0.75). --min-samples MIN_SAMPLES CoPTRContig only. Minimum number of samples required to reorder bins (default 5). - --plot PLOT Plot model fit and save the results. + --plot OUTFOLDER Plot model fit for each PTR. --restart Restarts the estimation step using the genomes in the coverage-maps-genome folder. This combines all coverage maps by species, then estimates PTRs for each species. From 9ece9ccf73711f258f82113735c57540da353c85 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 27 Aug 2021 19:17:03 -0400 Subject: [PATCH 44/48] better help message for the --plot argument --- src/coptr/cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/coptr/cli.py b/src/coptr/cli.py index c6c9305..2afc067 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -282,7 +282,10 @@ def estimate(self): default=5, ) parser.add_argument( - "--plot", default=None, help="Plot model fit and save the results." + "--plot", + metavar="OUTFOLDER", + default=None, + help="Plot model fit for each PTR." ) parser.add_argument( "--restart", From e13eb68f8ab337c488e1b6e2418c9409bf5559ea Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Sun, 10 Oct 2021 09:40:09 -0400 Subject: [PATCH 45/48] relaxed filtering criteria for CoPTR-Contig --- src/coptr/__init__.py | 2 +- src/coptr/coptr_contig.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index c72e379..9b102be 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.4" +__version__ = "1.1.5" diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index f360a90..f1d04c4 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -415,14 +415,15 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): ppca_failed = False try: - if keep_rows.sum() >= 0.5 * keep_rows.size: + if keep_rows.sum() >= self.min_samples: poisson_pca = PoissonPCA() W, V = poisson_pca.pca(A, k=1) except: ppca_failed = True logger.warn("PoissonPCA failed for assembly " + genome_ids[0]) - if keep_rows.sum() < 0.5 * keep_rows.size or ppca_failed: + + if keep_rows.sum() < self.min_samples or ppca_failed: for j, col in enumerate(A.T): reads = col[np.isfinite(col)].sum() frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[ From 68f6659f332d183e5335c02595592f3bc113059d Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Thu, 7 Apr 2022 18:49:34 -0400 Subject: [PATCH 46/48] bux fix: invalid format string for logger --- src/coptr/__init__.py | 2 +- src/coptr/read_mapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py index 9b102be..1436d8f 100644 --- a/src/coptr/__init__.py +++ b/src/coptr/__init__.py @@ -1 +1 @@ -__version__ = "1.1.5" +__version__ = "1.1.6" diff --git a/src/coptr/read_mapper.py b/src/coptr/read_mapper.py index df5a645..b4e2d78 100644 --- a/src/coptr/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -201,7 +201,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): outfile.close() # now remove sam file - logger.info("Cleaning up {}.", str(out_sam)) + logger.info("Cleaning up %s.", str(out_sam)) out_sam.unlink() # single end sequencing From e826cd8c04890583048bbbaa52a98b706f405502 Mon Sep 17 00:00:00 2001 From: Tyler Joseph Date: Fri, 12 Feb 2021 13:01:17 -0500 Subject: [PATCH 47/48] output oriC predictions for complete reference genomes; output bin order for assemblies --- src/coptr/cli.py | 26 ++++++++++++++- src/coptr/coptr_contig.py | 56 ++++++++++++++++++++++----------- src/coptr/coptr_ref.py | 63 ++++++++++++++++++++++++++++++------- test/test_bam_processor.py | 2 +- test/test_coordinate_map.py | 50 +++++++++++++++++++++++++++++ test/test_coptr.py | 6 ++-- 6 files changed, 168 insertions(+), 35 deletions(-) create mode 100644 test/test_coordinate_map.py diff --git a/src/coptr/cli.py b/src/coptr/cli.py index 2afc067..1c383fb 100644 --- a/src/coptr/cli.py +++ b/src/coptr/cli.py @@ -293,6 +293,12 @@ def estimate(self): action="store_true", help="Restarts the estimation step using the genomes in the coverage-maps-genome folder.", ) + parser.add_argument( + "--oriC", + default=False, + action="store_true", + help="Compute OriC estimates." + ) if len(sys.argv[2:]) < 1: parser.print_help() @@ -386,7 +392,7 @@ def estimate(self): args.min_cov, plot_folder=args.plot, ) - results_contig = estimate_ptrs_coptr_contig( + results_contig, ref_id_to_bin_coords = estimate_ptrs_coptr_contig( assembly_genome_ids, grouped_coverage_map_folder, args.min_reads, @@ -434,6 +440,24 @@ def estimate(self): row[sample_ids.index(result.sample_id)] = str(result.estimate) f.write(",".join(row) + "\n") + if args.oriC: + oric_file = os.path.splitext(out_file)[0] + "-oriC.csv" + with open(oric_file, "w") as f: + f.write("genome_id,oriC\n") + + for genome_id in sorted(results_ref): + orics = [ + result.ori_estimate_coord + for result in results_ref[genome_id] + if np.isfinite(result.ori_estimate_coord) + ] + oric_est = np.mean(orics) + f.write("{},{}\n".format(genome_id, oric_est)) + + for genome_id in sorted(ref_id_to_bin_coords): + bin_coords = ref_id_to_bin_coords[genome_id] + f.write("{},{}\n".format(genome_id, "\t".join(bin_coords))) + logger.info("Done.") logger.info("You may now remove the folder %s.", grouped_coverage_map_folder) diff --git a/src/coptr/coptr_contig.py b/src/coptr/coptr_contig.py index f1d04c4..e16587f 100644 --- a/src/coptr/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -146,6 +146,9 @@ def construct_coverage_matrix(self, coverage_maps): contig_ids = coverage_maps[0].contig_ids ref_genome = coverage_maps[0].genome_id + bin_coords = [] + contigs_seen = set() + for cm in coverage_maps: binned_reads = [] @@ -161,17 +164,28 @@ def construct_coverage_matrix(self, coverage_maps): sys.exit(1) length = cm.get_length(contig_id) + bin_size = cm.compute_bin_size() if length < 11000: continue else: - binned_reads.append(cm.bin_reads(contig_id)) + bins = cm.bin_reads(contig_id) + binned_reads.append(bins) + + if contig_id not in contigs_seen: + pos = 0 + for b in bins: + bin_coords.append(contig_id + "-{}".format(int(pos))) + pos += bin_size + contigs_seen.add(contig_id) row = np.concatenate(binned_reads) A.append(row) # bin by sample A = np.array(A).T - return A + bin_coords = np.array(bin_coords) + + return A, bin_coords def estimate_slope_intercept(self, bin_log_probs): """Estimate the slope along reordering bins using least squares. @@ -343,7 +357,7 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): genome_id = coverage_maps[0].genome_id # bin by sample - A = self.construct_coverage_matrix(passing_coverage_maps) + A, bin_coords = self.construct_coverage_matrix(passing_coverage_maps) nbins = A.shape[0] # filter out low coverage and high coverage bins, @@ -412,6 +426,7 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): # filter out rows where too many bins are missing keep_rows = np.sum(np.isnan(A), axis=1) < 0.025 * A.shape[1] A = A[keep_rows, :] + bin_coords = bin_coords[keep_rows] ppca_failed = False try: @@ -448,7 +463,9 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): sorted_idx = np.argsort(W.flatten()) A = A[sorted_idx, :] + bin_coords = bin_coords[sorted_idx] + flipped_bins = False for j, col in enumerate(A.T): col = col[np.isfinite(col)] reads = col.sum() @@ -473,15 +490,19 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): bc = col if flip: + flipped_bins = True bc = np.flip(bc) binned_counts.append(bc) + if flipped_bins: + bin_coords = np.flip(bin_coords) + logger.info("Finished %s.", genome_id) if return_bins: - return estimates, parameters, binned_counts + return estimates, parameters, binned_counts, bin_coords else: - return estimates + return estimates, bin_coords def _parallel_helper(self, x): ref_genome = x[0] @@ -489,14 +510,14 @@ def _parallel_helper(self, x): plot_folder = x[2] if plot_folder is not None: - estimates, parameters, reordered_bins = self.estimate_ptrs( + estimates, parameters, reordered_bins, bin_coords = self.estimate_ptrs( coverage_maps, return_bins=True ) plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) else: - estimates = self.estimate_ptrs(coverage_maps) + estimates, bin_coords = self.estimate_ptrs(coverage_maps) - return (ref_genome, estimates) + return (ref_genome, estimates), bin_coords def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): @@ -634,22 +655,21 @@ def estimate_ptrs_coptr_contig( coptr_contig_estimates = {} coptr_contig = CoPTRContig(min_reads, min_samples) + ref_id_to_bin_coords = {} + for ref_id in sorted(assembly_genome_ids): - with open( - os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" - ) as f: + with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: coverage_maps = load_coverage_maps(f, ref_id) if plot_folder is not None: - estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs( - coverage_maps, return_bins=True - ) - plot_fit( - estimates, parameters, coverage_maps, reordered_bins, plot_folder + estimates, parameters, reordered_bins, bin_coords = ( + coptr_contig.estimate_ptrs(coverage_maps, return_bins=True) ) + plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) else: - estimates = coptr_contig.estimate_ptrs(coverage_maps) + estimates, bin_coords = coptr_contig.estimate_ptrs(coverage_maps) coptr_contig_estimates[ref_id] = estimates + ref_id_to_bin_coords[ref_id] = bin_coords - return coptr_contig_estimates + return coptr_contig_estimates, ref_id_to_bin_coords diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index a8f46b7..18029f4 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -67,6 +67,27 @@ def __repr__(self): return self.__str__() +class CoordinateMap: + """Map post-filtering genomic coordinates to pre-filtering + genomic coordinates. + """ + + def __init__(self): + self.break_points = [] + + def update_break_points(self, break_point, shift_size): + self.break_points.append((break_point, shift_size)) + + def translate(self, filtered_coord): + unfiltered_coord = filtered_coord + for break_point, shift_size in self.break_points: + if unfiltered_coord >= break_point: + unfiltered_coord += shift_size + return unfiltered_coord + + + + class ReadFilterRef: """Read filtering steps for CoPTR Ref. @@ -111,10 +132,11 @@ def filter_reads(self, read_positions, genome_length): QCResult(-1, len(read_positions), -1, False), ) + coord_map = CoordinateMap() bin_size = self.compute_bin_size(genome_length) read_positions = np.copy(read_positions) filtered_read_positions, filtered_genome_length = self.filter_reads_phase1( - read_positions, genome_length, bin_size + read_positions, genome_length, bin_size, coord_map ) # more than 25% of the genome is removed frac_removed = 1 - filtered_genome_length / genome_length @@ -126,17 +148,16 @@ def filter_reads(self, read_positions, genome_length): ) filtered_read_positions, filtered_genome_length = self.filter_reads_phase2( - filtered_read_positions, filtered_genome_length, bin_size + filtered_read_positions, filtered_genome_length, bin_size, coord_map ) qc_result = self.quality_check( filtered_read_positions, filtered_genome_length, - genome_length, - bin_size, + genome_length, bin_size, self.min_reads, - self.min_cov, + self.min_cov ) - return filtered_read_positions, filtered_genome_length, qc_result + return filtered_read_positions, filtered_genome_length, qc_result, coord_map def compute_bin_size(self, genome_length): """Compute bin size for read counts. @@ -303,7 +324,8 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): return rolling_counts, endpoints - def remove_reads_by_region(self, read_positions, genome_length, regions): + + def remove_reads_by_region(self, read_positions, genome_length, regions, coord_map): """Remove reads that overlap a region in regions. Parameters @@ -340,9 +362,12 @@ def remove_reads_by_region(self, read_positions, genome_length, regions): read_positions = read_positions[keep] read_positions[read_positions > (right - adjustment)] -= remove_size + coord_map.update_break_points(right - remove_size, remove_size) + return read_positions, new_genome_length - def filter_reads_phase1(self, read_positions, genome_length, bin_size): + + def filter_reads_phase1(self, read_positions, genome_length, bin_size, coord_map): """A coarse-grained genomewide filter that removes reads in ultra-high or ultra-low coverage regions. @@ -393,10 +418,15 @@ def filter_reads_phase1(self, read_positions, genome_length, bin_size): read_positions, genome_length, remove_regions ) + read_positions, new_genome_length = self.remove_reads_by_region( + read_positions, genome_length, remove_regions, coord_map + ) + binned_reads = self.bin_reads(read_positions, genome_length, bin_size) return read_positions, new_genome_length - def filter_reads_phase2(self, read_positions, genome_length, bin_size): + + def filter_reads_phase2(self, read_positions, genome_length, bin_size, coord_map): """A fine-grained filter that removes reads in localized regions with too-high or too-low coverage. For each 10Kb, looks 6.25% of the genome length ahead and 6.25% of the genome length behind. @@ -469,7 +499,7 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): remove_regions.append((remove_start, remove_end)) read_positions, new_genome_length = self.remove_reads_by_region( - read_positions, genome_length, remove_regions + read_positions, genome_length, remove_regions, coord_map ) return read_positions, new_genome_length @@ -532,6 +562,7 @@ def __init__( ter_estimate, nreads, cov_frac, + ori_estimate_coord ): self.bam_file = bam_file self.genome_id = genome_id @@ -541,6 +572,7 @@ def __init__( self.ter_estimate = ter_estimate self.nreads = nreads self.cov_frac = cov_frac + self.ori_estimate_coord = ori_estimate_coord def __str__(self): return "CoPTRRefEstimate(bam_file={}, genome_id={}, sample_id={}, estimate={:.3f}, nreads={}, cov_frac={})".format( @@ -688,7 +720,7 @@ def estimate_ptr( """ if filter_reads: rf = ReadFilterRef(self.min_reads, self.min_cov) - read_positions, ref_genome_len, qc_result = rf.filter_reads( + read_positions, ref_genome_len, qc_result, coord_map = rf.filter_reads( read_positions, ref_genome_len ) if not qc_result.passed_qc: @@ -844,11 +876,15 @@ def estimate_ptrs(self, coverage_maps): sample_ids = [] read_positions_list = [] lengths = [] + filtered_genomes_lengths = [] + coord_maps = [] genome_id = coverage_maps[0].genome_id for cm in coverage_maps: - read_positions, ref_genome_len, qc_result = rf.filter_reads( + read_positions, ref_genome_len, qc_result, coord_map = rf.filter_reads( cm.read_positions, cm.length ) + coord_maps.append(coord_map) + filtered_genomes_lengths.append(ref_genome_len) if qc_result.passed_qc: read_positions_list.append(np.array(read_positions)) lengths.append(ref_genome_len) @@ -864,6 +900,7 @@ def estimate_ptrs(self, coverage_maps): np.nan, qc_result.nreads, qc_result.frac_nonzero, + np.nan ) ) @@ -896,10 +933,12 @@ def estimate_ptrs(self, coverage_maps): estimates[i].ori_estimate = ori estimates[i].ter_estimate = ter estimates[i].estimate = np.nan + estimates[i].ori_estimate_coord = coord_maps[i].translate(ori*filtered_genomes_lengths[i]) else: estimates[i].ori_estimate = ori estimates[i].ter_estimate = ter estimates[i].estimate = log2_ptrs[n] + estimates[i].ori_estimate_coord = coord_maps[i].translate(ori*filtered_genomes_lengths[i]) n += 1 logger.info("Finished %s.", genome_id) diff --git a/test/test_bam_processor.py b/test/test_bam_processor.py index 76f60b9..cf71312 100644 --- a/test/test_bam_processor.py +++ b/test/test_bam_processor.py @@ -3,7 +3,7 @@ import pysam -from src.coptr.bam_processor import BamProcessor +from coptr.bam_processor import BamProcessor def read_to_dict(read): diff --git a/test/test_coordinate_map.py b/test/test_coordinate_map.py new file mode 100644 index 0000000..cbb3d3e --- /dev/null +++ b/test/test_coordinate_map.py @@ -0,0 +1,50 @@ +import numpy as np +import unittest + +from coptr.coptr_ref import CoordinateMap +from coptr.coptr_ref import ReadFilterRef + + +class TestCoordinateMap(unittest.TestCase): + + def test_coordinate_map1(self): + rf = ReadFilterRef(0, 0) + + coord = 3 + read_positions = np.array([coord]) + genome_length = 10 + regions = [(0, 1)] + + read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions) + + coord_map = CoordinateMap() + coord_map.update_break_points(regions[0][1], regions[0][1] - regions[0][0]) + + unfiltered_coord = coord_map.translate(read_positions[0]) + + self.assertTrue(unfiltered_coord == coord) + + def test_coordinate_map2(self): + + rf = ReadFilterRef(0, 1) + + coordinates = [i for i in range(6, 25)] + \ + [i for i in range(28, 30)] + \ + [i for i in range(46, 60)] + \ + [i for i in range(68, 83)] + \ + [i for i in range(90, 100)] + read_positions = np.array(coordinates) + genome_length = 100 + regions = [(1, 5), (25, 27), (30, 45), (60, 67), (83, 89)] + + new_read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions) + self.assertTrue(new_read_positions.size == read_positions.size) + + for i,filtered_coord in enumerate(new_read_positions): + unfiltered_coord = rf.coord_map.translate(filtered_coord) + self.assertTrue(unfiltered_coord == coordinates[i]) + + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_coptr.py b/test/test_coptr.py index c850178..0b0d4d3 100644 --- a/test/test_coptr.py +++ b/test/test_coptr.py @@ -4,9 +4,9 @@ import numpy as np import scipy.stats -from src.coptr.bam_processor import CoverageMapContig -from src.coptr.coptr_contig import CoPTRContig -from src.coptr.coptr_ref import CoPTRRef +from coptr.bam_processor import CoverageMapContig +from coptr.coptr_contig import CoPTRContig +from coptr.coptr_ref import CoPTRRef class TestCoPTR(unittest.TestCase): From 730d2b39e6ea3ff03ca58fc45ad41eef065e4df8 Mon Sep 17 00:00:00 2001 From: Christian Diener Date: Wed, 11 May 2022 14:18:12 -0700 Subject: [PATCH 48/48] fixes to make it work --- src/coptr/coptr_ref.py | 4 ---- test/test_coordinate_map.py | 8 +++++--- test/test_coptr.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/coptr/coptr_ref.py b/src/coptr/coptr_ref.py index 18029f4..6424818 100644 --- a/src/coptr/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -414,10 +414,6 @@ def filter_reads_phase1(self, read_positions, genome_length, bin_size, coord_map remove_end = genome_length remove_regions.append((remove_start, remove_end)) - read_positions, new_genome_length = self.remove_reads_by_region( - read_positions, genome_length, remove_regions - ) - read_positions, new_genome_length = self.remove_reads_by_region( read_positions, genome_length, remove_regions, coord_map ) diff --git a/test/test_coordinate_map.py b/test/test_coordinate_map.py index cbb3d3e..a6bac27 100644 --- a/test/test_coordinate_map.py +++ b/test/test_coordinate_map.py @@ -15,7 +15,8 @@ def test_coordinate_map1(self): genome_length = 10 regions = [(0, 1)] - read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions) + coord_map = CoordinateMap() + read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions, coord_map) coord_map = CoordinateMap() coord_map.update_break_points(regions[0][1], regions[0][1] - regions[0][0]) @@ -37,11 +38,12 @@ def test_coordinate_map2(self): genome_length = 100 regions = [(1, 5), (25, 27), (30, 45), (60, 67), (83, 89)] - new_read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions) + coord_map = CoordinateMap() + new_read_positions, new_genome_length = rf.remove_reads_by_region(read_positions, genome_length, regions, coord_map) self.assertTrue(new_read_positions.size == read_positions.size) for i,filtered_coord in enumerate(new_read_positions): - unfiltered_coord = rf.coord_map.translate(filtered_coord) + unfiltered_coord = coord_map.translate(filtered_coord) self.assertTrue(unfiltered_coord == coordinates[i]) diff --git a/test/test_coptr.py b/test/test_coptr.py index 0b0d4d3..c500193 100644 --- a/test/test_coptr.py +++ b/test/test_coptr.py @@ -48,7 +48,7 @@ def test_coptr(self): if len(coverage_map_contig) % 10 == 0: coptr_contig_estimates = sorted( - coptr_contig.estimate_ptrs(coverage_map_contig), + coptr_contig.estimate_ptrs(coverage_map_contig)[0], key=lambda e: e.bam_file, ) coptr_contig_results += [e.estimate for e in coptr_contig_estimates]