diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..711388e --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# Editor Configuration (http://editorconfig.org) +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 4 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +max_line_length = 88 + +[*.{json,yml}] +indent_size = 2 + +[*.{md,rst}] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83f1817 --- /dev/null +++ b/.gitignore @@ -0,0 +1,209 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Windows template +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + diff --git a/README.md b/README.md index f28093d..fcb875f 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -Our documentation is on Read The Docs: \ No newline at end of file +CoPTR is a tool for estimating peak-to-trough ratios from metagenomic sequencing. + +You can find our documentation on Read The Docs: \ No newline at end of file diff --git a/coptr.py b/coptr.py deleted file mode 100644 index 2d1161e..0000000 --- a/coptr.py +++ /dev/null @@ -1,387 +0,0 @@ -""" -This file is part of CoPTR. - -CoPTR is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -CoPTR is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with CoPTR. If not, see . -""" - -import argparse -import numpy as np -import os -import os.path -import pickle as pkl -import sys - -from src.bam_processor import BamProcessor, CoverageMapRef, CoverageMapContig -from src.coptr_contig import estimate_ptrs_coptr_contig -from src.coptr_ref import estimate_ptrs_coptr_ref -from src.compute_read_counts import compute_read_counts -from src.print import print_error, print_info -from src.read_mapper import ReadMapper -from src.util import get_fastq_name - -VERSION="1.0.0" - -class ProgramOptions: - - def __init__(self): - parser = argparse.ArgumentParser( - description="CoPTR (v{}): Compute PTRs from complete reference genomes and assemblies.".format(VERSION), - usage="""coptr.py [options] - -command: index create a bowtie2 index for a reference database - map map reads against a reference database - merge merge BAM files from reads mapped to multiple indexes - extract compute coverage maps from bam files - estimate estimate PTRs from coverage maps - count compute read counts for each genome after filtering -""" - ) - self.default_bt2_k = 10 - - if len(sys.argv[1:]) < 1: - parser.print_help() - exit(1) - - parser.add_argument("command", type=str, help="Command to run.") - args = parser.parse_args(sys.argv[1:2]) - - if not hasattr(self, args.command): - print_error("Main", "Unrecognized command.", quit=False) - parser.print_help() - exit(1) - getattr(self, args.command)() - - - def index(self): - parser = argparse.ArgumentParser(usage="coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out") - parser.add_argument("ref_fasta", help= - """File or folder containing fasta to index. If a folder, the extension for each - fasta must be one of [.fasta, .fna, .fa] - """ - ) - parser.add_argument("index_out", help="Filepath to store index.") - parser.add_argument("--bt2-bmax", default=None, help="Set the --bmax arguement for bowtie2-build. Used to control memory useage.") - parser.add_argument("--bt2-dcv", default=None, help="Set the --dcv argument for bowtie2-build. Used to control memory usage.") - parser.add_argument("--bt2-threads", default="1", help="Number of threads to pass to bowtie2-build.") - parser.add_argument("--bt2-packed", action="store_true", help="Set the --packed flag for bowtie2-build. Used to control memory usage.") - - if len(sys.argv[2:]) < 1: - parser.print_help() - exit(1) - - args = parser.parse_args(sys.argv[2:]) - read_mapper = ReadMapper() - read_mapper.index(args.ref_fasta, args.index_out, args.bt2_bmax, args.bt2_dcv, args.bt2_threads, args.bt2_packed) - - - def map(self): - parser = argparse.ArgumentParser(usage="coptr.py map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder") - parser.add_argument("index", help="Name of database index.") - parser.add_argument("input", help= - """File or folder containing fastq reads to map. If a folder, the extension for - each fastq must be one of [.fastq, .fq, .fastq.gz, fq.gz] - """ - ) - parser.add_argument("out_folder", - help="Folder to save mapped reads. BAM files are output here." - ) - parser.add_argument("--paired", action="store_true", - help="Set for paired end reads. Assumes fastq files end in _1.* and _2.*") - parser.add_argument("--threads", type=int, default=1, - help="Number of threads for bowtie2 mapping." - ) - parser.add_argument("--bt2-k", type=int, default=self.default_bt2_k, - help="(Default 10). Number of alignments to report. Passed to -k flag of bowtie2.", - - ) - - if len(sys.argv[2:]) < 1: - parser.print_help() - exit(1) - - - args = parser.parse_args(sys.argv[2:]) - read_mapper = ReadMapper() - read_mapper.map(args.index, args.input, args.out_folder, args.paired, args.threads, args.bt2_k) - - - def merge(self): - parser = argparse.ArgumentParser(usage="coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam") - parser.add_argument("in-bams", nargs="+", - help="A space separated list of BAM files to merge. Assumes same reads were mapped against different indexes. " + - "Only keeps read 1 of paired end sequencing, since this is used downstream.") - parser.add_argument("out-bam", help="Path to merged BAM.") - - if len(sys.argv[2:]) < 2: - parser.print_help() - exit(1) - - args = vars(parser.parse_args(sys.argv[2:])) - in_bams = args["in-bams"] - out_bam = args["out-bam"] - bam_processor = BamProcessor() - bam_processor.merge(in_bams, out_bam) - - - def extract(self): - parser = argparse.ArgumentParser(usage="coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder") - parser.add_argument("in_folder", help="Folder with BAM files.") - parser.add_argument("out_folder", help="Folder to store coverage maps.") - parser.add_argument("--ref-genome-regex", default="[^\|]+", - help="Regular expression extracting a reference genome id from the sequence id in a bam file.", - ) - parser.add_argument("--check-regex", action="store_true", default=False, - help="Check the regular expression by counting reference genomes without processing." - ) - parser.add_argument("--bt2-k", type=int, default=self.default_bt2_k, - help="Maximum number of alignments.", - - ) - - if len(sys.argv[2:]) < 1: - parser.print_help() - exit(1) - - args = parser.parse_args(sys.argv[2:]) - - bam_processor = BamProcessor(args.ref_genome_regex) - ref_sequences = set() - ref_genomes = set() - for f in sorted(os.listdir(args.in_folder)): - fname, ext = os.path.splitext(f) - if ext == ".bam": - fpath = os.path.join(args.in_folder, f) - seq, gen = bam_processor.get_ref_names(fpath) - ref_sequences.update(seq) - ref_genomes.update(gen) - - if os.path.isfile(os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl")): - print_info("BamProcessor", "output for {} already found, skipping".format(fname)) - continue - - # don't process the rest of the bam file if we just want to - # sanity check the regular expression - if args.check_regex: - continue - - coverage_maps = bam_processor.process_bam(fpath, args.bt2_k) - with open(os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl"), "wb") as f: - pkl.dump(coverage_maps, f) - - print_info("BamProcessor", "found {} reference sequences corresponding to {} genomes".format(len(ref_sequences), len(ref_genomes))) - if args.check_regex: - print_info("BamProcessor", "reference genome ids:") - for ref in sorted(ref_genomes): - print("\t", ref, file=sys.stderr) - - - def estimate(self): - parser = argparse.ArgumentParser(usage= - """usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] [--plot OUTFOLDER] [--batch-size INT] [--restart] coverage-map-folder out-file - """ - ) - parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") - parser.add_argument("out_file", help="Filename to store PTR table.") - parser.add_argument("--min-reads", type=float, - help="Minimum number of reads required to compute a PTR (default 5000).", default=5000 - ) - parser.add_argument("--min-cov", type=float, - help="Fraction of nonzero bins required to compute a PTR (default 0.75).", default=0.75 - ) - parser.add_argument("--min-samples", type=float, - help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", default=5 - ) - parser.add_argument("--threads", type=int, help="Number of threads to use (default 1).", default=1) - parser.add_argument("--plot", default=None, help="Plot model fit and save the results.") - parser.add_argument("--restart", default=False, action="store_true", - help="Restarts the estimation step using the genomes in the coverage-maps-genome folder." - ) - - if len(sys.argv[2:]) < 1: - parser.print_help() - exit(1) - - args = parser.parse_args(sys.argv[2:]) - sample_ids = set() - ref_genome_ids = set() - assembly_genome_ids = set() - - - grouped_coverage_map_folder = os.path.join(args.coverage_map_folder, "coverage-maps-genome") - if not args.restart and not os.path.isdir(grouped_coverage_map_folder): - os.mkdir(grouped_coverage_map_folder) - - if args.restart: - print_info("CoPTR", "restarting from files in " + grouped_coverage_map_folder) - for cm_file in sorted(os.listdir(grouped_coverage_map_folder)): - _, ext = os.path.splitext(cm_file) - if ext != ".pkl": continue - - print_info("CoPTR", "checking " + cm_file) - with open(os.path.join(grouped_coverage_map_folder, cm_file), "rb") as f: - try: - while True: - cm = pkl.load(f) - if cm.is_assembly: - assembly_genome_ids.add(cm.genome_id) - else: - ref_genome_ids.add(cm.genome_id) - sample_ids.add(cm.sample_id) - - except EOFError: - pass - - else: - print_info("CoPTR", "grouping reads by reference genome") - print_info("CoPTR", "saving to " + grouped_coverage_map_folder) - - # first construct a list of genome_ids for ptr estimates - for f in sorted(os.listdir(args.coverage_map_folder)): - fname, ext = os.path.splitext(f) - if ext != ".pkl": continue - fpath = os.path.join(args.coverage_map_folder, f) - - print_info("CoPTR", "\tprocessing {}".format(f)) - - with open(fpath, "rb") as file: - coverage_maps = pkl.load(file) - - for ref_id in coverage_maps: - sample_ids.add(coverage_maps[ref_id].sample_id) - - # don't load coverage maps of species in reference database without reads - if coverage_maps[ref_id].count_reads() < args.min_reads: - continue - - write_mode = "ab+" if ref_id in ref_genome_ids or ref_id in assembly_genome_ids else "wb" - - # append to genome file - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), write_mode) as tofile: - pkl.dump(coverage_maps[ref_id], tofile) - - if coverage_maps[ref_id].is_assembly: - assembly_genome_ids.add(ref_id) - else: - ref_genome_ids.add(ref_id) - - del coverage_maps - print_info("CoPTR", "done grouping by reference genome") - print_info("CoPTR", "the --restart flag can be used to start from here") - - sample_ids = sorted(list(sample_ids)) - results_ref = estimate_ptrs_coptr_ref( - ref_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_cov, - threads=args.threads, plot_folder=args.plot - ) - results_contig = estimate_ptrs_coptr_contig( - assembly_genome_ids, grouped_coverage_map_folder, args.min_reads, args.min_samples, - threads=args.threads, plot_folder=args.plot - ) - - out_file = args.out_file - _, ext = os.path.splitext(out_file) - if ext != ".csv": - out_file += ".csv" - - print_info("CoPTR", "writing {}".format(out_file)) - - with open(out_file, "w") as f: - # write the header - f.write("log2(PTR):genome_id/sample_id") - for sample_id in sample_ids: - f.write(",{}".format(sample_id)) - f.write("\n") - - for genome_id in sorted(results_ref): - # don't write rows without estimates - estimates = [result.estimate for result in results_ref[genome_id]] - if np.all(np.isnan(estimates)): - continue - - f.write(genome_id + ",") - row = ["" for s in sample_ids] - for result in results_ref[genome_id]: - if not np.isnan(result.estimate): - row[sample_ids.index(result.sample_id)] = str(result.estimate) - f.write(",".join(row) + "\n") - - for genome_id in sorted(results_contig): - # don't write rows without estimates - estimates = [result.estimate for result in results_contig[genome_id]] - if np.all(np.isnan(estimates)): - continue - - f.write(genome_id + ",") - row = ["" for s in sample_ids] - for result in results_contig[genome_id]: - if not np.isnan(result.estimate): - row[sample_ids.index(result.sample_id)] = str(result.estimate) - f.write(",".join(row) + "\n") - - print_info("CoPTR", "done!") - print_info("CoPTR", "you may remove the folder {}".format(grouped_coverage_map_folder)) - # print_info("CoPTR", "cleaning up {}".format(grouped_coverage_map_folder)) - # for ref_id in ref_genome_ids.union(assembly_genome_ids): - # filename = os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl") - # os.remove(filename) - # os.rmdir(grouped_coverage_map_folder) - - - def count(self): - parser = argparse.ArgumentParser(usage= - """usage: coptr.py count [-h] coverage-map-folder out-file - """ - ) - parser.add_argument("coverage_map_folder", help="Folder with coverage maps computed from 'extract'.") - parser.add_argument("out_file", help="Filename to store PTR table.") - - if len(sys.argv[2:]) < 1: - parser.print_help() - exit(1) - - args = parser.parse_args(sys.argv[2:]) - - print_info("CoPTR", "estimating relative abundances") - - counts, genome_ids = compute_read_counts(args.coverage_map_folder) - - out_file = args.out_file - _, ext = os.path.splitext(out_file) - if ext != ".csv": - out_file += ".csv" - - print_info("CoPTR", "writing {}".format(out_file)) - - with open(out_file, "w") as f: - # write the header - f.write("count:genome_id/sample_id") - for sample_id in counts: - f.write(",{}".format(sample_id)) - f.write("\n") - - for genome_id in sorted(genome_ids): - row = [genome_id] - for sample_id in counts: - if genome_id in counts[sample_id]: - row.append(str(counts[sample_id][genome_id])) - else: - row.append(str(0)) - row = ",".join(row) + "\n" - f.write(row) - - print_info("CoPTR", "done!") - - -if __name__ == "__main__": - ProgramOptions() \ No newline at end of file diff --git a/coptr-env.yml b/coptr.yml similarity index 100% rename from coptr-env.yml rename to coptr.yml diff --git a/docs/conf.py b/docs/conf.py index 2502cab..451768a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,8 @@ import os import sys + + sys.path.insert(0, os.path.abspath('..')) diff --git a/docs/installation.rst b/docs/installation.rst index 34cd198..822cbb1 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -18,17 +18,18 @@ You can then create a new environment to run coPTR: .. code-block:: # creates an environment called coptr - $ conda env create -f coptr-env.yml + $ conda env create -f coptr.yml # you only need to create this once # to activate the environment: $ conda activate coptr + $ pip install . -To check if you can run coPTR: +To check if you can run CoPTR: .. code-block:: - $ python coptr.py + $ coptr usage: coptr.py [options] @@ -50,7 +51,7 @@ To check if you can run coPTR: Dependencies ------------ -Installation using conda will install all required dependencies. However, +Installation using conda iswill install all required dependencies. However, CoPTR's dependencies are listed below if you wish to install them manually. * bowtie2 (>=2.4.1): CoPTR assumes bowtie2 is install and accessible from your PATH variable diff --git a/docs/modules.rst b/docs/modules.rst index 675f313..c035a51 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -1,26 +1,23 @@ Modules ======= -.. automodule:: src.bam_processor +.. automodule:: src.coptr.bam_processor :members: -.. automodule:: src.coptr_contig +.. automodule:: src.coptr.coptr_contig :members: -.. automodule:: src.coptr_ref +.. automodule:: src.coptr.coptr_ref :members: -.. automodule:: src.poisson_pca +.. automodule:: src.coptr.poisson_pca :members: -.. automodule:: src.print +.. automodule:: src.coptr.read_mapper :members: -.. automodule:: src.read_mapper +.. automodule:: src.coptr.read_assigner :members: -.. automodule:: src.read_assigner - :members: - -.. automodule:: src.util +.. automodule:: src.coptr.util :members: \ No newline at end of file diff --git a/docs/quick-start.rst b/docs/quick-start.rst index 9e7df36..233d129 100644 --- a/docs/quick-start.rst +++ b/docs/quick-start.rst @@ -18,21 +18,24 @@ Detailed instructions: :doc:`Tutorial `. tar -xzvf example-data.tar.gz # Set up the environment: - conda env create -f coptr-env.yml + conda env create -f coptr.yml conda activate coptr + + # Install coptr + pip install . # Index the reference database: - python coptr.py index example-data/ref-db example-data/ref-db/example-index + coptr index example-data/ref-db example-data/ref-db/example-index # Map reads against the reference: - python coptr.py map example-data/ref-db/example-index example-data/fastq example-data/bam + coptr map example-data/ref-db/example-index example-data/fastq example-data/bam # Extract read positions: - python coptr.py extract example-data/bam example-data/coverage-maps + coptr extract example-data/bam example-data/coverage-maps # Estimate ptrs: # Note the min-reads flag is optional. We recommend the default setting (5000 reads). - python coptr.py estimate example-data/coverage-maps out.csv --min-reads 2500 + coptr estimate example-data/coverage-maps out.csv --min-reads 2500 # View the output: cat out.csv diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 97afc88..db7b53c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -58,14 +58,14 @@ Indexing the reference database .. code-block:: text - $ python coptr.py index example-data/ref-db example-data/ref-db/example-index - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: found 2 files totaling 0.006 Gb - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: copying to fasta files to coptr-fna-1599775601.629647.fna with prepended genome ids (filenames) - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: writing 2 reference genome ids to example-data/ref-db/example-index.genomes - [INFO] (Sep 10, 2020 18:06:41) ReadMapper: bowtie2-build coptr-fna-1599775601.629647.fna example-data/ref-db/example-index --noref --threads 1 + $ coptr index example-data/ref-db example-data/ref-db/example-index + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Found 2 files totaling 0.00611 GB. + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Copying FASTA files to coptr-fna-2021-06-28T19:46:57+00:00.fna with prepended genome ids (filenames). + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] Writing 2 reference genome ids to example-data/ref-db/example-index.genomes. + [INFO] [Jun 28, 2021 15:46:57] [coptr.read_mapper] bowtie2-build coptr-fna-2021-06-28T19:46:57+00:00.fna example-data/ref-db/example-index --threads 1 ...bowtie2 output... - [INFO] (Sep 10, 2020 18:06:44) ReadMapper: indexed 2 fasta files for reference database. - [INFO] (Sep 10, 2020 18:06:44) ReadMapper: cleaning up coptr-fna-1599775601.629647.fna + [INFO] [Jun 28, 2021 15:47:00] [coptr.read_mapper] Indexed 2 FASTA files for the reference database. + [INFO] [Jun 28, 2021 15:47:00] [coptr.read_mapper] Cleaning up coptr-fna-2021-06-28T19:46:57+00:00.fna. @@ -76,11 +76,11 @@ file, with the filename prepended to each sequencing id. This is how CoPTR keeps track of contigs from the same assembly. Indexing a database of fasta files can be accomplished by calling -``coptr.py index``: +``coptr index``: .. code-block:: text - usage: coptr.py index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out + usage: coptr index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out positional arguments: ref_fasta File or folder containing fasta to index. If a folder, @@ -100,20 +100,20 @@ Indexing a database of fasta files can be accomplished by calling control memory usage. It takes two arguments. The first ``ref-fasta`` is either a folder containing -fasta for the database. If it is a folder, coPTR will scan the folder for +fasta for the database. If it is a folder, CoPTR will scan the folder for all files ending in ``.fasta``, ``.fna``, or ``.fa``. CoPTR will combine these into a single fasta file, prepending the filename to each sequence id in order -to track contigs from the same reference genome. It then calls the ```bowtie2-build``` +to track contigs from the same reference genome. It then calls the ``bowtie2-build`` to index this file. -coPTR additionally outputs an ```index_name.genomes``` file with a list of ids for each +coPTR additionally outputs an ``index_name.genomes`` file with a list of ids for each reference genome. Notes on memory usage --------------------- Sometimes the database is too large for a single index. You can split the reference database into multiple parts, and index each separately. After -read mapping, the resulting BAM files can be merged with ```coptr.py merge```. +read mapping, the resulting BAM files can be merged with ```coptr merge```. Mapping reads ============= @@ -122,20 +122,20 @@ Mapping reads .. code-block:: text - $ python coptr.py map example-data/ref-db/example-index example-data/fastq example-data/bam - [INFO] (Aug 31, 2020 12:12:10) ReadMapper: mapping example-data/fastq/ERR969281.fastq.gz to example-data/bam/ERR969281.sam - [INFO] (Aug 31, 2020 12:12:10) ReadMapper: bowtie2 -x example-data/ref-db/example-index example-data/fastq/ERR969281.fastq.gz --no-unal -p 1 + $ coptr map example-data/ref-db/example-index example-data/fastq example-data/bam + [INFO] [Jun 28, 2021 15:48:26] [coptr.read_mapper] Mapping example-data/fastq to example-data/bam/ERR969281.sam + [INFO] [Jun 28, 2021 15:48:26] [coptr.read_mapper] bowtie2 -x example-data/ref-db/example-index example-data/fastq/ERR969281.fastq.gz --no-unal -p 1 -k 10 10818 reads; of these: 10818 (100.00%) were unpaired; of these: 4071 (37.63%) aligned 0 times 6709 (62.02%) aligned exactly 1 time 38 (0.35%) aligned >1 times 62.37% overall alignment rate - [INFO] (Aug 31, 2020 12:12:11) ReadMapper: converting example-data/bam/ERR969281.sam to example-data/bam/ERR969281.bam - [INFO] (Aug 31, 2020 12:12:11) ReadMapper: cleaning up example-data/bam/ERR969281.sam + [INFO] [Jun 28, 2021 15:48:27] [coptr.read_mapper] Converting example-data/bam/ERR969281.sam to example-data/bam/ERR969281.bam. + [INFO] [Jun 28, 2021 15:48:27] [coptr.read_mapper] Cleaning up example-data/bam/ERR969281.sam. .... - [INFO] (Aug 31, 2020 12:12:24) ReadMapper: converting example-data/bam/ERR969285.sam to example-data/bam/ERR969285.bam - [INFO] (Aug 31, 2020 12:12:24) ReadMapper: cleaning up example-data/bam/ERR969285.sam + [INFO] [Jun 28, 2021 15:48:40] [coptr.read_mapper] Converting example-data/bam/ERR969430.sam to example-data/bam/ERR969430.bam. + [INFO] [Jun 28, 2021 15:48:40] [coptr.read_mapper] Cleaning up example-data/bam/ERR969430.sam. Once you have indexed a reference database. You can then map reads against @@ -144,7 +144,7 @@ convenient: .. code-block:: text - usage: coptr.py map [-h] [--threads INT] [--paired] index input out-folder + usage: coptr map [-h] [--threads INT] [--paired] index input out-folder positional arguments: index Name of database index. @@ -161,12 +161,13 @@ convenient: --bt2-k BT2_K (Default 10). Number of alignments to report. Passed to -k flag of bowtie2. -The name of the database index corresponds to the name used from ``coptr.py index``. +The name of the database index corresponds to the name used from ``coptr index``. The input can either be a single fastq file, or a folder of fastq files to map. It also takes an optional ``--threads`` argument that allows bowtie2 to use multiple threads. Reads are output as ``bam`` files to save space. -For paired end sequencing, it is recommend to only map reads from a single mate-pair. +For paired-end sequencing, we recommand mapping reads from end only (e.g. the files +ending in either _1.* or _2.*). Merging mapped reads from multiple indexes @@ -174,11 +175,11 @@ Merging mapped reads from multiple indexes For large reference databases, it is sometimes necessary to create several indexes for subsets of the data and map reads against each index. Results from each index need to be merged to select reads with the best MAPQ across -indexes. You can use ```coptr.py merge``` to merge multiple bam files. +indexes. You can use ```coptr merge``` to merge multiple bam files. .. code-block:: text - usage: coptr.py merge [-h] in-bam1 in-bam2 ... in-bamN out-bam + usage: coptr merge [-h] in-bam1 in-bam2 ... in-bamN out-bam positional arguments: in-bams A space separateed list of BAM files to merge. Assumes same @@ -195,13 +196,13 @@ Computing coverage maps .. code-block:: text - $ python coptr.py extract example-data/bam example-data/coverage-maps - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: processing example-data/bam/ERR969281.bam - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: determining reference genomes - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: collecting multi-mapped reads - [INFO] (Jan 18, 2021 10:31:43) BamProcessor: grouping reads by reference genome + $ coptr extract example-data/bam example-data/coverage-maps + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Processing example-data/bam/ERR969281.bam. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Determining reference genomes. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Collecting multi-mapped reads. + [INFO] [Jun 28, 2021 15:49:48] [coptr.bam_processor] Grouping reads by reference genome. ... - [INFO] (Aug 31, 2020 12:13:56) BamProcessor: found 190 reference sequences corresponding to 2 genomes + [INFO] [Jun 28, 2021 15:50:00] [coptr.cli] Found 190 reference sequences corresponding to 2 genomes. Once reads have been mapped, the next step is to compute the coverage along each reference genome. In this step, starting positions of each read are @@ -210,7 +211,7 @@ assembly are collected. .. code-block:: text - usage: usage: coptr.py extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] + usage: usage: coptr extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder positional arguments: @@ -227,7 +228,7 @@ assembly are collected. The important argument here is the ``--ref-genome-regex``. This is a regular expression that extracts the reference genome id from a sequence id. The default -argument will work with the index created by ```coptr.py index```, and works by +argument will work with the index created by ```coptr index```, and works by prepending the name of the fasta file, and special character ```|``` to each sequence id. @@ -239,7 +240,7 @@ extracts a genome id from a sequence id in a fasta file. It is used to group contigs together. The default argument ``[^\|]+`` matches all characters up to the first ``|``, and uses them as a genome id. -You can check your regular expression using the ``--check-regex-flag``, which +You can check your regular expression using the ``--check-regex`` flag, which skips the extract step and instead outputs a list of all genome ids. @@ -250,28 +251,28 @@ Estimating PTRs .. code-block:: text - # python coptr.py estimate example-data/coverage-maps out --min-reads 2500 - [INFO] (Jan 18, 2021 10:36:05) CoPTR: grouping reads by reference genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: saving to example-data/coverage-maps/coverage-maps-genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969281.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969282.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969283.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969285.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969286.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969428.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969429.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: processing ERR969430.cm.pkl - [INFO] (Jan 18, 2021 10:36:05) CoPTR: done grouping by reference genome - [INFO] (Jan 18, 2021 10:36:05) CoPTR: the --restart flag can be used to start from here - [INFO] (Jan 18, 2021 10:36:05) CoPTRRef: checking reference genomes - [INFO] (Jan 18, 2021 10:36:15) CoPTRRef: running l-gasseri-ref - [INFO] (Jan 18, 2021 10:36:16) CoPTRRef: finished l-gasseri-ref - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: checking reference genomes - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: running e-coli-mag - [INFO] (Jan 18, 2021 10:36:16) CoPTRContig: finished e-coli-mag - [INFO] (Jan 18, 2021 10:36:16) CoPTR: writing out.csv - [INFO] (Jan 18, 2021 10:36:16) CoPTR: done! - [INFO] (Jan 18, 2021 10:36:16) CoPTR: you may remove the folder example-data/coverage-maps/coverage-maps-genome + $ coptr estimate example-data/coverage-maps out --min-reads 2500 + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Grouping reads by reference genome. + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Saving to example-data/coverage-maps/coverage-maps-genome: + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969281.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969282.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969283.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969285.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969286.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969428.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969429.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] ERR969430.cm.pkl + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] Grouping by reference genome: Complete. + [INFO] [Jun 28, 2021 15:50:15] [coptr.cli] The --restart flag can be used to start from here. + [INFO] [Jun 28, 2021 15:50:15] [coptr.coptr_ref] Checking reference genomes. + [INFO] [Jun 28, 2021 15:50:24] [coptr.coptr_ref] Running l-gasseri-ref. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_ref] Finished l-gasseri-ref. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Checking reference genomes. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Running e-coli-mag. + [INFO] [Jun 28, 2021 15:50:25] [coptr.coptr_contig] Finished e-coli-mag. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] Writing out.csv. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] Done. + [INFO] [Jun 28, 2021 15:50:25] [coptr.cli] You may now remove the folder example-data/coverage-maps/coverage-maps-genome. The final stage is to estimate PTR ratios from coverage maps. This is accomplished with the ``estimate`` command. **It is strongly recommended that you perform this step @@ -279,23 +280,22 @@ on all samples at once.** .. code-block:: text - usage: coptr.py estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--threads THREADS] coverage-map-folder out-file - - positional arguments: - coverage_map_folder Folder with coverage maps computed from 'extract'. - out_file Filename to store PTR table. - - optional arguments: - -h, --help show this help message and exit - --min-reads MIN_READS - Minimum number of reads required to compute a PTR - (default 5000). - --min-cov MIN_COV Fraction of nonzero 10Kb bins required to compute a - PTR (default 0.75). - --min-samples MIN_SAMPLES - CoPTRContig only. Minimum number of samples required - to reorder bins (default 5). - --threads THREADS Number of threads to use (default 1). + usage: usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file + + + positional arguments: + coverage_map_folder Folder with coverage maps computed from 'extract'. + out_file Filename to store PTR table. + + optional arguments: + -h, --help show this help message and exit + --min-reads MIN_READS + Minimum number of reads required to compute a PTR (default 5000). + --min-cov MIN_COV Fraction of nonzero bins required to compute a PTR (default 0.75). + --min-samples MIN_SAMPLES + CoPTRContig only. Minimum number of samples required to reorder bins (default 5). + --plot OUTFOLDER Plot model fit for each PTR. + --restart Restarts the estimation step using the genomes in the coverage-maps-genome folder. This combines all coverage maps by species, then estimates PTRs for each species. We have tried to set sensible default parameters for PTR estimatation. We set diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2fbbd37 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[build-system] +build-backend = 'setuptools.build_meta' +requires = [ + 'setuptools>=40.6.0', + 'wheel' +] + +[tool.black] +line-length = 88 +python-version = ['py37'] + +[tool.isort] +profile = 'black' +skip = ['__init__.py'] +lines_after_imports = 2 +known_first_party = 'coptr' +known_third_party = ['numpy', 'pysam', 'scipy'] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..75d0e7e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,58 @@ +[metadata] +name = coptr +version = attr: coptr.__version__ +url = https://github.com/tyjo/coptr +download_url = https://pypi.org/project/coptr/ +project_urls = + Source Code = https://github.com/tyjo/coptr + Documentation = https://coptr.readthedocs.io + Bug Tracker = https://github.com/tyjo/coptr/issues +author = Tyler Joseph +author_email = tj2357@columbia.edu +# Please consult https://pypi.org/classifiers/ for a full list. +classifiers = + Development Status :: 3 - Alpha + Environment :: Console + Intended Audience :: Science/Research + License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) + Natural Language :: English + Operating System :: OS Independent + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3 :: Only + Topic :: Scientific/Engineering :: Bio-Informatics +license = GPL-3.0-or-later +description = Accurate and robust inference of microbial growth dynamics from metagenomic sequencing reads. +long_description = file: README.md +long_description_content_type = text/markdown +keywords = + growth rate + peak-to-trough ratio + +[options] +zip_safe = True +install_requires = + numpy >= 1.19.1 + pysam >= 0.16.0.1 + scipy >= 1.5.2 + matplotlib >= 3.3.2 +python_requires = >=3.6 +tests_require = + tox +packages = find: +package_dir = + = src + +[options.entry_points] +console_scripts = + coptr = coptr.cli:cli + +[options.packages.find] +where = src + +[options.extras_require] +development = + black + isort diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4f876ca --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + + +# This file is part of CoPTR. +# +# CoPTR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# CoPTR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CoPTR. If not, see . + + +"""Set up the CoPTR for Python package.""" + + +from setuptools import setup + + +# All other arguments are defined in `setup.cfg`. +setup() diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/coptr/__init__.py b/src/coptr/__init__.py new file mode 100644 index 0000000..1436d8f --- /dev/null +++ b/src/coptr/__init__.py @@ -0,0 +1 @@ +__version__ = "1.1.6" diff --git a/src/bam_processor.py b/src/coptr/bam_processor.py similarity index 83% rename from src/bam_processor.py rename to src/coptr/bam_processor.py index 16f9693..1dd377f 100644 --- a/src/bam_processor.py +++ b/src/coptr/bam_processor.py @@ -1,7 +1,7 @@ """ bam_processor.py ====================== -Extract coordinates along each reference sequence and group reference +Extract coordinates along each reference sequence and group reference sequences by genome. """ @@ -24,23 +24,23 @@ import array import bisect +import logging import math -import numpy as np import os.path -import pysam import re -import sys -from scipy.sparse import lil_matrix, csr_matrix +import numpy as np +import pysam +from scipy.sparse import csr_matrix + +from .read_assigner import ReadAssigner -from src.print import print_info -from src.read_assigner import ReadAssigner +logger = logging.getLogger(__name__) class ReadContainer: - """Container to store read positions and metadata. - """ + """Container to store read positions and metadata.""" def __init__(self): # read query_id -> [ids in read_data] @@ -55,9 +55,8 @@ def __init__(self): self.index_ref_name = {} self.next_ref_idx = 0 - def check_add_mapping(self, query_id, ref_name, ref_position, score): - """Stores a mapping if it has an alignment score greater than or equal + """Stores a mapping if it has an alignment score greater than or equal to the mappings seen so far. Parameters @@ -76,12 +75,12 @@ def check_add_mapping(self, query_id, ref_name, ref_position, score): best_score = -np.inf else: row_ids = list(self.reads[query_id]) - best_score = np.max(self.read_data[row_ids,2]) + best_score = np.max(self.read_data[row_ids, 2]) # we've run out of room in the read data matrix, if self.next_row_idx == self.read_data.shape[0]: - new_read_data = np.zeros((2*self.next_row_idx, 3)) - new_read_data[:self.next_row_idx, :] = self.read_data + new_read_data = np.zeros((2 * self.next_row_idx, 3)) + new_read_data[: self.next_row_idx, :] = self.read_data self.read_data = new_read_data if ref_name not in self.ref_names_index: @@ -100,10 +99,9 @@ def check_add_mapping(self, query_id, ref_name, ref_position, score): self.reads[query_id].append(self.next_row_idx) self.next_row_idx += 1 - def get_mappings(self, query_id): """Return the highest scoring mappings for a read. - + Parameters ---------- query_id : str @@ -117,10 +115,10 @@ def get_mappings(self, query_id): A list of positions in each reference sequences. """ row_ids = list(self.reads[query_id]) - best_score = np.max(self.read_data[row_ids,2]) + best_score = np.max(self.read_data[row_ids, 2]) ref_names = [] ref_positions = [] - for row in self.read_data[row_ids,:]: + for row in self.read_data[row_ids, :]: ref_name_idx = int(row[0]) ref_pos = int(row[1]) score = int(row[2]) @@ -182,7 +180,6 @@ def __init__(self, regex="[^\|]+", is_bam=True): self.read_mode = "r" self.write_mode = "w" - def get_ref_names(self, bam_file): """Get the names of reference sequences and reference genomes. @@ -220,7 +217,6 @@ def get_ref_names(self, bam_file): return ref_seqs, ref_genomes - def extract_reads(self, bam_file): """Get the read positions from sam_file for each reference genome. @@ -240,7 +236,7 @@ def extract_reads(self, bam_file): and the value is the length of that reference sequence """ lengths = self.get_ref_seq_lengths(bam_file) - + read_container = ReadContainer() alignment_file = pysam.AlignmentFile(bam_file, self.read_mode) @@ -262,7 +258,7 @@ def extract_reads(self, bam_file): # This minimum scores means that reads with perfect # quality scores that fall below min_identity will # be discarded - min_score = -6*(1-self.min_identity)*aln.query_alignment_length + min_score = -6 * (1 - self.min_identity) * aln.query_alignment_length alignment_score = aln.get_tag("AS") if alignment_score < min_score: @@ -275,7 +271,6 @@ def extract_reads(self, bam_file): alignment_file.close() return read_container, lengths - def get_ref_seq_lengths(self, bam_file): """Extract the lengths of each reference sequence from a bam file. @@ -301,8 +296,9 @@ def get_ref_seq_lengths(self, bam_file): alignment_file.close() return ref_seq_lengths - - def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_genome_id): + def compute_bin_coverage( + self, genome_ids, read_container, lengths, ref_seq_genome_id + ): """Compute fraction of bins with reads for each reference genome. Parameters @@ -328,7 +324,8 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno for read_id in sorted(read_container.reads): ref_names, ref_positions = read_container.get_mappings(read_id) - if len(ref_names) > 1: continue + if len(ref_names) > 1: + continue bin_idx = math.floor(ref_positions[0] / 5000) binned_counts[ref_names[0]][bin_idx] += 1 @@ -337,8 +334,12 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno for ref_seq in binned_counts: genome_id = ref_seq_genome_id[ref_seq] - covered_bins[genome_id] = covered_bins.get(genome_id, 0) + (binned_counts[ref_seq] != 0).sum() - total_bins[genome_id] = total_bins.get(genome_id, 0) + binned_counts[ref_seq].size + covered_bins[genome_id] = ( + covered_bins.get(genome_id, 0) + (binned_counts[ref_seq] != 0).sum() + ) + total_bins[genome_id] = ( + total_bins.get(genome_id, 0) + binned_counts[ref_seq].size + ) # now add contigs without reads for ref_seq in ref_seq_genome_id: @@ -350,13 +351,13 @@ def compute_bin_coverage(self, genome_ids, read_container, lengths, ref_seq_geno coverage_frac = {} for genome_id in genome_ids: if genome_id in covered_bins: - coverage_frac[genome_id] = covered_bins[genome_id] / total_bins[genome_id] + coverage_frac[genome_id] = ( + covered_bins[genome_id] / total_bins[genome_id] + ) else: coverage_frac[genome_id] = 0 return coverage_frac - - def assign_multimapped_reads(self, read_container, lengths, max_alignments): """Assign multiple mapped reads to a single genome. @@ -372,7 +373,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): A dictionary whose key is a sequence id, and value are the read positions along that sequence. """ - print_info("BamProcessor", "determining reference genomes") + logger.info("Determining reference genomes.") genome_ids = set() # sequence_id -> genome_id @@ -387,9 +388,11 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): ref_seq_genome_id[ref_name] = genome_id genome_ids.add(genome_id) - genome_coverage = self.compute_bin_coverage(genome_ids, read_container, lengths, ref_seq_genome_id) + genome_coverage = self.compute_bin_coverage( + genome_ids, read_container, lengths, ref_seq_genome_id + ) - print_info("BamProcessor", "collecting multi-mapped reads") + logger.info("Collecting multi-mapped reads.") genome_ids = sorted(list(genome_ids)) # reads that fail filtering criteria discarded_reads = set() @@ -405,7 +408,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): for read_id in sorted(read_container.reads): ref_names, ref_positions = read_container.get_mappings(read_id) ref_genomes = [ref_seq_genome_id[r] for r in ref_names] - + # the read maps twice to the same genome if len(ref_genomes) != np.unique(ref_genomes).size: discarded_reads.add(read_id) @@ -430,11 +433,10 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): nreads += 1 indptr.append(len(indices)) - if len(data) > 0: X = csr_matrix((data, indices, indptr), shape=(nreads, len(genome_ids))) - print_info("BamProcessor", "assigning multi-mapped reads") + logger.info("Assigning multi-mapped reads.") read_assigner = ReadAssigner(X, prior_counts) assignments = read_assigner.assign_reads() @@ -464,7 +466,7 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): ref_seq_genome_id[ref_name] = genome_id genome_id = ref_seq_genome_id[ref_name] mapping_genome_ids.append(genome_id) - + assignment_id = assignments[current_row] assigned_genome = genome_ids[assignment_id] sequence_id = mapping_genome_ids.index(assigned_genome) @@ -482,8 +484,6 @@ def assign_multimapped_reads(self, read_container, lengths, max_alignments): return read_positions, ref_seq_genome_id - - def process_bam(self, bam_file, max_alignments=10): """Extract read coordinates along each reference sequence. @@ -501,7 +501,7 @@ def process_bam(self, bam_file, max_alignments=10): A dictionary whose key is a reference genome id, and value is a CoverageMap. """ - print_info("BamProcessor", "processing {}".format(bam_file)) + logger.info("Processing %s.", bam_file) # extract read positions # reads : dict[query_seq_id] -> Read @@ -511,7 +511,9 @@ def process_bam(self, bam_file, max_alignments=10): # assign multiply mapped reads to reference sequences # 1. dict[ref_seq_id] -> int (read position) # 2. dict[ref_seq_id] -> genome_id - read_positions, ref_seq_genome_id = self.assign_multimapped_reads(reads, lengths, max_alignments) + read_positions, ref_seq_genome_id = self.assign_multimapped_reads( + reads, lengths, max_alignments + ) # fill in read_positions with remaining sequence ids for seq_id in lengths: @@ -524,7 +526,7 @@ def process_bam(self, bam_file, max_alignments=10): contig_read_positions = {} contig_lengths = {} - print_info("BamProcessor", "grouping reads by reference genome") + logger.info("Grouping reads by reference genome.") # group by reference genome # for contigs, this will group all contigs together by species for ref in sorted(lengths): @@ -545,23 +547,29 @@ def process_bam(self, bam_file, max_alignments=10): contig_read_positions[genome_id][ref] = read_positions[ref] contig_lengths[genome_id][ref] = lengths[ref] - # now, store coverage maps for each refence genome coverage_maps = {} for genome_id in contig_read_positions: is_assembly = len(contig_read_positions[genome_id]) > 1 if is_assembly: - coverage_maps[genome_id] = \ - CoverageMapContig(bam_file, genome_id, contig_read_positions[genome_id], contig_lengths[genome_id]) + coverage_maps[genome_id] = CoverageMapContig( + bam_file, + genome_id, + contig_read_positions[genome_id], + contig_lengths[genome_id], + ) else: ref_id = list(contig_read_positions[genome_id].keys())[0] - coverage_maps[genome_id] = \ - CoverageMapRef(bam_file, genome_id, contig_read_positions[genome_id][ref_id], contig_lengths[genome_id][ref_id]) + coverage_maps[genome_id] = CoverageMapRef( + bam_file, + genome_id, + contig_read_positions[genome_id][ref_id], + contig_lengths[genome_id][ref_id], + ) return coverage_maps - def merge(self, bam_files, out_bam): """Merge many bam files from different indexes into one, taking the reads with the highest mapping quality from each bam. @@ -573,8 +581,8 @@ def merge(self, bam_files, out_bam): out_bam : str Location to store the merged bam file. """ - print_info("BamProcessor", "merging bam_files {}".format(bam_files)) - print_info("BamProcessor", "keeping reads with highest alignment score") + logger.info("Merging BAM files %s.", ", ".join(bam_files)) + logger.info("Keeping reads with highest alignment score.") # bam header: SN => LN seq_len = {} # read_id => best_score @@ -599,15 +607,14 @@ def merge(self, bam_files, out_bam): bf.close() # combined header - header = { - "HD" : {"VN": "1.0", "SO": "unsorted"}, - "SQ" : [{"SN": sq, "LN" : seq_len[sq]} for sq in sorted(seq_len)] + header = { + "HD": {"VN": "1.0", "SO": "unsorted"}, + "SQ": [{"SN": sq, "LN": seq_len[sq]} for sq in sorted(seq_len)], } seq_names = sorted(seq_len.keys()) - - print_info("BamProcessor", "writing merged file {}".format(out_bam)) + logger.info("Writing merged file %s.", out_bam) out = pysam.AlignmentFile(out_bam, self.write_mode, header=header) for bam_file in bam_files: inf = pysam.AlignmentFile(bam_file, self.read_mode) @@ -616,13 +623,19 @@ def merge(self, bam_files, out_bam): continue alignment_score = in_aln.get_tag("AS") - if in_aln.query_name in score and alignment_score == score[in_aln.query_name] and not in_aln.is_read2: + if ( + in_aln.query_name in score + and alignment_score == score[in_aln.query_name] + and not in_aln.is_read2 + ): # reference name will be wrong if it is not expicility set out_aln = pysam.AlignedSegment() out_aln.query_name = in_aln.query_name out_aln.query_sequence = in_aln.query_sequence out_aln.flag = in_aln.flag - out_aln.reference_id = bisect.bisect_left(seq_names, in_aln.reference_name) + out_aln.reference_id = bisect.bisect_left( + seq_names, in_aln.reference_name + ) out_aln.reference_start = in_aln.reference_start out_aln.cigar = in_aln.cigar out_aln.template_length = in_aln.template_length @@ -630,14 +643,14 @@ def merge(self, bam_files, out_bam): out_aln.tags = in_aln.tags # check that reference sequence is set correctly - assert seq_names[out_aln.reference_id] == in_aln.reference_name, "missing reference sequence from {}".format(bam_file) + assert ( + seq_names[out_aln.reference_id] == in_aln.reference_name + ), "missing reference sequence from {}".format(bam_file) out.write(out_aln) inf.close() out.close() - print_info("BamProcessor", "finished writing {}".format(out_bam)) - - + logger.info("Finished writing %s.", out_bam) class CoverageMap: @@ -664,7 +677,6 @@ def __init__(self, bam_file, genome_id, is_assembly): self.is_assembly = is_assembly - class CoverageMapRef(CoverageMap): """Data structure to store read positions from complete reference genomes. @@ -678,41 +690,57 @@ class CoverageMapRef(CoverageMap): A list of coordinates, one per read. """ - __slots__ = ("bam_file", "genome_id", "sample_id", "is_assembly", "read_positions", "length") - + __slots__ = ( + "bam_file", + "genome_id", + "sample_id", + "is_assembly", + "read_positions", + "length", + ) def __getstate__(self): - return self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.read_positions, self.length - + return ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.read_positions, + self.length, + ) def __setstate__(self, state): if type(state) == list or type(state) == tuple: - self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.read_positions, self.length = state + ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.read_positions, + self.length, + ) = state else: for key in state: setattr(self, key, state[key]) - def __init__(self, bam_file, genome_id, read_positions, length): super().__init__(bam_file, genome_id, is_assembly=False) self.read_positions = np.array(read_positions) self.length = length - def __str__(self): - return "CoverageMapRef(bam_file={}, genome_id={}, sample_id={}, reads={})".format( - self.bam_file, self.genome_id, self.sample_id, len(self.read_positions) + return ( + "CoverageMapRef(bam_file={}, genome_id={}, sample_id={}, reads={})".format( + self.bam_file, self.genome_id, self.sample_id, len(self.read_positions) + ) ) def __repr__(self): return self.__str__() - def get_length(self): """Get the lenth of the genome. - + Returns ------- length : int @@ -720,7 +748,6 @@ def get_length(self): """ return self.length - def get_reads(self): """Get the read coordinates along the reference genome. @@ -732,7 +759,6 @@ def get_reads(self): reads = np.copy(self.read_positions) return reads - def count_reads(self): """Return number of mapped reads. @@ -744,7 +770,6 @@ def count_reads(self): return np.array(self.read_positions).size - class CoverageMapContig(CoverageMap): """Data structure to store read positions from assemblies. @@ -763,30 +788,65 @@ class CoverageMapContig(CoverageMap): and value is the length of that contig. """ - __slots__ = ("bam_file", "genome_id", "sample_id", "is_assembly", - "contig_ids", "contig_read_positions", "contig_lengths", "bin_size", - "binned_reads", "total_bins", "frac_nonzero", "reads", "passed_qc_flag") - + __slots__ = ( + "bam_file", + "genome_id", + "sample_id", + "is_assembly", + "contig_ids", + "contig_read_positions", + "contig_lengths", + "bin_size", + "binned_reads", + "total_bins", + "frac_nonzero", + "reads", + "passed_qc_flag", + ) def __getstate__(self): - return self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.contig_ids, self.contig_read_positions, self.contig_lengths, self.bin_size, \ - self.binned_reads, self.total_bins, self.frac_nonzero, self.reads, self.passed_qc_flag - + return ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.contig_ids, + self.contig_read_positions, + self.contig_lengths, + self.bin_size, + self.binned_reads, + self.total_bins, + self.frac_nonzero, + self.reads, + self.passed_qc_flag, + ) def __setstate__(self, state): if type(state) == list or type(state) == tuple: - self.bam_file, self.genome_id, self.sample_id, self.is_assembly, \ - self.contig_ids, self.contig_read_positions, self.contig_lengths, self.bin_size, \ - self.binned_reads, self.total_bins, self.frac_nonzero, self.reads, self.passed_qc_flag = state + ( + self.bam_file, + self.genome_id, + self.sample_id, + self.is_assembly, + self.contig_ids, + self.contig_read_positions, + self.contig_lengths, + self.bin_size, + self.binned_reads, + self.total_bins, + self.frac_nonzero, + self.reads, + self.passed_qc_flag, + ) = state else: for key in state: setattr(self, key, state[key]) # load old pickle files for contig in self.contig_read_positions: - self.contig_read_positions[contig] = array.array("I", self.contig_read_positions[contig]) - + self.contig_read_positions[contig] = array.array( + "I", self.contig_read_positions[contig] + ) def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): super().__init__(bam_file, genome_id, is_assembly=True) @@ -795,7 +855,9 @@ def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): self.contig_read_positions = {} for contig in contig_read_positions: - self.contig_read_positions[contig] = array.array("I", contig_read_positions[contig]) + self.contig_read_positions[contig] = array.array( + "I", contig_read_positions[contig] + ) self.bin_size = None self.binned_reads = {} @@ -808,17 +870,19 @@ def __init__(self, bam_file, genome_id, contig_read_positions, contig_lengths): # flag for qc check self.passed_qc_flag = None - def __str__(self): return "CoverageMapContig(bam_file={}, genome_id={}, ncontigs={}, nreads={}, cov_frac={}, passed_qc={})".format( - self.bam_file, self.genome_id, len(self.contig_ids), self.reads, self.frac_nonzero, self.passed_qc_flag + self.bam_file, + self.genome_id, + len(self.contig_ids), + self.reads, + self.frac_nonzero, + self.passed_qc_flag, ) - def __repr__(self): return self.__str__() - def reset(self): self.bin_size = None self.binned_reads = {} @@ -827,7 +891,6 @@ def reset(self): self.reads = None self.passed_qc_flag = None - def compute_bin_size(self): """Compute bin size for read counts. @@ -845,7 +908,8 @@ def compute_bin_size(self): total_length = 0 for contig_id in self.contig_lengths: length = self.contig_lengths[contig_id] - if length >= 11000: total_length += length + if length >= 11000: + total_length += length # bound bin size below by 1000 bin_size = np.max((1000, total_length / target_bins)) @@ -856,10 +920,9 @@ def compute_bin_size(self): return bin_size - def get_length(self, contig_id): """Get the length of a contig. - + Parameters ---------- contig_id : str @@ -873,15 +936,14 @@ def get_length(self, contig_id): length = self.contig_lengths[contig_id] return length - def get_reads(self, contig_id): """Get the coordinates of all reads for a contig. - + Parameters ---------- contig_id : str Reference sequence id of the contig - + Returns ------- reads : numpy.array @@ -890,7 +952,6 @@ def get_reads(self, contig_id): reads = np.copy(self.contig_read_positions[contig_id]) return reads - def bin_reads(self, contig_id): """Count reads in bins. @@ -929,7 +990,6 @@ def bin_reads(self, contig_id): return self.binned_reads[contig_id] - def count_reads(self): """Return number of mapped reads. @@ -943,10 +1003,8 @@ def count_reads(self): nreads += np.array(self.contig_read_positions[contig_id]).size return nreads - def passed_qc(self): - """Run basic quality control checks. Sets the passed_gc_flag attribute. - """ + """Run basic quality control checks. Sets the passed_gc_flag attribute.""" if self.passed_qc_flag is not None: return self.passed_qc_flag total_bins = 0 @@ -954,7 +1012,8 @@ def passed_qc(self): zero_bins = 0 for cid in self.contig_ids: length = self.contig_lengths[cid] - if length < 11000: continue + if length < 11000: + continue bins = self.bin_reads(cid) total_bins += bins.size total_reads += bins.sum() @@ -965,7 +1024,9 @@ def passed_qc(self): else: self.passed_qc_flag = True - self.frac_nonzero = (total_bins - zero_bins) / total_bins if total_bins > 0 else 0 - self.reads = total_reads + self.frac_nonzero = ( + (total_bins - zero_bins) / total_bins if total_bins > 0 else 0 + ) + self.reads = total_reads self.total_bins = total_bins - return self.passed_qc_flag \ No newline at end of file + return self.passed_qc_flag diff --git a/src/coptr/cli.py b/src/coptr/cli.py new file mode 100644 index 0000000..1c383fb --- /dev/null +++ b/src/coptr/cli.py @@ -0,0 +1,600 @@ +""" +This file is part of CoPTR. + +CoPTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +CoPTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with CoPTR. If not, see . +""" + +import argparse +import logging +import os +import os.path +import pickle as pkl +import sys + +import numpy as np + +from . import __version__ +from .bam_processor import BamProcessor +from .compute_read_counts import compute_read_counts +from .compute_rel_abun import compute_rel_abun +from .coptr_contig import estimate_ptrs_coptr_contig +from .coptr_ref import estimate_ptrs_coptr_ref +from .read_mapper import ReadMapper +from .util import get_fastq_name + + +logger = logging.getLogger(__name__) + + +class ProgramOptions: + def __init__(self): + parser = argparse.ArgumentParser( + description="CoPTR (v{}): Compute PTRs from complete reference genomes and assemblies.".format( + __version__ + ), + usage="""coptr [options] + +command: index create a bowtie2 index for a reference database + map map reads against a reference database + merge merge BAM files from reads mapped to multiple indexes + extract compute coverage maps from bam files + estimate estimate PTRs from coverage maps + count compute read counts for each genome after filtering + rabun estimate relative abundances for each genomes after filtering +""", + ) + self.default_bt2_k = 10 + + if len(sys.argv[1:]) < 1: + parser.print_help() + sys.exit(2) + + parser.add_argument("command", type=str, help="Command to run.") + args = parser.parse_args(sys.argv[1:2]) + + if not hasattr(self, args.command): + logger.critical("Unrecognized command.") + parser.print_help() + sys.exit(2) + getattr(self, args.command)() + + def index(self): + parser = argparse.ArgumentParser( + usage="coptr index [-h] [--bt2-bmax BT2_BMAX] [--bt2-dcv BT2_DCV] [--bt2-threads BT2_THREADS] [--bt2-packed] ref-fasta index-out" + ) + parser.add_argument( + "ref_fasta", + help="""File or folder containing fasta to index. If a folder, the extension for each + fasta must be one of [.fasta, .fna, .fa] + """, + ) + parser.add_argument("index_out", help="Filepath to store index.") + parser.add_argument( + "--bt2-bmax", + default=None, + help="Set the --bmax arguement for bowtie2-build. Used to control memory useage.", + ) + parser.add_argument( + "--bt2-dcv", + default=None, + help="Set the --dcv argument for bowtie2-build. Used to control memory usage.", + ) + parser.add_argument( + "--bt2-threads", + default="1", + help="Number of threads to pass to bowtie2-build.", + ) + parser.add_argument( + "--bt2-packed", + action="store_true", + help="Set the --packed flag for bowtie2-build. Used to control memory usage.", + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + sys.exit(2) + + args = parser.parse_args(sys.argv[2:]) + read_mapper = ReadMapper() + read_mapper.index( + args.ref_fasta, + args.index_out, + args.bt2_bmax, + args.bt2_dcv, + args.bt2_threads, + args.bt2_packed, + ) + + def map(self): + parser = argparse.ArgumentParser( + usage="coptr map [-h] [--threads INT] [--bt2-k INT] [--paired] index input out-folder" + ) + parser.add_argument("index", help="Name of database index.") + parser.add_argument( + "input", + help="""File or folder containing fastq reads to map. If a folder, the extension for + each fastq must be one of [.fastq, .fq, .fastq.gz, fq.gz] + """, + ) + parser.add_argument( + "out_folder", help="Folder to save mapped reads. BAM files are output here." + ) + parser.add_argument( + "--paired", + action="store_true", + help="Set for paired end reads. Assumes fastq files end in _1.* and _2.*", + ) + parser.add_argument( + "--threads", + type=int, + default=1, + help="Number of threads for bowtie2 mapping.", + ) + parser.add_argument( + "--bt2-k", + type=int, + default=self.default_bt2_k, + help="(Default 10). Number of alignments to report. Passed to -k flag of bowtie2.", + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + sys.exit(2) + + args = parser.parse_args(sys.argv[2:]) + read_mapper = ReadMapper() + read_mapper.map( + args.index, + args.input, + args.out_folder, + args.paired, + args.threads, + args.bt2_k, + ) + + def merge(self): + parser = argparse.ArgumentParser( + usage="coptr merge [-h] in-bam1 in-bam2 ... in-bamN out-bam" + ) + parser.add_argument( + "in-bams", + nargs="+", + help="A space separated list of BAM files to merge. Assumes same reads were mapped against different indexes. " + + "Only keeps read 1 of paired end sequencing, since this is used downstream.", + ) + parser.add_argument("out-bam", help="Path to merged BAM.") + + if len(sys.argv[2:]) < 2: + parser.print_help() + sys.exit(2) + + args = vars(parser.parse_args(sys.argv[2:])) + in_bams = args["in-bams"] + out_bam = args["out-bam"] + bam_processor = BamProcessor() + bam_processor.merge(in_bams, out_bam) + + def extract(self): + parser = argparse.ArgumentParser( + usage="coptr extract [-h] [--ref-genome-regex REF_GENOME_REGEX] [--check-regex] in-folder out-folder" + ) + parser.add_argument("in_folder", help="Folder with BAM files.") + parser.add_argument("out_folder", help="Folder to store coverage maps.") + parser.add_argument( + "--ref-genome-regex", + default="[^\|]+", + help="Regular expression extracting a reference genome id from the sequence id in a bam file.", + ) + parser.add_argument( + "--check-regex", + action="store_true", + default=False, + help="Check the regular expression by counting reference genomes without processing.", + ) + parser.add_argument( + "--bt2-k", + type=int, + default=self.default_bt2_k, + help="Maximum number of alignments.", + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + sys.exit(2) + + args = parser.parse_args(sys.argv[2:]) + + bam_processor = BamProcessor(args.ref_genome_regex) + ref_sequences = set() + ref_genomes = set() + for f in sorted(os.listdir(args.in_folder)): + fname, ext = os.path.splitext(f) + if ext == ".bam": + fpath = os.path.join(args.in_folder, f) + seq, gen = bam_processor.get_ref_names(fpath) + ref_sequences.update(seq) + ref_genomes.update(gen) + + if os.path.isfile( + os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl") + ): + logger.info("Output for %s already found, skipping.", fname) + continue + + # don't process the rest of the bam file if we just want to + # sanity check the regular expression + if args.check_regex: + continue + + coverage_maps = bam_processor.process_bam(fpath, args.bt2_k) + with open( + os.path.join(args.out_folder, get_fastq_name(f) + ".cm.pkl"), "wb" + ) as f: + pkl.dump(coverage_maps, f) + + logger.info( + "Found %d reference sequences corresponding to %d genomes.", + len(ref_sequences), + len(ref_genomes), + ) + if args.check_regex: + logger.info("Reference genome ids:") + for ref in sorted(ref_genomes): + logger.info("\t%s", ref) + + def estimate(self): + parser = argparse.ArgumentParser( + usage="""usage: coptr estimate [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] [--threads THREADS] [--plot OUTFOLDER] [--restart] coverage-map-folder out-file + """ + ) + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) + parser.add_argument("out_file", help="Filename to store PTR table.") + parser.add_argument( + "--min-reads", + type=float, + help="Minimum number of reads required to compute a PTR (default 5000).", + default=5000, + ) + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, + ) + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, + ) + parser.add_argument( + "--plot", + metavar="OUTFOLDER", + default=None, + help="Plot model fit for each PTR." + ) + parser.add_argument( + "--restart", + default=False, + action="store_true", + help="Restarts the estimation step using the genomes in the coverage-maps-genome folder.", + ) + parser.add_argument( + "--oriC", + default=False, + action="store_true", + help="Compute OriC estimates." + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + sys.exit(2) + + args = parser.parse_args(sys.argv[2:]) + sample_ids = set() + ref_genome_ids = set() + assembly_genome_ids = set() + + grouped_coverage_map_folder = os.path.join( + args.coverage_map_folder, "coverage-maps-genome" + ) + if not args.restart and not os.path.isdir(grouped_coverage_map_folder): + os.mkdir(grouped_coverage_map_folder) + + if args.restart: + logger.info("Restarting from files in %s.", grouped_coverage_map_folder) + for cm_file in sorted(os.listdir(grouped_coverage_map_folder)): + _, ext = os.path.splitext(cm_file) + if ext != ".pkl": + continue + + logger.info("Checking %s.", cm_file) + with open( + os.path.join(grouped_coverage_map_folder, cm_file), "rb" + ) as f: + try: + while True: + cm = pkl.load(f) + if cm.is_assembly: + assembly_genome_ids.add(cm.genome_id) + else: + ref_genome_ids.add(cm.genome_id) + sample_ids.add(cm.sample_id) + + except EOFError: + pass + + else: + logger.info("Grouping reads by reference genome.") + logger.info("Saving to %s:", grouped_coverage_map_folder) + + # first construct a list of genome_ids for ptr estimates + for f in sorted(os.listdir(args.coverage_map_folder)): + fname, ext = os.path.splitext(f) + if ext != ".pkl": + continue + fpath = os.path.join(args.coverage_map_folder, f) + + logger.info("\t%s", f) + with open(fpath, "rb") as file: + coverage_maps = pkl.load(file) + + for ref_id in coverage_maps: + sample_ids.add(coverage_maps[ref_id].sample_id) + + # don't load coverage maps of species in reference database without reads + if coverage_maps[ref_id].count_reads() < args.min_reads: + continue + + write_mode = ( + "ab+" + if ref_id in ref_genome_ids or ref_id in assembly_genome_ids + else "wb" + ) + + # append to genome file + with open( + os.path.join( + grouped_coverage_map_folder, ref_id + ".cm.pkl" + ), + write_mode, + ) as tofile: + pkl.dump(coverage_maps[ref_id], tofile) + + if coverage_maps[ref_id].is_assembly: + assembly_genome_ids.add(ref_id) + else: + ref_genome_ids.add(ref_id) + + del coverage_maps + logger.info("Grouping by reference genome: Complete.") + logger.info("The --restart flag can be used to start from here.") + + sample_ids = sorted(list(sample_ids)) + results_ref = estimate_ptrs_coptr_ref( + ref_genome_ids, + grouped_coverage_map_folder, + args.min_reads, + args.min_cov, + plot_folder=args.plot, + ) + results_contig, ref_id_to_bin_coords = estimate_ptrs_coptr_contig( + assembly_genome_ids, + grouped_coverage_map_folder, + args.min_reads, + args.min_samples, + plot_folder=args.plot, + ) + + out_file = args.out_file + _, ext = os.path.splitext(out_file) + if ext != ".csv": + out_file += ".csv" + + logger.info("Writing %s.", out_file) + + with open(out_file, "w") as f: + # write the header + f.write("log2(PTR):genome_id/sample_id") + for sample_id in sample_ids: + f.write(",{}".format(sample_id)) + f.write("\n") + + for genome_id in sorted(results_ref): + # don't write rows without estimates + estimates = [result.estimate for result in results_ref[genome_id]] + if np.all(np.isnan(estimates)): + continue + + f.write(genome_id + ",") + row = ["" for s in sample_ids] + for result in results_ref[genome_id]: + if not np.isnan(result.estimate): + row[sample_ids.index(result.sample_id)] = str(result.estimate) + f.write(",".join(row) + "\n") + + for genome_id in sorted(results_contig): + # don't write rows without estimates + estimates = [result.estimate for result in results_contig[genome_id]] + if np.all(np.isnan(estimates)): + continue + + f.write(genome_id + ",") + row = ["" for s in sample_ids] + for result in results_contig[genome_id]: + if not np.isnan(result.estimate): + row[sample_ids.index(result.sample_id)] = str(result.estimate) + f.write(",".join(row) + "\n") + + if args.oriC: + oric_file = os.path.splitext(out_file)[0] + "-oriC.csv" + with open(oric_file, "w") as f: + f.write("genome_id,oriC\n") + + for genome_id in sorted(results_ref): + orics = [ + result.ori_estimate_coord + for result in results_ref[genome_id] + if np.isfinite(result.ori_estimate_coord) + ] + oric_est = np.mean(orics) + f.write("{},{}\n".format(genome_id, oric_est)) + + for genome_id in sorted(ref_id_to_bin_coords): + bin_coords = ref_id_to_bin_coords[genome_id] + f.write("{},{}\n".format(genome_id, "\t".join(bin_coords))) + + logger.info("Done.") + logger.info("You may now remove the folder %s.", grouped_coverage_map_folder) + + def count(self): + parser = argparse.ArgumentParser( + usage="""usage: coptr count [-h] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + """ + ) + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) + parser.add_argument("out_file", help="Filename to store PTR table.") + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, + ) + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + exit(1) + + args = parser.parse_args(sys.argv[2:]) + + logger.info("Computing read counts.") + + counts, genome_ids = compute_read_counts( + args.coverage_map_folder, args.min_cov, args.min_samples + ) + + out_file = args.out_file + _, ext = os.path.splitext(out_file) + if ext != ".csv": + out_file += ".csv" + + logger.info("Writing %s.", out_file) + + with open(out_file, "w") as f: + # write the header + f.write("count:genome_id/sample_id") + for sample_id in counts: + f.write(",{}".format(sample_id)) + f.write("\n") + + for genome_id in sorted(genome_ids): + row = [genome_id] + for sample_id in counts: + if genome_id in counts[sample_id]: + row.append(str(counts[sample_id][genome_id])) + else: + row.append(str(0)) + row = ",".join(row) + "\n" + f.write(row) + + logger.info("Done.") + + def rabun(self): + parser = argparse.ArgumentParser( + usage="""usage: coptr rabun [-h] [--min-reads MIN_READS] [--min-cov MIN_COV] [--min-samples MIN_SAMPLES] coverage-map-folder out-file + """ + ) + parser.add_argument( + "coverage_map_folder", + help="Folder with coverage maps computed from 'extract'.", + ) + parser.add_argument("out_file", help="Filename to store PTR table.") + parser.add_argument( + "--min-reads", + type=float, + help="Minimum number of reads required to compute a PTR (default 5000).", + default=5000, + ) + parser.add_argument( + "--min-cov", + type=float, + help="Fraction of nonzero bins required to compute a PTR (default 0.75).", + default=0.75, + ) + parser.add_argument( + "--min-samples", + type=float, + help="CoPTRContig only. Minimum number of samples required to reorder bins (default 5).", + default=5, + ) + + if len(sys.argv[2:]) < 1: + parser.print_help() + exit(1) + + args = parser.parse_args(sys.argv[2:]) + + logger.info("Computing relative abundances.") + + rel_abun, genome_ids = compute_rel_abun( + args.coverage_map_folder, args.min_reads, args.min_cov, args.min_samples + ) + + out_file = args.out_file + _, ext = os.path.splitext(out_file) + if ext != ".csv": + out_file += ".csv" + + logger.info("Writing %s.", out_file) + + with open(out_file, "w") as f: + # write the header + f.write("rel_abun:genome_id/sample_id") + for sample_id in rel_abun: + f.write(",{}".format(sample_id)) + f.write("\n") + + for genome_id in sorted(genome_ids): + row = [genome_id] + for sample_id in rel_abun: + if genome_id in rel_abun[sample_id]: + row.append(str(rel_abun[sample_id][genome_id])) + else: + row.append(str(0)) + row = ",".join(row) + "\n" + f.write(row) + + logger.info("Done.") + + +def cli(): + """Serve as an entry point for command line calls.""" + logging.basicConfig( + level="INFO", + format="[%(levelname)s] [%(asctime)s] [%(name)s] %(message)s", + datefmt="%b %d, %Y %H:%M:%S", + ) + ProgramOptions() diff --git a/src/compute_read_counts.py b/src/coptr/compute_read_counts.py similarity index 63% rename from src/compute_read_counts.py rename to src/coptr/compute_read_counts.py index c8d568c..950f6be 100644 --- a/src/compute_read_counts.py +++ b/src/coptr/compute_read_counts.py @@ -21,20 +21,24 @@ along with CoPTR. If not, see . """ -import numpy as np +import logging import os import pickle as pkl -from src.coptr_contig import CoPTRContig -from src.coptr_ref import ReadFilterRef -from src.print import print_info, print_warning +import numpy as np + +from .coptr_contig import CoPTRContig +from .coptr_ref import ReadFilterRef -def compute_read_counts_from_coverage_maps(coverage_maps): +logger = logging.getLogger(__name__) + + +def compute_read_counts_from_coverage_maps(coverage_maps, min_cov, min_samples): # instantiate classes for filtering methods - coptr_contig = CoPTRContig(5000, 5) - rf_ref = ReadFilterRef(5000, 0.75) + coptr_contig = CoPTRContig(1, min_samples) + rf_ref = ReadFilterRef(1, min_cov) total_passing_reads = 0 read_counts = {} sample_id = None @@ -46,49 +50,55 @@ def compute_read_counts_from_coverage_maps(coverage_maps): if cm.is_assembly and cm.passed_qc(): binned_reads = coptr_contig.construct_coverage_matrix([cm]) - lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds(binned_reads) - count = binned_reads[np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound)].sum() + lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds( + binned_reads + ) + count = binned_reads[ + np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound) + ].sum() read_counts[cm.genome_id] = count total_passing_reads += count genome_ids.add(cm.genome_id) elif not cm.is_assembly: - filtered_reads, filtered_length, qc_result = rf_ref.filter_reads(cm.read_positions, cm.length) + filtered_reads, filtered_length, qc_result = rf_ref.filter_reads( + cm.read_positions, cm.length + ) if qc_result.passed_qc: count = filtered_reads.size read_counts[cm.genome_id] = count total_passing_reads += count + genome_ids.add(cm.genome_id) - genome_ids.add(cm.genome_id) + return sample_id, read_counts, genome_ids - rel_abun = {} - for genome_id in read_counts: - rel_abun[genome_id] = read_counts[genome_id] / total_passing_reads - return sample_id, rel_abun, genome_ids +def compute_read_counts(coverage_map_folder, min_cov, min_samples): - - - -def compute_read_counts(coverage_map_folder): - - rel_abundances = {} + read_counts = {} genome_ids = set() for f in sorted(os.listdir(coverage_map_folder)): fname, ext = os.path.splitext(f) - if ext != ".pkl": continue + if ext != ".pkl": + continue fpath = os.path.join(coverage_map_folder, f) - print_info("Count", "\tprocessing {}".format(f)) + logger.info("\t%s", f) with open(fpath, "rb") as file: coverage_maps = pkl.load(file) - sample_id, sample_rel_abun, sample_genome_ids = compute_read_counts_from_coverage_maps(coverage_maps) + ( + sample_id, + sample_read_counts, + sample_genome_ids, + ) = compute_read_counts_from_coverage_maps( + coverage_maps, min_cov, min_samples + ) if sample_id is not None: - rel_abundances[sample_id] = sample_rel_abun + read_counts[sample_id] = sample_read_counts genome_ids.update(sample_genome_ids) - return rel_abundances, genome_ids \ No newline at end of file + return read_counts, genome_ids diff --git a/src/coptr/compute_rel_abun.py b/src/coptr/compute_rel_abun.py new file mode 100644 index 0000000..ac56fcb --- /dev/null +++ b/src/coptr/compute_rel_abun.py @@ -0,0 +1,119 @@ +""" +compute_rel_abun.py +====================== +Estimate relative abundances. +""" + +""" +This file is part of CoPTR. + +CoPTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +CoPTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with CoPTR. If not, see . +""" + +import logging +import os +import pickle as pkl + +import numpy as np + +from .coptr_contig import CoPTRContig +from .coptr_ref import ReadFilterRef + + +logger = logging.getLogger(__name__) + + +def compute_rel_abun_from_coverage_maps(coverage_maps, min_reads, min_cov, min_samples): + # instantiate classes for filtering methods + + coptr_contig = CoPTRContig(min_reads, min_samples) + rf_ref = ReadFilterRef(min_reads, min_cov) + total_passing_reads = 0 + read_counts = {} + genome_lengths = {} + sample_id = None + genome_ids = set() + for genome_id in coverage_maps: + cm = coverage_maps[genome_id] + sample_id = cm.sample_id + + if cm.is_assembly and cm.passed_qc(): + binned_reads = coptr_contig.construct_coverage_matrix([cm]) + + lower_bound, upper_bound = coptr_contig.compute_genomewide_bounds( + binned_reads + ) + count = binned_reads[ + np.logical_and(binned_reads >= lower_bound, binned_reads <= upper_bound) + ].sum() + read_counts[cm.genome_id] = count + genome_lengths[cm.genome_id] = binned_reads.shape[0] * cm.compute_bin_size() + total_passing_reads += count + genome_ids.add(cm.genome_id) + + elif not cm.is_assembly: + filtered_reads, filtered_length, qc_result = rf_ref.filter_reads( + cm.read_positions, cm.length + ) + + if qc_result.passed_qc: + count = filtered_reads.size + read_counts[cm.genome_id] = count + genome_lengths[cm.genome_id] = filtered_length + total_passing_reads += count + genome_ids.add(cm.genome_id) + + rel_abun = {} + normalizing_constant = 0 + for genome_id in read_counts: + unnormalized_rel_abun = ( + read_counts[genome_id] / total_passing_reads + ) / genome_lengths[genome_id] + rel_abun[genome_id] = unnormalized_rel_abun + normalizing_constant += unnormalized_rel_abun + + for genome_id in read_counts: + rel_abun[genome_id] = rel_abun[genome_id] / normalizing_constant + + return sample_id, rel_abun, genome_ids + + +def compute_rel_abun(coverage_map_folder, min_reads, min_cov, min_samples): + + rel_abun = {} + genome_ids = set() + for f in sorted(os.listdir(coverage_map_folder)): + fname, ext = os.path.splitext(f) + if ext != ".pkl": + continue + fpath = os.path.join(coverage_map_folder, f) + + logger.info("\t%s", f) + + with open(fpath, "rb") as file: + coverage_maps = pkl.load(file) + + ( + sample_id, + sample_rel_abun, + sample_genome_ids, + ) = compute_rel_abun_from_coverage_maps( + coverage_maps, min_reads, min_cov, min_samples + ) + + if sample_id is not None: + rel_abun[sample_id] = sample_rel_abun + genome_ids.update(sample_genome_ids) + + return rel_abun, genome_ids diff --git a/src/coptr_contig.py b/src/coptr/coptr_contig.py similarity index 71% rename from src/coptr_contig.py rename to src/coptr/coptr_contig.py index 11c4ded..e16587f 100644 --- a/src/coptr_contig.py +++ b/src/coptr/coptr_contig.py @@ -21,19 +21,22 @@ along with CoPTR. If not, see . """ -import math +import logging import multiprocessing as mp -import numpy as np import os.path import pickle as pkl +import struct +import sys + +import numpy as np import scipy.special import scipy.stats -import sys -from src.poisson_pca import PoissonPCA -from src.print import print_info, print_warning, print_error +from .poisson_pca import PoissonPCA +logger = logging.getLogger(__name__) + class CoPTRContigEstimate: """Data structure to store CoPTRContig estimates. @@ -67,8 +70,13 @@ def __init__(self, bam_file, genome_id, sample_id, estimate, nreads, cov_frac): def __str__(self): return "CoPTRContigEstimate(bam_file={}, genome_id={}, sample_id={}, estimate={:.3f}, nreads={}, cov_frac={})".format( - self.bam_file, self.genome_id, self.sample_id, self.estimate, self.nreads, self.cov_frac - ) + self.bam_file, + self.genome_id, + self.sample_id, + self.estimate, + self.nreads, + self.cov_frac, + ) def __repr__(self): return self.__str__() @@ -76,7 +84,7 @@ def __repr__(self): class CoPTRContig: """Estimate PTRs from draft assemblies. - + Parameters ---------- min_reads : float @@ -89,7 +97,6 @@ def __init__(self, min_reads, min_samples): self.min_reads = min_reads self.min_samples = min_samples - def compute_genomewide_bounds(self, binned_counts, crit_region=0.05): """Compute bounds on read counts in bins. @@ -113,14 +120,13 @@ def compute_genomewide_bounds(self, binned_counts, crit_region=0.05): std = np.max((1, std)) norm = scipy.stats.norm() - m = norm.ppf(1-crit_region) + m = norm.ppf(1 - crit_region) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def construct_coverage_matrix(self, coverage_maps): """Aggregate coverage maps into a bin by sample matrix. @@ -137,31 +143,49 @@ def construct_coverage_matrix(self, coverage_maps): A = [] # find all common contig ids - contig_id_list = [cm.contig_ids for cm in coverage_maps] contig_ids = coverage_maps[0].contig_ids ref_genome = coverage_maps[0].genome_id + bin_coords = [] + contigs_seen = set() + for cm in coverage_maps: binned_reads = [] for contig_id in sorted(contig_ids): if contig_id not in cm.contig_ids: - print_error("CoPTRContig", "missing contig {} from {} in {}".format(contig_id, ref_genome, cm.sample_id), quit=True) + logger.critical( + "Missing contig %s from %s in %s", + contig_id, + ref_genome, + cm.sample_id, + ) + sys.exit(1) length = cm.get_length(contig_id) + bin_size = cm.compute_bin_size() if length < 11000: continue else: - binned_reads.append(cm.bin_reads(contig_id)) + bins = cm.bin_reads(contig_id) + binned_reads.append(bins) + + if contig_id not in contigs_seen: + pos = 0 + for b in bins: + bin_coords.append(contig_id + "-{}".format(int(pos))) + pos += bin_size + contigs_seen.add(contig_id) row = np.concatenate(binned_reads) A.append(row) # bin by sample A = np.array(A).T - return A + bin_coords = np.array(bin_coords) + return A, bin_coords def estimate_slope_intercept(self, bin_log_probs): """Estimate the slope along reordering bins using least squares. @@ -181,16 +205,15 @@ def estimate_slope_intercept(self, bin_log_probs): rng = np.linspace(0, 1, bin_log_probs.size) y = bin_log_probs x = np.ones((y.size, 2)) - x[:,0] = rng + x[:, 0] = rng sol, residuals, rank, s = np.linalg.lstsq(x, y, rcond=None) - m,b = sol + m, b = sol - pred = m*rng + b + pred = m * rng + b residuals = y - pred - return m,b,residuals - + return m, b, residuals def compute_log_likelihood(self, log2_ptr, binned_counts): """Compute the log likelihood of reordered bin probabilities given a log2(PTR). @@ -214,7 +237,7 @@ def compute_log_likelihood(self, log2_ptr, binned_counts): x = np.linspace(0, 1, binned_counts.size) # change to base e loge_ptr = log2_ptr / np.log2(np.e) - log_probs = np.array([loge_ptr*xi for xi in x]) + log_probs = np.array([loge_ptr * xi for xi in x]) log_norm = scipy.special.logsumexp(log_probs) log_probs -= log_norm @@ -224,11 +247,10 @@ def compute_log_likelihood(self, log2_ptr, binned_counts): # log_norm = np.log2(np.power(2, log_probs).sum()) # log_probs -= log_norm - log_lk = (binned_counts*log_probs).sum() + log_lk = (binned_counts * log_probs).sum() return log_lk - def estimate_ptr_maximum_likelihood(self, binned_counts): """Compute maximum likelihood estimates of log2(PTR) gived observed reads. @@ -255,7 +277,7 @@ def estimate_ptr_maximum_likelihood(self, binned_counts): x = np.linspace(0, 1, binned_counts.size) # compute normalized probabilities loge_ptr = log2_ptr / np.log2(np.e) - log_probs = np.array([loge_ptr*xi for xi in x]) + log_probs = np.array([loge_ptr * xi for xi in x]) log_norm = scipy.special.logsumexp(log_probs) log_probs -= log_norm probs = np.exp(log_probs) @@ -271,10 +293,9 @@ def estimate_ptr_maximum_likelihood(self, binned_counts): return log2_ptr, intercept, log_lk, flip - def estimate_ptrs(self, coverage_maps, return_bins=False): """Estimate PTRs across multiple samples of the same reference genome. - + Parameters ---------- coverage_maps : list[CoverageMapContig] @@ -313,7 +334,14 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: bam_file = cm.bam_file genome_id = cm.genome_id - estimate = CoPTRContigEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, cm.reads, cm.frac_nonzero) + estimate = CoPTRContigEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + cm.reads, + cm.frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -329,10 +357,9 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): genome_id = coverage_maps[0].genome_id # bin by sample - A = self.construct_coverage_matrix(passing_coverage_maps) + A, bin_coords = self.construct_coverage_matrix(passing_coverage_maps) nbins = A.shape[0] - # filter out low coverage and high coverage bins, # remove samples with too few reads A_filtered = [] @@ -342,9 +369,11 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): tmp = [] np.set_printoptions(suppress=True) - for i,col in enumerate(A.T): + for i, col in enumerate(A.T): - lower_bound, upper_bound = self.compute_genomewide_bounds(col, crit_region=0.05) + lower_bound, upper_bound = self.compute_genomewide_bounds( + col, crit_region=0.05 + ) col[np.logical_or(col > upper_bound, col < lower_bound)] = np.nan frac_filtered = np.isnan(col).sum() / col.size @@ -361,17 +390,25 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): sample_id = passing_coverage_maps[i].sample_id frac_nonzero = passing_coverage_maps[i].frac_nonzero m = np.nan - estimate = CoPTRContigEstimate(bam_file, genome_id, sample_id, np.nan, reads, frac_nonzero) + estimate = CoPTRContigEstimate( + bam_file, genome_id, sample_id, np.nan, reads, frac_nonzero + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) - if len(A_filtered) < self.min_samples: for cm in tmp: bam_file = cm.bam_file genome_id = cm.genome_id - estimate = CoPTRContigEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, cm.reads, cm.frac_nonzero) + estimate = CoPTRContigEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + cm.reads, + cm.frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -382,19 +419,39 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: tmp = None - print_info("CoPTRContig", "running {}".format(genome_id)) + logger.info("Running %s.", genome_id) A = np.array(A_filtered).T # filter out rows where too many bins are missing - keep_rows = np.sum(np.isnan(A), axis=1) < 0.025*A.shape[1] - A = A[keep_rows,:] - - if keep_rows.sum() < 0.5*keep_rows.size: - for j,col in enumerate(A.T): + keep_rows = np.sum(np.isnan(A), axis=1) < 0.025 * A.shape[1] + A = A[keep_rows, :] + bin_coords = bin_coords[keep_rows] + + ppca_failed = False + try: + if keep_rows.sum() >= self.min_samples: + poisson_pca = PoissonPCA() + W, V = poisson_pca.pca(A, k=1) + except: + ppca_failed = True + logger.warn("PoissonPCA failed for assembly " + genome_ids[0]) + + + if keep_rows.sum() < self.min_samples or ppca_failed: + for j, col in enumerate(A.T): reads = col[np.isfinite(col)].sum() - frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[np.isfinite(col)].size - estimate = CoPTRContigEstimate(bam_files[j], genome_ids[j], sample_ids[j], np.nan, reads, frac_nonzero) + frac_nonzero = (col[np.isfinite(col)] > 0).sum() / col[ + np.isfinite(col) + ].size + estimate = CoPTRContigEstimate( + bam_files[j], + genome_ids[j], + sample_ids[j], + np.nan, + reads, + frac_nonzero, + ) estimates.append(estimate) parameters.append((np.nan, np.nan)) binned_counts.append([np.nan]) @@ -404,19 +461,19 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): else: return estimates - poisson_pca = PoissonPCA() - W,V = poisson_pca.pca(A,k=1) sorted_idx = np.argsort(W.flatten()) - A = A[sorted_idx,:] + A = A[sorted_idx, :] + bin_coords = bin_coords[sorted_idx] - for j,col in enumerate(A.T): + flipped_bins = False + for j, col in enumerate(A.T): col = col[np.isfinite(col)] reads = col.sum() - lb = int(0.05*col.size) - ub = int(0.95*col.size) + lb = int(0.05 * col.size) + ub = int(0.95 * col.size) col = col[lb:ub] - m,b,log_lk,flip = self.estimate_ptr_maximum_likelihood(col) + m, b, log_lk, flip = self.estimate_ptr_maximum_likelihood(col) # m,b,res = self.estimate_slope_intercept(col) # flip = m < 0 @@ -424,23 +481,28 @@ def estimate_ptrs(self, coverage_maps, return_bins=False): # so let's look at positive slopes only m = np.abs(m) frac_nonzero = (col > 0).sum() / col.size - estimate = CoPTRContigEstimate(bam_files[j], genome_ids[j], sample_ids[j], m, reads, frac_nonzero) + estimate = CoPTRContigEstimate( + bam_files[j], genome_ids[j], sample_ids[j], m, reads, frac_nonzero + ) estimates.append(estimate) parameters.append((m, b)) bc = col if flip: + flipped_bins = True bc = np.flip(bc) binned_counts.append(bc) + if flipped_bins: + bin_coords = np.flip(bin_coords) - print_info("CoPTRContig", "finished {}".format(genome_id)) + logger.info("Finished %s.", genome_id) if return_bins: - return estimates, parameters, binned_counts + return estimates, parameters, binned_counts, bin_coords else: - return estimates + return estimates, bin_coords def _parallel_helper(self, x): ref_genome = x[0] @@ -448,13 +510,14 @@ def _parallel_helper(self, x): plot_folder = x[2] if plot_folder is not None: - estimates, parameters, reordered_bins = self.estimate_ptrs(coverage_maps, return_bins=True) + estimates, parameters, reordered_bins, bin_coords = self.estimate_ptrs( + coverage_maps, return_bins=True + ) plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) else: - estimates = self.estimate_ptrs(coverage_maps) - - return (ref_genome, estimates) + estimates, bin_coords = self.estimate_ptrs(coverage_maps) + return (ref_genome, estimates), bin_coords def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): @@ -463,16 +526,17 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): coptr_contig = CoPTRContig(min_reads=5000, min_samples=5) # make sure esimates and coverage maps are in the same order - order = dict([ (est.sample_id, i) for i,est in enumerate(estimates)]) + order = dict([(est.sample_id, i) for i, est in enumerate(estimates)]) tmp = [[] for cm in coverage_maps] for cm in coverage_maps: idx = order[cm.sample_id] tmp[idx] = cm coverage_maps = tmp - nplots = 0 - for estimate,p,cm,rbins in zip(estimates, parameters, coverage_maps, reordered_bins): + for estimate, p, cm, rbins in zip( + estimates, parameters, coverage_maps, reordered_bins + ): if np.isnan(estimate.estimate): continue @@ -481,7 +545,8 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): binned_counts = [] for cid in contig_ids: length = cm.get_length(cid) - if length < 11000: continue + if length < 11000: + continue binned_counts += cm.bin_reads(cid).tolist() @@ -490,25 +555,37 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): median = np.median(binned_counts[binned_counts != 0]) - fig,ax = plt.subplots(nrows=2,ncols=1, figsize=(9,6)) + fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(9, 6)) # plot unordered bins x1 = np.linspace(0, 1, binned_counts.size) ax[0].scatter(x1, binned_counts) ax[0].set_yscale("log", base=2) - ax[0].plot(x1, np.ones(binned_counts.size)*median, c="red", linewidth=3) - ax[0].plot(x1, np.ones(binned_counts.size)*upper_bound, c="red", linestyle=":", linewidth=3) - ax[0].plot(x1, np.ones(binned_counts.size)*lower_bound, c="red", linestyle=":", linewidth=3) + ax[0].plot(x1, np.ones(binned_counts.size) * median, c="red", linewidth=3) + ax[0].plot( + x1, + np.ones(binned_counts.size) * upper_bound, + c="red", + linestyle=":", + linewidth=3, + ) + ax[0].plot( + x1, + np.ones(binned_counts.size) * lower_bound, + c="red", + linestyle=":", + linewidth=3, + ) y_lim = ax[0].get_ylim() ax[0].set_ylabel("Read Count") ax[0].set_xlabel("Unordered Bins") # plot model fit emp_probs = rbins / rbins.sum() - m,b = p # slope intercept in log space + m, b = p # slope intercept in log space x2 = np.linspace(0, 1, emp_probs.size) - y = np.array([m*xi + b for xi in x2]) + y = np.array([m * xi + b for xi in x2]) y = np.power(2, y) y /= y.sum() ax[1].scatter(x2, emp_probs, c="C1") @@ -516,15 +593,20 @@ def plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder): ax[1].set_yscale("log", base=2) ax[1].set_ylabel("Density") ax[1].set_xlabel("Reordered Bins") - ax[1].set_title("\nlog2(PTR)={:.3f} (Reads = {})".format(m, int(estimate.nreads))) + ax[1].set_title( + "\nlog2(PTR)={:.3f} (Reads = {})".format(m, int(estimate.nreads)) + ) plt.tight_layout() - plt.savefig(os.path.join(plot_folder, estimate.sample_id + "-" + estimate.genome_id + ".pdf")) + plt.savefig( + os.path.join( + plot_folder, estimate.sample_id + "-" + estimate.genome_id + ".pdf" + ) + ) plt.close() nplots += 1 - def load_coverage_maps(file_handle, ref_id): reached_end = False coverage_maps = [] @@ -539,8 +621,13 @@ def load_coverage_maps(file_handle, ref_id): return coverage_maps - -def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, min_reads, min_samples, threads, plot_folder=None): +def estimate_ptrs_coptr_contig( + assembly_genome_ids, + grouped_coverage_map_folder, + min_reads, + min_samples, + plot_folder=None, +): """Estimate Peak-to-Trough ratios across samples. Parameters @@ -564,37 +651,25 @@ def estimate_ptrs_coptr_contig(assembly_genome_ids, grouped_coverage_map_folder, A dictionary with key reference genome id and value a list of CoPTRContigEstimate for that reference genome. """ - print_info("CoPTRContig", "checking reference genomes") + logger.info("Checking reference genomes.") coptr_contig_estimates = {} coptr_contig = CoPTRContig(min_reads, min_samples) - if threads == 1: - for ref_id in sorted(assembly_genome_ids): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: - coverage_maps = load_coverage_maps(f, ref_id) + ref_id_to_bin_coords = {} - if plot_folder is not None: - estimates, parameters, reordered_bins = coptr_contig.estimate_ptrs(coverage_maps, return_bins=True) - plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) - else: - estimates = coptr_contig.estimate_ptrs(coverage_maps) - - coptr_contig_estimates[ref_id] = estimates - - else: - # workers for multiprocessing - pool = mp.Pool(threads) - coverage_maps = {} - - for i,ref_id in enumerate(sorted(assembly_genome_ids)): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: - coverage_maps[ref_id] = load_coverage_maps(f, ref_id) + for ref_id in sorted(assembly_genome_ids): + with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: + coverage_maps = load_coverage_maps(f, ref_id) - if (i % threads == 0 and i > 0) or i == len(assembly_genome_ids) - 1: - flat_coverage_maps = [(ref_id, coverage_maps[ref_id], plot_folder) for ref_id in coverage_maps] - flat_results = pool.map(coptr_contig._parallel_helper, flat_coverage_maps) - coptr_contig_estimates.update(dict(flat_results)) + if plot_folder is not None: + estimates, parameters, reordered_bins, bin_coords = ( + coptr_contig.estimate_ptrs(coverage_maps, return_bins=True) + ) + plot_fit(estimates, parameters, coverage_maps, reordered_bins, plot_folder) + else: + estimates, bin_coords = coptr_contig.estimate_ptrs(coverage_maps) - coverage_maps = {} + coptr_contig_estimates[ref_id] = estimates + ref_id_to_bin_coords[ref_id] = bin_coords - return coptr_contig_estimates \ No newline at end of file + return coptr_contig_estimates, ref_id_to_bin_coords diff --git a/src/coptr_ref.py b/src/coptr/coptr_ref.py similarity index 72% rename from src/coptr_ref.py rename to src/coptr/coptr_ref.py index 798e6a6..6424818 100644 --- a/src/coptr_ref.py +++ b/src/coptr/coptr_ref.py @@ -21,16 +21,20 @@ along with CoPTR. If not, see . """ +import logging import math import multiprocessing as mp -import numpy as np import os.path import pickle as pkl +import struct +import sys + +import numpy as np import scipy.optimize import scipy.stats -import sys -from src.print import print_info, print_warning + +logger = logging.getLogger(__name__) class QCResult: @@ -62,6 +66,28 @@ def __str__(self): def __repr__(self): return self.__str__() + +class CoordinateMap: + """Map post-filtering genomic coordinates to pre-filtering + genomic coordinates. + """ + + def __init__(self): + self.break_points = [] + + def update_break_points(self, break_point, shift_size): + self.break_points.append((break_point, shift_size)) + + def translate(self, filtered_coord): + unfiltered_coord = filtered_coord + for break_point, shift_size in self.break_points: + if unfiltered_coord >= break_point: + unfiltered_coord += shift_size + return unfiltered_coord + + + + class ReadFilterRef: """Read filtering steps for CoPTR Ref. @@ -77,7 +103,6 @@ def __init__(self, min_reads, min_cov): self.min_reads = min_reads self.min_cov = min_cov - def filter_reads(self, read_positions, genome_length): """Filter out reads in regions of the genome where the coverage is either too high or too low. @@ -101,20 +126,38 @@ def filter_reads(self, read_positions, genome_length): # don't filter if there are too few reads # just return that the sample failed QC if len(read_positions) < self.min_reads: - return np.array([]), genome_length, QCResult(-1, len(read_positions), -1, False) + return ( + np.array([]), + genome_length, + QCResult(-1, len(read_positions), -1, False), + ) + coord_map = CoordinateMap() bin_size = self.compute_bin_size(genome_length) read_positions = np.copy(read_positions) - filtered_read_positions, filtered_genome_length = self.filter_reads_phase1(read_positions, genome_length, bin_size) + filtered_read_positions, filtered_genome_length = self.filter_reads_phase1( + read_positions, genome_length, bin_size, coord_map + ) # more than 25% of the genome is removed frac_removed = 1 - filtered_genome_length / genome_length - if frac_removed > 0.25: - return np.array(filtered_read_positions), filtered_genome_length, QCResult(-1, filtered_read_positions.size, frac_removed, False) - - filtered_read_positions, filtered_genome_length = self.filter_reads_phase2(filtered_read_positions, filtered_genome_length, bin_size) - qc_result = self.quality_check(filtered_read_positions, filtered_genome_length, genome_length, bin_size, self.min_reads, self.min_cov) - return filtered_read_positions, filtered_genome_length, qc_result - + if frac_removed > 0.25: + return ( + np.array(filtered_read_positions), + filtered_genome_length, + QCResult(-1, filtered_read_positions.size, frac_removed, False), + ) + + filtered_read_positions, filtered_genome_length = self.filter_reads_phase2( + filtered_read_positions, filtered_genome_length, bin_size, coord_map + ) + qc_result = self.quality_check( + filtered_read_positions, + filtered_genome_length, + genome_length, bin_size, + self.min_reads, + self.min_cov + ) + return filtered_read_positions, filtered_genome_length, qc_result, coord_map def compute_bin_size(self, genome_length): """Compute bin size for read counts. @@ -136,10 +179,24 @@ def compute_bin_size(self, genome_length): # want a number divisible by 100 for downstream steps bin_size = bin_size - (bin_size % 100) - return bin_size + if bin_size < 1: + logger.error( + "Found complete reference genome with <500bp. " + "This is probably due to a mislabeled contig. " + "Please check your .genomes file." + ) + return bin_size - def quality_check(self, filtered_read_positions, filtered_genome_length, original_genome_length, bin_size, min_reads, frac_nonzero): + def quality_check( + self, + filtered_read_positions, + filtered_genome_length, + original_genome_length, + bin_size, + min_reads, + frac_nonzero, + ): """A basic quality check for required coverage of a genome in a sample. Parameters @@ -161,11 +218,15 @@ def quality_check(self, filtered_read_positions, filtered_genome_length, origina qc_result : QCResult Object that stores the result of the quality check """ - binned_counts = self.bin_reads(filtered_read_positions, filtered_genome_length, bin_size) + binned_counts = self.bin_reads( + filtered_read_positions, filtered_genome_length, bin_size + ) nonzero_bins = (binned_counts > 0).sum() / np.max((1, binned_counts.size)) nreads = binned_counts.sum() - passed_qc = True if nonzero_bins >= frac_nonzero and nreads >= min_reads else False + passed_qc = ( + True if nonzero_bins >= frac_nonzero and nreads >= min_reads else False + ) frac_removed = 1 - filtered_genome_length / original_genome_length if frac_removed > 0.25: @@ -173,7 +234,6 @@ def quality_check(self, filtered_read_positions, filtered_genome_length, origina return QCResult(nonzero_bins, nreads, frac_removed, passed_qc) - def compute_genomewide_bounds(self, binned_counts): """Compute bounds on read counts in bins. @@ -198,12 +258,11 @@ def compute_genomewide_bounds(self, binned_counts): norm = scipy.stats.norm() m = norm.ppf(0.95) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def compute_bin_bounds(self, binned_counts): """Compute bounds on read counts in a smaller window along the genome. @@ -226,12 +285,11 @@ def compute_bin_bounds(self, binned_counts): norm = scipy.stats.norm() m = norm.ppf(0.975) - lower_bound = np.power(2, median - m*std) - upper_bound = np.power(2, median + m*std) + lower_bound = np.power(2, median - m * std) + upper_bound = np.power(2, median + m * std) return lower_bound, upper_bound - def compute_rolling_sum(self, read_positions, genome_length, bin_size): """Compute a rolling sum of read counts in 10Kb bins, sliding 1Kb at a time. @@ -249,14 +307,14 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): rolling_counts : np.array of float The read count in each rolling bin endpoints : np.array of tuple - Each tuple gives the left (inclusive) and right (exclusive) + Each tuple gives the left (inclusive) and right (exclusive) end point of each bin. """ - step = math.ceil(bin_size/100) + step = math.ceil(bin_size / 100) s = read_positions.size sorted_reads = np.sort(read_positions) - endpoints = np.array([(i, i+bin_size) for i in range(0, genome_length, step)]) + endpoints = np.array([(i, i + bin_size) for i in range(0, genome_length, step)]) left_endpoints = [e[0] for e in endpoints] right_endpoints = [e[1] for e in endpoints] @@ -267,9 +325,9 @@ def compute_rolling_sum(self, read_positions, genome_length, bin_size): return rolling_counts, endpoints - def remove_reads_by_region(self, read_positions, genome_length, regions): + def remove_reads_by_region(self, read_positions, genome_length, regions, coord_map): """Remove reads that overlap a region in regions. - + Parameters ---------- read_positions : np.array of int @@ -291,20 +349,25 @@ def remove_reads_by_region(self, read_positions, genome_length, regions): new_genome_length = genome_length filter_count = 0 adjustment = 0 - for left,right in regions: + for left, right in regions: remove_size = int(right - left) new_genome_length -= remove_size - keep = np.logical_or(read_positions < (left - adjustment), read_positions >= (right - adjustment)) + keep = np.logical_or( + read_positions < (left - adjustment), + read_positions >= (right - adjustment), + ) adjustment += remove_size - filter_count += (read_positions.size - keep.sum()) + filter_count += read_positions.size - keep.sum() read_positions = read_positions[keep] read_positions[read_positions > (right - adjustment)] -= remove_size + coord_map.update_break_points(right - remove_size, remove_size) + return read_positions, new_genome_length - def filter_reads_phase1(self, read_positions, genome_length, bin_size): + def filter_reads_phase1(self, read_positions, genome_length, bin_size, coord_map): """A coarse-grained genomewide filter that removes reads in ultra-high or ultra-low coverage regions. @@ -326,36 +389,40 @@ def filter_reads_phase1(self, read_positions, genome_length, bin_size): binned_reads = self.bin_reads(read_positions, genome_length, bin_size) lower_bound, upper_bound = self.compute_genomewide_bounds(binned_reads) - rolling_counts, endpoints = self.compute_rolling_sum(read_positions, genome_length, bin_size) + rolling_counts, endpoints = self.compute_rolling_sum( + read_positions, genome_length, bin_size + ) # find regions to remove remove_start = None remove_end = None remove_regions = [] - for i,c in enumerate(rolling_counts): + for i, c in enumerate(rolling_counts): left, right = endpoints[i] if (c > upper_bound or c < lower_bound) and remove_start is None: remove_start = left elif c < upper_bound and c > lower_bound and remove_start is not None: - remove_end = left # want endpoint of previous bin - remove_regions.append( (remove_start, remove_end) ) + remove_end = left # want endpoint of previous bin + remove_regions.append((remove_start, remove_end)) remove_start = None remove_end = None # get endpoint if remove_start is not None: remove_end = genome_length - remove_regions.append( (remove_start, remove_end) ) + remove_regions.append((remove_start, remove_end)) + + read_positions, new_genome_length = self.remove_reads_by_region( + read_positions, genome_length, remove_regions, coord_map + ) - read_positions, new_genome_length = self.remove_reads_by_region(read_positions, genome_length, remove_regions) - binned_reads = self.bin_reads(read_positions, genome_length, bin_size) return read_positions, new_genome_length - def filter_reads_phase2(self, read_positions, genome_length, bin_size): + def filter_reads_phase2(self, read_positions, genome_length, bin_size, coord_map): """A fine-grained filter that removes reads in localized regions with too-high or too-low coverage. For each 10Kb, looks 6.25% of the genome length ahead and 6.25% of the genome length behind. @@ -377,11 +444,13 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): """ # rolling counts in 10Kb bins, shifting by 1000bp - rolling_counts, endpoints = self.compute_rolling_sum(read_positions, genome_length, bin_size) + rolling_counts, endpoints = self.compute_rolling_sum( + read_positions, genome_length, bin_size + ) nbins = rolling_counts.size # how many bins to use - window = np.max((4,math.ceil(0.0625*genome_length / bin_size))) + window = np.max((4, math.ceil(0.0625 * genome_length / bin_size))) window = int(window) # find regions to remove @@ -391,14 +460,16 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): bounds = {} - for i,c in enumerate(rolling_counts): + for i, c in enumerate(rolling_counts): # endpoints left, right = endpoints[i] # take bins in window to the left and right - left_bins = [(i - 100*j) for j in range(window)] - right_bins = [(i + 100*j) % nbins for j in range(window)] - bins = np.concatenate((rolling_counts[left_bins], rolling_counts[right_bins])) + left_bins = [(i - 100 * j) for j in range(window)] + right_bins = [(i + 100 * j) % nbins for j in range(window)] + bins = np.concatenate( + (rolling_counts[left_bins], rolling_counts[right_bins]) + ) median = np.median(bins) # computing these bounds is expensive, so @@ -413,8 +484,8 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): remove_start = left elif (c < upper_bound and c > lower_bound) and remove_start is not None: - remove_end = left # want endpoint of previous bin - remove_regions.append( (remove_start, remove_end) ) + remove_end = left # want endpoint of previous bin + remove_regions.append((remove_start, remove_end)) remove_start = None remove_end = None @@ -423,13 +494,14 @@ def filter_reads_phase2(self, read_positions, genome_length, bin_size): remove_end = genome_length remove_regions.append((remove_start, remove_end)) - read_positions, new_genome_length = self.remove_reads_by_region(read_positions, genome_length, remove_regions) + read_positions, new_genome_length = self.remove_reads_by_region( + read_positions, genome_length, remove_regions, coord_map + ) return read_positions, new_genome_length - def bin_reads(self, read_positions, genome_length, bin_size=10000): """Aggregate reads into bin_size windows and compute the read count in each window. - + Parameters ---------- read_positions : np.array of int @@ -452,7 +524,6 @@ def bin_reads(self, read_positions, genome_length, bin_size=10000): return bin_counts - class CoPTRRefEstimate: """Data structure to store CoPTRRef estimates. @@ -477,7 +548,18 @@ class CoPTRRefEstimate: The fraction of nonzero bins in 10Kb windows. """ - def __init__(self, bam_file, genome_id, sample_id, estimate, ori_estimate, ter_estimate, nreads, cov_frac): + def __init__( + self, + bam_file, + genome_id, + sample_id, + estimate, + ori_estimate, + ter_estimate, + nreads, + cov_frac, + ori_estimate_coord + ): self.bam_file = bam_file self.genome_id = genome_id self.sample_id = sample_id @@ -486,18 +568,22 @@ def __init__(self, bam_file, genome_id, sample_id, estimate, ori_estimate, ter_e self.ter_estimate = ter_estimate self.nreads = nreads self.cov_frac = cov_frac - + self.ori_estimate_coord = ori_estimate_coord def __str__(self): return "CoPTRRefEstimate(bam_file={}, genome_id={}, sample_id={}, estimate={:.3f}, nreads={}, cov_frac={})".format( - self.bam_file, self.genome_id, self.sample_id, self.estimate, self.nreads, self.cov_frac - ) + self.bam_file, + self.genome_id, + self.sample_id, + self.estimate, + self.nreads, + self.cov_frac, + ) def __repr__(self): return self.__str__() - class CoPTRRef: """CoPTR estimator for complete reference genomes. @@ -513,7 +599,6 @@ def __init__(self, min_reads, min_cov): self.min_reads = min_reads self.min_cov = min_cov - def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): """Model log-likelihood for one sample. @@ -528,27 +613,52 @@ def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): read_locs : np.array of float A 1D numpy array giving read positions in [0, 1] """ - if ori_loc > 1 or ori_loc < 0 or ter_loc > 1 or ter_loc < 0 or log2_ptr > np.log2(16) or log2_ptr <= np.log2(1): + if ( + ori_loc > 1 + or ori_loc < 0 + or ter_loc > 1 + or ter_loc < 0 + or log2_ptr > np.log2(16) + or log2_ptr <= np.log2(1) + ): return -np.inf - assert read_locs.size == 1 or np.all(read_locs[:-1] <= read_locs[1:]), "reads must be sorted" + assert read_locs.size == 1 or np.all( + read_locs[:-1] <= read_locs[1:] + ), "reads must be sorted" x1 = np.min([ori_loc, ter_loc]) x2 = np.max([ori_loc, ter_loc]) - dist = np.abs(x2-x1) + dist = np.abs(x2 - x1) if dist < 0.45 or dist > 0.55: return -np.inf # compute log2_p(x_i), log2_p(x_t) so that the density normalizes alpha = log2_ptr / (ori_loc - ter_loc) if ori_loc < ter_loc: - c1 = np.log2(np.log(2)) - \ - np.log2((1/alpha)*(2**(alpha*x1) + 2**(alpha*(x2 - x1)) - 2 - 2**(-alpha*(1-x2) - log2_ptr) + 2**(-log2_ptr))) + c1 = np.log2(np.log(2)) - np.log2( + (1 / alpha) + * ( + 2 ** (alpha * x1) + + 2 ** (alpha * (x2 - x1)) + - 2 + - 2 ** (-alpha * (1 - x2) - log2_ptr) + + 2 ** (-log2_ptr) + ) + ) c2 = c1 - log2_ptr else: - c1 = np.log2(np.log(2)) - \ - np.log2((1/alpha)*(2**(alpha*x1) + 2**(alpha*(x2 - x1)) - 2 - 2**(-alpha*(1-x2) + log2_ptr) + 2**(log2_ptr))) + c1 = np.log2(np.log(2)) - np.log2( + (1 / alpha) + * ( + 2 ** (alpha * x1) + + 2 ** (alpha * (x2 - x1)) + - 2 + - 2 ** (-alpha * (1 - x2) + log2_ptr) + + 2 ** (log2_ptr) + ) + ) c2 = c1 + log2_ptr log_prob = 0 @@ -562,21 +672,23 @@ def compute_log_likelihood(self, ori_loc, ter_loc, log2_ptr, read_locs): left_part = read_locs[:left_x] middle_part = read_locs[left_x:right_x] right_part = read_locs[right_x:] - log_prob += (-alpha*(left_part - x1) + c1).sum() - log_prob += ( alpha*(middle_part - x1) + c1).sum() - log_prob += (-alpha*(right_part - x2) + c2).sum() + log_prob += (-alpha * (left_part - x1) + c1).sum() + log_prob += (alpha * (middle_part - x1) + c1).sum() + log_prob += (-alpha * (right_part - x2) + c2).sum() return log_prob - - def compute_multisample_log_likelihood(self, ori_loc, ter_loc, log2_ptrs, read_locs_list): + def compute_multisample_log_likelihood( + self, ori_loc, ter_loc, log2_ptrs, read_locs_list + ): log_lk = 0 for log2_ptr, read_locs in zip(log2_ptrs, read_locs_list): log_lk += self.compute_log_likelihood(ori_loc, ter_loc, log2_ptr, read_locs) return log_lk - - def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estimate_terminus=False): + def estimate_ptr( + self, read_positions, ref_genome_len, filter_reads=True, estimate_terminus=False + ): """Estimate the PTR for a single sample. Parameters @@ -604,7 +716,9 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima """ if filter_reads: rf = ReadFilterRef(self.min_reads, self.min_cov) - read_positions, ref_genome_len, qc_result = rf.filter_reads(read_positions, ref_genome_len) + read_positions, ref_genome_len, qc_result, coord_map = rf.filter_reads( + read_positions, ref_genome_len + ) if not qc_result.passed_qc: return np.nan, np.nan, np.nan, np.nan, qc_result @@ -616,8 +730,8 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima if estimate_terminus: f = lambda x: -self.compute_log_likelihood(x[0], x[1], x[2], read_locs) - log_ptrs = [0.2*(i+1) for i in range(5)] - oris = [0.2*i for i in range(5)] + log_ptrs = [0.2 * (i + 1) for i in range(5)] + oris = [0.2 * i for i in range(5)] np.random.shuffle(log_ptrs) np.random.shuffle(oris) xs = [] @@ -628,10 +742,12 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima xs = np.array(xs) else: - f = lambda x: -self.compute_log_likelihood(x[0], (x[0] + 0.5) % 1, x[1], read_locs) + f = lambda x: -self.compute_log_likelihood( + x[0], (x[0] + 0.5) % 1, x[1], read_locs + ) - log_ptrs = [0.2*(i+1) for i in range(5)] - oris = [0.2*i for i in range(5)] + log_ptrs = [0.2 * (i + 1) for i in range(5)] + oris = [0.2 * i for i in range(5)] np.random.shuffle(log_ptrs) np.random.shuffle(oris) xs = [] @@ -639,7 +755,7 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima for ori in oris: xs.append([ori, log_ptr]) xs = np.array(xs) - + # initial ori estimate, initial ptr estimate best_x = 0 best_f = np.inf @@ -661,17 +777,15 @@ def estimate_ptr(self, read_positions, ref_genome_len, filter_reads=True, estima ter = (ori + 0.5) % 1 log2_ptr = np.clip(best_x[1], 0, np.inf) - np.seterr(invalid="warn") if best_f == np.inf: - print_warning("CoPTRRef", "PTR optimization failed") + logger.error("PTR optimization failed.") return np.nan, np.nan, np.nan, np.nan elif not filter_reads: return log2_ptr, ori, ter, -best_f else: return log2_ptr, ori, ter, -best_f, qc_result - def update_ori_ter(self, log2_ptrs, read_locs_list): """Compute maximum likelihood ori estimates across samples given estimated log2(PTR)s across samples. @@ -692,8 +806,10 @@ def update_ori_ter(self, log2_ptrs, read_locs_list): """ np.seterr(invalid="ignore") - xs = [0.1*i for i in range(10)] - f = lambda x: -self.compute_multisample_log_likelihood(x[0], (x[0] + 0.5 ) % 1, log2_ptrs, read_locs_list) + xs = [0.1 * i for i in range(10)] + f = lambda x: -self.compute_multisample_log_likelihood( + x[0], (x[0] + 0.5) % 1, log2_ptrs, read_locs_list + ) best_x = None best_f = np.inf @@ -705,13 +821,12 @@ def update_ori_ter(self, log2_ptrs, read_locs_list): best_f = res.fun np.seterr(invalid="warn") - return best_x, (best_x+0.5)%1 - + return best_x, (best_x + 0.5) % 1 def update_ptrs(self, ori, ter, prv_log2_ptrs, read_locs_list): """Compute maximum likelihood PTR estimates across samples given an estimated replication origin and terminus. - + Parameters ---------- ori : float @@ -731,14 +846,13 @@ def update_ptrs(self, ori, ter, prv_log2_ptrs, read_locs_list): np.seterr(invalid="ignore") log2_ptrs = [] - for i,read_locs in enumerate(read_locs_list): + for i, read_locs in enumerate(read_locs_list): f = lambda x: -self.compute_log_likelihood(ori, ter, x, read_locs) res = scipy.optimize.minimize(f, prv_log2_ptrs[i], method="SLSQP") log2_ptrs.append(res.x[0]) np.seterr(invalid="warn") return log2_ptrs - def estimate_ptrs(self, coverage_maps): """Compute maximum likelihood PTR estimates across samples. @@ -758,26 +872,46 @@ def estimate_ptrs(self, coverage_maps): sample_ids = [] read_positions_list = [] lengths = [] + filtered_genomes_lengths = [] + coord_maps = [] genome_id = coverage_maps[0].genome_id for cm in coverage_maps: - read_positions, ref_genome_len, qc_result = rf.filter_reads(cm.read_positions, cm.length) + read_positions, ref_genome_len, qc_result, coord_map = rf.filter_reads( + cm.read_positions, cm.length + ) + coord_maps.append(coord_map) + filtered_genomes_lengths.append(ref_genome_len) if qc_result.passed_qc: read_positions_list.append(np.array(read_positions)) lengths.append(ref_genome_len) sample_ids.append(cm.sample_id) - estimates.append(CoPTRRefEstimate(cm.bam_file, cm.genome_id, cm.sample_id, np.nan, np.nan, np.nan, qc_result.nreads, qc_result.frac_nonzero)) + estimates.append( + CoPTRRefEstimate( + cm.bam_file, + cm.genome_id, + cm.sample_id, + np.nan, + np.nan, + np.nan, + qc_result.nreads, + qc_result.frac_nonzero, + np.nan + ) + ) if len(sample_ids) == 0: return estimates - print_info("CoPTRRef", "running {}".format(genome_id)) + logger.info("Running %s.", genome_id) # first, compute individiual log2_ptr, ori, ter estimates log2_ptrs = [] read_locs_list = [] - for read_positions,length in zip(read_positions_list, lengths): - log2_ptr, ori, ter, f = self.estimate_ptr(read_positions, length, filter_reads=False) + for read_positions, length in zip(read_positions_list, lengths): + log2_ptr, ori, ter, f = self.estimate_ptr( + read_positions, length, filter_reads=False + ) log2_ptrs.append(log2_ptr) read_locs_list.append(np.sort(read_positions / length)) @@ -788,24 +922,25 @@ def estimate_ptrs(self, coverage_maps): log2_ptrs = self.update_ptrs(ori, ter, log2_ptrs, read_locs_list) n = 0 - for i,cm in enumerate(coverage_maps): + for i, cm in enumerate(coverage_maps): if cm.sample_id in sample_ids: # sanity check that the ptr estimate is good if np.abs(log2_ptrs[n] - 4) < 1e-4: estimates[i].ori_estimate = ori estimates[i].ter_estimate = ter estimates[i].estimate = np.nan + estimates[i].ori_estimate_coord = coord_maps[i].translate(ori*filtered_genomes_lengths[i]) else: estimates[i].ori_estimate = ori estimates[i].ter_estimate = ter estimates[i].estimate = log2_ptrs[n] + estimates[i].ori_estimate_coord = coord_maps[i].translate(ori*filtered_genomes_lengths[i]) n += 1 - print_info("CoPTRRef", "finished {}".format(genome_id)) + logger.info("Finished %s.", genome_id) return estimates - def _parallel_helper(self, x): ref_genome = x[0] coverage_maps = x[1] @@ -813,8 +948,14 @@ def _parallel_helper(self, x): return (ref_genome, self.estimate_ptrs(coverage_maps)) -def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, plot_folder): - import matplotlib.pyplot as plt +def plot_fit( + coptr_ref_est, read_positions, genome_length, min_reads, min_cov, plot_folder +): + try: + import matplotlib.pyplot as plt + except ModuleNotFoundError: + logger.critical("Please install matplotlib to enable plotting.") + raise coptr = CoPTRRef(min_reads, min_cov) rf = ReadFilterRef(min_reads, min_cov) @@ -822,7 +963,7 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p rp = np.array(read_positions) length = genome_length - fig,ax = plt.subplots(nrows=2,ncols=1, figsize=(9,6)) + fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(9, 6)) # first plot unfiltered bins bin_size = rf.compute_bin_size(genome_length) @@ -832,9 +973,9 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p lower_bound, upper_bound = rf.compute_genomewide_bounds(binned_counts) min_y = np.min(binned_counts[binned_counts != 0]) - min_y -= 0.5*min_y - max_y = np.max(binned_counts[binned_counts != 0]) + 0.008*binned_counts.sum() - + min_y -= 0.5 * min_y + max_y = np.max(binned_counts[binned_counts != 0]) + 0.008 * binned_counts.sum() + # plot bins ax[0].scatter(x, binned_counts) ax[0].set_yscale("log", base=2) @@ -842,9 +983,13 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p y_lim = ax[0].get_ylim() # plot upper and lower bounds - ax[0].plot(x, median*np.ones(binned_counts.size), c="red", linewidth=3) - ax[0].plot(x, lower_bound*np.ones(binned_counts.size), c="red", ls=":", linewidth=3) - ax[0].plot(x, upper_bound*np.ones(binned_counts.size), c="red", ls=":", linewidth=3) + ax[0].plot(x, median * np.ones(binned_counts.size), c="red", linewidth=3) + ax[0].plot( + x, lower_bound * np.ones(binned_counts.size), c="red", ls=":", linewidth=3 + ) + ax[0].plot( + x, upper_bound * np.ones(binned_counts.size), c="red", ls=":", linewidth=3 + ) ax[0].set_ylabel("Read Count") ax[0].set_xlabel("Position Along Genome") @@ -858,25 +1003,40 @@ def plot_fit(coptr_ref_est, read_positions, genome_length, min_reads, min_cov, p ax[1].scatter(x, probs, c="C1") ax[1].set_yscale("log", base=2) min_y = np.min(probs[probs != 0]) - min_y -= 0.5*min_y + min_y -= 0.5 * min_y max_y = np.max(probs) + 0.008 ax[1].set_ylim((min_y, max_y)) # plot fit - y = [ coptr.compute_log_likelihood(coptr_ref_est.ori_estimate, coptr_ref_est.ter_estimate, coptr_ref_est.estimate, xi) - for xi in x ] - y = np.power(2, y)*(x[1] - x[0]) + y = [ + coptr.compute_log_likelihood( + coptr_ref_est.ori_estimate, + coptr_ref_est.ter_estimate, + coptr_ref_est.estimate, + xi, + ) + for xi in x + ] + y = np.power(2, y) * (x[1] - x[0]) ax[1].plot(x, y, c="black", linewidth=3) ax[1].set_ylabel("Density") ax[1].set_xlabel("Position Along Genome") - ax[1].set_title("\nlog2(PTR)={:.3f} (Reads = {})".format(coptr_ref_est.estimate, int(binned_counts.sum()))) + ax[1].set_title( + "\nlog2(PTR)={:.3f} (Reads = {})".format( + coptr_ref_est.estimate, int(binned_counts.sum()) + ) + ) plt.tight_layout() - plt.savefig(os.path.join(plot_folder, coptr_ref_est.sample_id + "-" + coptr_ref_est.genome_id + ".pdf")) + plt.savefig( + os.path.join( + plot_folder, + coptr_ref_est.sample_id + "-" + coptr_ref_est.genome_id + ".pdf", + ) + ) plt.close() - def load_coverage_maps(file_handle, ref_id): coverage_maps = [] try: @@ -888,8 +1048,14 @@ def load_coverage_maps(file_handle, ref_id): return coverage_maps - -def estimate_ptrs_coptr_ref(ref_genome_ids, grouped_coverage_map_folder, min_reads, min_cov, threads, plot_folder=None, mem_limit=None): +def estimate_ptrs_coptr_ref( + ref_genome_ids, + grouped_coverage_map_folder, + min_reads, + min_cov, + plot_folder=None, + mem_limit=None, +): """Estimate Peak-to-Trough ratios across samples. Parameters @@ -917,42 +1083,28 @@ def estimate_ptrs_coptr_ref(ref_genome_ids, grouped_coverage_map_folder, min_rea A dictionary with key reference genome id and value a list of CoPTRRefEstimate for that reference genome. """ - print_info("CoPTRRef", "checking reference genomes") + logger.info("Checking reference genomes.") coptr_ref_estimates = {} coptr_ref = CoPTRRef(min_reads, min_cov) - if threads > 1: - # workers for multiprocessing - pool = mp.Pool(threads) - coverage_maps = {} - - for i,ref_id in enumerate(sorted(ref_genome_ids)): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: - coverage_maps[ref_id] = load_coverage_maps(f, ref_id) - - if (i % threads == 0 and i > 0) or i == len(ref_genome_ids) - 1: - flat_coverage_maps = [(ref_id, coverage_maps[ref_id]) for ref_id in coverage_maps] - flat_results = pool.map(coptr_ref._parallel_helper, flat_coverage_maps) - coptr_ref_estimates.update(dict(flat_results)) - - for ref_id in sorted(coverage_maps): - for sample,est in zip(coverage_maps[ref_id], coptr_ref_estimates[ref_id]): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit(est, sample.read_positions, sample.length, min_reads, min_cov, plot_folder) - - coverage_maps = {} - - else: - for ref_id in sorted(ref_genome_ids): - with open(os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb") as f: - coverage_maps = load_coverage_maps(f, ref_id) - - coptr_ref_estimates[ref_id] = coptr_ref.estimate_ptrs(coverage_maps) - - for sample,est in zip(coverage_maps, coptr_ref_estimates[ref_id]): - # only plot samples with a PTR estimate - if plot_folder is not None and not np.isnan(est.estimate): - plot_fit(est, sample.read_positions, sample.length, min_reads, min_cov, plot_folder) - - return coptr_ref_estimates \ No newline at end of file + for ref_id in sorted(ref_genome_ids): + with open( + os.path.join(grouped_coverage_map_folder, ref_id + ".cm.pkl"), "rb" + ) as f: + coverage_maps = load_coverage_maps(f, ref_id) + + coptr_ref_estimates[ref_id] = coptr_ref.estimate_ptrs(coverage_maps) + + for sample, est in zip(coverage_maps, coptr_ref_estimates[ref_id]): + # only plot samples with a PTR estimate + if plot_folder is not None and not np.isnan(est.estimate): + plot_fit( + est, + sample.read_positions, + sample.length, + min_reads, + min_cov, + plot_folder, + ) + + return coptr_ref_estimates diff --git a/src/poisson_pca.py b/src/coptr/poisson_pca.py similarity index 76% rename from src/poisson_pca.py rename to src/coptr/poisson_pca.py index f45b1f5..c5add7a 100644 --- a/src/poisson_pca.py +++ b/src/coptr/poisson_pca.py @@ -21,15 +21,19 @@ along with CoPTR. If not, see . """ +import logging + import numpy as np import scipy.optimize -import sys + + +logger = logging.getLogger(__name__) + class PoissonPCA: - """Principal Component Analysis with Poisson observations. - """ + """Principal Component Analysis with Poisson observations.""" - def pca(self, X, k, tol=1E-3, verbose=False): + def pca(self, X, k, tol=1e-3): """Run the PCA. For a data matrix X (sample by observations), finds two matrices W and V such that: @@ -46,34 +50,31 @@ def pca(self, X, k, tol=1E-3, verbose=False): """ assert k <= X.shape[1], "k must be smaller than the number of columns of X" assert k > 0 and k == int(k), "k must be a positive integer" - assert np.all(X[np.isfinite(X)] >= 0), "X must have nonnegative entries or be missing" + assert np.all( + X[np.isfinite(X)] >= 0 + ), "X must have nonnegative entries or be missing" X = X.astype(float) # use PCA to initialize matrices W,V # initializing missing values to mean Xt = np.copy(X) Xt[Xt == 0] = 0.1 - for i,row in enumerate(Xt): + for i, row in enumerate(Xt): if np.isnan(row).sum() > 0: - Xt[i,np.isnan(row)] = np.mean(row[np.isfinite(row)]) - u,s,vh = np.linalg.svd(np.log2(Xt)) - u = u[:,:k] + Xt[i, np.isnan(row)] = np.mean(row[np.isfinite(row)]) + u, s, vh = np.linalg.svd(np.log2(Xt)) + u = u[:, :k] s = s[:k] - vh = vh[:k,:] - W = u*s + vh = vh[:k, :] + W = u * s V = vh - # W = np.random.normal(loc=0, scale=0.1, size=(X.shape[0], k)) - # V = np.random.normal(loc=0, scale=0.1, size=(k, X.shape[1])) - # W = np.zeros((X.shape[0], k)) - # V = np.zeros((k, X.shape[1])) it = 0 prv_log_lk = -np.inf log_lk = self.log_likelihood(X, W, V) while np.abs(prv_log_lk - log_lk) > tol: - if verbose: - print("iteration:", it, "log likelihood:", log_lk, file=sys.stderr) + logger.debug("Iteration: %d; Log likelihood: %.3G", it, log_lk) prv_log_lk = log_lk W = self.update_W(X, W, V) @@ -82,9 +83,10 @@ def pca(self, X, k, tol=1E-3, verbose=False): it += 1 - assert prv_log_lk <= log_lk or np.abs(prv_log_lk - log_lk) < 1E-6, "log likelihood must be increasing ({} > {})".format(prv_log_lk, log_lk) - return W,V - + assert ( + prv_log_lk <= log_lk or np.abs(prv_log_lk - log_lk) < 1e-6 + ), "log likelihood must be increasing ({} > {})".format(prv_log_lk, log_lk) + return W, V def update_W(self, X, W, V): """Optimize the log-likelihood with respect to W. @@ -98,15 +100,20 @@ def update_W(self, X, W, V): V : np.array The current estimate of V """ - loss_fn = lambda Wx : -self.log_likelihood(X, Wx.reshape(X.shape[0], V.shape[0]), V) - grad_fn = lambda Wx : -self.grad_W(X, Wx.reshape(X.shape[0], V.shape[0]), V).flatten() + loss_fn = lambda Wx: -self.log_likelihood( + X, Wx.reshape(X.shape[0], V.shape[0]), V + ) + grad_fn = lambda Wx: -self.grad_W( + X, Wx.reshape(X.shape[0], V.shape[0]), V + ).flatten() # these are handled by the minimizer np.seterr(over="ignore") - res = scipy.optimize.minimize(loss_fn, np.copy(W).flatten(), jac=grad_fn, method="CG") + res = scipy.optimize.minimize( + loss_fn, np.copy(W).flatten(), jac=grad_fn, method="CG" + ) np.seterr(over="warn") return res.x.reshape(X.shape[0], V.shape[0]) - def update_V(self, X, W, V): """Optimize the log-likelihood with respect to V. @@ -121,7 +128,6 @@ def update_V(self, X, W, V): """ return self.update_W(X.T, V.T, W.T).T - def log_likelihood(self, X, W, V): """Compute the log likelihood (minus a constant). @@ -135,8 +141,7 @@ def log_likelihood(self, X, W, V): A k x m matrix """ param = np.exp(W.dot(V)) - return (X*W.dot(V) - param)[np.isfinite(X)].sum() - + return (X * W.dot(V) - param)[np.isfinite(X)].sum() def grad_W(self, X, W, V): """Compute the gradient of the log likelihood wrt W. @@ -164,11 +169,11 @@ def grad_W(self, X, W, V): # generate 100 random test sets for i in range(100): - print("running test set", i+1, "of", 100) + print("running test set", i + 1, "of", 100) n = np.random.randint(10, 100) m = np.random.randint(5, n) k = np.random.randint(1, np.min((m, 10))) - print("\tn {} m {} k {}".format(n,m,k)) + print("\tn {} m {} k {}".format(n, m, k)) # generate test data W = np.random.normal(loc=0, scale=1, size=(n, k)) @@ -176,13 +181,12 @@ def grad_W(self, X, W, V): X = np.random.poisson(np.exp(W.dot(V))).astype(float) - coords = [(i,j) for i in range(n) for j in range(m)] - nmissing = int(0.25*X.size) + coords = [(i, j) for i in range(n) for j in range(m)] + nmissing = int(0.25 * X.size) for l in range(nmissing): coord = coords[np.random.randint(len(coords))] X[coord] = np.nan - X_finite = X[np.isfinite(X)] print("\tmin {}".format(X_finite.min())) print("\tmax {}".format(X_finite.max())) diff --git a/src/read_assigner.py b/src/coptr/read_assigner.py similarity index 70% rename from src/read_assigner.py rename to src/coptr/read_assigner.py index 19f71d6..dd2e41a 100644 --- a/src/read_assigner.py +++ b/src/coptr/read_assigner.py @@ -22,15 +22,17 @@ along with CoPTR. If not, see . """ +import logging + import numpy as np +import scipy.misc import scipy.special import scipy.stats -import scipy.misc - +from scipy.sparse import csr_matrix from scipy.special import digamma -from scipy.special import loggamma -from scipy.sparse import csr_matrix + +logger = logging.getLogger(__name__) class ReadAssigner(object): @@ -47,7 +49,9 @@ def __init__(self, X, prior_counts): self.X = csr_matrix(X) self.nreads = X.shape[0] self.ngenomes = X.shape[1] - assert np.all(self.X.sum(axis=1) > 0), "each read must have at least one valid mapping" + assert np.all( + self.X.sum(axis=1) > 0 + ), "each read must have at least one valid mapping" # set alpha based on unambiguously assigned reads self.alpha = np.ones(self.ngenomes) + prior_counts @@ -56,14 +60,12 @@ def __init__(self, X, prior_counts): self.phi = self.X.multiply(1 / self.X.sum(axis=1)) self.eta = np.copy(self.alpha) - def compute_elbo(self): - """Compute the variational objective function. - """ + """Compute the variational objective function.""" elbo = 0 # E_q[log p(pi)] - elbo += ((self.alpha - 1)*(digamma(self.eta) - digamma(self.eta.sum()))).sum() + elbo += ((self.alpha - 1) * (digamma(self.eta) - digamma(self.eta.sum()))).sum() # E_q[log p(X | Z)] elbo += (self.phi.multiply(self.X)).sum() @@ -75,49 +77,44 @@ def compute_elbo(self): elbo += scipy.stats.dirichlet(self.eta).entropy() # entropy of categorical - tmp = (self.phi.data*np.log(self.phi.data)) + tmp = self.phi.data * np.log(self.phi.data) elbo += -tmp.sum() return elbo - def update_phi(self): - """Optimize phi with fixed eta. - """ + """Optimize phi with fixed eta.""" log_phi = self.X.multiply(digamma(self.eta)) - phi = csr_matrix((np.exp(log_phi.data), (log_phi.row, log_phi.col)), shape=log_phi.shape) - self.phi = phi.multiply(1/phi.sum(axis=1)) - + phi = csr_matrix( + (np.exp(log_phi.data), (log_phi.row, log_phi.col)), shape=log_phi.shape + ) + self.phi = phi.multiply(1 / phi.sum(axis=1)) def update_eta(self): - """Optimize eta with fixed phi. - """ + """Optimize eta with fixed phi.""" self.eta = np.array(self.alpha + self.phi.sum(axis=0))[0] - - def run_vi(self, tol=1E-3, verbose=False): - """Optimize variational parameters with respect to elbo. - """ + def run_vi(self, tol=1e-3): + """Optimize variational parameters with respect to elbo.""" prv_elbo = -np.inf elbo = self.compute_elbo() it = 0 while np.abs(prv_elbo - elbo) > tol: - if verbose: - print("it:", it, "elbo:", elbo) + logger.debug("Iteration: %d; elbo: %.3G", it, elbo) self.update_eta() self.update_phi() prv_elbo = elbo elbo = self.compute_elbo() it += 1 - assert elbo >= prv_elbo - 1E-3, "elbo must be strictly increasing ({} > {})".format(elbo, prv_elbo) + assert ( + elbo >= prv_elbo - 1e-3 + ), "elbo must be strictly increasing ({} > {})".format(elbo, prv_elbo) - if verbose: - print("it:", it, "elbo:", elbo) + logger.debug("Iteration: %d; elbo: %.3G", it, elbo) - - def assign_reads(self, verbose=False): + def assign_reads(self): """Compute read assignments. Return @@ -126,7 +123,7 @@ def assign_reads(self, verbose=False): A 1D array. Each entry gives a genome id that is the genome assignment for each read. """ - self.run_vi(verbose=verbose) + self.run_vi() self.phi = csr_matrix(self.phi) assignments = [] for i in range(self.nreads): @@ -140,7 +137,7 @@ def assign_reads(self, verbose=False): # test a large number of cases for i in range(100): - print("running test set", i+1) + print("running test set", i + 1) nreads = np.random.randint(10000, 100000) ngenomes = np.random.randint(10, 50) Z = np.zeros((nreads, ngenomes)) @@ -150,25 +147,30 @@ def assign_reads(self, verbose=False): for n in range(nreads): true_assignment = np.random.randint(5) - Z[n,true_assignment] = 1 + Z[n, true_assignment] = 1 X = np.copy(Z) - multimapped_reads = int(0.1*nreads) + multimapped_reads = int(0.1 * nreads) for n in range(multimapped_reads): true_assignment = np.argwhere(Z[n] == 1).flatten()[0] - additional_mappings = np.max((1,np.random.poisson(1))) + additional_mappings = np.max((1, np.random.poisson(1))) for i in range(additional_mappings): mapping_id = np.random.randint(ngenomes) - X[n,mapping_id] = 1 - X = X[:multimapped_reads,:] + X[n, mapping_id] = 1 + X = X[:multimapped_reads, :] - prior = Z[multimapped_reads:,].sum(axis=0) + prior = Z[ + multimapped_reads:, + ].sum(axis=0) read_assigner = ReadAssigner(X[:multimapped_reads], prior) assignments = read_assigner.assign_reads() correct_assignments = 0 - for i,assignment in enumerate(assignments): - if Z[i,assignment] == 1: + for i, assignment in enumerate(assignments): + if Z[i, assignment] == 1: correct_assignments += 1 - print("\t{} of reads correctly mapped".format(correct_assignments / len(assignments))) + print( + "\t{} of reads correctly mapped".format( + correct_assignments / len(assignments) + ) + ) print("done!") - diff --git a/src/read_mapper.py b/src/coptr/read_mapper.py similarity index 53% rename from src/read_mapper.py rename to src/coptr/read_mapper.py index 3ea51fb..b4e2d78 100644 --- a/src/read_mapper.py +++ b/src/coptr/read_mapper.py @@ -22,21 +22,23 @@ along with CoPTR. If not, see . """ -import numpy as np +import logging import os import os.path -import pysam import subprocess as sub -import sys -import time +from datetime import datetime, timezone +from pathlib import Path + +import pysam + +from .util import get_fastq_name -from src.print import print_error, print_info -from src.util import get_fastq_name + +logger = logging.getLogger(__name__) class ReadMapper: - """Wrapper around bowtie2. - """ + """Wrapper around bowtie2.""" def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed): """Build a bowtie2 index from ref_fasta. @@ -57,57 +59,68 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed bt2_packed : str Parameter to pass to bowtie2-build --packed argument. """ + files_found = 0 + total_size = 0 + ref_files = [] if os.path.isfile(ref_fasta): - ref_files = [ref_fasta] - files_found = 1 - total_size = os.stat(ref_fasta).st_size + ref_files.append(ref_fasta) + files_found += 1 + total_size += os.stat(ref_fasta).st_size elif os.path.isdir(ref_fasta): valid_ext = [".fasta", ".fa", ".fna"] # for multiple fasta files, bowtie2 takes a comma # separated list - ref_files = [] - files_found = 0 - total_size = 0 - for f in os.listdir(ref_fasta): - fname,ext = os.path.splitext(f) - fpath = os.path.join(ref_fasta,f) + for in_handle in os.listdir(ref_fasta): + fname, ext = os.path.splitext(in_handle) + fpath = os.path.join(ref_fasta, in_handle) if os.path.isfile(fpath) and ext in valid_ext: ref_files.append(fpath) total_size += os.stat(fpath).st_size - files_found += 1 else: - print_error("ReadMapper", "index must either be file or folder.") - + logger.error("The index must be either a file or directory.") - print_info("ReadMapper", "found {} files totaling {:.3f} Gb".format(len(ref_files), total_size / (1024**3))) + logger.info( + "Found %d files totaling %.3g GB.", len(ref_files), total_size / (1024 ** 3) + ) - fna_out = open("coptr-fna-{}.fna".format(time.time()), "w") - genomes_out = open("{}.genomes".format(index_out), "w") - n_genomes = 0 + sequence_collection = Path( + f"coptr-fna-{datetime.now(timezone.utc).isoformat(timespec='seconds')}.fna" + ) + genomes = Path(f"{index_out}.genomes") - print_info("ReadMapper", "copying to fasta files to {} with prepended genome ids (filenames)".format(fna_out.name)) + logger.info( + "Copying FASTA files to %s with prepended genome ids (filenames).", + str(sequence_collection), + ) # assume 1 genome per fasta # let's set the filename as the genome identifier - for fpath in ref_files: - fname = os.path.basename(os.path.splitext(fpath)[0]) - genomes_out.write(os.path.basename(fname) + "\n") - n_genomes += 1 - - with open(fpath, "r") as f: - for line in f: - if ">" == line[0]: - # prepend the filename as the identifier for the genome - line = line[0] + fname + "|" + line[1:] - fna_out.write(line) - - fna_out.close() - genomes_out.close() - - print_info("ReadMapper", "writing {} reference genome ids to {}".format(n_genomes, genomes_out.name)) - call = ["bowtie2-build", fna_out.name, index_out, "--threads", bt2_threads] + n_genomes = 0 + with sequence_collection.open("w") as out_handle, genomes.open( + "w" + ) as genomes_handle: + for fpath in ref_files: + fname = os.path.basename(os.path.splitext(fpath)[0]) + genomes_handle.write(os.path.basename(fname) + "\n") + n_genomes += 1 + + with open(fpath, "r") as in_handle: + for line in in_handle: + if line.startswith(">"): + # prepend the filename as the identifier for the genome + line = f">{fname}|{line[1:]}" + out_handle.write(line) + + logger.info("Writing %d reference genome ids to %s.", n_genomes, str(genomes)) + call = [ + "bowtie2-build", + str(sequence_collection), + index_out, + "--threads", + bt2_threads, + ] if bt2_bmax is not None: call += ["--bmax", bt2_bmax] if bt2_dcv is not None: @@ -116,15 +129,17 @@ def index(self, ref_fasta, index_out, bt2_bmax, bt2_dcv, bt2_threads, bt2_packed call += ["--packed"] try: - print_info("ReadMapper", " ".join(call)) + logger.info("%s", " ".join(call)) sub.check_call(call) - print_info("ReadMapper", "indexed {} fasta files for reference database.".format(files_found)) - except Exception as e: - print(e, file=sys.stderr) + logger.info( + "Indexed %d FASTA files for the reference database.", len(ref_files) + ) + except Exception as error: + logger.error("An error occurred while indexing with bowtie2.") + logger.info("Detailed information:", exc_info=error) finally: - print_info("ReadMapper", "cleaning up {}".format(fna_out.name)) - sub.check_call(["rm", fna_out.name]) - + logger.info("Cleaning up %s.", str(sequence_collection)) + sequence_collection.unlink() def map(self, index, inputf, outfolder, paired, threads, bt2_k): """Map reads from infile against reference database using bowtie2, then @@ -147,75 +162,100 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): """ bt2_k = str(bt2_k) - if not os.path.isdir(outfolder): - print_error("ReadMapper", "output folder does not exist.") + outfolder = Path(outfolder) + if not outfolder.is_dir(): + logger.error("The output directory %s does not exist.", str(outfolder)) if os.path.isfile(inputf): bn = os.path.basename(inputf) - bn,ext = os.path.splitext(bn) - out_sam = os.path.join(outfolder, bn + ".sam") - out_bam = os.path.join(outfolder, bn + ".bam") + bn, ext = os.path.splitext(bn) + out_sam = outfolder / f"{bn}.sam" + out_bam = outfolder / f"{bn}.bam" # first map to a sam file with bowtie2 - print_info("ReadMapper", "mapping {} to {}".format(inputf, out_sam)) - call = ["bowtie2", "-x", index, inputf, "--no-unal", "-p", str(threads), "-k", bt2_k] - print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) + call = [ + "bowtie2", + "-x", + index, + inputf, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] + logger.info(" ".join(call)) + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + logger.info("Converting {} to {}.".format(str(out_sam), str(out_bam))) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) + logger.info("Cleaning up %s.", str(out_sam)) + out_sam.unlink() # single end sequencing elif os.path.isdir(inputf) and not paired: valid_ext = [".fastq", ".fq", ".gz"] files_found = 0 for f in sorted(os.listdir(inputf)): - fname,ext1 = os.path.splitext(f) + fname, ext1 = os.path.splitext(f) if ext1 == ".gz": ext2 = fname.split(".")[1] else: ext2 = "" - fpath = os.path.join(inputf,f) + fpath = os.path.join(inputf, f) if ext1 in valid_ext or ext2 in valid_ext: bn = os.path.basename(f) - bn,ext = os.path.splitext(bn) - out_sam = os.path.join(outfolder, get_fastq_name(bn) + ".sam") - out_bam = os.path.join(outfolder, get_fastq_name(bn) + ".bam") + bn, ext = os.path.splitext(bn) + out_sam = outfolder / f"{get_fastq_name(bn)}.sam" + out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info("ReadMapper", "mapping {} to {}".format(fpath, out_sam)) - call = ["bowtie2", "-x", index, fpath, "--no-unal", "-p", str(threads), "-k", bt2_k] - print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) + call = [ + "bowtie2", + "-x", + index, + fpath, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] + logger.info(" ".join(call)) + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + logger.info( + "Converting {} to {}.".format(str(out_sam), str(out_bam)) + ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) - + logger.info("Cleaning up %s.", str(out_sam)) + out_sam.unlink() + # paired end sequencing elif os.path.isdir(inputf) and paired: valid_ext = [".fastq", "fq", ".gz"] @@ -223,7 +263,7 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): # file prefix -> [pair_1, pair_2] read_pairs = {} for f in sorted(os.listdir(inputf)): - fname,ext1 = os.path.splitext(f) + fname, ext1 = os.path.splitext(f) if ext1 == ".gz": ext2 = fname.split(".")[1] else: @@ -248,29 +288,45 @@ def map(self, index, inputf, outfolder, paired, threads, bt2_k): f2 = os.path.join(inputf, read_pairs[pair_name][1]) bn = os.path.basename(f1) bn = bn.split("_")[0] - out_sam = os.path.join(outfolder, get_fastq_name(bn) + ".sam") - out_bam = os.path.join(outfolder, get_fastq_name(bn) + ".bam") + out_sam = outfolder / f"{get_fastq_name(bn)}.sam" + out_bam = outfolder / f"{get_fastq_name(bn)}.bam" # map reads with bowtie2 - print_info("ReadMapper", "mapping {},{} to {}".format(f1, f2, out_sam)) - call = ["bowtie2", "-x", index, "-1", f1, "-2", f2, "--no-unal", "-p", str(threads), "-k", bt2_k] - print_info("ReadMapper", " ".join(call)) - with open(out_sam, "w") as out: + logger.info("Mapping {} to {}".format(inputf, str(out_sam))) + call = [ + "bowtie2", + "-x", + index, + "-1", + f1, + "-2", + f2, + "--no-unal", + "-p", + str(threads), + "-k", + bt2_k, + ] + logger.info(" ".join(call)) + with out_sam.open("w") as out: sub.check_call(call, stdout=out) # then convert to a bam file - print_info("ReadMapper", "converting {} to {}".format(out_sam, out_bam)) + logger.info( + "Converting {} to {}.".format(str(out_sam), str(out_bam)) + ) infile = pysam.AlignmentFile(out_sam, "r") outfile = pysam.AlignmentFile(out_bam, "wb", template=infile) - for s in infile: - outfile.write(s) - infile.close() - outfile.close() + try: + for s in infile: + outfile.write(s) + finally: + infile.close() + outfile.close() # now remove sam file - print_info("ReadMapper", "cleaning up {}".format(out_sam)) - call = ["rm", out_sam] - sub.check_call(call) + logger.info("Cleaning up %s.", str(out_sam)) + out_sam.unlink() else: - print_error("ReadMapper", "input must either be a file or folder.") + logger.error("Input must be either a file or a folder.") diff --git a/src/util.py b/src/coptr/util.py similarity index 92% rename from src/util.py rename to src/coptr/util.py index a3ec115..9a2a5e9 100644 --- a/src/util.py +++ b/src/coptr/util.py @@ -23,6 +23,7 @@ import os.path + def get_fastq_name(fpath): """Remove the path and extension from a fastq file. @@ -35,7 +36,7 @@ def get_fastq_name(fpath): fname : str the name of the file with its path and extension removeed """ - bn,ex = os.path.splitext(fpath) + bn, ex = os.path.splitext(fpath) if ex == ".gz": - bn,ex = os.path.splitext(bn) - return bn \ No newline at end of file + bn, ex = os.path.splitext(bn) + return bn diff --git a/src/print.py b/src/print.py deleted file mode 100644 index dbc1975..0000000 --- a/src/print.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -print.py -====================== -Print error, warning, and info messages. -""" - -""" -This file is part of CoPTR. - -CoPTR is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -CoPTR is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with CoPTR. If not, see . -""" - -import sys -import time - -def print_error(module, msg, quit=True): - """Print an error message to the standard error. - - Parameters - ---------- - module : str - Which module generated the error message - msg : str - The error message itself - exit : bool - If TRUE, the program terminates after printing the error - """ - print("[ERROR] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) - if quit: - exit(1) - - -def print_warning(module, msg): - """Print a warning message to the standard error. - - Parameters - ---------- - module : str - Which module generated the message - msg : str - The message itself - """ - - print("[WARNING] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) - - -def print_info(module, msg): - """Print an info message to the standard error. - - Parameters - ---------- - module : str - Which module generated the message - msg : str - The message itself - """ - - print("[INFO] ({}) {}: {}".format(time.strftime("%b %d, %Y %X"), module, msg), file=sys.stderr, flush=True) \ No newline at end of file diff --git a/test/test_bam_processor.py b/test/test_bam_processor.py index c4c0dc7..cf71312 100644 --- a/test/test_bam_processor.py +++ b/test/test_bam_processor.py @@ -1,46 +1,46 @@ import os -import pysam import unittest -from src.bam_processor import BamProcessor +import pysam + +from coptr.bam_processor import BamProcessor + def read_to_dict(read): read = read.strip("\n").split() read_dict = { - "name" : read[0], - "flag" : read[1], - "ref_name" : read[2], - "ref_pos" : read[3], - "map_quality" : read[4], - "cigar" : read[5], - "next_ref_name" : read[6], - "next_ref_pos" : read[7], - "length" : read[8], - "seq" : read[9], - "qual" : read[10], - "tags" : read[11:] + "name": read[0], + "flag": read[1], + "ref_name": read[2], + "ref_pos": read[3], + "map_quality": read[4], + "cigar": read[5], + "next_ref_name": read[6], + "next_ref_pos": read[7], + "length": read[8], + "seq": read[9], + "qual": read[10], + "tags": read[11:], } return read_dict class TestBamProcessor(unittest.TestCase): - def setUp(self): - header = { - "HD" : {"VN": "1.0", "SO": "unsorted"}, - "SQ" : - [ - {"SN": "ref1|seq1", "LN" : 1000000}, - {"SN": "ref1|seq2", "LN" : 1000000}, - {"SN": "ref2|seq1", "LN" : 1000000} - ] + header = { + "HD": {"VN": "1.0", "SO": "unsorted"}, + "SQ": [ + {"SN": "ref1|seq1", "LN": 1000000}, + {"SN": "ref1|seq2", "LN": 1000000}, + {"SN": "ref2|seq1", "LN": 1000000}, + ], } bam1_reads = [ "read1 0 ref1|seq1 24975 42 80M * 0 0 TGGGCCAGAAAAAATGACTTCTCCATCTCGCTGCCGGTAGACCGACTCTCTTTTCTGCTGGCGGTTGCCACGCTGAGCGG AAAAAF.A.FFAFFFFFAFFFFFFFFFFFFFF 0.9) - - class Simulator: - """Simulate sequencing reads using the density of reads along an E. coli genome. - """ + """Simulate sequencing reads using the density of reads along an E. coli genome.""" def __init__(self): f = open("test/e-coli-NZ_CP011495.1.pkl", "rb") read_density_map, ptr = pkl.load(f) f.close() - self.bin_edges = read_density_map[:,0] - self.probs = read_density_map[:,1] # left bin edges + self.bin_edges = read_density_map[:, 0] + self.probs = read_density_map[:, 1] # left bin edges self.min_log2_ptr = 0.05 self.max_log2_ptr = 1.25 - def get_genome_length(self): - """Return the number of bases in the genome. - """ - return 100*self.probs.size - + """Return the number of bases in the genome.""" + return 100 * self.probs.size - def simulate_reads(self, nreads, ptr = None, ori_pos = None): + def simulate_reads(self, nreads, ptr=None, ori_pos=None): """Simulate nreads along 100bp bins with specified ptr and origin position. If ptr or ori_pos are None, they are chosen randomly. @@ -98,12 +106,14 @@ def simulate_reads(self, nreads, ptr = None, ori_pos = None): The simulated replication terminus position as a fraction along the genome """ if ptr is None: - ptr = np.random.random()*(self.max_log2_ptr - self.min_log2_ptr) + self.min_log2_ptr - ptr = 2**ptr + ptr = ( + np.random.random() * (self.max_log2_ptr - self.min_log2_ptr) + + self.min_log2_ptr + ) + ptr = 2 ** ptr else: assert np.log2(ptr) > self.min_log2_ptr, "ptr < 2^0.5 may be unreliable" - if ori_pos is None: ori_pos = np.random.random() else: @@ -117,14 +127,13 @@ def simulate_reads(self, nreads, ptr = None, ori_pos = None): # convert counts to coordinates read_positions = [] - for i,c in enumerate(binned_counts): - size = binned_counts.size - m = (0.5*(2*i + 1)/size) + for i, c in enumerate(binned_counts): + size = binned_counts.size + m = 0.5 * (2 * i + 1) / size read_positions += [m for i in range(c)] - read_positions = self.get_genome_length()*np.array(read_positions) - - return read_positions, ptr, ori_pos, ter_pos # 100bp bins + read_positions = self.get_genome_length() * np.array(read_positions) + return read_positions, ptr, ori_pos, ter_pos # 100bp bins def adjust_read_probs(self, ptr, ori_pos, ter_pos): """Scale bin probabilities based on the PTR. @@ -147,16 +156,15 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): # (bin_edges[i], bin_edges[i+1]), so let's # take the midpoint of each bin if i == self.bin_edges.size - 1: - m = 0.5*(self.bin_edges[i] + 1) - length = (1 - self.bin_edges[i]) + m = 0.5 * (self.bin_edges[i] + 1) + length = 1 - self.bin_edges[i] else: - m = 0.5*(self.bin_edges[i] + self.bin_edges[i+1]) - length = (self.bin_edges[i+1] - self.bin_edges[i]) + m = 0.5 * (self.bin_edges[i] + self.bin_edges[i + 1]) + length = self.bin_edges[i + 1] - self.bin_edges[i] x1 = np.min([ori_pos, ter_pos]) x2 = np.max([ori_pos, ter_pos]) - # since we normalize bins at the end, # we can compute c1 and c2 + a constant # in this case we can use c - log p(x_t) @@ -168,11 +176,11 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): c2 = np.log2(ptr) if m <= x1: - adj_probs[i] = -alpha*(m - x1) + c1 + adj_probs[i] = -alpha * (m - x1) + c1 elif x1 < m and m < x2: - adj_probs[i] = alpha*(m - x1) + c1 + adj_probs[i] = alpha * (m - x1) + c1 else: - adj_probs[i] = -alpha*(m - x2) + c2 + adj_probs[i] = -alpha * (m - x2) + c2 adj_probs[i] += np.log2(length) # normalize @@ -183,8 +191,12 @@ def adjust_read_probs(self, ptr, ori_pos, ter_pos): # reweight and normalize new_probs = np.zeros(self.probs.size) - new_probs[self.probs != 0] = np.log(self.probs[self.probs != 0]) + adj_probs[self.probs != 0] - new_probs[self.probs != 0] -= scipy.special.logsumexp(new_probs[self.probs != 0]) + new_probs[self.probs != 0] = ( + np.log(self.probs[self.probs != 0]) + adj_probs[self.probs != 0] + ) + new_probs[self.probs != 0] -= scipy.special.logsumexp( + new_probs[self.probs != 0] + ) new_probs[self.probs != 0] = np.exp(new_probs[self.probs != 0]) - return new_probs \ No newline at end of file + return new_probs