From 2db8657edf822878e05d3ce319d5729803728bb7 Mon Sep 17 00:00:00 2001 From: Don Freed Date: Mon, 19 Jan 2026 15:12:23 -0800 Subject: [PATCH 1/8] Add the argument `--skip_model_apply` to the pangenome pipeline - Add some unit tests for the pangenome pipeline --- sentieon_cli/sentieon_pangenome.py | 12 ++- tests/unit/test_sentieon_pangenome.py | 131 ++++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_sentieon_pangenome.py diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py index 862d481..b8b6b9c 100644 --- a/sentieon_cli/sentieon_pangenome.py +++ b/sentieon_cli/sentieon_pangenome.py @@ -227,6 +227,10 @@ class SentieonPangenome(BasePangenome): "help": argparse.SUPPRESS, "action": "store_true", }, + "skip_model_apply": { + "help": argparse.SUPPRESS, + "action": "store_true", + }, } ) @@ -243,6 +247,7 @@ def __init__(self) -> None: self.skip_contig_checks: bool = False self.skip_pangenome_name_checks: bool = False self.skip_pop_vcf_id_check: bool = False + self.skip_model_apply = False def main(self, args: argparse.Namespace) -> None: """Run the pipeline""" @@ -468,6 +473,7 @@ def build_first_dag(self) -> DAG: """Build the main DAG for the Sentieon pangenome pipeline""" assert self.reference assert self.model_bundle + assert self.output_vcf self.logger.info("Building the Sentieon pangenome DAG") dag = DAG() @@ -597,13 +603,17 @@ def build_first_dag(self) -> DAG: # transfer annotations from the pop_vcf if self.pop_vcf: transfer_jobs, concat_job = self.build_transfer_jobs( - transfer_vcf, raw_vcf + transfer_vcf if not self.skip_model_apply else self.output_vcf, + raw_vcf, ) for job in transfer_jobs: dag.add_job(job, {dnascope_job}) dag.add_job(concat_job, set(transfer_jobs)) apply_dependencies.add(concat_job) + if self.skip_model_apply: + return dag + # DNAModelApply apply_job = self.build_dnamodelapply_job(transfer_vcf) dag.add_job(apply_job, apply_dependencies) diff --git a/tests/unit/test_sentieon_pangenome.py b/tests/unit/test_sentieon_pangenome.py new file mode 100644 index 0000000..7ec84da --- /dev/null +++ b/tests/unit/test_sentieon_pangenome.py @@ -0,0 +1,131 @@ +""" +Unit tests for SentieonPangenome pipeline logic +""" + +import pathlib +import pytest +import tempfile +import sys +import os +import json +from unittest.mock import patch, MagicMock + +# Add the parent directory to the path to import sentieon_cli +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) + +from sentieon_cli.sentieon_pangenome import SentieonPangenome +from sentieon_cli.dag import DAG + + +class TestSentieonPangenome: + """Test Sentieon pangenome pipeline logic""" + + def setup_method(self): + """Setup test fixtures""" + self.temp_dir = tempfile.mkdtemp() + self.mock_dir = pathlib.Path(self.temp_dir) + + # Create mock files + self.mock_vcf = self.mock_dir / "output.vcf.gz" + self.mock_ref = self.mock_dir / "reference.fa" + self.mock_bam = self.mock_dir / "sample.bam" + self.mock_bundle = self.mock_dir / "model.bundle" + self.mock_gbz = self.mock_dir / "pangenome.grch38.gbz" + self.mock_hapl = self.mock_dir / "pangenome.grch38.hapl" + self.mock_pop_vcf = self.mock_dir / "population.vcf.gz" + + # Create empty files + for file_path in [ + self.mock_ref, + self.mock_bam, + self.mock_bundle, + self.mock_gbz, + self.mock_hapl, + self.mock_pop_vcf, + ]: + file_path.touch() + + # Create reference index + with open(str(self.mock_ref) + ".fai", "w") as f: + f.write("chr1\t1000\t0\t80\t81\n") + + def create_pipeline(self): + """Create a SentieonPangenome pipeline for testing""" + pipeline = SentieonPangenome() + + # Setup mocks + pipeline.logger = MagicMock() + + # Configure arguments + pipeline.output_vcf = self.mock_vcf + pipeline.reference = self.mock_ref + pipeline.model_bundle = self.mock_bundle + pipeline.sample_input = [self.mock_bam] + pipeline.gbz = self.mock_gbz + pipeline.hapl = self.mock_hapl + pipeline.pop_vcf = self.mock_pop_vcf + pipeline.cores = 2 + pipeline.dry_run = True + pipeline.skip_version_check = True + pipeline.skip_contig_checks = True + pipeline.skip_pangenome_name_checks = True + pipeline.skip_pop_vcf_id_check = True + pipeline.tmp_dir = self.mock_dir + + # Mock parsing fai + pipeline.fai_data = {"chr1": {"length": 1000}} + pipeline.shards = [MagicMock()] + pipeline.shards[0].contig = "chr1" + pipeline.shards[0].start = 1 + pipeline.shards[0].stop = 1000 + + # Mock pop vcf contigs + pipeline.pop_vcf_contigs = {"chr1": 1000} + + # Mock collection readgroups + pipeline.bam_readgroups = [{"ID": "rg1", "SM": "sample1"}] + pipeline.fastq_readgroup = {} + pipeline.tech = "Illumina" + pipeline.pcr_free = False + + return pipeline + + def test_model_apply_default(self): + """Test that model apply job is created by default""" + pipeline = self.create_pipeline() + dag = pipeline.build_first_dag() + + # Verify DAG is created + assert isinstance(dag, DAG) + + # Check for model-apply job + job_names = [job.name for job in dag.waiting_jobs.keys()] + assert "model-apply" in job_names + + def test_skip_model_apply(self): + """Test that model apply job is skipped when requested""" + pipeline = self.create_pipeline() + pipeline.skip_model_apply = True + dag = pipeline.build_first_dag() + + # Verify DAG is created + assert isinstance(dag, DAG) + + # Check that model-apply job is NOT present + job_names = [job.name for job in dag.waiting_jobs] + assert "model-apply" not in job_names + + # Verify that the concat job writes to the final output + # Find the merge-trim-concat job + concat_job = None + for job in dag.waiting_jobs: + if job.name == "merge-trim-concat": + concat_job = job + break + + assert concat_job is not None + # Check that the first argument (output file) is set to the final output VCF + # Check that the output file is in the arguments + args_str = " ".join([str(arg) for arg in concat_job.shell.nodes[0].args]) + assert str(pipeline.output_vcf) in args_str + From 8bfacb879a54594638b99738770dbc38d99bb861 Mon Sep 17 00:00:00 2001 From: Don Freed Date: Mon, 19 Jan 2026 16:17:43 -0800 Subject: [PATCH 2/8] Add a `--version` argument --- sentieon_cli/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py index 5a2e86a..54b80b5 100644 --- a/sentieon_cli/__init__.py +++ b/sentieon_cli/__init__.py @@ -4,6 +4,7 @@ from .dnascope_longread import DNAscopeLRPipeline from .pangenome import PangenomePipeline from .sentieon_pangenome import SentieonPangenome +from .util import __version__ def main(): @@ -26,6 +27,11 @@ def main(): dest="loglevel", const="DEBUG", ) + parser.add_argument( + "--version", + action="version", + version=__version__, + ) subparsers = parser.add_subparsers(required=True) # DNAscope parser From 0793cf7d986db0298d1a02e7822cb5bdaaa2f1ee Mon Sep 17 00:00:00 2001 From: Don Freed Date: Wed, 21 Jan 2026 15:44:05 -0800 Subject: [PATCH 3/8] Add the `--pop_vcf` argument to the hybrid pipeline --- sentieon_cli/dnascope_hybrid.py | 115 +++++++++- sentieon_cli/sentieon_pangenome.py | 248 ++------------------- sentieon_cli/shard.py | 117 ++++++++++ sentieon_cli/transfer.py | 135 +++++++++++ sentieon_cli/util.py | 11 + tests/unit/test_dnascope_hybrid_pop_vcf.py | 167 ++++++++++++++ tests/unit/test_pipeline_validation.py | 3 +- tests/utils/test_helpers.py | 1 + 8 files changed, 558 insertions(+), 239 deletions(-) create mode 100644 sentieon_cli/shard.py create mode 100644 sentieon_cli/transfer.py create mode 100644 tests/unit/test_dnascope_hybrid_pop_vcf.py diff --git a/sentieon_cli/dnascope_hybrid.py b/sentieon_cli/dnascope_hybrid.py index d268a6a..4fe078c 100644 --- a/sentieon_cli/dnascope_hybrid.py +++ b/sentieon_cli/dnascope_hybrid.py @@ -6,13 +6,17 @@ import copy import json import pathlib +import re +import shutil +import subprocess as sp import sys -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple import packaging.version from importlib_resources import files +from .logging import get_logger from .archive import ar_load from . import command_strings as cmds from .dag import DAG @@ -37,7 +41,19 @@ parse_rg_line, path_arg, split_alignment, + vcf_id, ) +from .shard import ( + GRCH38_CONTIGS, + Shard, + determine_shards_from_fai, + parse_fai, + vcf_contigs, +) +from .transfer import build_transfer_jobs + + +logger = get_logger(__name__) CALLING_MIN_VERSIONS = { @@ -113,6 +129,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline): params = copy.deepcopy(BasePipeline.params) params.update( { + # Required arguments "lr_aln": { "nargs": "*", "help": "Long-read BAM or CRAM files.", @@ -144,6 +161,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline): "nargs": "*", "help": "Readgroup information for the short-read fastq files", }, + # Additional arguments "bam_format": { "help": ( "Use the BAM format instead of CRAM for output aligned " @@ -189,6 +207,13 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline): ), "type": path_arg(exists=True, is_file=True), }, + "pop_vcf": { + "flags": ["--pop_vcf"], + "help": ( + "A VCF containing annotations for use with DNAModelApply." + ), + "type": path_arg(exists=True, is_file=True), + }, "rgsm": { "help": ( "Overwrite the SM tag of the input readgroups for " @@ -220,6 +245,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline): "choices": ["markdup", "rmdup", "none"], "default": "markdup", }, + # Hidden arguments "bwa_args": { # help="Extra arguments for sentieon bwa", "help": argparse.SUPPRESS, @@ -256,6 +282,10 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline): "help": argparse.SUPPRESS, "action": "store_true", }, + "skip_pop_vcf_id_check": { + "help": argparse.SUPPRESS, + "action": "store_true", + }, "sr_read_filter": { "help": argparse.SUPPRESS, }, @@ -277,6 +307,7 @@ def __init__(self) -> None: self.lr_aln: List[pathlib.Path] = [] self.lr_align_input = False self.lr_input_ref: Optional[pathlib.Path] = None + self.pop_vcf: Optional[pathlib.Path] = None self.bam_format = False self.rgsm: Optional[str] = None self.lr_fastq_taglist = "*" @@ -284,8 +315,18 @@ def __init__(self) -> None: self.lr_read_filter: Optional[str] = None self.assay = "WGS" self.skip_model_apply = False + self.skip_pop_vcf_id_check = False def validate(self) -> None: + self.fai_data = parse_fai(pathlib.Path(str(self.reference) + ".fai")) + self.pop_vcf_contigs: Dict[str, Optional[int]] = {} + if self.pop_vcf: + self.pop_vcf_contigs = vcf_contigs(self.pop_vcf, self.dry_run) + self.logger.debug("VCF contigs are: %s", self.pop_vcf_contigs) + self.shards = determine_shards_from_fai( + self.fai_data, 10 * 1000 * 1000 + ) + self.validate_bundle() self.collect_readgroups() self.validate_readgroups() @@ -329,6 +370,24 @@ def validate_bundle(self) -> None: bundle_info = json.loads(bundle_info_bytes.decode()) self.longread_tech = bundle_info.get("longReadPlatform") self.shortread_tech = bundle_info.get("shortReadPlatform") + bundle_vcf_id = bundle_info.get("SentieonVcfID") + + if bundle_vcf_id: + if not self.pop_vcf: + self.logger.error( + "The model bundle requires a population VCF file. Please " + "supply the `--pop_vcf` argument." + ) + sys.exit(2) + if not self.skip_pop_vcf_id_check and not self.dry_run: + pop_vcf_id = vcf_id(self.pop_vcf) + if bundle_vcf_id != pop_vcf_id: + self.logger.error( + "The ID of the `--pop_vcf` does not match the model " + "bundle" + ) + sys.exit(2) + if not self.longread_tech or not self.shortread_tech: self.logger.error( "The bundle file does not have the expected attributes. " @@ -586,6 +645,8 @@ def build_dag(self) -> DAG: concat_job, rm_job5, anno_job, + transfer_jobs, + transfer_concat, apply_job, norm_job, ) = self.call_variants(sr_aln, lr_aln, rg_info) @@ -601,8 +662,16 @@ def build_dag(self) -> DAG: dag.add_job(subset_job, {second_stage_job}) dag.add_job(concat_job, {subset_job, call2_job}) dag.add_job(anno_job, {concat_job}) + + apply_dependencies = {anno_job} + if transfer_jobs and transfer_concat: + for job in transfer_jobs: + dag.add_job(job, {anno_job}) + dag.add_job(transfer_concat, set(transfer_jobs)) + apply_dependencies = {transfer_concat} + if apply_job: - dag.add_job(apply_job, {anno_job}) + dag.add_job(apply_job, apply_dependencies) if apply_job and norm_job: dag.add_job(norm_job, {apply_job}) @@ -640,6 +709,8 @@ def call_variants( Job, Job, Job, + Optional[List[Job]], + Optional[Job], Optional[Job], Optional[Job], ]: @@ -910,12 +981,13 @@ def call_variants( hybrid_anno = pathlib.Path( str(files("sentieon_cli.scripts").joinpath("hybrid_anno.py")) ) - combined_anno_vcf = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz") - if self.skip_model_apply: - combined_anno_vcf = self.output_vcf + anno_target = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz") + if self.skip_model_apply and not self.pop_vcf: + anno_target = self.output_vcf + anno_job = Job( cmds.cmd_pyexec_hybrid_anno( - combined_anno_vcf, + anno_target, combined_tmp_vcf, stage1_hap_bed, hybrid_anno, @@ -924,6 +996,31 @@ def call_variants( "anno-calls", 0, ) + + transfer_jobs: Optional[List[Job]] = None + transfer_concat_job: Optional[Job] = None + input_to_apply = anno_target + + if self.pop_vcf: + transfer_target = self.tmp_dir.joinpath( + "combined_tmp_transfer.vcf.gz" + ) + if self.skip_model_apply: + transfer_target = self.output_vcf + + transfer_jobs, transfer_concat_job = build_transfer_jobs( + transfer_target, + self.pop_vcf, + anno_target, + self.tmp_dir, + self.shards, + self.pop_vcf_contigs, + self.fai_data, + self.dry_run, + self.cores, + ) + input_to_apply = transfer_target + if self.skip_model_apply: return ( call_job, @@ -943,6 +1040,8 @@ def call_variants( concat_job, rm_job5, anno_job, + transfer_jobs, + transfer_concat_job, None, None, ) @@ -957,7 +1056,7 @@ def call_variants( driver.add_algo( DNAModelApply( model=self.model_bundle.joinpath("hybrid.model"), - vcf=combined_anno_vcf, + vcf=input_to_apply, output=apply_vcf, ) ) @@ -994,6 +1093,8 @@ def call_variants( concat_job, rm_job5, anno_job, + transfer_jobs, + transfer_concat_job, apply_job, norm_job, ) diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py index b8b6b9c..a322808 100644 --- a/sentieon_cli/sentieon_pangenome.py +++ b/sentieon_cli/sentieon_pangenome.py @@ -6,11 +6,9 @@ import copy import json import pathlib -import re import shutil -import subprocess as sp import sys -from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union import packaging.version @@ -45,7 +43,15 @@ path_arg, tmp, total_memory, + vcf_id, ) +from .shard import ( + GRCH38_CONTIGS, + determine_shards_from_fai, + parse_fai, + vcf_contigs, +) +from .transfer import build_transfer_jobs SENT_PANGENOME_MIN_VERSIONS = { "kmc": None, @@ -55,124 +61,9 @@ "samtools": packaging.version.Version("1.16"), } -GRCH38_CONTIGS: Dict[str, int] = { - "chr1": 248956422, - "chr2": 242193529, - "chr3": 198295559, - "chr4": 190214555, - "chr5": 181538259, - "chr6": 170805979, - "chr7": 159345973, - "chr8": 145138636, - "chr9": 138394717, - "chr10": 133797422, - "chr11": 135086622, - "chr12": 133275309, - "chr13": 114364328, - "chr14": 107043718, - "chr15": 101991189, - "chr16": 90338345, - "chr17": 83257441, - "chr18": 80373285, - "chr19": 58617616, - "chr20": 64444167, - "chr21": 46709983, - "chr22": 50818468, - "chrX": 156040895, - "chrY": 57227415, - "chrM": 16569, -} - logger = get_logger(__name__) -class Shard(NamedTuple): - contig: str - start: int - stop: int - - def __str__(self) -> str: - return f"{self.contig}:{self.start}-{self.stop}" - - def bcftools_str(self) -> str: - return f"{{{self.contig}}}:{self.start}-{self.stop}" - - -def parse_fai(ref_fai: pathlib.Path) -> Dict[str, Dict[str, int]]: - """Parse a faidx index""" - contigs: Dict[str, Dict[str, int]] = {} - with open(ref_fai) as fh: - for line in fh: - try: - chrom, length, offset, lb, lw = line.rstrip().split() - except ValueError as err: - logger.error( - "Reference fasta index (.fai) does not have the expected " - "format" - ) - raise err - contigs[chrom] = { - "length": int(length), - "offset": int(offset), - "linebases": int(lb), - "linewidth": int(lw), - } - return contigs - - -def determine_shards_from_fai( - fai_data: Dict[str, Dict[str, int]], step: int -) -> List[Shard]: - """Generate shards of the genome from the fasta index""" - shards: List[Shard] = [] - for ctg, d in fai_data.items(): - pos = 1 - length = d["length"] - while pos <= length: - end = pos + step - 1 - end = end if end < length else length - shards.append(Shard(ctg, pos, end)) - pos = end + 1 - return shards - - -def vcf_contigs( - in_vcf: pathlib.Path, dry_run=False -) -> Dict[str, Optional[int]]: - """Report the contigs in the input VCF""" - if dry_run: - return { - "chr1": 100, - "chr2": 200, - "chr3": 300, - } - kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)') - cmd = ["bcftools", "view", "-h", str(in_vcf)] - p = sp.run(cmd, capture_output=True, text=True) - contigs: Dict[str, Optional[int]] = {} - for line in p.stdout.split("\n"): - if not line.startswith("##contig"): - continue - s = line.index("<") - e = line.index(">") - d = dict(kvpat.findall(line[s + 1 : e])) # noqa: E203 - ctg: str = d["ID"] - length: Optional[str] = d.get("length", None) - contigs[ctg] = int(length) if length else None - return contigs - - -def vcf_id(in_vcf: pathlib.Path) -> Optional[str]: - """Collect the SentieonVcfID header""" - cmd = ["bcftools", "view", "-h", str(in_vcf)] - p = sp.run(cmd, capture_output=True, text=True) - for line in p.stdout.split("\n"): - if line.startswith("##SentieonVcfID="): - i = line.index("=") - return line[i + 1 :] # noqa: E203 - return None - - class SentieonPangenome(BasePangenome): """The Sentieon pangenome pipeline""" @@ -602,9 +493,16 @@ def build_first_dag(self) -> DAG: # transfer annotations from the pop_vcf if self.pop_vcf: - transfer_jobs, concat_job = self.build_transfer_jobs( + transfer_jobs, concat_job = build_transfer_jobs( transfer_vcf if not self.skip_model_apply else self.output_vcf, + self.pop_vcf, raw_vcf, + self.tmp_dir, + self.shards, + self.pop_vcf_contigs, + self.fai_data, + self.dry_run, + self.cores, ) for job in transfer_jobs: dag.add_job(job, {dnascope_job}) @@ -903,118 +801,6 @@ def build_dnascope_job( self.cores, ) - def build_transfer_jobs( - self, out_vcf: pathlib.Path, raw_vcf: pathlib.Path - ) -> Tuple[List[Job], Job]: - """Transfer annotations from the pop_vcf to the raw_vcf""" - assert self.pop_vcf - - # Generate merge rules from the population VCF - merge_rules = "AC_v20:sum,AF_v20:sum,AC_genomes:sum,AF_genomes:sum" - if not self.dry_run: - kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)') - cmd = ["bcftools", "view", "-h", str(self.pop_vcf)] - p = sp.run(cmd, capture_output=True, text=True) - id_fields: List[str] = [] - for line in p.stdout.split("\n"): - if not line.startswith("##INFO"): - continue - if ",Number=A" not in line: - continue - s = line.index("<") - e = line.index(">") - d = dict(kvpat.findall(line[s + 1 : e])) # noqa: E203 - id_fields.append(d["ID"]) - merge_rules = ",".join([x + ":sum" for x in id_fields]) - - # Merge VCFs by shards - sharded_vcfs: List[pathlib.Path] = [] - sharded_merge_jobs: List[Job] = [] - trim_script = pathlib.Path( - str(files("sentieon_cli.scripts").joinpath("trimalt.py")) - ).resolve() - seen_contigs: Set[str] = set() - for i, shard in enumerate(self.shards): - # Use a BED file for unusual contig names - subset_bed = self.tmp_dir.joinpath( - f"sample-dnascope_transfer-subset{i}.bed" - ) - - # Extract contigs not in the pop vcf as merge will fail - if shard.contig not in self.pop_vcf_contigs: - if shard.contig in seen_contigs: - continue - self.logger.info( - "Skipping transfer for contig: %s", shard.contig - ) - seen_contigs.add(shard.contig) - subset_vcf = self.tmp_dir.joinpath( - f"sample-dnascope_transfer-subset{i}.vcf.gz" - ) - - ctg_len = self.fai_data[shard.contig]["length"] - if not self.dry_run: - with open(subset_bed, "w") as fh: - print(f"{shard.contig}\t0\t{ctg_len}", file=fh) - - view_job = Job( - cmds.cmd_bcftools_view_regions( - subset_vcf, - raw_vcf, - regions_file=subset_bed, - ), - "merge-trim-extra", - 1, - ) - sharded_merge_jobs.append(view_job) - sharded_vcfs.append(subset_vcf) - else: - if not self.dry_run: - with open(subset_bed, "w") as fh: - print( - f"{shard.contig}\t{shard.start}\t{shard.stop}", - file=fh, - ) - - self.logger.debug("Transferring shard: %s", shard) - shard_vcf = self.tmp_dir.joinpath( - f"sample-dnascope_transfer-shard{i}.vcf.gz" - ) - merge_job = Job( - cmds.cmd_bcftools_merge_trim( - shard_vcf, - raw_vcf, - self.pop_vcf, - trim_script, - subset_bed, - merge_rules=merge_rules, - merge_xargs=[ - "--no-version", - "--regions-overlap", - "pos", - "-m", - "all", - ], - view_xargs=["--no-version"], - ), - f"merge-trim-{i}", - 1, - ) - sharded_merge_jobs.append(merge_job) - sharded_vcfs.append(shard_vcf) - - # Concat all shards - concat_job = Job( - cmds.bcftools_concat( - out_vcf, - sharded_vcfs, - xargs=["--no-version", "--threads", str(self.cores)], - ), - "merge-trim-concat", - self.cores, - ) - return (sharded_merge_jobs, concat_job) - def build_dnamodelapply_job(self, in_vcf) -> Job: assert self.output_vcf assert self.model_bundle diff --git a/sentieon_cli/shard.py b/sentieon_cli/shard.py new file mode 100644 index 0000000..ee1af4f --- /dev/null +++ b/sentieon_cli/shard.py @@ -0,0 +1,117 @@ +""" +Sharding functionality for Sentieon pipelines +""" + +import pathlib +import re +import subprocess as sp +from typing import Dict, List, NamedTuple, Optional + +from .logging import get_logger + +logger = get_logger(__name__) + + +GRCH38_CONTIGS: Dict[str, int] = { + "chr1": 248956422, + "chr2": 242193529, + "chr3": 198295559, + "chr4": 190214555, + "chr5": 181538259, + "chr6": 170805979, + "chr7": 159345973, + "chr8": 145138636, + "chr9": 138394717, + "chr10": 133797422, + "chr11": 135086622, + "chr12": 133275309, + "chr13": 114364328, + "chr14": 107043718, + "chr15": 101991189, + "chr16": 90338345, + "chr17": 83257441, + "chr18": 80373285, + "chr19": 58617616, + "chr20": 64444167, + "chr21": 46709983, + "chr22": 50818468, + "chrX": 156040895, + "chrY": 57227415, + "chrM": 16569, +} + + +class Shard(NamedTuple): + contig: str + start: int + stop: int + + def __str__(self) -> str: + return f"{self.contig}:{self.start}-{self.stop}" + + def bcftools_str(self) -> str: + return f"{{{self.contig}}}:{self.start}-{self.stop}" + + +def parse_fai(ref_fai: pathlib.Path) -> Dict[str, Dict[str, int]]: + """Parse a faidx index""" + contigs: Dict[str, Dict[str, int]] = {} + with open(ref_fai) as fh: + for line in fh: + try: + chrom, length, offset, lb, lw = line.rstrip().split() + except ValueError as err: + logger.error( + "Reference fasta index (.fai) does not have the expected " + "format" + ) + raise err + contigs[chrom] = { + "length": int(length), + "offset": int(offset), + "linebases": int(lb), + "linewidth": int(lw), + } + return contigs + + +def determine_shards_from_fai( + fai_data: Dict[str, Dict[str, int]], step: int +) -> List[Shard]: + """Generate shards of the genome from the fasta index""" + shards: List[Shard] = [] + for ctg, d in fai_data.items(): + pos = 1 + length = d["length"] + while pos <= length: + end = pos + step - 1 + end = end if end < length else length + shards.append(Shard(ctg, pos, end)) + pos = end + 1 + return shards + + +def vcf_contigs( + in_vcf: pathlib.Path, dry_run=False +) -> Dict[str, Optional[int]]: + """Report the contigs in the input VCF""" + if dry_run: + return { + "chr1": 100, + "chr2": 200, + "chr3": 300, + } + kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)') + cmd = ["bcftools", "view", "-h", str(in_vcf)] + p = sp.run(cmd, capture_output=True, text=True) + contigs: Dict[str, Optional[int]] = {} + for line in p.stdout.split("\n"): + if not line.startswith("##contig"): + continue + s = line.index("<") + e = line.index(">") + d = dict(kvpat.findall(line[s + 1 : e])) # noqa: E203 + ctg: str = d["ID"] + length: Optional[str] = d.get("length", None) + contigs[ctg] = int(length) if length else None + return contigs diff --git a/sentieon_cli/transfer.py b/sentieon_cli/transfer.py new file mode 100644 index 0000000..8526f09 --- /dev/null +++ b/sentieon_cli/transfer.py @@ -0,0 +1,135 @@ +""" +Annotation transfer functionality +""" + +import pathlib +import re +import subprocess as sp +from typing import Any, Dict, List, Optional, Set, Tuple + +from importlib_resources import files + +from . import command_strings as cmds +from .job import Job +from .logging import get_logger +from .shard import Shard + +logger = get_logger(__name__) + + +def build_transfer_jobs( + out_vcf: pathlib.Path, + pop_vcf: pathlib.Path, + raw_vcf: pathlib.Path, + tmp_dir: pathlib.Path, + shards: List[Shard], + pop_vcf_contigs: Dict[str, Optional[int]], + fai_data: Dict[str, Dict[str, int]], + dry_run: bool = False, + cores: int = 1, +) -> Tuple[List[Job], Job]: + """Transfer annotations from the pop_vcf to the raw_vcf""" + + # Generate merge rules from the population VCF + merge_rules = "AC_v20:sum,AF_v20:sum,AC_genomes:sum,AF_genomes:sum" + if not dry_run: + kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)') + cmd = ["bcftools", "view", "-h", str(pop_vcf)] + p = sp.run(cmd, capture_output=True, text=True) + id_fields: List[str] = [] + for line in p.stdout.split("\n"): + if not line.startswith("##INFO"): + continue + if ",Number=A" not in line: + continue + s = line.index("<") + e = line.index(">") + d = dict(kvpat.findall(line[s + 1 : e])) # noqa: E203 + id_fields.append(d["ID"]) + merge_rules = ",".join([x + ":sum" for x in id_fields]) + + # Merge VCFs by shards + sharded_vcfs: List[pathlib.Path] = [] + sharded_merge_jobs: List[Job] = [] + trim_script = pathlib.Path( + str(files("sentieon_cli.scripts").joinpath("trimalt.py")) + ).resolve() + seen_contigs: Set[str] = set() + for i, shard in enumerate(shards): + # Use a BED file for unusual contig names + subset_bed = tmp_dir.joinpath( + f"sample-dnascope_transfer-subset{i}.bed" + ) + + # Extract contigs not in the pop vcf as merge will fail + if shard.contig not in pop_vcf_contigs: + if shard.contig in seen_contigs: + continue + logger.info("Skipping transfer for contig: %s", shard.contig) + seen_contigs.add(shard.contig) + subset_vcf = tmp_dir.joinpath( + f"sample-dnascope_transfer-subset{i}.vcf.gz" + ) + + ctg_len = fai_data[shard.contig]["length"] + if not dry_run: + with open(subset_bed, "w") as fh: + print(f"{shard.contig}\t0\t{ctg_len}", file=fh) + + view_job = Job( + cmds.cmd_bcftools_view_regions( + subset_vcf, + raw_vcf, + regions_file=subset_bed, + ), + "merge-trim-extra", + 1, + ) + sharded_merge_jobs.append(view_job) + sharded_vcfs.append(subset_vcf) + else: + if not dry_run: + with open(subset_bed, "w") as fh: + print( + f"{shard.contig}\t{shard.start}\t{shard.stop}", + file=fh, + ) + + logger.debug("Transferring shard: %s", shard) + shard_vcf = tmp_dir.joinpath( + f"sample-dnascope_transfer-shard{i}.vcf.gz" + ) + merge_job = Job( + cmds.cmd_bcftools_merge_trim( + shard_vcf, + raw_vcf, + pop_vcf, + trim_script, + subset_bed, + merge_rules=merge_rules, + merge_xargs=[ + "--no-version", + "--regions-overlap", + "pos", + "-m", + "all", + ], + view_xargs=["--no-version"], + ), + f"merge-trim-{i}", + 1, + ) + sharded_merge_jobs.append(merge_job) + sharded_vcfs.append(shard_vcf) + + # Concat all shards + concat_job = Job( + cmds.bcftools_concat( + out_vcf, + sharded_vcfs, + xargs=["--no-version", "--threads", str(cores)], + ), + "merge-trim-concat", + cores, + ) + return (sharded_merge_jobs, concat_job) diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py index 32395e1..825514d 100644 --- a/sentieon_cli/util.py +++ b/sentieon_cli/util.py @@ -228,3 +228,14 @@ def get_read_length_aln( if m: return int(m.groupdict()["length"]) return 151 + + +def vcf_id(in_vcf: pathlib.Path) -> Optional[str]: + """Collect the SentieonVcfID header""" + cmd = ["bcftools", "view", "-h", str(in_vcf)] + p = sp.run(cmd, capture_output=True, text=True) + for line in p.stdout.split("\n"): + if line.startswith("##SentieonVcfID="): + i = line.index("=") + return line[i + 1 :] # noqa: E203 + return None diff --git a/tests/unit/test_dnascope_hybrid_pop_vcf.py b/tests/unit/test_dnascope_hybrid_pop_vcf.py new file mode 100644 index 0000000..661d277 --- /dev/null +++ b/tests/unit/test_dnascope_hybrid_pop_vcf.py @@ -0,0 +1,167 @@ +""" +Unit tests for DNAscopeHybridPipeline pop_vcf logic +""" + +import pathlib +import pytest +import tempfile +import sys +import os +import json +from unittest.mock import patch, MagicMock + +# Add the parent directory to the path to import sentieon_cli +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) + +from sentieon_cli.dnascope_hybrid import DNAscopeHybridPipeline +from sentieon_cli.dag import DAG + + +class TestDNAscopeHybridPopVcf: + """Test DNAscope hybrid pipeline pop_vcf logic""" + + def setup_method(self): + """Setup test fixtures""" + self.temp_dir = tempfile.mkdtemp() + self.mock_dir = pathlib.Path(self.temp_dir) + + # Create mock files + self.mock_vcf = self.mock_dir / "output.vcf.gz" + self.mock_ref = self.mock_dir / "reference.fa" + self.mock_sr_aln = [self.mock_dir / "short.bam"] + self.mock_lr_aln = [self.mock_dir / "long.bam"] + self.mock_bundle = self.mock_dir / "model.bundle" + self.mock_pop_vcf = self.mock_dir / "population.vcf.gz" + self.mock_bed = self.mock_dir / "interval.bed" + + # Create empty files + for file_path in [ + self.mock_ref, + self.mock_sr_aln[0], + self.mock_lr_aln[0], + self.mock_bundle, + self.mock_pop_vcf, + self.mock_bed, + ]: + file_path.touch() + + def create_pipeline(self): + """Create a DNAscopeHybridPipeline for testing""" + # We need to mock sys.exit to avoid exiting during initialization if validation fails + with patch("sys.exit"): + pipeline = DNAscopeHybridPipeline() + + # Setup mocks + pipeline.logger = MagicMock() + + # Configure arguments + pipeline.output_vcf = self.mock_vcf + pipeline.reference = self.mock_ref + pipeline.model_bundle = self.mock_bundle + pipeline.bed = self.mock_bed + pipeline.cores = 2 + pipeline.dry_run = True + pipeline.skip_version_check = True + pipeline.tmp_dir = self.mock_dir + pipeline.pop_vcf = None + pipeline.skip_pop_vcf_id_check = False + + # Mock fai related + pipeline.fai_data = {"chr1": {"length": 1000}} + pipeline.shards = [MagicMock()] + pipeline.shards[0].contig = "chr1" + pipeline.shards[0].start = 1 + pipeline.shards[0].stop = 1000 + + # Mock pop vcf contigs + pipeline.pop_vcf_contigs = {"chr1": 1000} + + # Mock readgroup info (needed for build_dag) + pipeline.lr_aln_readgroups = [[{"ID": "lr_rg1", "SM": "sample1"}]] + pipeline.sr_aln_readgroups = [[{"ID": "sr_rg1", "SM": "sample1"}]] + pipeline.hybrid_rg_sm = "sample1" + pipeline.hybrid_set_rg = False + pipeline.shortread_tech = "Illumina" + pipeline.longread_tech = "ONT" + pipeline.sr_aln = self.mock_sr_aln + pipeline.lr_aln = self.mock_lr_aln + + return pipeline + + @patch("sentieon_cli.dnascope_hybrid.ar_load") + def test_validation_requires_pop_vcf(self, mock_ar_load): + """Test that validation fails if model bundle requires pop_vcf but none provided""" + pipeline = self.create_pipeline() + + # Mock bundle info requiring SentieonVcfID + mock_bundle_info = { + "longReadPlatform": "ONT", + "shortReadPlatform": "Illumina", + "SentieonVcfID": "some_id", + "minScriptVersion": "2.0" + } + mock_ar_load.return_value = json.dumps(mock_bundle_info).encode() + pipeline.dry_run = False + + with patch("sys.exit") as mock_exit: + pipeline.validate_bundle() + mock_exit.assert_called_with(2) + + @patch("sentieon_cli.dnascope_hybrid.ar_load") + def test_validation_pop_vcf_mismatch(self, mock_ar_load): + """Test validation fails if pop_vcf ID mismatches""" + pipeline = self.create_pipeline() + pipeline.pop_vcf = self.mock_pop_vcf + + mock_bundle_info = { + "longReadPlatform": "ONT", + "shortReadPlatform": "Illumina", + "SentieonVcfID": "expected_id", + "minScriptVersion": "2.0" + } + mock_ar_load.return_value = json.dumps(mock_bundle_info).encode() + pipeline.dry_run = False + + with patch("sentieon_cli.dnascope_hybrid.vcf_id", return_value="wrong_id"), \ + patch("sys.exit") as mock_exit: + + pipeline.validate_bundle() + mock_exit.assert_called_with(2) + + def test_transfer_jobs_creation(self): + """Test that transfer jobs are added to the DAG when pop_vcf is present""" + pipeline = self.create_pipeline() + pipeline.pop_vcf = self.mock_pop_vcf + + # Build DAG + with patch("sentieon_cli.dnascope_hybrid.check_version", return_value=True): + dag = pipeline.build_dag() + + # Verify transfer jobs exist + job_names = [job.name for job in dag.waiting_jobs] + + # Expect merge-trim jobs + assert any("merge-trim" in name for name in job_names) + assert "merge-trim-concat" in job_names + + def test_model_apply_input_with_pop_vcf(self): + """Test that model apply uses the transfer output when pop_vcf is present""" + pipeline = self.create_pipeline() + pipeline.pop_vcf = self.mock_pop_vcf + + with patch("sentieon_cli.dnascope_hybrid.check_version", return_value=True): + dag = pipeline.build_dag() + + # Find model-apply job + apply_job = None + for job in dag.waiting_jobs: + if job.name == "model-apply": + apply_job = job + break + + assert apply_job is not None + + # Check dependency: apply_job should depend on merge-trim-concat + deps = dag.waiting_jobs[apply_job] + dep_names = [j.name for j in deps] + assert "merge-trim-concat" in dep_names diff --git a/tests/unit/test_pipeline_validation.py b/tests/unit/test_pipeline_validation.py index d9418c8..d8b5f69 100644 --- a/tests/unit/test_pipeline_validation.py +++ b/tests/unit/test_pipeline_validation.py @@ -236,12 +236,13 @@ def setup_method(self): # Create mock files self.mock_vcf = pathlib.Path(self.temp_dir) / "output.vcf.gz" self.mock_ref = pathlib.Path(self.temp_dir) / "reference.fa" + self.mock_fai = pathlib.Path(self.temp_dir) / "reference.fa.fai" self.mock_lr_bam = pathlib.Path(self.temp_dir) / "longread.bam" self.mock_sr_bam = pathlib.Path(self.temp_dir) / "shortread.bam" self.mock_fastq = pathlib.Path(self.temp_dir) / "sample_R1.fastq.gz" # Create empty files - for file_path in [self.mock_ref, self.mock_lr_bam, self.mock_sr_bam, self.mock_fastq]: + for file_path in [self.mock_ref, self.mock_fai, self.mock_lr_bam, self.mock_sr_bam, self.mock_fastq]: file_path.touch() # Create mock bundle file (content will be mocked by ar_load) diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index 8c214bc..cf057b7 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -194,6 +194,7 @@ def create_hybrid_pipeline(self, **kwargs) -> DNAscopeHybridPipeline: # Set default files pipeline.output_vcf = self.fs.create_file("output.vcf.gz") pipeline.reference = self.fs.create_file("reference.fa") + _fai = self.fs.create_file("reference.fa.fai") pipeline.model_bundle = self.fs.create_model_bundle("model.bundle", "DNAscope Hybrid") # Default configuration From c923ac7019788435883c108d5fcbea7066a2fe52 Mon Sep 17 00:00:00 2001 From: Don Freed Date: Wed, 21 Jan 2026 15:50:23 -0800 Subject: [PATCH 4/8] Formatting fixes --- sentieon_cli/dnascope_hybrid.py | 7 +------ sentieon_cli/transfer.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/sentieon_cli/dnascope_hybrid.py b/sentieon_cli/dnascope_hybrid.py index 4fe078c..284d3a7 100644 --- a/sentieon_cli/dnascope_hybrid.py +++ b/sentieon_cli/dnascope_hybrid.py @@ -6,11 +6,8 @@ import copy import json import pathlib -import re -import shutil -import subprocess as sp import sys -from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import packaging.version @@ -44,8 +41,6 @@ vcf_id, ) from .shard import ( - GRCH38_CONTIGS, - Shard, determine_shards_from_fai, parse_fai, vcf_contigs, diff --git a/sentieon_cli/transfer.py b/sentieon_cli/transfer.py index 8526f09..6d5df85 100644 --- a/sentieon_cli/transfer.py +++ b/sentieon_cli/transfer.py @@ -5,7 +5,7 @@ import pathlib import re import subprocess as sp -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple from importlib_resources import files From 1f3caa1dda632a9bc21e61e81bfad1d096910e8a Mon Sep 17 00:00:00 2001 From: Don Freed Date: Wed, 21 Jan 2026 16:15:54 -0800 Subject: [PATCH 5/8] Fix a test --- tests/unit/test_dnascope_hybrid_pop_vcf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_dnascope_hybrid_pop_vcf.py b/tests/unit/test_dnascope_hybrid_pop_vcf.py index 661d277..7add9f1 100644 --- a/tests/unit/test_dnascope_hybrid_pop_vcf.py +++ b/tests/unit/test_dnascope_hybrid_pop_vcf.py @@ -89,7 +89,8 @@ def create_pipeline(self): return pipeline @patch("sentieon_cli.dnascope_hybrid.ar_load") - def test_validation_requires_pop_vcf(self, mock_ar_load): + @patch("sentieon_cli.dnascope_hybrid.vcf_id") + def test_validation_requires_pop_vcf(self, mock_vcf_id, mock_ar_load): """Test that validation fails if model bundle requires pop_vcf but none provided""" pipeline = self.create_pipeline() From df415c04a5dc642858b004aa174ae5b857c613fd Mon Sep 17 00:00:00 2001 From: Don Freed Date: Thu, 22 Jan 2026 11:22:15 -0800 Subject: [PATCH 6/8] Increment version --- README.md | 4 ++-- pyproject.toml | 2 +- sentieon_cli/util.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1711756..f58b8b8 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ A command-line interface for the Sentieon software Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip: ```sh -curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.0/sentieon_cli-1.5.0.tar.gz -pip install sentieon_cli-1.5.0.tar.gz +curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.1/sentieon_cli-1.5.1.tar.gz +pip install sentieon_cli-1.5.1.tar.gz ``` ## Installation with Poetry diff --git a/pyproject.toml b/pyproject.toml index daac4e4..1296c34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "sentieon_cli" -version = "1.5.0" +version = "1.5.1" description = "entry point for sentieon command-line tools" authors = [ {name = "Don Freed", email = "don.freed@sentieon.com"}, diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py index 825514d..2d4cea3 100644 --- a/sentieon_cli/util.py +++ b/sentieon_cli/util.py @@ -17,7 +17,7 @@ from .logging import get_logger -__version__ = "1.5.0" +__version__ = "1.5.1" logger = get_logger(__name__) From 4635465acedf9c03121f7c2b3f204b4ffc01c315 Mon Sep 17 00:00:00 2001 From: Don Freed Date: Fri, 23 Jan 2026 13:51:34 -0800 Subject: [PATCH 7/8] Add a check for the kmc binary --- sentieon_cli/sentieon_pangenome.py | 11 ++++++++ sentieon_cli/util.py | 41 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py index a322808..7ac9cc1 100644 --- a/sentieon_cli/sentieon_pangenome.py +++ b/sentieon_cli/sentieon_pangenome.py @@ -39,6 +39,7 @@ from .util import ( __version__, check_version, + check_kmc_patch, parse_rg_line, path_arg, tmp, @@ -194,6 +195,16 @@ def validate(self) -> None: if not check_version(cmd, min_version): sys.exit(2) + if self.sample_input: + if not check_kmc_patch("kmc"): + self.logger.error( + "Error: The 'kmc' executable in the PATH does not " + "support reading from stdin. Please ensure " + "you are using the patched version of KMC from " + "https://github.com/Sentieon/KMC/releases." + ) + sys.exit(2) + if self.bed is None: self.logger.info( "A BED file is recommended to avoid variant calling " diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py index 2d4cea3..2e27fcf 100644 --- a/sentieon_cli/util.py +++ b/sentieon_cli/util.py @@ -239,3 +239,44 @@ def vcf_id(in_vcf: pathlib.Path) -> Optional[str]: i = line.index("=") return line[i + 1 :] # noqa: E203 return None + + +def check_kmc_patch(kmc_cmd: str = "kmc") -> bool: + """Check if the KMC version supports the required patch""" + # Create a temporary directory for the test + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = pathlib.Path(temp_dir) + output_prefix = temp_path / "kmc_test" + + # Test input sequence + test_input = ( + ">206B4ABXX100825:7:1:1360:6029/1\n" + "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n" + ">206B4ABXX100825:7:1:1362:4449/1\n" + "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA" + ) + + cmd = [ + kmc_cmd, + "-k29", + "-m4", + "-okff", + "-t1", + "-fa", + "/dev/stdin", + str(output_prefix), + str(temp_path), + ] + + try: + sp.run( + cmd, + input=test_input, + text=True, + check=True, + stdout=sp.DEVNULL, + stderr=sp.DEVNULL, + ) + return True + except (sp.CalledProcessError, FileNotFoundError): + return False From 35f5f6d355667d911bc0cbd078dbcb1f26885f40 Mon Sep 17 00:00:00 2001 From: Don Freed Date: Fri, 23 Jan 2026 14:00:18 -0800 Subject: [PATCH 8/8] Formatting --- sentieon_cli/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py index 2e27fcf..8abc80a 100644 --- a/sentieon_cli/util.py +++ b/sentieon_cli/util.py @@ -251,9 +251,9 @@ def check_kmc_patch(kmc_cmd: str = "kmc") -> bool: # Test input sequence test_input = ( ">206B4ABXX100825:7:1:1360:6029/1\n" - "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n" + "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n" # noqa: E501 ">206B4ABXX100825:7:1:1362:4449/1\n" - "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA" + "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA" # noqa: E501 ) cmd = [