From 2db8657edf822878e05d3ce319d5729803728bb7 Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Mon, 19 Jan 2026 15:12:23 -0800
Subject: [PATCH 1/8] Add the argument `--skip_model_apply` to the pangenome
 pipeline

- Add some unit tests for the pangenome pipeline
---
 sentieon_cli/sentieon_pangenome.py    |  12 ++-
 tests/unit/test_sentieon_pangenome.py | 131 ++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_sentieon_pangenome.py

diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py
index 862d481..b8b6b9c 100644
--- a/sentieon_cli/sentieon_pangenome.py
+++ b/sentieon_cli/sentieon_pangenome.py
@@ -227,6 +227,10 @@ class SentieonPangenome(BasePangenome):
                 "help": argparse.SUPPRESS,
                 "action": "store_true",
             },
+            "skip_model_apply": {
+                "help": argparse.SUPPRESS,
+                "action": "store_true",
+            },
         }
     )
 
@@ -243,6 +247,7 @@ def __init__(self) -> None:
         self.skip_contig_checks: bool = False
         self.skip_pangenome_name_checks: bool = False
         self.skip_pop_vcf_id_check: bool = False
+        self.skip_model_apply = False
 
     def main(self, args: argparse.Namespace) -> None:
         """Run the pipeline"""
@@ -468,6 +473,7 @@ def build_first_dag(self) -> DAG:
         """Build the main DAG for the Sentieon pangenome pipeline"""
         assert self.reference
         assert self.model_bundle
+        assert self.output_vcf
 
         self.logger.info("Building the Sentieon pangenome DAG")
         dag = DAG()
@@ -597,13 +603,17 @@ def build_first_dag(self) -> DAG:
         # transfer annotations from the pop_vcf
         if self.pop_vcf:
             transfer_jobs, concat_job = self.build_transfer_jobs(
-                transfer_vcf, raw_vcf
+                transfer_vcf if not self.skip_model_apply else self.output_vcf,
+                raw_vcf,
             )
             for job in transfer_jobs:
                 dag.add_job(job, {dnascope_job})
             dag.add_job(concat_job, set(transfer_jobs))
             apply_dependencies.add(concat_job)
 
+        if self.skip_model_apply:
+            return dag
+
         # DNAModelApply
         apply_job = self.build_dnamodelapply_job(transfer_vcf)
         dag.add_job(apply_job, apply_dependencies)
diff --git a/tests/unit/test_sentieon_pangenome.py b/tests/unit/test_sentieon_pangenome.py
new file mode 100644
index 0000000..7ec84da
--- /dev/null
+++ b/tests/unit/test_sentieon_pangenome.py
@@ -0,0 +1,131 @@
+"""
+Unit tests for SentieonPangenome pipeline logic
+"""
+
+import pathlib
+import pytest
+import tempfile
+import sys
+import os
+import json
+from unittest.mock import patch, MagicMock
+
+# Add the parent directory to the path to import sentieon_cli
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+from sentieon_cli.sentieon_pangenome import SentieonPangenome
+from sentieon_cli.dag import DAG
+
+
+class TestSentieonPangenome:
+    """Test Sentieon pangenome pipeline logic"""
+
+    def setup_method(self):
+        """Setup test fixtures"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.mock_dir = pathlib.Path(self.temp_dir)
+
+        # Create mock files
+        self.mock_vcf = self.mock_dir / "output.vcf.gz"
+        self.mock_ref = self.mock_dir / "reference.fa"
+        self.mock_bam = self.mock_dir / "sample.bam"
+        self.mock_bundle = self.mock_dir / "model.bundle"
+        self.mock_gbz = self.mock_dir / "pangenome.grch38.gbz"
+        self.mock_hapl = self.mock_dir / "pangenome.grch38.hapl"
+        self.mock_pop_vcf = self.mock_dir / "population.vcf.gz"
+
+        # Create empty files
+        for file_path in [
+            self.mock_ref,
+            self.mock_bam,
+            self.mock_bundle,
+            self.mock_gbz,
+            self.mock_hapl,
+            self.mock_pop_vcf,
+        ]:
+            file_path.touch()
+
+        # Create reference index
+        with open(str(self.mock_ref) + ".fai", "w") as f:
+            f.write("chr1\t1000\t0\t80\t81\n")
+
+    def create_pipeline(self):
+        """Create a SentieonPangenome pipeline for testing"""
+        pipeline = SentieonPangenome()
+        
+        # Setup mocks
+        pipeline.logger = MagicMock()
+        
+        # Configure arguments
+        pipeline.output_vcf = self.mock_vcf
+        pipeline.reference = self.mock_ref
+        pipeline.model_bundle = self.mock_bundle
+        pipeline.sample_input = [self.mock_bam]
+        pipeline.gbz = self.mock_gbz
+        pipeline.hapl = self.mock_hapl
+        pipeline.pop_vcf = self.mock_pop_vcf
+        pipeline.cores = 2
+        pipeline.dry_run = True
+        pipeline.skip_version_check = True
+        pipeline.skip_contig_checks = True
+        pipeline.skip_pangenome_name_checks = True
+        pipeline.skip_pop_vcf_id_check = True
+        pipeline.tmp_dir = self.mock_dir
+        
+        # Mock parsing fai
+        pipeline.fai_data = {"chr1": {"length": 1000}}
+        pipeline.shards = [MagicMock()]
+        pipeline.shards[0].contig = "chr1"
+        pipeline.shards[0].start = 1
+        pipeline.shards[0].stop = 1000
+        
+        # Mock pop vcf contigs
+        pipeline.pop_vcf_contigs = {"chr1": 1000}
+
+        # Mock collection readgroups
+        pipeline.bam_readgroups = [{"ID": "rg1", "SM": "sample1"}]
+        pipeline.fastq_readgroup = {}
+        pipeline.tech = "Illumina"
+        pipeline.pcr_free = False
+
+        return pipeline
+
+    def test_model_apply_default(self):
+        """Test that model apply job is created by default"""
+        pipeline = self.create_pipeline()
+        dag = pipeline.build_first_dag()
+
+        # Verify DAG is created
+        assert isinstance(dag, DAG)
+
+        # Check for model-apply job
+        job_names = [job.name for job in dag.waiting_jobs.keys()]
+        assert "model-apply" in job_names
+
+    def test_skip_model_apply(self):
+        """Test that model apply job is skipped when requested"""
+        pipeline = self.create_pipeline()
+        pipeline.skip_model_apply = True
+        dag = pipeline.build_first_dag()
+
+        # Verify DAG is created
+        assert isinstance(dag, DAG)
+
+        # Check that model-apply job is NOT present
+        job_names = [job.name for job in dag.waiting_jobs]
+        assert "model-apply" not in job_names
+        
+        # Verify that the concat job writes to the final output
+        # Find the merge-trim-concat job
+        concat_job = None
+        for job in dag.waiting_jobs:
+            if job.name == "merge-trim-concat":
+                concat_job = job
+                break
+        
+        assert concat_job is not None
+        # Check that the first argument (output file) is set to the final output VCF
+        # Check that the output file is in the arguments
+        args_str = " ".join([str(arg) for arg in concat_job.shell.nodes[0].args])
+        assert str(pipeline.output_vcf) in args_str
+

From 8bfacb879a54594638b99738770dbc38d99bb861 Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Mon, 19 Jan 2026 16:17:43 -0800
Subject: [PATCH 2/8] Add a `--version` argument

---
 sentieon_cli/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py
index 5a2e86a..54b80b5 100644
--- a/sentieon_cli/__init__.py
+++ b/sentieon_cli/__init__.py
@@ -4,6 +4,7 @@
 from .dnascope_longread import DNAscopeLRPipeline
 from .pangenome import PangenomePipeline
 from .sentieon_pangenome import SentieonPangenome
+from .util import __version__
 
 
 def main():
@@ -26,6 +27,11 @@ def main():
         dest="loglevel",
         const="DEBUG",
     )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=__version__,
+    )
     subparsers = parser.add_subparsers(required=True)
 
     # DNAscope parser

From 0793cf7d986db0298d1a02e7822cb5bdaaa2f1ee Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Wed, 21 Jan 2026 15:44:05 -0800
Subject: [PATCH 3/8] Add the `--pop_vcf` argument to the hybrid pipeline

---
 sentieon_cli/dnascope_hybrid.py            | 115 +++++++++-
 sentieon_cli/sentieon_pangenome.py         | 248 ++-------------------
 sentieon_cli/shard.py                      | 117 ++++++++++
 sentieon_cli/transfer.py                   | 135 +++++++++++
 sentieon_cli/util.py                       |  11 +
 tests/unit/test_dnascope_hybrid_pop_vcf.py | 167 ++++++++++++++
 tests/unit/test_pipeline_validation.py     |   3 +-
 tests/utils/test_helpers.py                |   1 +
 8 files changed, 558 insertions(+), 239 deletions(-)
 create mode 100644 sentieon_cli/shard.py
 create mode 100644 sentieon_cli/transfer.py
 create mode 100644 tests/unit/test_dnascope_hybrid_pop_vcf.py

diff --git a/sentieon_cli/dnascope_hybrid.py b/sentieon_cli/dnascope_hybrid.py
index d268a6a..4fe078c 100644
--- a/sentieon_cli/dnascope_hybrid.py
+++ b/sentieon_cli/dnascope_hybrid.py
@@ -6,13 +6,17 @@
 import copy
 import json
 import pathlib
+import re
+import shutil
+import subprocess as sp
 import sys
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
 
 import packaging.version
 
 from importlib_resources import files
 
+from .logging import get_logger
 from .archive import ar_load
 from . import command_strings as cmds
 from .dag import DAG
@@ -37,7 +41,19 @@
     parse_rg_line,
     path_arg,
     split_alignment,
+    vcf_id,
 )
+from .shard import (
+    GRCH38_CONTIGS,
+    Shard,
+    determine_shards_from_fai,
+    parse_fai,
+    vcf_contigs,
+)
+from .transfer import build_transfer_jobs
+
+
+logger = get_logger(__name__)
 
 
 CALLING_MIN_VERSIONS = {
@@ -113,6 +129,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
     params = copy.deepcopy(BasePipeline.params)
     params.update(
         {
+            # Required arguments
             "lr_aln": {
                 "nargs": "*",
                 "help": "Long-read BAM or CRAM files.",
@@ -144,6 +161,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
                 "nargs": "*",
                 "help": "Readgroup information for the short-read fastq files",
             },
+            # Additional arguments
             "bam_format": {
                 "help": (
                     "Use the BAM format instead of CRAM for output aligned "
@@ -189,6 +207,13 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
                 ),
                 "type": path_arg(exists=True, is_file=True),
             },
+            "pop_vcf": {
+                "flags": ["--pop_vcf"],
+                "help": (
+                    "A VCF containing annotations for use with DNAModelApply."
+                ),
+                "type": path_arg(exists=True, is_file=True),
+            },
             "rgsm": {
                 "help": (
                     "Overwrite the SM tag of the input readgroups for "
@@ -220,6 +245,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
                 "choices": ["markdup", "rmdup", "none"],
                 "default": "markdup",
             },
+            # Hidden arguments
             "bwa_args": {
                 # help="Extra arguments for sentieon bwa",
                 "help": argparse.SUPPRESS,
@@ -256,6 +282,10 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
                 "help": argparse.SUPPRESS,
                 "action": "store_true",
             },
+            "skip_pop_vcf_id_check": {
+                "help": argparse.SUPPRESS,
+                "action": "store_true",
+            },
             "sr_read_filter": {
                 "help": argparse.SUPPRESS,
             },
@@ -277,6 +307,7 @@ def __init__(self) -> None:
         self.lr_aln: List[pathlib.Path] = []
         self.lr_align_input = False
         self.lr_input_ref: Optional[pathlib.Path] = None
+        self.pop_vcf: Optional[pathlib.Path] = None
         self.bam_format = False
         self.rgsm: Optional[str] = None
         self.lr_fastq_taglist = "*"
@@ -284,8 +315,18 @@ def __init__(self) -> None:
         self.lr_read_filter: Optional[str] = None
         self.assay = "WGS"
         self.skip_model_apply = False
+        self.skip_pop_vcf_id_check = False
 
     def validate(self) -> None:
+        self.fai_data = parse_fai(pathlib.Path(str(self.reference) + ".fai"))
+        self.pop_vcf_contigs: Dict[str, Optional[int]] = {}
+        if self.pop_vcf:
+            self.pop_vcf_contigs = vcf_contigs(self.pop_vcf, self.dry_run)
+            self.logger.debug("VCF contigs are: %s", self.pop_vcf_contigs)
+        self.shards = determine_shards_from_fai(
+            self.fai_data, 10 * 1000 * 1000
+        )
+
         self.validate_bundle()
         self.collect_readgroups()
         self.validate_readgroups()
@@ -329,6 +370,24 @@ def validate_bundle(self) -> None:
         bundle_info = json.loads(bundle_info_bytes.decode())
         self.longread_tech = bundle_info.get("longReadPlatform")
         self.shortread_tech = bundle_info.get("shortReadPlatform")
+        bundle_vcf_id = bundle_info.get("SentieonVcfID")
+
+        if bundle_vcf_id:
+            if not self.pop_vcf:
+                self.logger.error(
+                    "The model bundle requires a population VCF file. Please "
+                    "supply the `--pop_vcf` argument."
+                )
+                sys.exit(2)
+            if not self.skip_pop_vcf_id_check and not self.dry_run:
+                pop_vcf_id = vcf_id(self.pop_vcf)
+                if bundle_vcf_id != pop_vcf_id:
+                    self.logger.error(
+                        "The ID of the `--pop_vcf` does not match the model "
+                        "bundle"
+                    )
+                    sys.exit(2)
+
         if not self.longread_tech or not self.shortread_tech:
             self.logger.error(
                 "The bundle file does not have the expected attributes. "
@@ -586,6 +645,8 @@ def build_dag(self) -> DAG:
             concat_job,
             rm_job5,
             anno_job,
+            transfer_jobs,
+            transfer_concat,
             apply_job,
             norm_job,
         ) = self.call_variants(sr_aln, lr_aln, rg_info)
@@ -601,8 +662,16 @@ def build_dag(self) -> DAG:
         dag.add_job(subset_job, {second_stage_job})
         dag.add_job(concat_job, {subset_job, call2_job})
         dag.add_job(anno_job, {concat_job})
+
+        apply_dependencies = {anno_job}
+        if transfer_jobs and transfer_concat:
+            for job in transfer_jobs:
+                dag.add_job(job, {anno_job})
+            dag.add_job(transfer_concat, set(transfer_jobs))
+            apply_dependencies = {transfer_concat}
+
         if apply_job:
-            dag.add_job(apply_job, {anno_job})
+            dag.add_job(apply_job, apply_dependencies)
         if apply_job and norm_job:
             dag.add_job(norm_job, {apply_job})
 
@@ -640,6 +709,8 @@ def call_variants(
         Job,
         Job,
         Job,
+        Optional[List[Job]],
+        Optional[Job],
         Optional[Job],
         Optional[Job],
     ]:
@@ -910,12 +981,13 @@ def call_variants(
         hybrid_anno = pathlib.Path(
             str(files("sentieon_cli.scripts").joinpath("hybrid_anno.py"))
         )
-        combined_anno_vcf = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz")
-        if self.skip_model_apply:
-            combined_anno_vcf = self.output_vcf
+        anno_target = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz")
+        if self.skip_model_apply and not self.pop_vcf:
+            anno_target = self.output_vcf
+
         anno_job = Job(
             cmds.cmd_pyexec_hybrid_anno(
-                combined_anno_vcf,
+                anno_target,
                 combined_tmp_vcf,
                 stage1_hap_bed,
                 hybrid_anno,
@@ -924,6 +996,31 @@ def call_variants(
             "anno-calls",
             0,
         )
+
+        transfer_jobs: Optional[List[Job]] = None
+        transfer_concat_job: Optional[Job] = None
+        input_to_apply = anno_target
+
+        if self.pop_vcf:
+            transfer_target = self.tmp_dir.joinpath(
+                "combined_tmp_transfer.vcf.gz"
+            )
+            if self.skip_model_apply:
+                transfer_target = self.output_vcf
+
+            transfer_jobs, transfer_concat_job = build_transfer_jobs(
+                transfer_target,
+                self.pop_vcf,
+                anno_target,
+                self.tmp_dir,
+                self.shards,
+                self.pop_vcf_contigs,
+                self.fai_data,
+                self.dry_run,
+                self.cores,
+            )
+            input_to_apply = transfer_target
+
         if self.skip_model_apply:
             return (
                 call_job,
@@ -943,6 +1040,8 @@ def call_variants(
                 concat_job,
                 rm_job5,
                 anno_job,
+                transfer_jobs,
+                transfer_concat_job,
                 None,
                 None,
             )
@@ -957,7 +1056,7 @@ def call_variants(
         driver.add_algo(
             DNAModelApply(
                 model=self.model_bundle.joinpath("hybrid.model"),
-                vcf=combined_anno_vcf,
+                vcf=input_to_apply,
                 output=apply_vcf,
             )
         )
@@ -994,6 +1093,8 @@ def call_variants(
             concat_job,
             rm_job5,
             anno_job,
+            transfer_jobs,
+            transfer_concat_job,
             apply_job,
             norm_job,
         )
diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py
index b8b6b9c..a322808 100644
--- a/sentieon_cli/sentieon_pangenome.py
+++ b/sentieon_cli/sentieon_pangenome.py
@@ -6,11 +6,9 @@
 import copy
 import json
 import pathlib
-import re
 import shutil
-import subprocess as sp
 import sys
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import packaging.version
 
@@ -45,7 +43,15 @@
     path_arg,
     tmp,
     total_memory,
+    vcf_id,
 )
+from .shard import (
+    GRCH38_CONTIGS,
+    determine_shards_from_fai,
+    parse_fai,
+    vcf_contigs,
+)
+from .transfer import build_transfer_jobs
 
 SENT_PANGENOME_MIN_VERSIONS = {
     "kmc": None,
@@ -55,124 +61,9 @@
     "samtools": packaging.version.Version("1.16"),
 }
 
-GRCH38_CONTIGS: Dict[str, int] = {
-    "chr1": 248956422,
-    "chr2": 242193529,
-    "chr3": 198295559,
-    "chr4": 190214555,
-    "chr5": 181538259,
-    "chr6": 170805979,
-    "chr7": 159345973,
-    "chr8": 145138636,
-    "chr9": 138394717,
-    "chr10": 133797422,
-    "chr11": 135086622,
-    "chr12": 133275309,
-    "chr13": 114364328,
-    "chr14": 107043718,
-    "chr15": 101991189,
-    "chr16": 90338345,
-    "chr17": 83257441,
-    "chr18": 80373285,
-    "chr19": 58617616,
-    "chr20": 64444167,
-    "chr21": 46709983,
-    "chr22": 50818468,
-    "chrX": 156040895,
-    "chrY": 57227415,
-    "chrM": 16569,
-}
-
 logger = get_logger(__name__)
 
 
-class Shard(NamedTuple):
-    contig: str
-    start: int
-    stop: int
-
-    def __str__(self) -> str:
-        return f"{self.contig}:{self.start}-{self.stop}"
-
-    def bcftools_str(self) -> str:
-        return f"{{{self.contig}}}:{self.start}-{self.stop}"
-
-
-def parse_fai(ref_fai: pathlib.Path) -> Dict[str, Dict[str, int]]:
-    """Parse a faidx index"""
-    contigs: Dict[str, Dict[str, int]] = {}
-    with open(ref_fai) as fh:
-        for line in fh:
-            try:
-                chrom, length, offset, lb, lw = line.rstrip().split()
-            except ValueError as err:
-                logger.error(
-                    "Reference fasta index (.fai) does not have the expected "
-                    "format"
-                )
-                raise err
-            contigs[chrom] = {
-                "length": int(length),
-                "offset": int(offset),
-                "linebases": int(lb),
-                "linewidth": int(lw),
-            }
-    return contigs
-
-
-def determine_shards_from_fai(
-    fai_data: Dict[str, Dict[str, int]], step: int
-) -> List[Shard]:
-    """Generate shards of the genome from the fasta index"""
-    shards: List[Shard] = []
-    for ctg, d in fai_data.items():
-        pos = 1
-        length = d["length"]
-        while pos <= length:
-            end = pos + step - 1
-            end = end if end < length else length
-            shards.append(Shard(ctg, pos, end))
-            pos = end + 1
-    return shards
-
-
-def vcf_contigs(
-    in_vcf: pathlib.Path, dry_run=False
-) -> Dict[str, Optional[int]]:
-    """Report the contigs in the input VCF"""
-    if dry_run:
-        return {
-            "chr1": 100,
-            "chr2": 200,
-            "chr3": 300,
-        }
-    kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)')
-    cmd = ["bcftools", "view", "-h", str(in_vcf)]
-    p = sp.run(cmd, capture_output=True, text=True)
-    contigs: Dict[str, Optional[int]] = {}
-    for line in p.stdout.split("\n"):
-        if not line.startswith("##contig"):
-            continue
-        s = line.index("<")
-        e = line.index(">")
-        d = dict(kvpat.findall(line[s + 1 : e]))  # noqa: E203
-        ctg: str = d["ID"]
-        length: Optional[str] = d.get("length", None)
-        contigs[ctg] = int(length) if length else None
-    return contigs
-
-
-def vcf_id(in_vcf: pathlib.Path) -> Optional[str]:
-    """Collect the SentieonVcfID header"""
-    cmd = ["bcftools", "view", "-h", str(in_vcf)]
-    p = sp.run(cmd, capture_output=True, text=True)
-    for line in p.stdout.split("\n"):
-        if line.startswith("##SentieonVcfID="):
-            i = line.index("=")
-            return line[i + 1 :]  # noqa: E203
-    return None
-
-
 class SentieonPangenome(BasePangenome):
     """The Sentieon pangenome pipeline"""
 
@@ -602,9 +493,16 @@ def build_first_dag(self) -> DAG:
 
         # transfer annotations from the pop_vcf
         if self.pop_vcf:
-            transfer_jobs, concat_job = self.build_transfer_jobs(
+            transfer_jobs, concat_job = build_transfer_jobs(
                 transfer_vcf if not self.skip_model_apply else self.output_vcf,
+                self.pop_vcf,
                 raw_vcf,
+                self.tmp_dir,
+                self.shards,
+                self.pop_vcf_contigs,
+                self.fai_data,
+                self.dry_run,
+                self.cores,
             )
             for job in transfer_jobs:
                 dag.add_job(job, {dnascope_job})
@@ -903,118 +801,6 @@ def build_dnascope_job(
             self.cores,
         )
 
-    def build_transfer_jobs(
-        self, out_vcf: pathlib.Path, raw_vcf: pathlib.Path
-    ) -> Tuple[List[Job], Job]:
-        """Transfer annotations from the pop_vcf to the raw_vcf"""
-        assert self.pop_vcf
-
-        # Generate merge rules from the population VCF
-        merge_rules = "AC_v20:sum,AF_v20:sum,AC_genomes:sum,AF_genomes:sum"
-        if not self.dry_run:
-            kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)')
-            cmd = ["bcftools", "view", "-h", str(self.pop_vcf)]
-            p = sp.run(cmd, capture_output=True, text=True)
-            id_fields: List[str] = []
-            for line in p.stdout.split("\n"):
-                if not line.startswith("##INFO"):
-                    continue
-                if ",Number=A" not in line:
-                    continue
-                s = line.index("<")
-                e = line.index(">")
-                d = dict(kvpat.findall(line[s + 1 : e]))  # noqa: E203
-                id_fields.append(d["ID"])
-            merge_rules = ",".join([x + ":sum" for x in id_fields])
-
-        # Merge VCFs by shards
-        sharded_vcfs: List[pathlib.Path] = []
-        sharded_merge_jobs: List[Job] = []
-        trim_script = pathlib.Path(
-            str(files("sentieon_cli.scripts").joinpath("trimalt.py"))
-        ).resolve()
-        seen_contigs: Set[str] = set()
-        for i, shard in enumerate(self.shards):
-            # Use a BED file for unusual contig names
-            subset_bed = self.tmp_dir.joinpath(
-                f"sample-dnascope_transfer-subset{i}.bed"
-            )
-
-            # Extract contigs not in the pop vcf as merge will fail
-            if shard.contig not in self.pop_vcf_contigs:
-                if shard.contig in seen_contigs:
-                    continue
-                self.logger.info(
-                    "Skipping transfer for contig: %s", shard.contig
-                )
-                seen_contigs.add(shard.contig)
-                subset_vcf = self.tmp_dir.joinpath(
-                    f"sample-dnascope_transfer-subset{i}.vcf.gz"
-                )
-
-                ctg_len = self.fai_data[shard.contig]["length"]
-                if not self.dry_run:
-                    with open(subset_bed, "w") as fh:
-                        print(f"{shard.contig}\t0\t{ctg_len}", file=fh)
-
-                view_job = Job(
-                    cmds.cmd_bcftools_view_regions(
-                        subset_vcf,
-                        raw_vcf,
-                        regions_file=subset_bed,
-                    ),
-                    "merge-trim-extra",
-                    1,
-                )
-                sharded_merge_jobs.append(view_job)
-                sharded_vcfs.append(subset_vcf)
-            else:
-                if not self.dry_run:
-                    with open(subset_bed, "w") as fh:
-                        print(
-                            f"{shard.contig}\t{shard.start}\t{shard.stop}",
-                            file=fh,
-                        )
-
-                self.logger.debug("Transferring shard: %s", shard)
-                shard_vcf = self.tmp_dir.joinpath(
-                    f"sample-dnascope_transfer-shard{i}.vcf.gz"
-                )
-                merge_job = Job(
-                    cmds.cmd_bcftools_merge_trim(
-                        shard_vcf,
-                        raw_vcf,
-                        self.pop_vcf,
-                        trim_script,
-                        subset_bed,
-                        merge_rules=merge_rules,
-                        merge_xargs=[
-                            "--no-version",
-                            "--regions-overlap",
-                            "pos",
-                            "-m",
-                            "all",
-                        ],
-                        view_xargs=["--no-version"],
-                    ),
-                    f"merge-trim-{i}",
-                    1,
-                )
-                sharded_merge_jobs.append(merge_job)
-                sharded_vcfs.append(shard_vcf)
-
-        # Concat all shards
-        concat_job = Job(
-            cmds.bcftools_concat(
-                out_vcf,
-                sharded_vcfs,
-                xargs=["--no-version", "--threads", str(self.cores)],
-            ),
-            "merge-trim-concat",
-            self.cores,
-        )
-        return (sharded_merge_jobs, concat_job)
-
     def build_dnamodelapply_job(self, in_vcf) -> Job:
         assert self.output_vcf
         assert self.model_bundle
diff --git a/sentieon_cli/shard.py b/sentieon_cli/shard.py
new file mode 100644
index 0000000..ee1af4f
--- /dev/null
+++ b/sentieon_cli/shard.py
@@ -0,0 +1,117 @@
+"""
+Sharding functionality for Sentieon pipelines
+"""
+
+import pathlib
+import re
+import subprocess as sp
+from typing import Dict, List, NamedTuple, Optional
+
+from .logging import get_logger
+
+logger = get_logger(__name__)
+
+
+GRCH38_CONTIGS: Dict[str, int] = {
+    "chr1": 248956422,
+    "chr2": 242193529,
+    "chr3": 198295559,
+    "chr4": 190214555,
+    "chr5": 181538259,
+    "chr6": 170805979,
+    "chr7": 159345973,
+    "chr8": 145138636,
+    "chr9": 138394717,
+    "chr10": 133797422,
+    "chr11": 135086622,
+    "chr12": 133275309,
+    "chr13": 114364328,
+    "chr14": 107043718,
+    "chr15": 101991189,
+    "chr16": 90338345,
+    "chr17": 83257441,
+    "chr18": 80373285,
+    "chr19": 58617616,
+    "chr20": 64444167,
+    "chr21": 46709983,
+    "chr22": 50818468,
+    "chrX": 156040895,
+    "chrY": 57227415,
+    "chrM": 16569,
+}
+
+
+class Shard(NamedTuple):
+    contig: str
+    start: int
+    stop: int
+
+    def __str__(self) -> str:
+        return f"{self.contig}:{self.start}-{self.stop}"
+
+    def bcftools_str(self) -> str:
+        return f"{{{self.contig}}}:{self.start}-{self.stop}"
+
+
+def parse_fai(ref_fai: pathlib.Path) -> Dict[str, Dict[str, int]]:
+    """Parse a faidx index"""
+    contigs: Dict[str, Dict[str, int]] = {}
+    with open(ref_fai) as fh:
+        for line in fh:
+            try:
+                chrom, length, offset, lb, lw = line.rstrip().split()
+            except ValueError as err:
+                logger.error(
+                    "Reference fasta index (.fai) does not have the expected "
+                    "format"
+                )
+                raise err
+            contigs[chrom] = {
+                "length": int(length),
+                "offset": int(offset),
+                "linebases": int(lb),
+                "linewidth": int(lw),
+            }
+    return contigs
+
+
+def determine_shards_from_fai(
+    fai_data: Dict[str, Dict[str, int]], step: int
+) -> List[Shard]:
+    """Generate shards of the genome from the fasta index"""
+    shards: List[Shard] = []
+    for ctg, d in fai_data.items():
+        pos = 1
+        length = d["length"]
+        while pos <= length:
+            end = pos + step - 1
+            end = end if end < length else length
+            shards.append(Shard(ctg, pos, end))
+            pos = end + 1
+    return shards
+
+
+def vcf_contigs(
+    in_vcf: pathlib.Path, dry_run=False
+) -> Dict[str, Optional[int]]:
+    """Report the contigs in the input VCF"""
+    if dry_run:
+        return {
+            "chr1": 100,
+            "chr2": 200,
+            "chr3": 300,
+        }
+    kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)')
+    cmd = ["bcftools", "view", "-h", str(in_vcf)]
+    p = sp.run(cmd, capture_output=True, text=True)
+    contigs: Dict[str, Optional[int]] = {}
+    for line in p.stdout.split("\n"):
+        if not line.startswith("##contig"):
+            continue
+        s = line.index("<")
+        e = line.index(">")
+        d = dict(kvpat.findall(line[s + 1 : e]))  # noqa: E203
+        ctg: str = d["ID"]
+        length: Optional[str] = d.get("length", None)
+        contigs[ctg] = int(length) if length else None
+    return contigs
diff --git a/sentieon_cli/transfer.py b/sentieon_cli/transfer.py
new file mode 100644
index 0000000..8526f09
--- /dev/null
+++ b/sentieon_cli/transfer.py
@@ -0,0 +1,135 @@
+"""
+Annotation transfer functionality
+"""
+
+import pathlib
+import re
+import subprocess as sp
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from importlib_resources import files
+
+from . import command_strings as cmds
+from .job import Job
+from .logging import get_logger
+from .shard import Shard
+
+logger = get_logger(__name__)
+
+
+def build_transfer_jobs(
+    out_vcf: pathlib.Path,
+    pop_vcf: pathlib.Path,
+    raw_vcf: pathlib.Path,
+    tmp_dir: pathlib.Path,
+    shards: List[Shard],
+    pop_vcf_contigs: Dict[str, Optional[int]],
+    fai_data: Dict[str, Dict[str, int]],
+    dry_run: bool = False,
+    cores: int = 1,
+) -> Tuple[List[Job], Job]:
+    """Transfer annotations from the pop_vcf to the raw_vcf"""
+
+    # Generate merge rules from the population VCF
+    merge_rules = "AC_v20:sum,AF_v20:sum,AC_genomes:sum,AF_genomes:sum"
+    if not dry_run:
+        kvpat = re.compile(r'(.*?)=(".*?"|.*?)(?:,|$)')
+        cmd = ["bcftools", "view", "-h", str(pop_vcf)]
+        p = sp.run(cmd, capture_output=True, text=True)
+        id_fields: List[str] = []
+        for line in p.stdout.split("\n"):
+            if not line.startswith("##INFO"):
+                continue
+            if ",Number=A" not in line:
+                continue
+            s = line.index("<")
+            e = line.index(">")
+            d = dict(kvpat.findall(line[s + 1 : e]))  # noqa: E203
+            id_fields.append(d["ID"])
+        merge_rules = ",".join([x + ":sum" for x in id_fields])
+
+    # Merge VCFs by shards
+    sharded_vcfs: List[pathlib.Path] = []
+    sharded_merge_jobs: List[Job] = []
+    trim_script = pathlib.Path(
+        str(files("sentieon_cli.scripts").joinpath("trimalt.py"))
+    ).resolve()
+    seen_contigs: Set[str] = set()
+    for i, shard in enumerate(shards):
+        # Use a BED file for unusual contig names
+        subset_bed = tmp_dir.joinpath(
+            f"sample-dnascope_transfer-subset{i}.bed"
+        )
+
+        # Extract contigs not in the pop vcf as merge will fail
+        if shard.contig not in pop_vcf_contigs:
+            if shard.contig in seen_contigs:
+                continue
+            logger.info("Skipping transfer for contig: %s", shard.contig)
+            seen_contigs.add(shard.contig)
+            subset_vcf = tmp_dir.joinpath(
+                f"sample-dnascope_transfer-subset{i}.vcf.gz"
+            )
+
+            ctg_len = fai_data[shard.contig]["length"]
+            if not dry_run:
+                with open(subset_bed, "w") as fh:
+                    print(f"{shard.contig}\t0\t{ctg_len}", file=fh)
+
+            view_job = Job(
+                cmds.cmd_bcftools_view_regions(
+                    subset_vcf,
+                    raw_vcf,
+                    regions_file=subset_bed,
+                ),
+                "merge-trim-extra",
+                1,
+            )
+            sharded_merge_jobs.append(view_job)
+            sharded_vcfs.append(subset_vcf)
+        else:
+            if not dry_run:
+                with open(subset_bed, "w") as fh:
+                    print(
+                        f"{shard.contig}\t{shard.start}\t{shard.stop}",
+                        file=fh,
+                    )
+
+            logger.debug("Transferring shard: %s", shard)
+            shard_vcf = tmp_dir.joinpath(
+                f"sample-dnascope_transfer-shard{i}.vcf.gz"
+            )
+            merge_job = Job(
+                cmds.cmd_bcftools_merge_trim(
+                    shard_vcf,
+                    raw_vcf,
+                    pop_vcf,
+                    trim_script,
+                    subset_bed,
+                    merge_rules=merge_rules,
+                    merge_xargs=[
+                        "--no-version",
+                        "--regions-overlap",
+                        "pos",
+                        "-m",
+                        "all",
+                    ],
+                    view_xargs=["--no-version"],
+                ),
+                f"merge-trim-{i}",
+                1,
+            )
+            sharded_merge_jobs.append(merge_job)
+            sharded_vcfs.append(shard_vcf)
+
+    # Concat all shards
+    concat_job = Job(
+        cmds.bcftools_concat(
+            out_vcf,
+            sharded_vcfs,
+            xargs=["--no-version", "--threads", str(cores)],
+        ),
+        "merge-trim-concat",
+        cores,
+    )
+    return (sharded_merge_jobs, concat_job)
diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
index 32395e1..825514d 100644
--- a/sentieon_cli/util.py
+++ b/sentieon_cli/util.py
@@ -228,3 +228,14 @@ def get_read_length_aln(
         if m:
             return int(m.groupdict()["length"])
     return 151
+
+
+def vcf_id(in_vcf: pathlib.Path) -> Optional[str]:
+    """Collect the SentieonVcfID header"""
+    cmd = ["bcftools", "view", "-h", str(in_vcf)]
+    p = sp.run(cmd, capture_output=True, text=True)
+    for line in p.stdout.split("\n"):
+        if line.startswith("##SentieonVcfID="):
+            i = line.index("=")
+            return line[i + 1 :]  # noqa: E203
+    return None
diff --git a/tests/unit/test_dnascope_hybrid_pop_vcf.py b/tests/unit/test_dnascope_hybrid_pop_vcf.py
new file mode 100644
index 0000000..661d277
--- /dev/null
+++ b/tests/unit/test_dnascope_hybrid_pop_vcf.py
@@ -0,0 +1,167 @@
+"""
+Unit tests for DNAscopeHybridPipeline pop_vcf logic
+"""
+
+import pathlib
+import pytest
+import tempfile
+import sys
+import os
+import json
+from unittest.mock import patch, MagicMock
+
+# Add the parent directory to the path to import sentieon_cli
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+from sentieon_cli.dnascope_hybrid import DNAscopeHybridPipeline
+from sentieon_cli.dag import DAG
+
+
+class TestDNAscopeHybridPopVcf:
+    """Test DNAscope hybrid pipeline pop_vcf logic"""
+
+    def setup_method(self):
+        """Setup test fixtures"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.mock_dir = pathlib.Path(self.temp_dir)
+
+        # Create mock files
+        self.mock_vcf = self.mock_dir / "output.vcf.gz"
+        self.mock_ref = self.mock_dir / "reference.fa"
+        self.mock_sr_aln = [self.mock_dir / "short.bam"]
+        self.mock_lr_aln = [self.mock_dir / "long.bam"]
+        self.mock_bundle = self.mock_dir / "model.bundle"
+        self.mock_pop_vcf = self.mock_dir / "population.vcf.gz"
+        self.mock_bed = self.mock_dir / "interval.bed"
+
+        # Create empty files
+        for file_path in [
+            self.mock_ref,
+            self.mock_sr_aln[0],
+            self.mock_lr_aln[0],
+            self.mock_bundle,
+            self.mock_pop_vcf,
+            self.mock_bed,
+        ]:
+            file_path.touch()
+
+    def create_pipeline(self):
+        """Create a DNAscopeHybridPipeline for testing"""
+        # We need to mock sys.exit to avoid exiting during initialization if validation fails
+        with patch("sys.exit"):
+            pipeline = DNAscopeHybridPipeline()
+        
+        # Setup mocks
+        pipeline.logger = MagicMock()
+        
+        # Configure arguments
+        pipeline.output_vcf = self.mock_vcf
+        pipeline.reference = self.mock_ref
+        pipeline.model_bundle = self.mock_bundle
+        pipeline.bed = self.mock_bed
+        pipeline.cores = 2
+        pipeline.dry_run = True
+        pipeline.skip_version_check = True
+        pipeline.tmp_dir = self.mock_dir
+        pipeline.pop_vcf = None
+        pipeline.skip_pop_vcf_id_check = False
+        
+        # Mock fai related
+        pipeline.fai_data = {"chr1": {"length": 1000}}
+        pipeline.shards = [MagicMock()]
+        pipeline.shards[0].contig = "chr1"
+        pipeline.shards[0].start = 1
+        pipeline.shards[0].stop = 1000
+        
+        # Mock pop vcf contigs
+        pipeline.pop_vcf_contigs = {"chr1": 1000}
+
+        # Mock readgroup info (needed for build_dag)
+        pipeline.lr_aln_readgroups = [[{"ID": "lr_rg1", "SM": "sample1"}]]
+        pipeline.sr_aln_readgroups = [[{"ID": "sr_rg1", "SM": "sample1"}]]
+        pipeline.hybrid_rg_sm = "sample1"
+        pipeline.hybrid_set_rg = False
+        pipeline.shortread_tech = "Illumina"
+        pipeline.longread_tech = "ONT"
+        pipeline.sr_aln = self.mock_sr_aln
+        pipeline.lr_aln = self.mock_lr_aln
+        
+        return pipeline
+
+    @patch("sentieon_cli.dnascope_hybrid.ar_load")
+    def test_validation_requires_pop_vcf(self, mock_ar_load):
+        """Test that validation fails if model bundle requires pop_vcf but none provided"""
+        pipeline = self.create_pipeline()
+        
+        # Mock bundle info requiring SentieonVcfID
+        mock_bundle_info = {
+            "longReadPlatform": "ONT",
+            "shortReadPlatform": "Illumina",
+            "SentieonVcfID": "some_id",
+            "minScriptVersion": "2.0"
+        }
+        mock_ar_load.return_value = json.dumps(mock_bundle_info).encode()
+        pipeline.dry_run = False
+        
+        with patch("sys.exit") as mock_exit:
+             pipeline.validate_bundle()
+             mock_exit.assert_called_with(2)
+
+    @patch("sentieon_cli.dnascope_hybrid.ar_load")
+    def test_validation_pop_vcf_mismatch(self, mock_ar_load):
+        """Test validation fails if pop_vcf ID mismatches"""
+        pipeline = self.create_pipeline()
+        pipeline.pop_vcf = self.mock_pop_vcf
+        
+        mock_bundle_info = {
+            "longReadPlatform": "ONT",
+            "shortReadPlatform": "Illumina",
+            "SentieonVcfID": "expected_id",
+            "minScriptVersion": "2.0"
+        }
+        mock_ar_load.return_value = json.dumps(mock_bundle_info).encode()
+        pipeline.dry_run = False
+
+        with patch("sentieon_cli.dnascope_hybrid.vcf_id", return_value="wrong_id"), \
+             patch("sys.exit") as mock_exit:
+                 
+            pipeline.validate_bundle()
+            mock_exit.assert_called_with(2)
+
+    def test_transfer_jobs_creation(self):
+        """Test that transfer jobs are added to the DAG when pop_vcf is present"""
+        pipeline = self.create_pipeline()
+        pipeline.pop_vcf = self.mock_pop_vcf
+        
+        # Build DAG
+        with patch("sentieon_cli.dnascope_hybrid.check_version", return_value=True):
+             dag = pipeline.build_dag()
+        
+        # Verify transfer jobs exist
+        job_names = [job.name for job in dag.waiting_jobs]
+        
+        # Expect merge-trim jobs
+        assert any("merge-trim" in name for name in job_names)
+        assert "merge-trim-concat" in job_names
+
+    def test_model_apply_input_with_pop_vcf(self):
+        """Test that model apply uses the transfer output when pop_vcf is present"""
+        pipeline = self.create_pipeline()
+        pipeline.pop_vcf = self.mock_pop_vcf
+        
+        with patch("sentieon_cli.dnascope_hybrid.check_version", return_value=True):
+             dag = pipeline.build_dag()
+        
+        # Find model-apply job
+        apply_job = None
+        for job in dag.waiting_jobs:
+            if job.name == "model-apply":
+                apply_job = job
+                break
+        
+        assert apply_job is not None
+        
+        # Check dependency: apply_job should depend on merge-trim-concat
+        deps = dag.waiting_jobs[apply_job]
+        dep_names = [j.name for j in deps]
+        assert "merge-trim-concat" in dep_names
diff --git a/tests/unit/test_pipeline_validation.py b/tests/unit/test_pipeline_validation.py
index d9418c8..d8b5f69 100644
--- a/tests/unit/test_pipeline_validation.py
+++ b/tests/unit/test_pipeline_validation.py
@@ -236,12 +236,13 @@ def setup_method(self):
         # Create mock files
         self.mock_vcf = pathlib.Path(self.temp_dir) / "output.vcf.gz"
         self.mock_ref = pathlib.Path(self.temp_dir) / "reference.fa"
+        self.mock_fai = pathlib.Path(self.temp_dir) / "reference.fa.fai"
         self.mock_lr_bam = pathlib.Path(self.temp_dir) / "longread.bam"
         self.mock_sr_bam = pathlib.Path(self.temp_dir) / "shortread.bam"
         self.mock_fastq = pathlib.Path(self.temp_dir) / "sample_R1.fastq.gz"
 
         # Create empty files
-        for file_path in [self.mock_ref, self.mock_lr_bam, self.mock_sr_bam, self.mock_fastq]:
+        for file_path in [self.mock_ref, self.mock_fai, self.mock_lr_bam, self.mock_sr_bam, self.mock_fastq]:
             file_path.touch()
 
         # Create mock bundle file (content will be mocked by ar_load)
diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py
index 8c214bc..cf057b7 100644
--- a/tests/utils/test_helpers.py
+++ b/tests/utils/test_helpers.py
@@ -194,6 +194,7 @@ def create_hybrid_pipeline(self, **kwargs) -> DNAscopeHybridPipeline:
         # Set default files
         pipeline.output_vcf = self.fs.create_file("output.vcf.gz")
         pipeline.reference = self.fs.create_file("reference.fa")
+        _fai = self.fs.create_file("reference.fa.fai")
         pipeline.model_bundle = self.fs.create_model_bundle("model.bundle", "DNAscope Hybrid")
 
         # Default configuration

From c923ac7019788435883c108d5fcbea7066a2fe52 Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Wed, 21 Jan 2026 15:50:23 -0800
Subject: [PATCH 4/8] Formatting fixes

---
 sentieon_cli/dnascope_hybrid.py | 7 +------
 sentieon_cli/transfer.py        | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/sentieon_cli/dnascope_hybrid.py b/sentieon_cli/dnascope_hybrid.py
index 4fe078c..284d3a7 100644
--- a/sentieon_cli/dnascope_hybrid.py
+++ b/sentieon_cli/dnascope_hybrid.py
@@ -6,11 +6,8 @@
 import copy
 import json
 import pathlib
-import re
-import shutil
-import subprocess as sp
 import sys
-from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import packaging.version
 
@@ -44,8 +41,6 @@
     vcf_id,
 )
 from .shard import (
-    GRCH38_CONTIGS,
-    Shard,
     determine_shards_from_fai,
     parse_fai,
     vcf_contigs,
diff --git a/sentieon_cli/transfer.py b/sentieon_cli/transfer.py
index 8526f09..6d5df85 100644
--- a/sentieon_cli/transfer.py
+++ b/sentieon_cli/transfer.py
@@ -5,7 +5,7 @@
 import pathlib
 import re
 import subprocess as sp
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 from importlib_resources import files
 

From 1f3caa1dda632a9bc21e61e81bfad1d096910e8a Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Wed, 21 Jan 2026 16:15:54 -0800
Subject: [PATCH 5/8] Fix a test

---
 tests/unit/test_dnascope_hybrid_pop_vcf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_dnascope_hybrid_pop_vcf.py b/tests/unit/test_dnascope_hybrid_pop_vcf.py
index 661d277..7add9f1 100644
--- a/tests/unit/test_dnascope_hybrid_pop_vcf.py
+++ b/tests/unit/test_dnascope_hybrid_pop_vcf.py
@@ -89,7 +89,8 @@ def create_pipeline(self):
         return pipeline
 
     @patch("sentieon_cli.dnascope_hybrid.ar_load")
-    def test_validation_requires_pop_vcf(self, mock_ar_load):
+    @patch("sentieon_cli.dnascope_hybrid.vcf_id")
+    def test_validation_requires_pop_vcf(self, mock_vcf_id, mock_ar_load):
         """Test that validation fails if model bundle requires pop_vcf but none provided"""
         pipeline = self.create_pipeline()
         

From df415c04a5dc642858b004aa174ae5b857c613fd Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Thu, 22 Jan 2026 11:22:15 -0800
Subject: [PATCH 6/8] Increment version

---
 README.md            | 4 ++--
 pyproject.toml       | 2 +-
 sentieon_cli/util.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1711756..f58b8b8 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,8 @@ A command-line interface for the Sentieon software
 
 Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
 ```sh
-curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.0/sentieon_cli-1.5.0.tar.gz
-pip install sentieon_cli-1.5.0.tar.gz
+curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.1/sentieon_cli-1.5.1.tar.gz
+pip install sentieon_cli-1.5.1.tar.gz
 ```
 
 ## Installation with Poetry
diff --git a/pyproject.toml b/pyproject.toml
index daac4e4..1296c34 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 
 [project]
 name = "sentieon_cli"
-version = "1.5.0"
+version = "1.5.1"
 description = "entry point for sentieon command-line tools"
 authors = [
     {name = "Don Freed", email = "don.freed@sentieon.com"},
diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
index 825514d..2d4cea3 100644
--- a/sentieon_cli/util.py
+++ b/sentieon_cli/util.py
@@ -17,7 +17,7 @@
 
 from .logging import get_logger
 
-__version__ = "1.5.0"
+__version__ = "1.5.1"
 
 logger = get_logger(__name__)
 

From 4635465acedf9c03121f7c2b3f204b4ffc01c315 Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Fri, 23 Jan 2026 13:51:34 -0800
Subject: [PATCH 7/8] Add a check for the kmc binary

---
 sentieon_cli/sentieon_pangenome.py | 11 ++++++++
 sentieon_cli/util.py               | 41 ++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/sentieon_cli/sentieon_pangenome.py b/sentieon_cli/sentieon_pangenome.py
index a322808..7ac9cc1 100644
--- a/sentieon_cli/sentieon_pangenome.py
+++ b/sentieon_cli/sentieon_pangenome.py
@@ -39,6 +39,7 @@
 from .util import (
     __version__,
     check_version,
+    check_kmc_patch,
     parse_rg_line,
     path_arg,
     tmp,
@@ -194,6 +195,16 @@ def validate(self) -> None:
                 if not check_version(cmd, min_version):
                     sys.exit(2)
 
+            if self.sample_input:
+                if not check_kmc_patch("kmc"):
+                    self.logger.error(
+                        "Error: The 'kmc' executable in the PATH does not "
+                        "support reading from stdin. Please ensure "
+                        "you are using the patched version of KMC from "
+                        "https://github.com/Sentieon/KMC/releases."
+                    )
+                    sys.exit(2)
+
         if self.bed is None:
             self.logger.info(
                 "A BED file is recommended to avoid variant calling "
diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
index 2d4cea3..2e27fcf 100644
--- a/sentieon_cli/util.py
+++ b/sentieon_cli/util.py
@@ -239,3 +239,44 @@ def vcf_id(in_vcf: pathlib.Path) -> Optional[str]:
             i = line.index("=")
             return line[i + 1 :]  # noqa: E203
     return None
+
+
+def check_kmc_patch(kmc_cmd: str = "kmc") -> bool:
+    """Check if the KMC version supports the required patch"""
+    # Create a temporary directory for the test
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = pathlib.Path(temp_dir)
+        output_prefix = temp_path / "kmc_test"
+
+        # Test input sequence
+        test_input = (
+            ">206B4ABXX100825:7:1:1360:6029/1\n"
+            "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n"
+            ">206B4ABXX100825:7:1:1362:4449/1\n"
+            "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA"
+        )
+
+        cmd = [
+            kmc_cmd,
+            "-k29",
+            "-m4",
+            "-okff",
+            "-t1",
+            "-fa",
+            "/dev/stdin",
+            str(output_prefix),
+            str(temp_path),
+        ]
+
+        try:
+            sp.run(
+                cmd,
+                input=test_input,
+                text=True,
+                check=True,
+                stdout=sp.DEVNULL,
+                stderr=sp.DEVNULL,
+            )
+            return True
+        except (sp.CalledProcessError, FileNotFoundError):
+            return False

From 35f5f6d355667d911bc0cbd078dbcb1f26885f40 Mon Sep 17 00:00:00 2001
From: Don Freed <donfreed12@gmail.com>
Date: Fri, 23 Jan 2026 14:00:18 -0800
Subject: [PATCH 8/8] Formatting

---
 sentieon_cli/util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
index 2e27fcf..8abc80a 100644
--- a/sentieon_cli/util.py
+++ b/sentieon_cli/util.py
@@ -251,9 +251,9 @@ def check_kmc_patch(kmc_cmd: str = "kmc") -> bool:
         # Test input sequence
         test_input = (
             ">206B4ABXX100825:7:1:1360:6029/1\n"
-            "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n"
+            "TGATTTTNNNNNNNNNNNTGAAGAACGCACCCATGTTAAAGAGCATGACAAANNNANNACAAGGCTAAGNGGCGNG\n"  # noqa: E501
             ">206B4ABXX100825:7:1:1362:4449/1\n"
-            "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA"
+            "ATTCCCCNNNNNNNNNNNCCACAGCCGGAGGAGCTGACCAACATCCTGGAGATNTGNAATGTGGTCTTANCCAGNA"  # noqa: E501
         )
 
         cmd = [