Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ A command-line interface for the Sentieon software

Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
```sh
curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.0/sentieon_cli-1.5.0.tar.gz
pip install sentieon_cli-1.5.0.tar.gz
curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.1/sentieon_cli-1.5.1.tar.gz
pip install sentieon_cli-1.5.1.tar.gz
```

## Installation with Poetry
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

[project]
name = "sentieon_cli"
version = "1.5.0"
version = "1.5.1"
description = "entry point for sentieon command-line tools"
authors = [
{name = "Don Freed", email = "don.freed@sentieon.com"},
Expand Down
6 changes: 6 additions & 0 deletions sentieon_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .dnascope_longread import DNAscopeLRPipeline
from .pangenome import PangenomePipeline
from .sentieon_pangenome import SentieonPangenome
from .util import __version__


def main():
Expand All @@ -26,6 +27,11 @@ def main():
dest="loglevel",
const="DEBUG",
)
parser.add_argument(
"--version",
action="version",
version=__version__,
)
subparsers = parser.add_subparsers(required=True)

# DNAscope parser
Expand Down
108 changes: 102 additions & 6 deletions sentieon_cli/dnascope_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from importlib_resources import files

from .logging import get_logger
from .archive import ar_load
from . import command_strings as cmds
from .dag import DAG
Expand All @@ -37,7 +38,17 @@
parse_rg_line,
path_arg,
split_alignment,
vcf_id,
)
from .shard import (
determine_shards_from_fai,
parse_fai,
vcf_contigs,
)
from .transfer import build_transfer_jobs


logger = get_logger(__name__)


CALLING_MIN_VERSIONS = {
Expand Down Expand Up @@ -113,6 +124,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
params = copy.deepcopy(BasePipeline.params)
params.update(
{
# Required arguments
"lr_aln": {
"nargs": "*",
"help": "Long-read BAM or CRAM files.",
Expand Down Expand Up @@ -144,6 +156,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
"nargs": "*",
"help": "Readgroup information for the short-read fastq files",
},
# Additional arguments
"bam_format": {
"help": (
"Use the BAM format instead of CRAM for output aligned "
Expand Down Expand Up @@ -189,6 +202,13 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
),
"type": path_arg(exists=True, is_file=True),
},
"pop_vcf": {
"flags": ["--pop_vcf"],
"help": (
"A VCF containing annotations for use with DNAModelApply."
),
"type": path_arg(exists=True, is_file=True),
},
"rgsm": {
"help": (
"Overwrite the SM tag of the input readgroups for "
Expand Down Expand Up @@ -220,6 +240,7 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
"choices": ["markdup", "rmdup", "none"],
"default": "markdup",
},
# Hidden arguments
"bwa_args": {
# help="Extra arguments for sentieon bwa",
"help": argparse.SUPPRESS,
Expand Down Expand Up @@ -256,6 +277,10 @@ class DNAscopeHybridPipeline(DNAscopePipeline, DNAscopeLRPipeline):
"help": argparse.SUPPRESS,
"action": "store_true",
},
"skip_pop_vcf_id_check": {
"help": argparse.SUPPRESS,
"action": "store_true",
},
"sr_read_filter": {
"help": argparse.SUPPRESS,
},
Expand All @@ -277,15 +302,26 @@ def __init__(self) -> None:
self.lr_aln: List[pathlib.Path] = []
self.lr_align_input = False
self.lr_input_ref: Optional[pathlib.Path] = None
self.pop_vcf: Optional[pathlib.Path] = None
self.bam_format = False
self.rgsm: Optional[str] = None
self.lr_fastq_taglist = "*"
self.sr_read_filter: Optional[str] = None
self.lr_read_filter: Optional[str] = None
self.assay = "WGS"
self.skip_model_apply = False
self.skip_pop_vcf_id_check = False

def validate(self) -> None:
self.fai_data = parse_fai(pathlib.Path(str(self.reference) + ".fai"))
self.pop_vcf_contigs: Dict[str, Optional[int]] = {}
if self.pop_vcf:
self.pop_vcf_contigs = vcf_contigs(self.pop_vcf, self.dry_run)
self.logger.debug("VCF contigs are: %s", self.pop_vcf_contigs)
self.shards = determine_shards_from_fai(
self.fai_data, 10 * 1000 * 1000
)

self.validate_bundle()
self.collect_readgroups()
self.validate_readgroups()
Expand Down Expand Up @@ -329,6 +365,24 @@ def validate_bundle(self) -> None:
bundle_info = json.loads(bundle_info_bytes.decode())
self.longread_tech = bundle_info.get("longReadPlatform")
self.shortread_tech = bundle_info.get("shortReadPlatform")
bundle_vcf_id = bundle_info.get("SentieonVcfID")

if bundle_vcf_id:
if not self.pop_vcf:
self.logger.error(
"The model bundle requires a population VCF file. Please "
"supply the `--pop_vcf` argument."
)
sys.exit(2)
if not self.skip_pop_vcf_id_check and not self.dry_run:
pop_vcf_id = vcf_id(self.pop_vcf)
if bundle_vcf_id != pop_vcf_id:
self.logger.error(
"The ID of the `--pop_vcf` does not match the model "
"bundle"
)
sys.exit(2)

if not self.longread_tech or not self.shortread_tech:
self.logger.error(
"The bundle file does not have the expected attributes. "
Expand Down Expand Up @@ -586,6 +640,8 @@ def build_dag(self) -> DAG:
concat_job,
rm_job5,
anno_job,
transfer_jobs,
transfer_concat,
apply_job,
norm_job,
) = self.call_variants(sr_aln, lr_aln, rg_info)
Expand All @@ -601,8 +657,16 @@ def build_dag(self) -> DAG:
dag.add_job(subset_job, {second_stage_job})
dag.add_job(concat_job, {subset_job, call2_job})
dag.add_job(anno_job, {concat_job})

apply_dependencies = {anno_job}
if transfer_jobs and transfer_concat:
for job in transfer_jobs:
dag.add_job(job, {anno_job})
dag.add_job(transfer_concat, set(transfer_jobs))
apply_dependencies = {transfer_concat}

if apply_job:
dag.add_job(apply_job, {anno_job})
dag.add_job(apply_job, apply_dependencies)
if apply_job and norm_job:
dag.add_job(norm_job, {apply_job})

Expand Down Expand Up @@ -640,6 +704,8 @@ def call_variants(
Job,
Job,
Job,
Optional[List[Job]],
Optional[Job],
Optional[Job],
Optional[Job],
]:
Expand Down Expand Up @@ -910,12 +976,13 @@ def call_variants(
hybrid_anno = pathlib.Path(
str(files("sentieon_cli.scripts").joinpath("hybrid_anno.py"))
)
combined_anno_vcf = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz")
if self.skip_model_apply:
combined_anno_vcf = self.output_vcf
anno_target = self.tmp_dir.joinpath("combined_tmp_anno.vcf.gz")
if self.skip_model_apply and not self.pop_vcf:
anno_target = self.output_vcf

anno_job = Job(
cmds.cmd_pyexec_hybrid_anno(
combined_anno_vcf,
anno_target,
combined_tmp_vcf,
stage1_hap_bed,
hybrid_anno,
Expand All @@ -924,6 +991,31 @@ def call_variants(
"anno-calls",
0,
)

transfer_jobs: Optional[List[Job]] = None
transfer_concat_job: Optional[Job] = None
input_to_apply = anno_target

if self.pop_vcf:
transfer_target = self.tmp_dir.joinpath(
"combined_tmp_transfer.vcf.gz"
)
if self.skip_model_apply:
transfer_target = self.output_vcf

transfer_jobs, transfer_concat_job = build_transfer_jobs(
transfer_target,
self.pop_vcf,
anno_target,
self.tmp_dir,
self.shards,
self.pop_vcf_contigs,
self.fai_data,
self.dry_run,
self.cores,
)
input_to_apply = transfer_target

if self.skip_model_apply:
return (
call_job,
Expand All @@ -943,6 +1035,8 @@ def call_variants(
concat_job,
rm_job5,
anno_job,
transfer_jobs,
transfer_concat_job,
None,
None,
)
Expand All @@ -957,7 +1051,7 @@ def call_variants(
driver.add_algo(
DNAModelApply(
model=self.model_bundle.joinpath("hybrid.model"),
vcf=combined_anno_vcf,
vcf=input_to_apply,
output=apply_vcf,
)
)
Expand Down Expand Up @@ -994,6 +1088,8 @@ def call_variants(
concat_job,
rm_job5,
anno_job,
transfer_jobs,
transfer_concat_job,
apply_job,
norm_job,
)
Loading