diff --git a/fastdfe/annotation.py b/fastdfe/annotation.py index 4861438f9..d86eda9ee 100644 --- a/fastdfe/annotation.py +++ b/fastdfe/annotation.py @@ -1860,7 +1860,10 @@ def _setup(self, handler: MultiHandler): }) # set reader - self._reader = self._handler.load_vcf() + if self._handler.is_zarr_store: + self._reader = self._handler.load_zarr() + else: + self._reader = self._handler.load_vcf() # prepare masks self._prepare_masks(handler._reader.samples) @@ -4452,7 +4455,8 @@ def _setup(self): annotation._setup(self) # create the writer - self._writer = Writer(self.output, self._reader) + if not self.is_zarr_store: + self._writer = Writer(self.output, self._reader) def _teardown(self): """ @@ -4462,8 +4466,9 @@ def _teardown(self): annotation._teardown() # close the writer and reader - self._writer.close() - self._reader.close() + if not self.is_zarr_store: + self._writer.close() + self._reader.close() def annotate(self): """ @@ -4485,7 +4490,8 @@ def annotate(self): annotation.annotate_site(variant) # write the variant - self._writer.write_record(variant) + if not self.is_zarr_store: + self._writer.write_record(variant) # update the progress bar pbar.update() diff --git a/fastdfe/io_handlers.py b/fastdfe/io_handlers.py index 5ac73e5b0..e5614f1e1 100644 --- a/fastdfe/io_handlers.py +++ b/fastdfe/io_handlers.py @@ -27,6 +27,9 @@ from pandas.errors import SettingWithCopyWarning from tqdm import tqdm +import zarr +from vcztools import retrieval + from .settings import Settings #: The DNA bases @@ -616,6 +619,8 @@ def __init__( #: Random generator instance self.rng = np.random.default_rng(seed=seed) + self.is_zarr_store = False + @cached_property def _reader(self) -> 'cyvcf2.VCF': """ @@ -623,6 +628,9 @@ def _reader(self) -> 'cyvcf2.VCF': :return: The VCF reader. """ + if os.path.isdir(self.vcf) and self.vcf.endswith('.vcz'): + self.is_zarr_store = True + return self.load_zarr() return self.load_vcf() def _rewind(self): @@ -651,6 +659,18 @@ def load_vcf(self) -> 'cyvcf2.VCF': return VCF(self.download_if_url(self.vcf)) + + def load_zarr(self): + """ + Load a Zarr file into a dictionary. + + :return: The Zarr reader. + """ + self._logger.info("Loading Zarr file") + + return ZarrStore(self.vcf) + + @cached_property def n_sites(self) -> int: """ @@ -666,6 +686,9 @@ def count_sites(self) -> int: :return: Number of sites """ + if self.is_zarr_store: + return self._reader.count_sites() + return count_sites( vcf=self.download_if_url(self.vcf), max_sites=self.max_sites, @@ -687,6 +710,81 @@ def get_pbar(self, desc: str = "Processing sites", total: int | None = 0) -> tqd ) +class ZarrVariant: + def __init__( + self, + ref: str, + chrom: str, + pos: int, + gt_bases: np.ndarray, + ): + self.REF = ref + self.CHROM = chrom + self.POS = pos + self.INFO = {} + self.gt_bases = gt_bases + self.is_snp = True + + +class ZarrStore: + + def __init__(self, fname: str, samples: List[str] = None): + """ + Initialize a Zarr store. + + The iterator iterates over Variant objects. + + :param fname: The file name + """ + self.fname = fname + self.root = zarr.open(fname, mode='r') + all_samples = self.root["sample_id"][:] + self.samples = list(retrieval.parse_samples(samples, all_samples)[0]) + self.iter = retrieval.variant_iter( + fname, + samples=samples, + fields=["variant_contig", "variant_position", "variant_allele", + "call_genotype", "call_genotype_phased"] + ) + + def __enter__(self): + return self + + def __iter__(self): + return self + + def __next__(self): + v = next(self.iter) + alleles = v["variant_allele"] + phased = v["call_genotype_phased"] + gt_bases = np.array([ + ["/", "|"][np.int8(ph)].join(alleles[gt]) for ph, gt in zip(phased, v["call_genotype"]) + ]) + return ZarrVariant( + ref=alleles[0], + chrom=v["variant_contig"], + pos=v["variant_position"], + gt_bases=gt_bases, + ) + + def count_sites(self) -> int: + """ + Count the number of sites in the Zarr store. + + :return: Number of sites + """ + return self.root["variant_id"].shape[0] + + def add_info_to_header(self, header): + """ + Add the info fields to the header. + + :param header: The header + """ + print("Adding info to header: ", header) + pass + + class MultiHandler(VCFHandler, FASTAHandler, GFFHandler): """ Handle VCF, FASTA and GFF files. diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zattrs new file mode 100644 index 000000000..39802e39a --- /dev/null +++ b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zattrs @@ -0,0 +1,34 @@ +{ + "source": "bio2zarr-0.1.6", + "vcf_meta_information": [ + [ + "fileformat", + "VCFv4.2" + ], + [ + "ALT", + "" + ], + [ + "GATKCommandLine", + "" + ], + [ + "VEP", + "\"v104\" time=\"2022-10-02 10:23:11\" ensembl-funcgen=104.59ae779 ensembl=104.1af1dce ensembl-variation=104.6154f8b ensembl-io=104.1d3bb6e" + ], + [ + "bcftools_viewCommand", + "view -Oz --apply-filters .,PASS --max-alleles 2 --exclude-types indels,mnps,other results/default/variants/vcfs/variants72.quality.tagged.vcf.gz" + ], + [ + "bcftools_viewVersion", + "1.3.1+htslib-1.3.1" + ], + [ + "source", + "GenomicsDBImport" + ] + ], + "vcf_zarr_version": "0.4" +} \ No newline at end of file diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zgroup b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zgroup new file mode 100644 index 000000000..3b7daf227 --- /dev/null +++ b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata new file mode 100644 index 000000000..929b77906 --- /dev/null +++ b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata @@ -0,0 +1,1603 @@ +{ + "metadata": { + ".zattrs": { + "source": "bio2zarr-0.1.6", + "vcf_meta_information": [ + [ + "fileformat", + "VCFv4.2" + ], + [ + "ALT", + "" + ], + [ + "GATKCommandLine", + "" + ], + [ + "VEP", + "\"v104\" time=\"2022-10-02 10:23:11\" ensembl-funcgen=104.59ae779 ensembl=104.1af1dce ensembl-variation=104.6154f8b ensembl-io=104.1d3bb6e" + ], + [ + "bcftools_viewCommand", + "view -Oz --apply-filters .,PASS --max-alleles 2 --exclude-types indels,mnps,other results/default/variants/vcfs/variants72.quality.tagged.vcf.gz" + ], + [ + "bcftools_viewVersion", + "1.3.1+htslib-1.3.1" + ], + [ + "source", + "GenomicsDBImport" + ] + ], + "vcf_zarr_version": "0.4" + }, + ".zgroup": { + "zarr_format": 2 + }, + "call_AD/.zarray": { + "chunks": [ + 1000, + 379, + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 7, + "cname": "zstd", + "id": "blosc", + "shuffle": 1 + }, + "dimension_separator": "/", + "dtype": "