Sendrowski · percyfal · Dec 1, 2025 · Dec 1, 2025
diff --git a/fastdfe/annotation.py b/fastdfe/annotation.py
@@ -1860,7 +1860,10 @@ def _setup(self, handler: MultiHandler):
         })
 
         # set reader
-        self._reader = self._handler.load_vcf()
+        if self._handler.is_zarr_store:
+            self._reader = self._handler.load_zarr()
+        else:
+            self._reader = self._handler.load_vcf()
 
         # prepare masks
         self._prepare_masks(handler._reader.samples)
@@ -4452,7 +4455,8 @@ def _setup(self):
             annotation._setup(self)
 
         # create the writer
-        self._writer = Writer(self.output, self._reader)
+        if not self.is_zarr_store:
+            self._writer = Writer(self.output, self._reader)
 
     def _teardown(self):
         """
@@ -4462,8 +4466,9 @@ def _teardown(self):
             annotation._teardown()
 
         # close the writer and reader
-        self._writer.close()
-        self._reader.close()
+        if not self.is_zarr_store:
+            self._writer.close()
+            self._reader.close()
 
     def annotate(self):
         """
@@ -4485,7 +4490,8 @@ def annotate(self):
                     annotation.annotate_site(variant)
 
                 # write the variant
-                self._writer.write_record(variant)
+                if not self.is_zarr_store:
+                    self._writer.write_record(variant)
 
                 # update the progress bar
                 pbar.update()

diff --git a/fastdfe/io_handlers.py b/fastdfe/io_handlers.py
@@ -27,6 +27,9 @@
 from pandas.errors import SettingWithCopyWarning
 from tqdm import tqdm
 
+import zarr
+from vcztools import retrieval
+
 from .settings import Settings
 
 #: The DNA bases
@@ -616,13 +619,18 @@ def __init__(
         #: Random generator instance
         self.rng = np.random.default_rng(seed=seed)
 
+        self.is_zarr_store = False
+
     @cached_property
     def _reader(self) -> 'cyvcf2.VCF':
         """
         Get the VCF reader.
 
         :return: The VCF reader.
         """
+        if os.path.isdir(self.vcf) and self.vcf.endswith('.vcz'):
+            self.is_zarr_store = True
+            return self.load_zarr()
         return self.load_vcf()
 
     def _rewind(self):
@@ -651,6 +659,18 @@ def load_vcf(self) -> 'cyvcf2.VCF':
 
         return VCF(self.download_if_url(self.vcf))
 
+
+    def load_zarr(self):
+        """
+        Load a Zarr file into a dictionary.
+
+        :return: The Zarr reader.
+        """
+        self._logger.info("Loading Zarr file")
+
+        return ZarrStore(self.vcf)
+
+
     @cached_property
     def n_sites(self) -> int:
         """
@@ -666,6 +686,9 @@ def count_sites(self) -> int:
 
         :return: Number of sites
         """
+        if self.is_zarr_store:
+            return self._reader.count_sites()
+
         return count_sites(
             vcf=self.download_if_url(self.vcf),
             max_sites=self.max_sites,
@@ -687,6 +710,81 @@ def get_pbar(self, desc: str = "Processing sites", total: int | None = 0) -> tqd
         )
 
 
+class ZarrVariant:
+    def __init__(
+        self,
+        ref: str,
+        chrom: str,
+        pos: int,
+        gt_bases: np.ndarray,
+    ):
+        self.REF = ref
+        self.CHROM = chrom
+        self.POS = pos
+        self.INFO = {}
+        self.gt_bases = gt_bases
+        self.is_snp = True
+
+
+class ZarrStore:
+
+    def __init__(self, fname: str, samples: List[str] = None):
+        """
+        Initialize a Zarr store.
+
+        The iterator iterates over Variant objects.
+
+        :param fname: The file name
+        """
+        self.fname = fname
+        self.root = zarr.open(fname, mode='r')
+        all_samples = self.root["sample_id"][:]
+        self.samples = list(retrieval.parse_samples(samples, all_samples)[0])
+        self.iter = retrieval.variant_iter(
+            fname,
+            samples=samples,
+            fields=["variant_contig", "variant_position", "variant_allele",
+                    "call_genotype", "call_genotype_phased"]
+        )
+
+    def __enter__(self):
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        v = next(self.iter)
+        alleles = v["variant_allele"]
+        phased = v["call_genotype_phased"]
+        gt_bases = np.array([
+            ["/", "|"][np.int8(ph)].join(alleles[gt]) for ph, gt in zip(phased, v["call_genotype"])
+        ])
+        return ZarrVariant(
+            ref=alleles[0],
+            chrom=v["variant_contig"],
+            pos=v["variant_position"],
+            gt_bases=gt_bases,
+        )
+
+    def count_sites(self) -> int:
+        """
+        Count the number of sites in the Zarr store.
+
+        :return: Number of sites
+        """
+        return self.root["variant_id"].shape[0]
+
+    def add_info_to_header(self, header):
+        """
+        Add the info fields to the header.
+
+        :param header: The header
+        """
+        print("Adding info to header: ", header)
+        pass
+
+
 class MultiHandler(VCFHandler, FASTAHandler, GFFHandler):
     """
     Handle VCF, FASTA and GFF files.

diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zattrs
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zgroup b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zgroup
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/.zarray
@@ -0,0 +1,25 @@
+{
+    "chunks": [
+        1000,
+        379,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "<i2",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10000,
+        379,
+        2
+    ],
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/.zattrs
@@ -0,0 +1,8 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples",
+        "alleles"
+    ],
+    "description": "Allelic depths for the ref and alt alleles in the order listed"
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/0/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/0/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/1/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/1/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/2/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/2/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/3/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/3/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/4/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/4/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/5/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/5/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/6/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/6/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/7/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/7/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/8/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/8/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/9/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_AD/9/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/.zarray
@@ -0,0 +1,23 @@
+{
+    "chunks": [
+        1000,
+        379
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "<i2",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10000,
+        379
+    ],
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/1/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/1/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/2/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/2/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/3/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/3/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/4/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/4/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/5/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/5/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/6/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/6/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/7/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/7/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/8/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/8/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/9/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_DP/9/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/.zarray
@@ -0,0 +1,23 @@
+{
+    "chunks": [
+        1000,
+        379
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10000,
+        379
+    ],
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "description": "Genotype Quality"
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/1/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/1/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/2/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/2/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/3/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/3/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/4/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/4/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/5/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/5/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/6/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/6/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/7/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/7/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/8/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/8/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/9/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_GQ/9/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/.zarray
@@ -0,0 +1,23 @@
+{
+    "chunks": [
+        1000,
+        379
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        10000,
+        379
+    ],
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "description": "Minimum DP observed within the GVCF block"
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/1/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/1/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/2/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/2/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/3/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/3/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/4/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/4/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/5/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/5/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/6/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/6/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/7/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/7/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/8/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/8/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/9/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_MIN_DP/9/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/.zarray
@@ -0,0 +1,27 @@
+{
+    "chunks": [
+        1000,
+        379
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "|O",
+    "fill_value": null,
+    "filters": [
+        {
+            "id": "vlen-utf8"
+        }
+    ],
+    "order": "C",
+    "shape": [
+        10000,
+        379
+    ],
+    "zarr_format": 2
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/.zattrs b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles"
+}
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/0/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/0/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/1/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/1/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/2/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/2/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/3/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/3/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/4/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/4/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/5/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/5/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/6/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/6/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/7/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/7/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/8/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/8/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/9/0 b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PGT/9/0
diff --git a/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PID/.zarray b/resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/call_PID/.zarray
@@ -0,0 +1,27 @@
+{
+    "chunks": [
+        1000,
+        379
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 7,
+        "cname": "zstd",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dimension_separator": "/",
+    "dtype": "|O",
+    "fill_value": null,
+    "filters": [
+        {
+            "id": "vlen-utf8"
+        }
+    ],
+    "order": "C",
+    "shape": [
+        10000,
+        379
+    ],
+    "zarr_format": 2
+}