Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
16 changes: 11 additions & 5 deletions fastdfe/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,7 +1860,10 @@ def _setup(self, handler: MultiHandler):
})

# set reader
self._reader = self._handler.load_vcf()
if self._handler.is_zarr_store:
self._reader = self._handler.load_zarr()
else:
self._reader = self._handler.load_vcf()

# prepare masks
self._prepare_masks(handler._reader.samples)
Expand Down Expand Up @@ -4452,7 +4455,8 @@ def _setup(self):
annotation._setup(self)

# create the writer
self._writer = Writer(self.output, self._reader)
if not self.is_zarr_store:
self._writer = Writer(self.output, self._reader)

def _teardown(self):
"""
Expand All @@ -4462,8 +4466,9 @@ def _teardown(self):
annotation._teardown()

# close the writer and reader
self._writer.close()
self._reader.close()
if not self.is_zarr_store:
self._writer.close()
self._reader.close()

def annotate(self):
"""
Expand All @@ -4485,7 +4490,8 @@ def annotate(self):
annotation.annotate_site(variant)

# write the variant
self._writer.write_record(variant)
if not self.is_zarr_store:
self._writer.write_record(variant)

# update the progress bar
pbar.update()
Expand Down
98 changes: 98 additions & 0 deletions fastdfe/io_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
from pandas.errors import SettingWithCopyWarning
from tqdm import tqdm

import zarr
from vcztools import retrieval

from .settings import Settings

#: The DNA bases
Expand Down Expand Up @@ -616,13 +619,18 @@ def __init__(
#: Random generator instance
self.rng = np.random.default_rng(seed=seed)

self.is_zarr_store = False

@cached_property
def _reader(self) -> 'cyvcf2.VCF':
"""
Get the VCF reader.

:return: The VCF reader.
"""
if os.path.isdir(self.vcf) and self.vcf.endswith('.vcz'):
self.is_zarr_store = True
return self.load_zarr()
return self.load_vcf()

def _rewind(self):
Expand Down Expand Up @@ -651,6 +659,18 @@ def load_vcf(self) -> 'cyvcf2.VCF':

return VCF(self.download_if_url(self.vcf))


def load_zarr(self):
"""
Load a Zarr file into a dictionary.

:return: The Zarr reader.
"""
self._logger.info("Loading Zarr file")

return ZarrStore(self.vcf)


@cached_property
def n_sites(self) -> int:
"""
Expand All @@ -666,6 +686,9 @@ def count_sites(self) -> int:

:return: Number of sites
"""
if self.is_zarr_store:
return self._reader.count_sites()

return count_sites(
vcf=self.download_if_url(self.vcf),
max_sites=self.max_sites,
Expand All @@ -687,6 +710,81 @@ def get_pbar(self, desc: str = "Processing sites", total: int | None = 0) -> tqd
)


class ZarrVariant:
def __init__(
self,
ref: str,
chrom: str,
pos: int,
gt_bases: np.ndarray,
):
self.REF = ref
self.CHROM = chrom
self.POS = pos
self.INFO = {}
self.gt_bases = gt_bases
self.is_snp = True


class ZarrStore:

def __init__(self, fname: str, samples: List[str] = None):
"""
Initialize a Zarr store.

The iterator iterates over Variant objects.

:param fname: The file name
"""
self.fname = fname
self.root = zarr.open(fname, mode='r')
all_samples = self.root["sample_id"][:]
self.samples = list(retrieval.parse_samples(samples, all_samples)[0])
self.iter = retrieval.variant_iter(
fname,
samples=samples,
fields=["variant_contig", "variant_position", "variant_allele",
"call_genotype", "call_genotype_phased"]
)

def __enter__(self):
return self

def __iter__(self):
return self

def __next__(self):
v = next(self.iter)
alleles = v["variant_allele"]
phased = v["call_genotype_phased"]
gt_bases = np.array([
["/", "|"][np.int8(ph)].join(alleles[gt]) for ph, gt in zip(phased, v["call_genotype"])
])
return ZarrVariant(
ref=alleles[0],
chrom=v["variant_contig"],
pos=v["variant_position"],
gt_bases=gt_bases,
)

def count_sites(self) -> int:
"""
Count the number of sites in the Zarr store.

:return: Number of sites
"""
return self.root["variant_id"].shape[0]

def add_info_to_header(self, header):
"""
Add the info fields to the header.

:param header: The header
"""
print("Adding info to header: ", header)
pass


class MultiHandler(VCFHandler, FASTAHandler, GFFHandler):
"""
Handle VCF, FASTA and GFF files.
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"zarr_format": 2
}
1,603 changes: 1,603 additions & 0 deletions resources/genome/betula/biallelic.with_outgroups.subset.10000.vcz/.zmetadata

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"chunks": [
1000,
379,
2
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "<i2",
"fill_value": null,
"filters": null,
"order": "C",
"shape": [
10000,
379,
2
],
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"_ARRAY_DIMENSIONS": [
"variants",
"samples",
"alleles"
],
"description": "Allelic depths for the ref and alt alleles in the order listed"
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"chunks": [
1000,
379
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "<i2",
"fill_value": null,
"filters": null,
"order": "C",
"shape": [
10000,
379
],
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"_ARRAY_DIMENSIONS": [
"variants",
"samples"
],
"description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"chunks": [
1000,
379
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "|i1",
"fill_value": null,
"filters": null,
"order": "C",
"shape": [
10000,
379
],
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"_ARRAY_DIMENSIONS": [
"variants",
"samples"
],
"description": "Genotype Quality"
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"chunks": [
1000,
379
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "|i1",
"fill_value": null,
"filters": null,
"order": "C",
"shape": [
10000,
379
],
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"_ARRAY_DIMENSIONS": [
"variants",
"samples"
],
"description": "Minimum DP observed within the GVCF block"
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"chunks": [
1000,
379
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "|O",
"fill_value": null,
"filters": [
{
"id": "vlen-utf8"
}
],
"order": "C",
"shape": [
10000,
379
],
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"_ARRAY_DIMENSIONS": [
"variants",
"samples"
],
"description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles"
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"chunks": [
1000,
379
],
"compressor": {
"blocksize": 0,
"clevel": 7,
"cname": "zstd",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "|O",
"fill_value": null,
"filters": [
{
"id": "vlen-utf8"
}
],
"order": "C",
"shape": [
10000,
379
],
"zarr_format": 2
}
Loading