-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
126 lines (100 loc) · 3.68 KB
/
utils.py
File metadata and controls
126 lines (100 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Utility classes.
"""
from typing import Union, List
import fastdfe as fd
import numpy as np
import pandas as pd
from fastdfe.io_handlers import DummyVariant
from matplotlib import pyplot as plt
class RecombinationIntensityAnnotation(fd.Annotation):
"""
Recombination intensity annotation using a genetic map.
"""
def __init__(self, map_file: str, bins_file: str = None, n_bins: int = 10):
"""
Create a new annotation instance.
"""
super().__init__()
self.bins_file = bins_file
self.n_bins = n_bins
self.map: pd.DataFrame = pd.read_csv(map_file, comment='#', sep='\t')
self.chr_cache = {chr_: df for chr_, df in self.map.groupby("Chr")}
self.values: List[float] = []
self.current: pd.Series = self.map.iloc[0]
def _setup(self, handler: fd.io_handlers.MultiHandler):
"""
Provide context to the annotator.
"""
super()._setup(handler)
handler._reader.add_info_to_header({
'ID': 'Recombination_intensity',
'Number': '.',
'Type': 'Float',
'Description': 'Recombination intensity in cM/Mb.'
})
def _teardown(self):
"""
Finalize the annotation. Called after all sites have been annotated.
"""
super()._teardown()
if self.bins_file is not None:
pd.DataFrame(np.quantile(self.values, np.linspace(0, 1, self.n_bins + 1))).to_csv(self.bins_file)
def annotate_site(self, variant: Union['cyvcf2.Variant', DummyVariant]):
"""
Annotate site.
"""
if (self.current is not None and variant.CHROM == self.current.Chr and
self.current.Begin <= variant.POS < self.current.End):
variant.INFO['Recombination_intensity'] = float(self.current.cMperMb)
self.values.append(self.current.cMperMb)
self.n_annotated += 1
return
# get chromosome-specific dataframe
chr_map = self.chr_cache.get(variant.CHROM)
if chr_map is not None:
mask = (chr_map.Begin <= variant.POS) & (variant.POS < chr_map.End)
if mask.any():
self.current = chr_map[mask].iloc[0]
self.annotate_site(variant)
def plot_histogram(self, show: bool = True, file: str = None):
"""
Plot histogram of recombination intensity values.
:param show: Show the plot.
:param file: Save the plot to a file.
"""
edges = [0] + list(np.logspace(-6, 2, 30))
hist, _ = np.histogram(self.values, bins=edges)
plt.bar(edges[:-1], hist, width=np.diff(edges), align='edge')
plt.xlabel("Recombination intensity (cM/Mb)")
plt.ylabel("Frequency")
plt.xscale('log')
plt.tight_layout()
if file is not None:
plt.savefig(file)
if show:
plt.show()
class RecombinationIntensityStratification(fd.Stratification):
"""
Recombination intensity stratification based annotation.
"""
def __init__(self, bins_file: str):
"""
Initialize stratification.
"""
super().__init__()
self.bins: pd.Series = pd.read_csv(bins_file).iloc[:, 1]
def get_types(self) -> List[str]:
"""
Get all possible types.
"""
return [f"bin{i}" for i in range(len(self.bins) - 1)]
def get_type(self, variant: Union['cyvcf2.Variant', DummyVariant]) -> str:
"""
Get type.
"""
try:
rho = variant.INFO['Recombination_intensity']
return f"bin{sum(rho >= self.bins) - 1}"
except KeyError:
raise fd.io_handlers.NoTypeException()