Skip to content

Commit afb3ac4

Browse files
committed
add data processing scripts
1 parent b02be5b commit afb3ac4

File tree

156 files changed

+6539
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+6539
-0
lines changed

model_evaluate/DNV/.DS_Store

-6 KB
Binary file not shown.

model_evaluate/gene_s/.DS_Store

-6 KB
Binary file not shown.

variant/AOU/AN_exome.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import re
2+
import gzip
3+
import sys
4+
5+
class genome_position:
6+
global all_chromosomes
7+
all_chromosomes = [str(i) for i in range(1,23)] + ["MT", "X", "Y", ""]
8+
def __init__(self, chromosome, position):
9+
self.chromosome = str(chromosome)
10+
self.position = int(position)
11+
def __eq__(self, other):
12+
if (self.chromosome == other.chromosome) and (self.position == other.position):
13+
return True
14+
else:
15+
return False
16+
def __lt__(self, other):
17+
if (all_chromosomes.index(self.chromosome) < all_chromosomes.index(other.chromosome)):
18+
return True
19+
elif (all_chromosomes.index(self.chromosome) == all_chromosomes.index(other.chromosome)) and (self.position < other.position):
20+
return True
21+
else:
22+
return False
23+
def __gr__(self, other):
24+
if (all_chromosomes.index(self.chromosome) > all_chromosomes.index(other.chromosome)):
25+
return True
26+
elif (all_chromosomes.index(self.chromosome) == all_chromosomes.index(other.chromosome)) and (self.position > other.position):
27+
return True
28+
else:
29+
return False
30+
def __le__(self, other):
31+
return (self < other) or (self == other)
32+
def __ge__(self, other):
33+
return (self > other) or (self == other)
34+
35+
def process_region(region_file):
36+
region = region_file.readline()
37+
if region == "":
38+
return genome_position("", 0), genome_position("", 0)
39+
region_split = region.strip().split("\t")
40+
region_start = genome_position(region_split[0], region_split[1])
41+
region_end = genome_position(region_split[0], region_split[2])
42+
return region_start, region_end
43+
44+
def main():
45+
pos_file = gzip.open("AOU_af_cds_eur.txt.gz", "rt")
46+
region_file = open("../gnomad_AF/exome_regions.txt", "r")
47+
output_file = open(f"exome_AN_AOU_eur.bed", "w")
48+
49+
region_start, region_end = process_region(region_file)
50+
curr_AN = None
51+
marker_pos = None
52+
pos_file.readline()
53+
for line in pos_file:
54+
line_split = line.strip().split("\t")
55+
AN_pos = genome_position(line_split[0], line_split[1])
56+
while region_end <= AN_pos:
57+
if curr_AN is not None:
58+
print(region_start.chromosome, str(region_start.position - 1), str(region_end.position - 1), curr_AN, sep = "\t", file = output_file, flush = True)
59+
region_start, region_end = process_region(region_file)
60+
curr_AN = None
61+
marker_pos = None
62+
if region_start > AN_pos:
63+
continue
64+
AN = line_split[4]
65+
if (curr_AN is not None) and (curr_AN != AN):
66+
print(region_start.chromosome, str(region_start.position - 1), str((marker_pos + AN_pos.position + 1) // 2 - 1), curr_AN, sep = "\t", file = output_file, flush = True)
67+
region_start.position = (marker_pos + AN_pos.position + 1) // 2
68+
marker_pos = AN_pos.position
69+
curr_AN = AN
70+
if curr_AN is not None:
71+
print(region_start.chromosome, str(region_start.position - 1), str(region_end.position - 1), curr_AN, sep = "\t", file = output_file, flush = True)
72+
73+
pos_file.close()
74+
region_file.close()
75+
output_file.close()
76+
77+
78+
if __name__=="__main__":
79+
main()
80+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import hail
2+
import re
3+
4+
dt = hail.import_table("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/vat/vat_complete.bgz.tsv.gz", force_bgz = True, types={'position': hail.tint32})
5+
dt = dt.key_by('contig','position','ref_allele','alt_allele').select("gvs_eur_an", "gvs_eur_ac", "gvs_all_an", "gvs_all_ac").distinct()
6+
dt = dt.filter((dt['ref_allele'].length()==1) & (dt['alt_allele'].length()==1))
7+
8+
# should write to a cloud workspace
9+
dt.write("f{cloud_location}/AOU_af_SNV.ht")
10+

variant/AOU/get_cds_pos.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import re
2+
import gzip
3+
import pandas as pd
4+
5+
def process_gff(line):
6+
line_split = line.strip().split("\t")
7+
chrom = line_split[0]
8+
feature = line_split[2]
9+
start = line_split[3]
10+
end = line_split[4]
11+
12+
if (feature == "CDS"):
13+
return chrom, start, end
14+
else:
15+
return None, None, None
16+
17+
def main():
18+
# inputs
19+
annot_filename = "gencode.v38.basic.annotation.gff3.gz"
20+
annot_file = gzip.open(annot_filename, "rt")
21+
22+
# outputs
23+
pos_filename = "all_cds_regions.txt"
24+
pos_file = open(pos_filename, "w")
25+
26+
# split line
27+
count = 0
28+
for line in annot_file:
29+
if re.search(r"^#", line):
30+
continue
31+
chrom, start, end = process_gff(line)
32+
if chrom is None:
33+
continue
34+
# to allow capturing of splice_donor and splice_acceptor
35+
start = int(start) - 2
36+
end = int(end) + 2
37+
print(f"{chrom}\t{start}\t{end}", file = pos_file)
38+
39+
annot_file.close()
40+
pos_file.close()
41+
42+
if __name__ == "__main__":
43+
main()

variant/AOU/rename_chr.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import gzip
2+
3+
def main():
4+
input_file = gzip.open("AOU_af_cds.txt.gz", "rt")
5+
output_file = open("AOU_af_cds_eur.txt", "w")
6+
chrom_dict = {}
7+
for chrom in [str(i) for i in range(1, 23)] + ["X", "Y"]:
8+
chrom_dict["chr" + chrom] = chrom
9+
print("Chrom\tPos\tRef\tAlt\tAN_AOU_eur\tAC_AOU_eur", file = output_file)
10+
input_file.readline()
11+
for line in input_file:
12+
line_strip = line.strip().split("\t")
13+
if line_strip[0] in chrom_dict:
14+
line_strip[0] = chrom_dict[line_strip[0]]
15+
print(*line_strip[0:6], sep = "\t", file = output_file)
16+
else:
17+
continue
18+
input_file.close()
19+
output_file.close()
20+
21+
if __name__=="__main__":
22+
main()

variant/ESM-1b/combine_frac.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
L = 1000
5+
SLIDING = 800
6+
OVERLAP = L - SLIDING
7+
OVERLAP_TYPE = "weighted"
8+
AA_list = "ACDEFGHIKLMNPQRSTVWY"
9+
10+
def _gen_overlap_weight(start, end, mask_start, mask_end):
11+
full_weights = np.ones(shape = (L, 1))
12+
full_weights[(end - start + 1):, :] = 0.
13+
if OVERLAP_TYPE == "weighted":
14+
weights = np.array([(i+1)/(OVERLAP+1) for i in range(OVERLAP)])
15+
weights = np.reshape(weights, (OVERLAP, 1))
16+
if mask_start > start:
17+
full_weights[0:OVERLAP] = weights
18+
if mask_end < end:
19+
full_weights[(L-OVERLAP):L] = weights[::-1,:]
20+
else:
21+
full_weights[:(mask_start - start), :] = 0.
22+
full_weights[(mask_end - start + 1):, :] = 0.
23+
return full_weights[:(end - start + 1),:].astype(np.float32)
24+
25+
def main():
26+
segset = pd.read_table(f"seg_list_{L}_{OVERLAP}.txt")
27+
genes = segset['UniprotID'].unique()
28+
for gene in genes:
29+
subset = segset[segset['UniprotID']==gene]
30+
l = subset['end'].max()
31+
full_score = np.zeros((l, len(AA_list)))
32+
for _, row in subset.iterrows():
33+
frac = row['frac']
34+
start = row['start']
35+
end = row['end']
36+
mask_start = row['unmask_start']
37+
mask_end = row['unmask_end']
38+
weight = _gen_overlap_weight(start, end, mask_start, mask_end)
39+
frac_score = np.load(f"logits_{L}_{OVERLAP}/{gene}_{frac}.npy")
40+
frac_score *= weight
41+
full_score[(start-1):end] += frac_score # (l, A)
42+
result = {
43+
"Protein_position": [i // len(AA_list) + 1 for i in range(len(AA_list) * l)], # zero based
44+
"AA_alt": list(AA_list) * l,
45+
"ESM": np.reshape(full_score, (-1)),
46+
}
47+
pd.DataFrame(result).to_csv(f"logits_merged/{gene}.txt.gz", sep = "\t", index = False)
48+
49+
50+
51+
if __name__=="__main__":
52+
main()

variant/ESM-1b/get_probs.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import torch
2+
import numpy as np
3+
from Bio import SeqIO
4+
import gzip
5+
import esm
6+
from os.path import exists
7+
from scipy.special import softmax
8+
9+
cuda = torch.device('cuda:3')
10+
11+
def main():
12+
13+
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
14+
batch_converter = alphabet.get_batch_converter()
15+
model = model.eval().to(cuda)
16+
token_dict = alphabet.tok_to_idx
17+
AA_list = "ACDEFGHIKLMNPQRSTVWY"
18+
indices = [token_dict[a] for a in AA_list]
19+
20+
# prepare data
21+
# sequence connot exceed length of 1024
22+
MAX_L = 1000
23+
SLIDING = 800
24+
OVERLAP = MAX_L - SLIDING
25+
data = []
26+
l = []
27+
28+
input_file = open("../list/geneset_uniprot_len.txt", "r")
29+
input_file.readline()
30+
for line in input_file:
31+
uniprot_id = line.split("\t")[0]
32+
length = int(line.split("\t")[-1])
33+
seq_filename = "../pep/uniprot_seq/" + uniprot_id + ".fasta.gz"
34+
with gzip.open(seq_filename, "rt") as f:
35+
record = SeqIO.read(f, "fasta")
36+
for part in range(length // SLIDING + 1):
37+
start = SLIDING * part
38+
end = start + MAX_L
39+
if end >= length:
40+
data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:]))
41+
l.append(len(data[-1][1]))
42+
break
43+
else:
44+
data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:end]))
45+
l.append(len(data[-1][1]))
46+
input_file.close()
47+
48+
batch_size = 100
49+
50+
for batch_index in range(0, len(data), batch_size):
51+
batch_labels, batch_strs, batch_tokens = batch_converter(data[batch_index:(batch_index + batch_size)])
52+
batch_tokens = batch_tokens.to(cuda)
53+
batch_l = l[batch_index:(batch_index + batch_size)]
54+
# get per token representation
55+
with torch.no_grad():
56+
results = model(batch_tokens, repr_layers=[33])
57+
58+
logits = results["logits"]
59+
# dimension: batch * (MAX_L + 2) * 33, while 33 is for 33 tokens
60+
repr_np = logits.cpu().numpy()
61+
ref = np.take_along_axis(repr_np, np.expand_dims(batch_tokens.cpu().numpy(), -1), -1)
62+
repr_np = np.take(repr_np, indices, axis = 2) # B * (MAX_L + 2) * 20
63+
repr_np = ref - repr_np
64+
for i, label in enumerate(batch_labels):
65+
np.save("logits_" + str(MAX_L) + "_" + str(OVERLAP) + "/" + label + ".npy", repr_np[i, 1:(batch_l[i]+1), :])
66+
67+
if (batch_index % 1000 == 0):
68+
print(str(batch_index) + " sequences processed.")
69+
70+
if __name__ == "__main__":
71+
main()
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import torch
2+
import numpy as np
3+
from Bio import SeqIO
4+
import gzip
5+
import esm
6+
from os.path import exists
7+
8+
cuda = torch.device('cuda:3')
9+
10+
def main():
11+
12+
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
13+
batch_converter = alphabet.get_batch_converter()
14+
model = model.eval().to(cuda)
15+
16+
# prepare data
17+
# sequence connot exceed length of 1024
18+
MAX_L = 1000
19+
SLIDING = 800
20+
OVERLAP = MAX_L - SLIDING
21+
data = []
22+
23+
input_file = open("../list/geneset_uniprot_len.txt", "r")
24+
input_file.readline()
25+
for line in input_file:
26+
uniprot_id = line.split("\t")[0]
27+
length = int(line.split("\t")[-1])
28+
seq_filename = "../pep/uniprot_seq/" + uniprot_id + ".fasta.gz"
29+
with gzip.open(seq_filename, "rt") as f:
30+
record = SeqIO.read(f, "fasta")
31+
for part in range(length // SLIDING + 1):
32+
start = SLIDING * part
33+
end = start + MAX_L
34+
if end >= length:
35+
data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:]))
36+
break
37+
else:
38+
data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:end]))
39+
input_file.close()
40+
41+
batch_size = 100
42+
43+
for batch_index in range(0, len(data), batch_size):
44+
batch_labels, batch_strs, batch_tokens = batch_converter(data[batch_index:(batch_index + batch_size)])
45+
batch_tokens = batch_tokens.to(cuda)
46+
47+
# get per token representation
48+
with torch.no_grad():
49+
results = model(batch_tokens, repr_layers=[33])
50+
51+
representation = results["representations"][33]
52+
# representation dimension: batch * (MAX_L + 2) * 1280
53+
repr_np = representation.cpu().numpy()
54+
for i, label in enumerate(batch_labels):
55+
np.save("repr_" + str(MAX_L) + "_" + str(OVERLAP) + "/" + label + ".npy", repr_np[i, :, :])
56+
57+
if (batch_index % 1000 == 0):
58+
print(str(batch_index) + " sequences processed.")
59+
60+
if __name__ == "__main__":
61+
main()

variant/ESM-2/build_logits.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
import json
3+
import pandas as pd
4+
5+
A = 20
6+
7+
# files used are already in order of position and altAA
8+
def main():
9+
geneset = pd.read_table("geneset_uniprot_len.txt")
10+
for _, row in geneset.iterrows():
11+
uniprot_id = row['UniprotID']
12+
data = pd.read_table(f"logits_merged/{uniprot_id}.txt.gz")
13+
logits = data['ESM'].to_numpy(dtype = np.float32)
14+
logits = np.reshape(logits, (-1, A))
15+
np.save(f"logits_np/{uniprot_id}.npy", logits)
16+
17+
if __name__=="__main__":
18+
main()
19+
20+
21+

0 commit comments

Comments
 (0)