ShenLab
diff --git a/‎model_evaluate/DNV/.DS_Store‎
-6 KB b/‎model_evaluate/DNV/.DS_Store‎
-6 KB
diff --git a/‎model_evaluate/gene_s/.DS_Store‎
-6 KB b/‎model_evaluate/gene_s/.DS_Store‎
-6 KB
diff --git a/‎variant/AOU/AN_exome.py‎
Lines changed: 80 additions & 0 deletions b/‎variant/AOU/AN_exome.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎variant/AOU/extract_from_database.py‎
Lines changed: 10 additions & 0 deletions b/‎variant/AOU/extract_from_database.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎variant/AOU/get_cds_pos.py‎
Lines changed: 43 additions & 0 deletions b/‎variant/AOU/get_cds_pos.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎variant/AOU/rename_chr.py‎
Lines changed: 22 additions & 0 deletions b/‎variant/AOU/rename_chr.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎variant/ESM-1b/combine_frac.py‎
Lines changed: 52 additions & 0 deletions b/‎variant/ESM-1b/combine_frac.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎variant/ESM-1b/get_probs.py‎
Lines changed: 71 additions & 0 deletions b/‎variant/ESM-1b/get_probs.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎variant/ESM-1b/get_representation.py‎
Lines changed: 61 additions & 0 deletions b/‎variant/ESM-1b/get_representation.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎variant/ESM-2/build_logits.py‎
Lines changed: 21 additions & 0 deletions b/‎variant/ESM-2/build_logits.py‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,80 @@
+import re
+import gzip
+import sys
+
+class genome_position:
+	global all_chromosomes 
+	all_chromosomes = [str(i) for i in range(1,23)] + ["MT", "X", "Y", ""]
+	def __init__(self, chromosome, position):
+		self.chromosome = str(chromosome)
+		self.position = int(position)
+	def __eq__(self, other):
+		if (self.chromosome == other.chromosome) and (self.position == other.position):
+			return True
+		else:
+			return False
+	def __lt__(self, other):
+		if (all_chromosomes.index(self.chromosome) < all_chromosomes.index(other.chromosome)):
+			return True
+		elif (all_chromosomes.index(self.chromosome) ==  all_chromosomes.index(other.chromosome)) and (self.position < other.position):
+			return True
+		else:
+			return False
+	def __gr__(self, other):
+		if (all_chromosomes.index(self.chromosome) > all_chromosomes.index(other.chromosome)):
+			return True
+		elif (all_chromosomes.index(self.chromosome) ==  all_chromosomes.index(other.chromosome)) and (self.position > other.position):
+			return True
+		else:
+			return False
+	def __le__(self, other):
+		return (self < other) or (self == other)
+	def __ge__(self, other):
+		return (self > other) or (self == other)
+
+def process_region(region_file):
+	region = region_file.readline()
+	if region == "":
+		return genome_position("", 0), genome_position("", 0)
+	region_split = region.strip().split("\t")
+	region_start = genome_position(region_split[0], region_split[1])
+	region_end = genome_position(region_split[0], region_split[2])
+	return region_start, region_end
+
+def main():
+	pos_file = gzip.open("AOU_af_cds_eur.txt.gz", "rt")
+	region_file = open("../gnomad_AF/exome_regions.txt", "r")
+	output_file = open(f"exome_AN_AOU_eur.bed", "w")
+	
+	region_start, region_end = process_region(region_file)
+	curr_AN = None
+	marker_pos = None
+	pos_file.readline()
+	for line in pos_file:
+		line_split = line.strip().split("\t")
+		AN_pos = genome_position(line_split[0], line_split[1])
+		while region_end <= AN_pos:
+			if curr_AN is not None:
+				print(region_start.chromosome, str(region_start.position - 1), str(region_end.position - 1), curr_AN, sep = "\t", file = output_file, flush = True)
+			region_start, region_end = process_region(region_file)
+			curr_AN = None
+			marker_pos = None
+		if region_start > AN_pos:
+			continue
+		AN = line_split[4]
+		if (curr_AN is not None) and (curr_AN != AN):
+			print(region_start.chromosome, str(region_start.position - 1), str((marker_pos + AN_pos.position + 1) // 2 - 1), curr_AN, sep = "\t", file = output_file, flush = True)
+			region_start.position = (marker_pos + AN_pos.position + 1) // 2
+		marker_pos = AN_pos.position
+		curr_AN = AN
+	if curr_AN is not None:
+		print(region_start.chromosome, str(region_start.position - 1), str(region_end.position - 1), curr_AN, sep = "\t", file = output_file, flush = True)
+
+	pos_file.close()
+	region_file.close()
+	output_file.close()
+
+
+if __name__=="__main__":
+	main()
+
@@ -0,0 +1,10 @@
+import hail
+import re
+
+dt = hail.import_table("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/vat/vat_complete.bgz.tsv.gz", force_bgz = True, types={'position': hail.tint32})
+dt = dt.key_by('contig','position','ref_allele','alt_allele').select("gvs_eur_an", "gvs_eur_ac", "gvs_all_an", "gvs_all_ac").distinct()
+dt = dt.filter((dt['ref_allele'].length()==1) & (dt['alt_allele'].length()==1))
+
+# should write to a cloud workspace
+dt.write("f{cloud_location}/AOU_af_SNV.ht")
+
@@ -0,0 +1,43 @@
+import re
+import gzip
+import pandas as pd
+
+def process_gff(line):
+	line_split = line.strip().split("\t")
+	chrom = line_split[0]
+	feature = line_split[2]
+	start = line_split[3]
+	end = line_split[4]
+
+	if (feature == "CDS"):
+		return chrom, start, end
+	else:
+		return None, None, None
+
+def main():
+	# inputs
+	annot_filename = "gencode.v38.basic.annotation.gff3.gz"
+	annot_file = gzip.open(annot_filename, "rt")
+
+	# outputs
+	pos_filename = "all_cds_regions.txt"
+	pos_file = open(pos_filename, "w")
+
+	# split line
+	count = 0
+	for line in annot_file:
+		if re.search(r"^#", line):
+			continue
+		chrom, start, end = process_gff(line)
+		if chrom is None:
+			continue
+		# to allow capturing of splice_donor and splice_acceptor
+		start = int(start) - 2
+		end = int(end) + 2
+		print(f"{chrom}\t{start}\t{end}", file = pos_file)
+
+	annot_file.close()
+	pos_file.close()
+
+if __name__ == "__main__":
+	main()
@@ -0,0 +1,22 @@
+import gzip
+
+def main():
+	input_file = gzip.open("AOU_af_cds.txt.gz", "rt")
+	output_file = open("AOU_af_cds_eur.txt", "w")
+	chrom_dict = {}
+	for chrom in [str(i) for i in range(1, 23)] + ["X", "Y"]:
+		chrom_dict["chr" + chrom] = chrom
+	print("Chrom\tPos\tRef\tAlt\tAN_AOU_eur\tAC_AOU_eur", file = output_file)
+	input_file.readline()
+	for line in input_file:
+		line_strip = line.strip().split("\t")
+		if line_strip[0] in chrom_dict:
+			line_strip[0] = chrom_dict[line_strip[0]]
+			print(*line_strip[0:6], sep = "\t", file = output_file)
+		else:
+			continue
+	input_file.close()
+	output_file.close()
+
+if __name__=="__main__":
+	main()
@@ -0,0 +1,52 @@
+import pandas as pd
+import numpy as np
+
+L = 1000
+SLIDING = 800
+OVERLAP = L - SLIDING
+OVERLAP_TYPE = "weighted"
+AA_list = "ACDEFGHIKLMNPQRSTVWY"
+
+def _gen_overlap_weight(start, end, mask_start, mask_end):
+    full_weights = np.ones(shape = (L, 1)) 
+    full_weights[(end - start + 1):, :] = 0.
+    if OVERLAP_TYPE == "weighted":
+        weights = np.array([(i+1)/(OVERLAP+1) for i in range(OVERLAP)])
+        weights = np.reshape(weights, (OVERLAP, 1)) 
+        if mask_start > start:
+            full_weights[0:OVERLAP] = weights
+        if mask_end < end:
+            full_weights[(L-OVERLAP):L] = weights[::-1,:]
+    else:
+        full_weights[:(mask_start - start), :] = 0.
+        full_weights[(mask_end - start + 1):, :] = 0.
+    return full_weights[:(end - start + 1),:].astype(np.float32)
+
+def main():
+	segset = pd.read_table(f"seg_list_{L}_{OVERLAP}.txt")
+	genes = segset['UniprotID'].unique()
+	for gene in genes:
+		subset = segset[segset['UniprotID']==gene]
+		l = subset['end'].max()
+		full_score = np.zeros((l, len(AA_list)))
+		for _, row in subset.iterrows():
+			frac = row['frac']
+			start = row['start']
+			end = row['end']
+			mask_start = row['unmask_start']
+			mask_end = row['unmask_end']
+			weight = _gen_overlap_weight(start, end, mask_start, mask_end)
+			frac_score = np.load(f"logits_{L}_{OVERLAP}/{gene}_{frac}.npy")
+			frac_score *= weight
+			full_score[(start-1):end] += frac_score # (l, A)
+		result = {
+			"Protein_position": [i // len(AA_list) + 1 for i in range(len(AA_list) * l)], # zero based
+            "AA_alt": list(AA_list) * l,
+            "ESM": np.reshape(full_score, (-1)), 
+        }
+		pd.DataFrame(result).to_csv(f"logits_merged/{gene}.txt.gz", sep = "\t", index = False)
+
+	
+
+if __name__=="__main__":
+	main()
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from Bio import SeqIO
+import gzip
+import esm
+from os.path import exists
+from scipy.special import softmax
+
+cuda = torch.device('cuda:3') 
+
+def main():
+
+	model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+	batch_converter = alphabet.get_batch_converter()
+	model = model.eval().to(cuda)
+	token_dict = alphabet.tok_to_idx
+	AA_list = "ACDEFGHIKLMNPQRSTVWY"
+	indices = [token_dict[a] for a in AA_list]
+
+	# prepare data
+	# sequence connot exceed length of 1024
+	MAX_L = 1000
+	SLIDING = 800
+	OVERLAP = MAX_L - SLIDING
+	data = []
+	l = []
+	
+	input_file = open("../list/geneset_uniprot_len.txt", "r")
+	input_file.readline()
+	for line in input_file:
+		uniprot_id = line.split("\t")[0]
+		length = int(line.split("\t")[-1])
+		seq_filename = "../pep/uniprot_seq/" + uniprot_id + ".fasta.gz"
+		with gzip.open(seq_filename, "rt") as f:
+			record = SeqIO.read(f, "fasta")
+		for part in range(length // SLIDING + 1):
+			start = SLIDING * part
+			end = start + MAX_L
+			if end >= length:
+				data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:]))
+				l.append(len(data[-1][1]))
+				break
+			else:
+				data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:end]))
+				l.append(len(data[-1][1]))
+	input_file.close()
+
+	batch_size = 100
+
+	for batch_index in range(0, len(data), batch_size):
+		batch_labels, batch_strs, batch_tokens = batch_converter(data[batch_index:(batch_index + batch_size)])
+		batch_tokens = batch_tokens.to(cuda)
+		batch_l = l[batch_index:(batch_index + batch_size)]
+		# get per token representation
+		with torch.no_grad():
+			results = model(batch_tokens, repr_layers=[33])
+
+		logits = results["logits"]
+		#  dimension: batch * (MAX_L + 2) * 33, while 33 is for 33 tokens
+		repr_np = logits.cpu().numpy()
+		ref = np.take_along_axis(repr_np, np.expand_dims(batch_tokens.cpu().numpy(), -1), -1)
+		repr_np = np.take(repr_np, indices, axis = 2) # B * (MAX_L + 2) * 20
+		repr_np = ref - repr_np
+		for i, label in enumerate(batch_labels):
+			np.save("logits_" + str(MAX_L) + "_" + str(OVERLAP) + "/" + label + ".npy", repr_np[i, 1:(batch_l[i]+1), :])
+
+		if (batch_index % 1000 == 0):
+			print(str(batch_index) + " sequences processed.")
+
+if __name__ == "__main__":
+	main()
@@ -0,0 +1,61 @@
+import torch
+import numpy as np
+from Bio import SeqIO
+import gzip
+import esm
+from os.path import exists
+
+cuda = torch.device('cuda:3') 
+
+def main():
+
+	model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+	batch_converter = alphabet.get_batch_converter()
+	model = model.eval().to(cuda)
+
+	# prepare data
+	# sequence connot exceed length of 1024
+	MAX_L = 1000
+	SLIDING = 800
+	OVERLAP = MAX_L - SLIDING
+	data = []
+	
+	input_file = open("../list/geneset_uniprot_len.txt", "r")
+	input_file.readline()
+	for line in input_file:
+		uniprot_id = line.split("\t")[0]
+		length = int(line.split("\t")[-1])
+		seq_filename = "../pep/uniprot_seq/" + uniprot_id + ".fasta.gz"
+		with gzip.open(seq_filename, "rt") as f:
+			record = SeqIO.read(f, "fasta")
+		for part in range(length // SLIDING + 1):
+			start = SLIDING * part
+			end = start + MAX_L
+			if end >= length:
+				data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:]))
+				break
+			else:
+				data.append((uniprot_id + "_" + str(part + 1), str(record.seq)[start:end]))
+	input_file.close()
+
+	batch_size = 100
+
+	for batch_index in range(0, len(data), batch_size):
+		batch_labels, batch_strs, batch_tokens = batch_converter(data[batch_index:(batch_index + batch_size)])
+		batch_tokens = batch_tokens.to(cuda)
+
+		# get per token representation
+		with torch.no_grad():
+			results = model(batch_tokens, repr_layers=[33])
+
+		representation = results["representations"][33]
+		# representation dimension: batch * (MAX_L + 2) * 1280
+		repr_np = representation.cpu().numpy()
+		for i, label in enumerate(batch_labels):
+			np.save("repr_" + str(MAX_L) + "_" + str(OVERLAP) + "/" + label + ".npy", repr_np[i, :, :])
+
+		if (batch_index % 1000 == 0):
+			print(str(batch_index) + " sequences processed.")
+
+if __name__ == "__main__":
+	main()
@@ -0,0 +1,21 @@
+import numpy as np
+import json
+import pandas as pd
+
+A = 20
+
+# files used are already in order of position and altAA
+def main():
+	geneset = pd.read_table("geneset_uniprot_len.txt")
+	for _, row in geneset.iterrows():
+		uniprot_id = row['UniprotID']
+		data = pd.read_table(f"logits_merged/{uniprot_id}.txt.gz")
+		logits = data['ESM'].to_numpy(dtype = np.float32)
+		logits = np.reshape(logits, (-1, A))
+		np.save(f"logits_np/{uniprot_id}.npy", logits)
+
+if __name__=="__main__":
+	main()
+
+
+