diff --git a/BioCompass/BioCompass.py b/BioCompass/BioCompass.py index 4dcc3db..0507e6e 100644 --- a/BioCompass/BioCompass.py +++ b/BioCompass/BioCompass.py @@ -102,9 +102,9 @@ def parse_antiSMASH(content): QueryCluster = OrderedDict() - for k in re.search( + for k in list(re.search( rule_query_cluster, parsed['QueryCluster'], - re.VERBOSE).groupdict().keys(): + re.VERBOSE).groupdict().keys()): QueryCluster[k] = [] for row in re.finditer( rule_query_cluster, parsed['QueryCluster'], re.VERBOSE): @@ -186,7 +186,7 @@ def __getitem__(self, item): return self.data[item] def keys(self): - return self.data.keys() + return list(self.data.keys()) def load(self, filename): self.data = {} @@ -221,8 +221,8 @@ def efetch_hit(term, seq_start, seq_stop): downloaded = True except: nap = ntry*3 - print "Fail to download (term). I'll take a nap of %s seconds ", \ - " and try again." + print("Fail to download (term). I'll take a nap of %s seconds ", \ + " and try again.") time.sleep(ntry*3) return content @@ -233,7 +233,7 @@ def download_hits(filename, output_path): """ c = antiSMASH_file(filename) - for cs in c['SignificantHits'].keys(): + for cs in list(c['SignificantHits'].keys()): locus = c['SignificantHits'][cs]['locus'] table_genes = c['SignificantHits'][cs]['TableGenes'] @@ -244,25 +244,25 @@ def download_hits(filename, output_path): max(table_genes['location_end']))) if os.path.isfile(filename_out): - print "Already downloaded %s" % filename_out + print("Already downloaded %s" % filename_out) else: - print "Requesting cluster_subject: %s, start: %s, end: %s" % ( + print("Requesting cluster_subject: %s, start: %s, end: %s" % ( locus, min(table_genes['location_start']), - max(table_genes['location_end'])) + max(table_genes['location_end']))) content = efetch_hit( term=locus, seq_start=min(table_genes['location_start']), seq_stop=max(table_genes['location_end'])) - print "Saving %s" % filename_out + print("Saving %s" % filename_out) with open(filename_out, 'w') as f: f.write(content) -import urlparse -import urllib2 +import urllib.parse +import urllib.request, urllib.error, urllib.parse import tempfile import tarfile import os @@ -274,10 +274,10 @@ def download_mibig(outputdir, version='1.3'): server = 'http://mibig.secondarymetabolites.org' filename = "mibig_gbk_%s.tar.gz" % version - url = urlparse.urljoin(server, filename) + url = urllib.parse.urljoin(server, filename) with tempfile.NamedTemporaryFile(delete=True) as f: - u = urllib2.urlopen(url) + u = urllib.request.urlopen(url) f.write(u.read()) f.file.flush() tar = tarfile.open(f.name) diff --git a/BioCompass/BioCompass.py.bak b/BioCompass/BioCompass.py.bak new file mode 100644 index 0000000..4dcc3db --- /dev/null +++ b/BioCompass/BioCompass.py.bak @@ -0,0 +1,353 @@ + +import os +import re +import time +import json +from collections import OrderedDict +import pkg_resources + +import pandas as pd +from Bio import SeqIO +from Bio import Entrez + +Entrez.email = "testing@ucsd.edu" + + +def untag(rule): + return re.sub('\?P<.*?>', '', rule) + +def parse_antiSMASH(content): + """ Parse antiSMASH output + """ + + rule_table_genes = r""" + (?P \w+ \"?) \t + \w+ \t + (?P \d+) \t + (?P \d+) \t + (?P [+|-]) \t + (?P .*) \n + """ + + rule_table_blasthit = r""" + (?P \w+ )\"? \t + (?P \w+ )\"? \t + (?P \d+) \t + (?P \d+) \t + (?P \d+(?:\.\d+)?) \t + (?P \d+\.\d+e[+|-]\d+) \t + \n + """ + + rule_query_cluster = r""" + (?P \w+) \s+ + (?P \d+) \s + (?P \d+) \s + (?P [+|-]) \s + (?P \w+ (?:\s \w+)?) \s* \n+ + """ + + rule_detail = r""" + >>\n + (?P\d+) \. \s+ + (?P (?P\w+)_(?P\w+)) \n + Source: \s+ (?P.+?) \s* \n + Type: \s+ (?P.+) \s* \n + Number\ of\ proteins\ with\ BLAST\ hits\ to\ this\ cluster:\ (?P \d+ ) \n + Cumulative\ BLAST\ score:\ (?P \d+ ) + \n \n + Table\ of\ genes,\ locations,\ strands\ and\ annotations\ of\ subject\ cluster:\n + (?P + ( + """ + untag(rule_table_genes) + r""" + )+ + ) + \n + Table\ of\ Blast\ hits\ \(query\ gene,\ subject\ gene,\ %identity,\ blast\ score,\ %coverage,\ e-value\): \n + (?P + (\w+ \t \w+ \"? \t \d+ \t \d+ \t \d+\.\d+ \t \d+\.\d+e[+|-]\d+ \t \n)+ + ) + \n+ + """ + + rule = r""" + ^ + ClusterBlast\ scores\ for\ (?P.*)\n+ + Table\ of\ genes,\ locations,\ strands\ and\ annotations\ of\ query\ cluster:\n+ + (?P + ( + """ + untag(rule_query_cluster) + r""" + )+ + ) + \n \n+ + Significant \ hits:\ \n + (?P + (\d+ \. \ \w+ \t .* \n+)+ + ) + \n \n + (?P
+ Details:\n\n + ( + """ + untag(rule_detail) + r""" + )+ + ) + \n* + $ + """ + parsed = re.search(rule, content, re.VERBOSE).groupdict() + + output = {} + for k in ['target', 'QueryCluster', 'SignificantHits']: + output[k] = parsed[k] + + + QueryCluster = OrderedDict() + for k in re.search( + rule_query_cluster, parsed['QueryCluster'], + re.VERBOSE).groupdict().keys(): + QueryCluster[k] = [] + for row in re.finditer( + rule_query_cluster, parsed['QueryCluster'], re.VERBOSE): + row = row.groupdict() + for k in row: + QueryCluster[k].append(row[k]) + output['QueryCluster'] = QueryCluster + + + output['SignificantHits'] = OrderedDict() + for row in re.finditer( + r"""(?P\d+) \. \ (?P (?P\w+)_(?P\w+)) \t (?P.*) \n+""", parsed['SignificantHits'], re.VERBOSE): + hit = row.groupdict() + cs = hit['cluster_subject'] + + if cs not in output['SignificantHits']: + output['SignificantHits'][cs] = OrderedDict() + + for v in ['id', 'description', 'locus', 'locus_cluster']: + output['SignificantHits'][cs][v] = hit[v] + + for block in re.finditer(rule_detail, parsed['Details'], re.VERBOSE): + block = dict(block.groupdict()) + + content = block['TableGenes'] + block['TableGenes'] = OrderedDict() + for k in re.findall('\(\?P<(.*?)>', rule_table_genes): + block['TableGenes'][k] = [] + for row in re.finditer(rule_table_genes, content, re.VERBOSE): + row = row.groupdict() + for k in row: + block['TableGenes'][k].append(row[k]) + + content = block['BlastHit'] + block['BlastHit'] = OrderedDict() + for k in re.findall('\(\?P<(.*?)>', rule_table_blasthit): + block['BlastHit'][k] = [] + for row in re.finditer(rule_table_blasthit, content, re.VERBOSE): + row = row.groupdict() + for k in row: + block['BlastHit'][k].append(row[k]) + + for k in block: + output['SignificantHits'][block['cluster_subject']][k] = block[k] + + return output + + +def antiSMASH_to_dataFrame(content): + """ Extract an antiSMASH file as a pandas.DataFrame + """ + parsed = parse_antiSMASH(content) + output = pd.DataFrame() + for cs in parsed['SignificantHits']: + clusterSubject = parsed['SignificantHits'][cs].copy() + df = pd.merge( + pd.DataFrame(clusterSubject['BlastHit']), + pd.DataFrame(clusterSubject['TableGenes']), + on='subject_gene', how='outer') + + del(clusterSubject['BlastHit']) + del(clusterSubject['TableGenes']) + + for v in clusterSubject: + df[v] = clusterSubject[v] + output = output.append(df, ignore_index=True) + + return output + + +class antiSMASH_file(object): + """ A class to handle antiSMASH file output. + """ + def __init__(self, filename): + self.data = {} + self.load(filename) + + def __getitem__(self, item): + return self.data[item] + + def keys(self): + return self.data.keys() + + def load(self, filename): + self.data = {} + with open(filename, 'r') as f: + parsed = parse_antiSMASH(f.read()) + for v in parsed: + self.data[v] = parsed[v] + + +def efetch_hit(term, seq_start, seq_stop): + """ Fetch the relevant part of a hit + """ + db = "nucleotide" + + maxtry = 3 + ntry = -1 + downloaded = False + + while ~downloaded and (ntry <= maxtry): + ntry += 1 + try: + handle = Entrez.esearch(db=db, term=term) + record = Entrez.read(handle) + + assert len(record['IdList']) == 1, \ + "Sorry, I'm not ready to handle more than one record" + + handle = Entrez.efetch(db=db, rettype="gb", retmode="text", + id=record['IdList'][0], + seq_start=seq_start, seq_stop=seq_stop) + content = handle.read() + downloaded = True + except: + nap = ntry*3 + print "Fail to download (term). I'll take a nap of %s seconds ", \ + " and try again." + time.sleep(ntry*3) + + return content + + +def download_hits(filename, output_path): + """ Download the GenBank block for all hits by antiSMASH + """ + c = antiSMASH_file(filename) + + for cs in c['SignificantHits'].keys(): + locus = c['SignificantHits'][cs]['locus'] + table_genes = c['SignificantHits'][cs]['TableGenes'] + + filename_out = os.path.join( + output_path, + "%s_%s-%s.gbk" % (locus, + min(table_genes['location_start']), + max(table_genes['location_end']))) + + if os.path.isfile(filename_out): + print "Already downloaded %s" % filename_out + else: + print "Requesting cluster_subject: %s, start: %s, end: %s" % ( + locus, + min(table_genes['location_start']), + max(table_genes['location_end'])) + + content = efetch_hit( + term=locus, + seq_start=min(table_genes['location_start']), + seq_stop=max(table_genes['location_end'])) + + print "Saving %s" % filename_out + with open(filename_out, 'w') as f: + f.write(content) + + +import urlparse +import urllib2 +import tempfile +import tarfile +import os +def download_mibig(outputdir, version='1.3'): + """ Download and extract MIBiG files into outputdir + """ + assert version in ['1.0', '1.1', '1.2', '1.3'], \ + "Invalid version of MIBiG" + + server = 'http://mibig.secondarymetabolites.org' + filename = "mibig_gbk_%s.tar.gz" % version + url = urlparse.urljoin(server, filename) + + with tempfile.NamedTemporaryFile(delete=True) as f: + u = urllib2.urlopen(url) + f.write(u.read()) + f.file.flush() + tar = tarfile.open(f.name) + tar.extractall(path=outputdir) + tar.close() + + # MIBiG was packed with strange files ._*gbk. Let's remove it + for f in [f for f in os.listdir(outputdir) if f[:2] == '._']: + os.remove(os.path.join(outputdir, f)) + +#def gbk2tablegen(gb_file, strain_id=None): +#def cds_from_gbk(gb_file, strain_id=None): +def cds_from_gbk(gb_file): + gb_record = SeqIO.read(open(gb_file,"rU"), "genbank") + + #if strain_id is not None: + # gb_record.id = strain_id + + output = pd.DataFrame() + sign = lambda x: '+' if x > 0 else '-' + for feature in gb_record.features: + if feature.type == "CDS": + tmp = {} + tmp = {'BGC': gb_record.id, + 'locus_tag': feature.qualifiers['locus_tag'][0], + 'start': feature.location.start.position, + 'stop': feature.location.end.position, + 'strand': sign(feature.location.strand) } + if 'note' in feature.qualifiers: + for note in feature.qualifiers['note']: + product = re.search( r"""smCOG: \s (?P.*?) \s+ \(Score: \s* (?P.*); \s* E-value: \s (?P.*?)\);""", note, re.VERBOSE) + if product is not None: + product = product.groupdict() + product['score'] = float(product['score']) + product['e_value'] = float(product['e_value']) + for p in product: + tmp[p] = product[p] + output = output.append(pd.Series(tmp), ignore_index=True) + return output + + +def find_category_from_product(df): + subcluster = json.loads( + pkg_resources.resource_string( + __name__, 'subcluster_dictionary.json')) + def get_category(product): + for s in subcluster: + if re.search(s, product): + return subcluster[s] + return 'hypothetical' + + idx = df['product'].notnull() + df['category'] = df.loc[idx, 'product'].apply(get_category) + df['category'].fillna('hypothetical', inplace=True) + return df + + +def get_hits(filename, criteria='cum_BLAST_score'): + """ + + Reproduces original Tiago's code: table_1_extender.py + + In the future allow different criteria. Right now it takes + from the very first block, which has the highest Cumulative + BLAST. + """ + with open(filename) as f: + df = antiSMASH_to_dataFrame(f.read()) + + df.dropna(subset=['query_gene'], inplace=True) + df.sort_values(by=criteria, ascending=False, na_position='last', + inplace=True) + return df.groupby('query_gene', as_index=False).first() diff --git a/BioCompass/cli.py b/BioCompass/cli.py index df4db02..02a28d5 100644 --- a/BioCompass/cli.py +++ b/BioCompass/cli.py @@ -21,7 +21,7 @@ def downloadHits(mgbfile, outputdir): @main.command(name="download-MIBiG") @click.option('--outputdir', default='./', type=click.Path(exists=True), help="Path to save the MIBig genbank files.") -@click.option('--version', type=unicode, default='1.3', +@click.option('--version', type=str, default='1.3', help="Version of MIBiG to download.") def downloadMIBiG(outputdir, version): """Download MIBiG gbk database.""" diff --git a/BioCompass/cli.py.bak b/BioCompass/cli.py.bak new file mode 100644 index 0000000..df4db02 --- /dev/null +++ b/BioCompass/cli.py.bak @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +import click + +from .BioCompass import download_hits +from .BioCompass import download_mibig + +@click.group() +def main(): + pass + +@main.command(name="download-hits") +@click.option('--outputdir', default='./', type=click.Path(exists=True), + help="Path to save the NCBI clusters.") +@click.argument('mgbfile', type=click.Path(exists=True)) + #help="Multigeneblast file containing NCBI references to be downloaded.") +def downloadHits(mgbfile, outputdir): + """Download NCBI clusters listed t in multigeneblast file.""" + download_hits(mgbfile, outputdir) + +@main.command(name="download-MIBiG") +@click.option('--outputdir', default='./', type=click.Path(exists=True), + help="Path to save the MIBig genbank files.") +@click.option('--version', type=unicode, default='1.3', + help="Version of MIBiG to download.") +def downloadMIBiG(outputdir, version): + """Download MIBiG gbk database.""" + download_mibig(outputdir, version=version) + +if __name__ == "__main__": + main() diff --git a/BioCompass/edges_gen.py b/BioCompass/edges_gen.py index e1c3e78..3d05ea7 100644 --- a/BioCompass/edges_gen.py +++ b/BioCompass/edges_gen.py @@ -21,7 +21,7 @@ for index, row in table2_df.iterrows(): file_name = './%s_%s_%s/clusterblast_output.txt'%(table2_df.BGC.loc[index],itineration,table2_df.subcluster.loc[index]) if os.path.isfile(file_name): - with open(file_name).xreadlines() as f: + with open(file_name) as f: for line in f: hit = re.search(r'^(\d*). (\S*)_(\S*)$',line) matching_genes = re.search(r'^Number of proteins with BLAST hits to this cluster: (\d*)$', line) diff --git a/BioCompass/edges_gen.py.bak b/BioCompass/edges_gen.py.bak new file mode 100644 index 0000000..e1c3e78 --- /dev/null +++ b/BioCompass/edges_gen.py.bak @@ -0,0 +1,63 @@ +import pandas as pd +import re +import os.path +import itertools +from sys import argv + +script, cluster_name, itineration = argv + +table2_df = pd.read_csv('../tables/%s_table2_%s.csv'%(cluster_name,itineration), sep='\t') + +col1 = [] +col2 = [] +col3 = [] +col4 = [] +col5 = [] +col6 = [] +col7 = [] +col8 = [] +col9 = [] + +for index, row in table2_df.iterrows(): + file_name = './%s_%s_%s/clusterblast_output.txt'%(table2_df.BGC.loc[index],itineration,table2_df.subcluster.loc[index]) + if os.path.isfile(file_name): + with open(file_name).xreadlines() as f: + for line in f: + hit = re.search(r'^(\d*). (\S*)_(\S*)$',line) + matching_genes = re.search(r'^Number of proteins with BLAST hits to this cluster: (\d*)$', line) + mgb_score = re.search(r'^MultiGeneBlast score: (\d*).(\d*)$', line) + blast_score = re.search(r'^Cumulative Blast bit score: (\d*)$',line) + if hit != None: + col1.append('%s'%table2_df.BGC.loc[index]) + col2.append('%s'%table2_df.subcluster.loc[index]) + col3.append('%s'%table2_df.category.loc[index]) + col4.append('%s'%table2_df.CDSs.loc[index]) + col5.append(hit.group(2)) + col9.append('%s'%itineration) + if mgb_score != None: + col6.append('%s.%s'%mgb_score.group(1,2)) + if blast_score != None: + col7.append(blast_score.group(1)) + if matching_genes != None: + perc_matching = int(matching_genes.group(1))*100/(table2_df.CDSs.loc[index]) + col8.append(perc_matching) + + else: + col1.append('%s'%table2_df.BGC.loc[index]) + col2.append('%s'%table2_df.subcluster.loc[index]) + col3.append('%s'%table2_df.category.loc[index]) + col4.append('%s'%table2_df.CDSs.loc[index]) + col5.append('%s'%table2_df.BGC.loc[index]) + col6.append('NA') + col7.append('NA') + col8.append('NA') + col9.append('%s'%itineration) + +frames = {'BGC':col1, 'BGC_subcluster':col2, 'BGC_subcluster_category':col3, 'BGC_subcluster_genes':col4, 'BLAST_hit':col5, 'MultiGeneBlast_score':col6, 'BLAST_score':col7, 'percent_matching_genes':col8, 'BGC_iteration':col9} + +edges_df = pd.DataFrame(frames) + +edges_handle = open('%s_edges_%s.txt'%(cluster_name,itineration), "w") +edges_df.to_csv(edges_handle, sep='\t', index=False) +edges_handle.close() + diff --git a/BioCompass/filter_edges.py b/BioCompass/filter_edges.py index 42122fe..200976b 100644 --- a/BioCompass/filter_edges.py +++ b/BioCompass/filter_edges.py @@ -9,9 +9,9 @@ while True: try: - per_genes = int(raw_input("\n What minimum percentage of matching genes would you like [from 0 to 100]? ")) - blast_score = int(raw_input("\n What minimum cumulative BLAST score would you like [from %s to %s]? "%(edges_final.BLAST_score.min(),edges_final.BLAST_score.max()))) - mgb_score = int(raw_input("\n What minimum multigeneBLAST score [from %s to %s]? "%(edges_final.MultiGeneBlast_score.min(),edges_final.MultiGeneBlast_score.max()))) + per_genes = int(input("\n What minimum percentage of matching genes would you like [from 0 to 100]? ")) + blast_score = int(input("\n What minimum cumulative BLAST score would you like [from %s to %s]? "%(edges_final.BLAST_score.min(),edges_final.BLAST_score.max()))) + mgb_score = int(input("\n What minimum multigeneBLAST score [from %s to %s]? "%(edges_final.MultiGeneBlast_score.min(),edges_final.MultiGeneBlast_score.max()))) break except ValueError: print("Sorry, I didn't understand that. Try again using INTERGERS ONLY.") @@ -40,7 +40,7 @@ edges_final = edges_final.drop(edges_final.index[indexes_to_drop]) while True: - self_loop = raw_input("\n Would you like to include subclusters with no matches?[Yes or No only!] ") + self_loop = input("\n Would you like to include subclusters with no matches?[Yes or No only!] ") if self_loop == "Yes" or self_loop == "yes": edges_final.MultiGeneBlast_score.replace(to_replace='NaN', value=mgb_score, inplace=True) edges_final.BLAST_score.replace(to_replace='NaN', value=blast_score, inplace=True) diff --git a/BioCompass/filter_edges.py.bak b/BioCompass/filter_edges.py.bak new file mode 100644 index 0000000..42122fe --- /dev/null +++ b/BioCompass/filter_edges.py.bak @@ -0,0 +1,57 @@ +import pandas as pd +import itertools +from sys import argv + +script, strain_name = argv + +edges_final = pd.read_csv('%s_edges_best_itineration.txt'%strain_name, sep='\t') + + +while True: + try: + per_genes = int(raw_input("\n What minimum percentage of matching genes would you like [from 0 to 100]? ")) + blast_score = int(raw_input("\n What minimum cumulative BLAST score would you like [from %s to %s]? "%(edges_final.BLAST_score.min(),edges_final.BLAST_score.max()))) + mgb_score = int(raw_input("\n What minimum multigeneBLAST score [from %s to %s]? "%(edges_final.MultiGeneBlast_score.min(),edges_final.MultiGeneBlast_score.max()))) + break + except ValueError: + print("Sorry, I didn't understand that. Try again using INTERGERS ONLY.") + continue + else: + break + +seen = [] +indexes_to_drop = [] + +for index,row in edges_final.iterrows(): + query = '%s_%s'%(edges_final.BGC.loc[index],edges_final.BGC_subcluster.loc[index]) + if edges_final.percent_matching_genes.loc[index] >= per_genes and edges_final.BLAST_score.loc[index] >= blast_score and edges_final.MultiGeneBlast_score.loc[index] >= mgb_score: + seen.append(query) + continue + else: + if query in seen: + indexes_to_drop.append(index) + else: + seen.append(query) + edges_final.set_value(index,'BLAST_hit','%s'%edges_final.BGC.loc[index]) + edges_final.set_value(index,'percent_matching_genes',None) + edges_final.set_value(index,'BLAST_score',None) + edges_final.set_value(index,'MultiGeneBlast_score',None) + +edges_final = edges_final.drop(edges_final.index[indexes_to_drop]) + +while True: + self_loop = raw_input("\n Would you like to include subclusters with no matches?[Yes or No only!] ") + if self_loop == "Yes" or self_loop == "yes": + edges_final.MultiGeneBlast_score.replace(to_replace='NaN', value=mgb_score, inplace=True) + edges_final.BLAST_score.replace(to_replace='NaN', value=blast_score, inplace=True) + edges_final.percent_matching_genes.replace(to_replace='NaN', value=100, inplace=True) + break + elif self_loop == "No" or self_loop == "no": + break + else: + print("Sorry, answer Yes or No only") + continue + +edges_final_handle = open('%s_edges_filtered.txt'%strain_name, "w") +edges_final.to_csv(edges_final_handle, sep='\t', index=False) +edges_final_handle.close() \ No newline at end of file diff --git a/BioCompass/subcluster_gen.py b/BioCompass/subcluster_gen.py index 1ba90fa..5971273 100644 --- a/BioCompass/subcluster_gen.py +++ b/BioCompass/subcluster_gen.py @@ -67,7 +67,7 @@ def parse_db(db): D = defaultdict(list) for i,item in enumerate(db): D[item].append(i) - D = {k:v for k,v in D.items() if len(v)>1} + D = {k:v for k,v in list(D.items()) if len(v)>1} return D def find_category(categories,col5): @@ -97,9 +97,9 @@ def find_category(categories,col5): col3 = [] col4 = [] col5 = [] - for key, value in subcluster_dict.iteritems(): + for key, value in subcluster_dict.items(): col1.append(strain_name) - col2.append(string.ascii_uppercase[subcluster_dict.keys().index(key)]) + col2.append(string.ascii_uppercase[list(subcluster_dict.keys()).index(key)]) col3.append(len(value)) categories = [] genes = [] diff --git a/BioCompass/subcluster_gen.py.bak b/BioCompass/subcluster_gen.py.bak new file mode 100644 index 0000000..1ba90fa --- /dev/null +++ b/BioCompass/subcluster_gen.py.bak @@ -0,0 +1,115 @@ +from collections import defaultdict +from sys import argv +import pandas as pd +import re +import string +import numpy as np +from sklearn.cluster import DBSCAN +from collections import Counter + +script, strain_name = argv + +table1_df = pd.read_csv('%s_table1.csv' % strain_name, sep='\t') +table1_df['product'].fillna('None', inplace=True) + +#This first portion will create the distance matrix + +def score_match(table, index_gene1, index_gene2): + score = 0 + gene1_category = table1_df.category.loc[index_gene1] + gene2_category = table1_df.category.loc[index_gene2] + if gene1_category == 'hypothetical' or gene2_category == 'hypothetical': + if gene1_category == gene2_category: + score = score + 1 + elif gene1_category != gene2_category: + score = score + 2 + else: + if gene1_category == gene2_category: + score = score + 5 + gene1_best_BGC = table1_df.best_hit_BGC.loc[index_gene1] + gene2_best_BGC = table1_df.best_hit_BGC.loc[index_gene2] + if gene1_best_BGC != 'None' and gene2_best_BGC != 'None': + if gene1_best_BGC == gene2_best_BGC: + score = score + 2 + gene1_best_hit_pos = re.search(r'^\D*([0-9]*)',table1_df.best_hit_gene_loc.loc[index_gene1]) + gene2_best_hit_pos = re.search(r'^\D*([0-9]*)',table1_df.best_hit_gene_loc.loc[index_gene2]) + dif_best_hit_pos = abs(abs((int(gene2_best_hit_pos.group(1)) - int(gene1_best_hit_pos.group(1)))) - abs((index_gene2-index_gene1))) + if dif_best_hit_pos == 0: + score = score + 3 + elif dif_best_hit_pos == 1: + score = score + 2 + elif dif_best_hit_pos == 2: + score = score + 1 + else: + score = score + 1 + return score + +for index,row in table1_df.iterrows(): + scores = [] + for gene in range(0,len(table1_df)): + scores.append(score_match(table1_df,gene,index)) + if index == 0: + A = np.vstack([scores]) + else: + A = np.vstack([A,scores]) + +#This second portion will run dbscan to create a subclusters possibilities + +def repeated(db_arrays,db): + for i in range(0,len(db_arrays)-1): + if np.array_equal(db_arrays[i],db) == False: + continue + else: + return True + break + +def parse_db(db): + D = defaultdict(list) + for i,item in enumerate(db): + D[item].append(i) + D = {k:v for k,v in D.items() if len(v)>1} + return D + +def find_category(categories,col5): + if 'biosynthetic' in categories: + col5.append('biosynthetic') + else: + if len(categories) > 1: + category = re.search(r'^\[\(\'(\S*)\'',str(Counter(categories).most_common(1))).group(1) + col5.append('%s'%category) + else: + col5.append('%s'%categories[0]) + +count = 0 + +for itn in range(1,len(A)): + db = DBSCAN(eps=itn, min_samples=2).fit_predict(A) + if itn == 1: + db_arrays = np.vstack([db]) + else: + db_arrays = np.vstack([db_arrays,db]) + if repeated(db_arrays,db) == True: + continue + else: + subcluster_dict = parse_db(db) + col1 = [] + col2 = [] + col3 = [] + col4 = [] + col5 = [] + for key, value in subcluster_dict.iteritems(): + col1.append(strain_name) + col2.append(string.ascii_uppercase[subcluster_dict.keys().index(key)]) + col3.append(len(value)) + categories = [] + genes = [] + for item in value: + categories.append(table1_df.category.loc[item]) + genes.append(table1_df.locus_tag.loc[item]) + genes = ','.join(genes) + col4.append(genes) + find_category(categories,col5) + frames = {'BGC':col1, 'subcluster':col2, 'CDSs':col3, 'loci':col4, 'category':col5} + count = count + 1 + table2_df = pd.DataFrame(frames, index=None) + table2_df.to_csv('%s_table2_%d.csv' % (strain_name,count), sep='\t', index=False) diff --git a/BioCompass/table_1_extender.py b/BioCompass/table_1_extender.py index ef12199..13613b7 100644 --- a/BioCompass/table_1_extender.py +++ b/BioCompass/table_1_extender.py @@ -10,13 +10,13 @@ table1_df = pd.read_csv('../outputs/tables/%s_%s_table1.csv'%(strain_name,cluster_number), sep='\t') def find_boarders(file_name): - with open(file_name).xreadlines() as f: + with open(file_name) as f: for num, line in enumerate(f): header = re.search(r'^(\d*). (\S*)_(\S*)$',line) if header != None and num not in numbers: numbers.append(num) headers.append(header.group()) - temp_dict = dict(zip(numbers, headers)) + temp_dict = dict(list(zip(numbers, headers))) return temp_dict def itinerate_temp_file(file_name,temp_dict): @@ -38,7 +38,7 @@ def find_best_hits(table1_df): """ For each ctg1_* @ table1_df['locus_tag'], get """ for item in table1_df['locus_tag']: - with open("temp.txt", "r").xreadlines() as tf: + with open("temp.txt", "r") as tf: for line in tf: # query gene, subject gene, %identity, blast score, %coverage, e-value # e.g line <- 'ctg1_160\tSCATT_35940\t69\t169\t100.0\t2.3e-40\t\n' @@ -70,11 +70,11 @@ def find_best_hits(table1_df): col9 = [] col10 = [] col11 = [] - it = iter(sorted(temp_dict.iteritems())) - i = it.next() + it = iter(sorted(temp_dict.items())) + i = next(it) # e.g. i <- (139, '1. CP003219_c13') if len(temp_dict) > 1: - j = it.next() + j = next(it) while True: if i[0] == sorted(temp_dict)[-1]: itinerate_temp_file(file_name,temp_dict) @@ -85,7 +85,7 @@ def find_best_hits(table1_df): find_best_hits(table1_df) i = j if j[0] != sorted(temp_dict)[-1]: - j = it.next() + j = next(it) """ ctg1_160 SCATT_35940 69 169 100.0 2.3e-40 hence: @@ -94,7 +94,7 @@ def find_best_hits(table1_df): """ col12 = [] for item in col9: - with open(file_name, "r").xreadlines() as tf: + with open(file_name, "r") as tf: seen = [] for line in tf: best_hit_loc = re.search(r'^%s\t(\S*)\t(.*)'%item,line) diff --git a/BioCompass/table_1_extender.py.bak b/BioCompass/table_1_extender.py.bak new file mode 100644 index 0000000..ef12199 --- /dev/null +++ b/BioCompass/table_1_extender.py.bak @@ -0,0 +1,126 @@ +from sys import argv +from Bio import SeqIO +import pandas as pd +import re +import itertools +import os.path + +script, strain_name, cluster_number = argv + +table1_df = pd.read_csv('../outputs/tables/%s_%s_table1.csv'%(strain_name,cluster_number), sep='\t') + +def find_boarders(file_name): + with open(file_name).xreadlines() as f: + for num, line in enumerate(f): + header = re.search(r'^(\d*). (\S*)_(\S*)$',line) + if header != None and num not in numbers: + numbers.append(num) + headers.append(header.group()) + temp_dict = dict(zip(numbers, headers)) + return temp_dict + +def itinerate_temp_file(file_name,temp_dict): + """ Extract from file_name a section defined by i into temp.txt""" + with open(file_name) as fp: + temp_file = open("temp.txt", "w") + for num, line in enumerate(fp): + # Is not the last one + if i[0] != sorted(temp_dict)[-1]: + if num >= i[0] and num < j[0]: + temp_file.writelines(line) + else: + if num >= i[0]: + temp_file.writelines(line) + temp_file.close() + + +def find_best_hits(table1_df): + """ For each ctg1_* @ table1_df['locus_tag'], get + """ + for item in table1_df['locus_tag']: + with open("temp.txt", "r").xreadlines() as tf: + for line in tf: + # query gene, subject gene, %identity, blast score, %coverage, e-value + # e.g line <- 'ctg1_160\tSCATT_35940\t69\t169\t100.0\t2.3e-40\t\n' + best_hit = re.search(r'^%s\t(\S*)\t(\d*)\t(\d*)\t(\d*)'%item,line) + if best_hit and item not in col7: + # e.g. ctg1_160 + col7.append(item) + # e.g i[1] <- '1. CP003219_c13' + hit = re.search(r'^(\d*). (\S*)_(\S*)',i[1]) + # e.g. 'CP003219' + col8.append(hit.group(2)) + # e.g. 'SCATT_35940' + col9.append(best_hit.group(1)) + # e.g. '69' + col10.append(best_hit.group(2)) + # e.g. '100' + col11.append(best_hit.group(4)) + +#find boarders for temp_file +short_cluster_number = re.search(r'0*([0-9]*)',cluster_number).group(1) +file_name = '../antiSMASH_input/%s/clusterblast/cluster%s.txt' % (strain_name,short_cluster_number) +numbers = [] +headers = [] +temp_dict = [] +if os.path.isfile(file_name): + temp_dict = find_boarders(file_name) + col7 = [] + col8 = [] + col9 = [] + col10 = [] + col11 = [] + it = iter(sorted(temp_dict.iteritems())) + i = it.next() + # e.g. i <- (139, '1. CP003219_c13') + if len(temp_dict) > 1: + j = it.next() + while True: + if i[0] == sorted(temp_dict)[-1]: + itinerate_temp_file(file_name,temp_dict) + find_best_hits(table1_df) + break + else: + itinerate_temp_file(file_name,temp_dict) + find_best_hits(table1_df) + i = j + if j[0] != sorted(temp_dict)[-1]: + j = it.next() + """ + ctg1_160 SCATT_35940 69 169 100.0 2.3e-40 + hence: + best_hit_loc -> + SCATT_35940 AEW95965 3878018 3878386 + 50S_ribosomal_protein_L14 + """ + col12 = [] + for item in col9: + with open(file_name, "r").xreadlines() as tf: + seen = [] + for line in tf: + best_hit_loc = re.search(r'^%s\t(\S*)\t(.*)'%item,line) + if best_hit_loc and item not in seen: + col12.append(best_hit_loc.group(1)) + seen.append(item) + frames = {'locus_tag':col7,'best_hit_BGC':col8,'best_hit_gene':col9,'best_hit_%id':col10,'best_hit_%cov':col11,'best_hit_gene_loc':col12} + new_cols_df = pd.DataFrame(frames, index=None) + table1_df = pd.merge(table1_df, new_cols_df, on='locus_tag', how='outer') + table1_df.fillna('None', inplace=True) +else: + col7 = [] + col8 = [] + for item in table1_df['locus_tag']: + col7.append(item) + col8.append('None') + frames = {'locus_tag':col7,'best_hit_BGC':col8} + new_cols_df = pd.DataFrame(frames, index=None) + table1_df = pd.merge(table1_df, new_cols_df, on='locus_tag', how='outer') + + +table1_handle = pd.HDFStore('../outputs/tables/table1.h5') +table1_handle['%s_%s' % (strain_name,cluster_number)] = table1_df +table1_handle.close() + + +table1_handle = open('../outputs/tables/%s_%s_table1.csv'%(strain_name,cluster_number), "w") +table1_df.to_csv(table1_handle, sep='\t', index=False) +table1_handle.close()