Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149,703 changes: 149,703 additions & 0 deletions python/PSI-MOD.obo.xml

Large diffs are not rendered by default.

745 changes: 461 additions & 284 deletions python/proBAM.py

Large diffs are not rendered by default.

Binary file modified python/proBAM_ENSEMBL.pyc
Binary file not shown.
Binary file modified python/proBAM_IDparser.pyc
Binary file not shown.
212 changes: 117 additions & 95 deletions python/proBAM_biomart.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,59 +17,72 @@

from __future__ import division

from bioservices import BioMart

__author__ = 'Volodimir Olexiouk'

from bioservices import BioMart

#
#Function that links the correct database archive with version number
#
def _get_ensembl_archive_(version,species):
def _get_ensembl_archive_(version, species):
'''
:param version: Ensembl version
:return: ENSEMBL repository for a specific version
'''
version=int(version)
if species=='arabidopsis_thaliana':
version = int(version)
if species == 'arabidopsis_thaliana':
return 'plants.ensembl.org'
else:
d={}
d[89]="www.ensembl.org"
d[88]="mar2017.archive.ensembl.org"
d[87]="dec2016.archive.ensembl.org"
d[86]="oct2016.archive.ensembl.org"
d[85]="jul2016.archive.ensembl.org"
d[84]="mar2016.archive.ensembl.org"
d[83]="dec2015.archive.ensembl.org"
d[82]="sep2015.archive.ensembl.org"
d[81]="jul2015.archive.ensembl.org"
d[80]="may2015.archive.ensembl.org"
d[79]="mar2015.archive.ensembl.org"
d[78]="dec2014.archive.ensembl.org"
d[77]="oct2014.archive.ensembl.org"
d[76]="aug2014.archive.ensembl.org"
d[75]="feb2014.archive.ensembl.org"
d[74]="dec2013.archive.ensembl.org"
d[73]="sep2013.archive.ensembl.org"
d[72]="jun2013.archive.ensembl.org"
d[71]="apr2013.archive.ensembl.org"
d[70]="jan2013.archive.ensembl.org"
d[69]="oct2012.archive.ensembl.org"
d[68]="jul2012.archive.ensembl.org"
d[67]="may2012.archive.ensembl.org"
d[66]="feb2012.archive.ensembl.org"
d[65]="dec2011.archive.ensembl.org"
d[64]="sep2011.archive.ensembl.org"
d[63]="jun2011.archive.ensembl.org"
d[62]="apr2011.archive.ensembl.org"
d[61]="feb2011.archive.ensembl.org"
d[60]="nov2010.archive.ensembl.org"
d[59]="aug2010.archive.ensembl.org"
d[58]="may2010.archive.ensembl.org"
d[57]="mar2010.archive.ensembl.org"
d[56]="sep2009.archive.ensembl.org"
d[55]="jul2009.archive.ensembl.org"
d[54]="may2009.archive.ensembl.org"
d = {}
d[101] = "www.ensembl.org"
d[100] = "apr2020.archive.ensembl.org"
d[99] = "jan2020.archive.ensembl.org"
d[98] = "sep2019.archive.ensembl.org"
d[97] = "jul2019.archive.ensembl.org"
d[96] = "apr2019.archive.ensembl.org"
d[95] = "jan2019.archive.ensembl.org"
d[94] = "oct2018.archive.ensembl.org"
d[93] = "jul2018.archive.ensembl.org"
d[92] = "apr2018.archive.ensembl.org"
d[91] = "dec2017.archive.ensembl.org"
d[90] = "aug2017.archive.ensembl.org"
d[89] = "may2017.archive.ensembl.org"
d[88] = "mar2017.archive.ensembl.org"
d[87] = "dec2016.archive.ensembl.org"
d[86] = "oct2016.archive.ensembl.org"
d[85] = "jul2016.archive.ensembl.org"
d[84] = "mar2016.archive.ensembl.org"
d[83] = "dec2015.archive.ensembl.org"
d[82] = "sep2015.archive.ensembl.org"
d[81] = "jul2015.archive.ensembl.org"
d[80] = "may2015.archive.ensembl.org"
d[79] = "mar2015.archive.ensembl.org"
d[78] = "dec2014.archive.ensembl.org"
d[77] = "oct2014.archive.ensembl.org"
d[76] = "aug2014.archive.ensembl.org"
d[75] = "feb2014.archive.ensembl.org"
d[74] = "dec2013.archive.ensembl.org"
d[73] = "sep2013.archive.ensembl.org"
d[72] = "jun2013.archive.ensembl.org"
d[71] = "apr2013.archive.ensembl.org"
d[70] = "jan2013.archive.ensembl.org"
d[69] = "oct2012.archive.ensembl.org"
d[68] = "jul2012.archive.ensembl.org"
d[67] = "may2012.archive.ensembl.org"
d[66] = "feb2012.archive.ensembl.org"
d[65] = "dec2011.archive.ensembl.org"
d[64] = "sep2011.archive.ensembl.org"
d[63] = "jun2011.archive.ensembl.org"
d[62] = "apr2011.archive.ensembl.org"
d[61] = "feb2011.archive.ensembl.org"
d[60] = "nov2010.archive.ensembl.org"
d[59] = "aug2010.archive.ensembl.org"
d[58] = "may2010.archive.ensembl.org"
d[57] = "mar2010.archive.ensembl.org"
d[56] = "sep2009.archive.ensembl.org"
d[55] = "jul2009.archive.ensembl.org"
d[54] = "may2009.archive.ensembl.org"
if version in d:
return d[version]
else:
Expand All @@ -84,19 +97,21 @@ def _get_ensembl_dataset_(species):
:param species: full species name
:return: ensembl species name
'''
d={}
d['homo_sapiens']='hsapiens_gene_ensembl'
d['mus_musculus']='mmusculus_gene_ensembl'
d['drosophila_melanogaster']='dmelanogaster_gene_ensembl'
d['danio_rerio']='drerio_gene_ensembl'
d['arabidopsis_thaliana']='athaliana_eg_gene'
d = {}
d['homo_sapiens'] = 'hsapiens_gene_ensembl'
d['mus_musculus'] = 'mmusculus_gene_ensembl'
d['drosophila_melanogaster'] = 'dmelanogaster_gene_ensembl'
d['danio_rerio'] = 'drerio_gene_ensembl'
d['arabidopsis_thaliana'] = 'athaliana_eg_gene'

if species not in d:
print 'Error: unsupported species'
print 'Currently supported species:'
print d.keys()
raise ValueError('unsupported species')
return d[species]


#
# Function to create XML readable transcript_id query string
#
Expand All @@ -105,16 +120,18 @@ def _id_in_xml_query_(transcipt_id):
:param transcipt_id: list of transcrip IDs
:return: XML readable transcript ID query string
'''
query=""
query = ""
for tr in transcipt_id:
query+=(str(tr)+",")
query=query[:-1]
query += (str(tr) + ",")
query = query[:-1]
return query


#
# Function that retrieves cds,strand,chr and ensembl_transcript_id from BioMart
#
def retrieve_data_from_biomart(version,species,transcript_id,three_frame_translation):
def retrieve_data_from_biomart(version, species, transcript_id,
three_frame_translation):
'''
:param version: Database version
:param species: Full species name
Expand All @@ -123,42 +140,44 @@ def retrieve_data_from_biomart(version,species,transcript_id,three_frame_transla
'''

#create connection
tr_query=_id_in_xml_query_(transcript_id)
version=_get_ensembl_archive_(version,species)
tr_query = _id_in_xml_query_(transcript_id)
version = _get_ensembl_archive_(version, species)

dataset=_get_ensembl_dataset_(species)
dataset = _get_ensembl_dataset_(species)
biomart = BioMart(host=version)

#add filters
biomart.add_dataset_to_xml(dataset)
biomart.add_filter_to_xml("ensembl_transcript_id",tr_query)
biomart.add_filter_to_xml("ensembl_transcript_id", tr_query)

#add attributes
biomart.add_attribute_to_xml('ensembl_transcript_id')
biomart.add_attribute_to_xml("chromosome_name")
biomart.add_attribute_to_xml("strand")
if three_frame_translation=="Y":
if three_frame_translation == "Y":
biomart.add_attribute_to_xml("cdna")
else:
biomart.add_attribute_to_xml("coding")
attributes=biomart.attributes(dataset)
attributes = biomart.attributes(dataset)

#execute query
xml_query=biomart.get_xml()
xml_query = biomart.get_xml()

# create bypass for plants database
if species=="arabidopsis_thaliana":
xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')
if species == "arabidopsis_thaliana":
xml_query = xml_query.replace('virtualSchemaName = "default"',
'virtualSchemaName = "plants_mart_30"')

result=biomart.query(xml_query)
result=result.split("\n")
result = biomart.query(xml_query)
result = result.split("\n")

return result


#
# Function that maps Identifiers to ENSEMBl
#
def id_map_ensembl(to_annotation,version,species,psm_protein_id):
def id_map_ensembl(to_annotation, version, species, psm_protein_id):
'''
:param to_annotation: target identifier annotation (i.e. uniprot_swissprot)
:param version: Database version
Expand All @@ -167,53 +186,56 @@ def id_map_ensembl(to_annotation,version,species,psm_protein_id):
:return: BioMart results
'''
# If species is in plantsDB, execute plants adjusted function
if species=="arabidopsis_thaliana":
result=id_map_ensembl_plants(to_annotation,version,species,psm_protein_id)
if species == "arabidopsis_thaliana":
result = id_map_ensembl_plants(to_annotation, version, species,
psm_protein_id)
return result
else:
#adjust UniProt xml annotation for BioMart version >87
if int(version)>87 and "uniprot" in to_annotation:
to_annotation=to_annotation.replace('_','')
if int(version) > 87 and "uniprot" in to_annotation:
to_annotation = to_annotation.replace('_', '')
#create connection
query_string=_id_in_xml_query_(psm_protein_id)
version=_get_ensembl_archive_(version,species)
dataset=_get_ensembl_dataset_(species)
query_string = _id_in_xml_query_(psm_protein_id)
version = _get_ensembl_archive_(version, species)
dataset = _get_ensembl_dataset_(species)
biomart = BioMart(host=version)

#add filters
biomart.add_dataset_to_xml(dataset)
biomart.add_filter_to_xml(to_annotation,query_string)
biomart.add_filter_to_xml(to_annotation, query_string)

#add attributs
biomart.add_attribute_to_xml("ensembl_transcript_id")
biomart.add_attribute_to_xml("transcript_start")
biomart.add_attribute_to_xml("transcript_end")
biomart.add_attribute_to_xml(to_annotation)
attributes=biomart.attributes(dataset)
attributes = biomart.attributes(dataset)

#execute query
xml_query=biomart.get_xml()
tmp_result=biomart.query(xml_query)
if len(tmp_result)==1:
xml_query = biomart.get_xml()
tmp_result = biomart.query(xml_query)
if len(tmp_result) == 1:
print "ERROR: could not convert ID's trough BioMart, " \
"Please check whether Ensembl version/species were correctly supplied"
tmp_result=tmp_result.split("\n")
result=[]
tmp_result = tmp_result.split("\n")
result = []

if tmp_result!=[]:
if tmp_result != []:
for i in tmp_result:
i=i.split("\t")
if i[0]!="":
result.append([i[0],(int(i[2])-int(i[1])),i[3]])
i = i.split("\t")
if i[0] != "":
result.append([i[0], (int(i[2]) - int(i[1])), i[3]])
else:
result.append(i)
return result


#
# Function that maps Identfiers to Ensembl adjusted for plant DB compatibility
#

def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id):

def id_map_ensembl_plants(to_annotation, version, species, psm_protein_id):
'''
:param to_annotation: to which annotation
:param version: ensembl version
Expand All @@ -222,14 +244,14 @@ def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id):
:return: list of protein ID's converted to ENSEMBL
'''
#create connection
query_string=_id_in_xml_query_(psm_protein_id)
version=_get_ensembl_archive_(version,species)
dataset=_get_ensembl_dataset_(species)
query_string = _id_in_xml_query_(psm_protein_id)
version = _get_ensembl_archive_(version, species)
dataset = _get_ensembl_dataset_(species)
biomart = BioMart(host=version)

#add filters
biomart.add_dataset_to_xml(dataset)
biomart.add_filter_to_xml(to_annotation+"_accession",query_string)
biomart.add_filter_to_xml(to_annotation + "_accession", query_string)

#add attributs
biomart.add_attribute_to_xml("ensembl_transcript_id")
Expand All @@ -238,17 +260,17 @@ def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id):
biomart.add_attribute_to_xml("transcript_end")

#execute query
xml_query=biomart.get_xml()
xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')
xml_query = biomart.get_xml()
xml_query = xml_query.replace('virtualSchemaName = "default"',
'virtualSchemaName = "plants_mart_30"')

#parse results and adjust length
temp_result=biomart.query(xml_query).split("\n")
result=[]
temp_result = biomart.query(xml_query).split("\n")
result = []
for row in temp_result:
items=row.split("\t")
items = row.split("\t")
# print row
if len(items)==4:
length=int(items[3])-int(items[1])+1
result.append(items[0]+"\t"+str(length)+"\t"+items[2])
if len(items) == 4:
length = int(items[3]) - int(items[1]) + 1
result.append(items[0] + "\t" + str(length) + "\t" + items[2])
return result

Binary file modified python/proBAM_biomart.pyc
Binary file not shown.
Binary file modified python/proBAM_coref.pyc
Binary file not shown.
40 changes: 20 additions & 20 deletions python/proBAM_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,32 @@ def get_PSM_hash(psm_file,decoy_annotation,validated_only):
:return: dictionairy of parsed psm file,
'''
print "Reading PSM-file"
try:
# try:
# catch mzid file format and parse
if re.match('^.*\.(mzid)$',psm_file.lower())!=None:
PSM=proBAM_mzid.get_PSM_mzid(psm_file,validated_only)
if re.match('^.*\.(mzid)$',psm_file.lower())!=None:
PSM=proBAM_mzid.get_PSM_mzid(psm_file,validated_only)

# catch pepxml file format and parse
elif re.match('^.*\.(pepxml|pep.xml|xml)$',psm_file.lower())!=None:
PSM=proBAM_pepxml.get_PSM_pepxml(psm_file)
# catch pepxml file format and parse
elif re.match('^.*\.(pepxml|pep.xml|xml)$',psm_file.lower())!=None:
PSM=proBAM_pepxml.get_PSM_pepxml(psm_file)


# catch mztab file format and parse
elif re.match('^.*\.(mztab)$',psm_file.lower())!=None:
PSM=proBAM_mzTab.get_PSM_mztab(psm_file)
# catch mztab file format and parse
elif re.match('^.*\.(mztab)$',psm_file.lower())!=None:
PSM=proBAM_mzTab.get_PSM_mztab(psm_file)

else:
raise IOError('Unrecognized file extension, \n ' \
'Accepted file extensions: .mzid/.pepxml/.pep.xml/.xml')
else:
raise IOError('Unrecognized file extension, \n ' \
'Accepted file extensions: .mzid/.pepxml/.pep.xml/.xml')

except Exception as e:
print "ERROR: Unable to the PSM file : \n"
print e.__doc__
print e.message
print "\nPlease confirm that the file is conform with the document specification." \
"If the error keeps occuring contact the developers at https://github.com/Biobix/proBAMconvert/issues " \
"and supply this error message along with the file and used settings"
raise IOError()
# except Exception as e:
# print "ERROR: Unable to the PSM file : \n"
# print e.__doc__
# print e.message
# print "\nPlease confirm that the file is conform with the document specification." \
# "If the error keeps occuring contact the developers at https://github.com/Biobix/proBAMconvert/issues " \
# "and supply this error message along with the file and used settings"
# raise IOError()



Expand Down
Binary file modified python/proBAM_input.pyc
Binary file not shown.
Loading