From d1ddd83bd9ed0f84d7649acf01ef6f3b04ac41d1 Mon Sep 17 00:00:00 2001 From: "Ian M. B." <99409346+iPsychonaut@users.noreply.github.com> Date: Thu, 13 Feb 2025 15:17:02 -0700 Subject: [PATCH 1/2] Create setup.py Initial generation for python 3 setup --- setup.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..91f8ddd --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +from setuptools import setup, find_packages +import os + +# Read the long description from the README file. +this_directory = os.path.abspath(os.path.dirname(__file__)) +with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as fh: + long_description = fh.read() + +setup( + name="GFFtoolsGX", + version="0.0.1", + description="A collection of tools for converting genome annotation between GTF, BED, GenBank and GFF.", + long_description=long_description, + long_description_content_type="text/markdown", + author="Vipin T Sreedharan", + author_email="vipin@cbio.mskcc.org", + url="https://github.com/iPsychonaut/GFFtools-GX", + packages=find_packages(), + install_requires=[ + "biopython", + ], + entry_points={ + "console_scripts": [ + "gff_parser=GFFtoolsGX.GFFParser:main", + "bed_to_gff=GFFtoolsGX.bed_to_gff:main", + "gbk_to_gff=GFFtoolsGX.gbk_to_gff:main", + "gff_to_bed=GFFtoolsGX.gff_to_bed:main", + "gff_to_gtf=GFFtoolsGX.gff_to_gtf:main", + "gtf_to_gff=GFFtoolsGX.gtf_to_gff:main", + ] + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", +) From da582942318e4d06155d1d5dac845468c2c5f0fa Mon Sep 17 00:00:00 2001 From: iPsychonaut Date: Thu, 13 Feb 2025 15:25:49 -0700 Subject: [PATCH 2/2] Update code for Python 3 compatibility --- GFFParser.py | 16 +- GFFParser.py.bak | 498 ++++++++++++++++++++++++++++++++++++++++++++++ bed_to_gff.py | 2 +- bed_to_gff.py.bak | 71 +++++++ gbk_to_gff.py | 8 +- gbk_to_gff.py.bak | 216 ++++++++++++++++++++ gff_to_bed.py | 6 +- gff_to_bed.py.bak | 116 +++++++++++ gff_to_gtf.py | 2 +- gff_to_gtf.py.bak | 79 ++++++++ gtf_to_gff.py | 2 +- gtf_to_gff.py.bak | 78 ++++++++ 12 files changed, 1076 insertions(+), 18 deletions(-) create mode 100644 GFFParser.py.bak create mode 100644 bed_to_gff.py.bak create mode 100644 gbk_to_gff.py.bak create mode 100644 gff_to_bed.py.bak create mode 100644 gff_to_gtf.py.bak create mode 100644 gtf_to_gff.py.bak diff --git a/GFFParser.py b/GFFParser.py index f85b274..78b40e7 100644 --- a/GFFParser.py +++ b/GFFParser.py @@ -14,7 +14,7 @@ import re import os import sys -import urllib +import urllib.request, urllib.parse, urllib.error import numpy as np import helper as utils from collections import defaultdict @@ -62,7 +62,7 @@ def attribute_tags(col9): # replace the double qoutes from feature identifier val = re.sub('"', '', val) # replace the web formating place holders to plain text format - info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) + info[key].extend([urllib.parse.unquote(v) for v in val.split(',') if v]) return is_gff, info @@ -87,7 +87,7 @@ def spec_features_keywd(gff_parts): pass ## TODO key words for flat_name in ["Transcript", "CDS"]: - if gff_parts["info"].has_key(flat_name): + if flat_name in gff_parts["info"]: # parents if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): if not gff_parts['id']: @@ -158,7 +158,7 @@ def Parse(ga_file): gff_info = spec_features_keywd(gff_info) # link the feature relationships - if gff_info['info'].has_key('Parent'): + if 'Parent' in gff_info['info']: for p in gff_info['info']['Parent']: if p == gff_info['id']: gff_info['id'] = '' @@ -217,7 +217,7 @@ def format_gene_models(parent_nf_map, child_nf_map): g_cnt = 0 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) - for pkey, pdet in parent_nf_map.items(): + for pkey, pdet in list(parent_nf_map.items()): # considering only gene features #if not re.search(r'gene', pdet.get('type', '')): # continue @@ -284,7 +284,7 @@ def format_gene_models(parent_nf_map, child_nf_map): # make general ascending order of coordinates if orient == '-': - for etype, excod in child_feat.items(): + for etype, excod in list(child_feat.items()): if len(excod) > 1: if excod[0][0] > excod[-1][0]: excod.reverse() @@ -412,7 +412,7 @@ def format_gene_models(parent_nf_map, child_nf_map): break if XPFLG==1: - XQC = range(XP, len(gene)+1) + XQC = list(range(XP, len(gene)+1)) gene = np.delete(gene, XQC) return gene @@ -445,7 +445,7 @@ def create_missing_feature_type(p_feat, c_feat): """ child_n_map = defaultdict(list) - for fid, det in c_feat.items(): + for fid, det in list(c_feat.items()): # get the details from grand child GID = STRD = SCR = None SPOS, EPOS = [], [] diff --git a/GFFParser.py.bak b/GFFParser.py.bak new file mode 100644 index 0000000..f85b274 --- /dev/null +++ b/GFFParser.py.bak @@ -0,0 +1,498 @@ +#!/usr/bin/env python +""" +Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. + +Requirements: + Numpy :- http://numpy.org/ + +Copyright (C) + +2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. +2012-2015 Memorial Sloan Kettering Cancer Center, New York City, USA. +""" + +import re +import os +import sys +import urllib +import numpy as np +import helper as utils +from collections import defaultdict + +def attribute_tags(col9): + """ + Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file + + @args col9: attribute column from GFF file + @type col9: str + """ + info = defaultdict(list) + is_gff = False + + if not col9: + return is_gff, info + + # trim the line ending semi-colon ucsc may have some white-space + col9 = col9.rstrip(';| ') + # attributes from 9th column + atbs = col9.split(" ; ") + if len(atbs) == 1: + atbs = col9.split("; ") + if len(atbs) == 1: + atbs = col9.split(";") + # check the GFF3 pattern which has key value pairs like: + gff3_pat = re.compile("\w+=") + # sometime GTF have: gene_id uc002zkg.1; + gtf_pat = re.compile("\s?\w+\s") + + key_vals = [] + + if gff3_pat.match(atbs[0]): # gff3 pattern + is_gff = True + key_vals = [at.split('=') for at in atbs] + elif gtf_pat.match(atbs[0]): # gtf pattern + for at in atbs: + key_vals.append(at.strip().split(" ",1)) + else: + # to handle attribute column has only single value + key_vals.append(['ID', atbs[0]]) + # get key, val items + for item in key_vals: + key, val = item + # replace the double qoutes from feature identifier + val = re.sub('"', '', val) + # replace the web formating place holders to plain text format + info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) + + return is_gff, info + +def spec_features_keywd(gff_parts): + """ + Specify the feature key word according to the GFF specifications + + @args gff_parts: attribute field key + @type gff_parts: str + """ + for t_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["info"]["Parent"] = gff_parts["info"][t_id] + break + except KeyError: + pass + for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: + try: + gff_parts["info"]["GParent"] = gff_parts["info"][g_id] + break + except KeyError: + pass + ## TODO key words + for flat_name in ["Transcript", "CDS"]: + if gff_parts["info"].has_key(flat_name): + # parents + if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): + if not gff_parts['id']: + gff_parts['id'] = gff_parts['info'][flat_name][0] + #gff_parts["info"]["ID"] = [gff_parts["id"]] + # children + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] + break + return gff_parts + +def Parse(ga_file): + """ + Parsing GFF/GTF file based on feature relationship, it takes the input file. + + @args ga_file: input file name + @type ga_file: str + """ + child_map = defaultdict(list) + parent_map = dict() + + ga_handle = utils.open_file(ga_file) + + for rec in ga_handle: + rec = rec.strip('\n\r') + + # skip empty line fasta identifier and commented line + if not rec or rec[0] in ['#', '>']: + continue + # skip the genome sequence + if not re.search('\t', rec): + continue + + parts = rec.split('\t') + assert len(parts) >= 8, rec + + # process the attribute column (9th column) + ftype, tags = attribute_tags(parts[-1]) + if not tags: # skip the line if no attribute column. + continue + + # extract fields + if parts[1]: + tags["source"] = parts[1] + if parts[7]: + tags["phase"] = parts[7] + + gff_info = dict() + gff_info['info'] = dict(tags) + gff_info["is_gff3"] = ftype + gff_info['chr'] = parts[0] + gff_info['score'] = parts[5] + + if parts[3] and parts[4]: + gff_info['location'] = [int(parts[3]) , + int(parts[4])] + gff_info['type'] = parts[2] + gff_info['id'] = tags.get('ID', [''])[0] + if parts[6] in ['?', '.']: + parts[6] = None + gff_info['strand'] = parts[6] + + # key word according to the GFF spec. + # is_gff3 flag is false check this condition and get the attribute fields + if not ftype: + gff_info = spec_features_keywd(gff_info) + + # link the feature relationships + if gff_info['info'].has_key('Parent'): + for p in gff_info['info']['Parent']: + if p == gff_info['id']: + gff_info['id'] = '' + break + rec_category = 'child' + elif gff_info['id']: + rec_category = 'parent' + else: + rec_category = 'record' + + # depends on the record category organize the features + if rec_category == 'child': + for p in gff_info['info']['Parent']: + # create the data structure based on source and feature id + child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( + dict( type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + score = gff_info['score'], + ID = gff_info['id'], + gene_name = gff_info['info'].get('gene_name', [''])[0], + gene_id = gff_info['info'].get('GParent', [''])[0] + )) + elif rec_category == 'parent': + parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( + type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + score = gff_info['score'], + name = tags.get('Name', [''])[0]) + elif rec_category == 'record': + #TODO how to handle plain records? + considering_soon = 1 + ga_handle.close() + + # depends on file type create parent feature + if not ftype: + parent_map, child_map = create_missing_feature_type(parent_map, child_map) + + # connecting parent child relations + # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file + gene_mat = format_gene_models(parent_map, child_map) + + return gene_mat + +def format_gene_models(parent_nf_map, child_nf_map): + """ + Genarate GeneObject based on the parsed file contents + + @args parent_nf_map: parent features with source and chromosome information + @type parent_nf_map: collections defaultdict + @args child_nf_map: transctipt and exon information are encoded + @type child_nf_map: collections defaultdict + """ + + g_cnt = 0 + gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) + + for pkey, pdet in parent_nf_map.items(): + # considering only gene features + #if not re.search(r'gene', pdet.get('type', '')): + # continue + + # infer the gene start and stop if not there in the + if not pdet.get('location', []): + GNS, GNE = [], [] + # multiple number of transcripts + for L1 in child_nf_map[pkey]: + GNS.append(L1.get('location', [])[0]) + GNE.append(L1.get('location', [])[1]) + GNS.sort() + GNE.sort() + pdet['location'] = [GNS[0], GNE[-1]] + + orient = pdet.get('strand', '') + gene[g_cnt]['id'] = g_cnt +1 + gene[g_cnt]['chr'] = pkey[0] + gene[g_cnt]['source'] = pkey[1] + gene[g_cnt]['name'] = pkey[-1] + gene[g_cnt]['start'] = pdet.get('location', [])[0] + gene[g_cnt]['stop'] = pdet.get('location', [])[1] + gene[g_cnt]['strand'] = orient + gene[g_cnt]['score'] = pdet.get('score','') + + # default value + gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 + if len(child_nf_map[pkey]) > 1: + gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 + + # complete sub-feature for all transcripts + dim = len(child_nf_map[pkey]) + TRS = np.zeros((dim,), dtype=np.object) + TR_TYP = np.zeros((dim,), dtype=np.object) + EXON = np.zeros((dim,), dtype=np.object) + UTR5 = np.zeros((dim,), dtype=np.object) + UTR3 = np.zeros((dim,), dtype=np.object) + CDS = np.zeros((dim,), dtype=np.object) + TISc = np.zeros((dim,), dtype=np.object) + TSSc = np.zeros((dim,), dtype=np.object) + CLV = np.zeros((dim,), dtype=np.object) + CSTOP = np.zeros((dim,), dtype=np.object) + TSTAT = np.zeros((dim,), dtype=np.object) + TSCORE = np.zeros((dim,), dtype=np.object) + + # fetching corresponding transcripts + for xq, Lv1 in enumerate(child_nf_map[pkey]): + + TID = Lv1.get('ID', '') + TRS[xq]= np.array([TID]) + + TYPE = Lv1.get('type', '') + TR_TYP[xq] = np.array('') + TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] + + orient = Lv1.get('strand', '') + tr_score = Lv1.get('score', '') + + # fetching different sub-features + child_feat = defaultdict(list) + for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: + E_TYP = Lv2.get('type', '') + child_feat[E_TYP].append(Lv2.get('location')) + + # make general ascending order of coordinates + if orient == '-': + for etype, excod in child_feat.items(): + if len(excod) > 1: + if excod[0][0] > excod[-1][0]: + excod.reverse() + child_feat[etype] = excod + + # make exon coordinate from cds and utr regions + if not child_feat.get('exon'): + if child_feat.get('CDS'): + exon_cod = utils.make_Exon_cod( orient, + NonetoemptyList(child_feat.get('five_prime_UTR')), + NonetoemptyList(child_feat.get('CDS')), + NonetoemptyList(child_feat.get('three_prime_UTR'))) + child_feat['exon'] = exon_cod + else: + # TODO only UTR's + # searching through keys to find a pattern describing exon feature + ex_key_pattern = [k for k in child_feat if k.endswith("exon")] + if ex_key_pattern: + child_feat['exon'] = child_feat[ex_key_pattern[0]] + + # stop_codon are seperated from CDS, add the coordinates based on strand + if child_feat.get('stop_codon'): + if orient == '+': + if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1: + child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]] + else: + child_feat['CDS'].append(child_feat.get('stop_codon')[0]) + elif orient == '-': + if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1: + child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]] + else: + child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0]) + + # transcript signal sites + TIS, cdsStop, TSS, cleave = [], [], [], [] + cds_status, exon_status, utr_status = 0, 0, 0 + + if child_feat.get('exon'): + TSS = [child_feat.get('exon')[-1][1]] + TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS + cleave = [child_feat.get('exon')[0][0]] + cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave + exon_status = 1 + + if child_feat.get('CDS'): + if orient == '+': + TIS = [child_feat.get('CDS')[0][0]] + cdsStop = [child_feat.get('CDS')[-1][1]-3] + else: + TIS = [child_feat.get('CDS')[-1][1]] + cdsStop = [child_feat.get('CDS')[0][0]+3] + cds_status = 1 + # cds phase calculation + child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS')) + + # sub-feature status + if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'): + utr_status =1 + + if utr_status == cds_status == exon_status == 1: + t_status = 1 + else: + t_status = 0 + + # add sub-feature # make array for export to different out + TSTAT[xq] = t_status + EXON[xq] = np.array(child_feat.get('exon'), np.float64) + UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR'))) + UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR'))) + CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) + TISc[xq] = np.array(TIS) + CSTOP[xq] = np.array(cdsStop) + TSSc[xq] = np.array(TSS) + CLV[xq] = np.array(cleave) + TSCORE[xq] = tr_score + + # add sub-features to the parent gene feature + gene[g_cnt]['transcript_status'] = TSTAT + gene[g_cnt]['transcripts'] = TRS + gene[g_cnt]['exons'] = EXON + gene[g_cnt]['utr5_exons'] = UTR5 + gene[g_cnt]['cds_exons'] = CDS + gene[g_cnt]['utr3_exons'] = UTR3 + gene[g_cnt]['transcript_type'] = TR_TYP + gene[g_cnt]['tis'] = TISc + gene[g_cnt]['cdsStop'] = CSTOP + gene[g_cnt]['tss'] = TSSc + gene[g_cnt]['cleave'] = CLV + gene[g_cnt]['transcript_score'] = TSCORE + + gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], + Name = pdet.get('name'), + Source = pkey[1]) + # few empty fields // TODO fill this: + gene[g_cnt]['anno_id'] = [] + gene[g_cnt]['confgenes_id'] = [] + gene[g_cnt]['alias'] = '' + gene[g_cnt]['name2'] = [] + gene[g_cnt]['chr_num'] = [] + gene[g_cnt]['paralogs'] = [] + gene[g_cnt]['transcript_valid'] = [] + gene[g_cnt]['exons_confirmed'] = [] + gene[g_cnt]['tis_conf'] = [] + gene[g_cnt]['tis_info'] = [] + gene[g_cnt]['cdsStop_conf'] = [] + gene[g_cnt]['cdsStop_info'] = [] + gene[g_cnt]['tss_info'] = [] + gene[g_cnt]['tss_conf'] = [] + gene[g_cnt]['cleave_info'] = [] + gene[g_cnt]['cleave_conf'] = [] + gene[g_cnt]['polya_info'] = [] + gene[g_cnt]['polya_conf'] = [] + gene[g_cnt]['is_valid'] = [] + gene[g_cnt]['transcript_complete'] = [] + gene[g_cnt]['is_complete'] = [] + gene[g_cnt]['is_correctly_gff3_referenced'] = '' + gene[g_cnt]['splicegraph'] = [] + g_cnt += 1 + + ## deleting empty gene records from the main array + XPFLG=0 + for XP, ens in enumerate(gene): + if ens[0]==0: + XPFLG=1 + break + + if XPFLG==1: + XQC = range(XP, len(gene)+1) + gene = np.delete(gene, XQC) + + return gene + +def NonetoemptyList(XS): + """ + Convert a None type to empty list + + @args XS: None type + @type XS: str + """ + return [] if XS is None else XS + +def create_missing_feature_type(p_feat, c_feat): + """ + GFF/GTF file defines only child features. This function tries to create + the parent feature from the information provided in the attribute column. + + example: + chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; + chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; + chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; + + This function gets the parsed feature annotations. + + @args p_feat: Parent feature map + @type p_feat: collections defaultdict + @args c_feat: Child feature map + @type c_feat: collections defaultdict + """ + + child_n_map = defaultdict(list) + for fid, det in c_feat.items(): + # get the details from grand child + GID = STRD = SCR = None + SPOS, EPOS = [], [] + TYP = dict() + for gchild in det: + GID = gchild.get('gene_id', '') + GNAME = gchild.get('gene_name', '') + SPOS.append(gchild.get('location', [])[0]) + EPOS.append(gchild.get('location', [])[1]) + STRD = gchild.get('strand', '') + SCR = gchild.get('score', '') + if gchild.get('type', '') == "gene": ## gencode GTF file has this problem + continue + TYP[gchild.get('type', '')] = 1 + SPOS.sort() + EPOS.sort() + + # infer transcript type + transcript_type = 'transcript' + transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type + + # gene id and transcript id are same + transcript_id = fid[-1] + if GID == transcript_id: + transcript_id = 'Transcript:' + str(GID) + + # level -1 feature type + p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', + location = [], ## infer location based on multiple transcripts + strand = STRD, + name = GNAME ) + # level -2 feature type + child_n_map[(fid[0], fid[1], GID)].append( + dict( type = transcript_type, + location = [SPOS[0], EPOS[-1]], + strand = STRD, + score = SCR, + ID = transcript_id, + gene_id = '' )) + # reorganizing the grand child + for gchild in det: + child_n_map[(fid[0], fid[1], transcript_id)].append( + dict( type = gchild.get('type', ''), + location = gchild.get('location'), + strand = gchild.get('strand'), + ID = gchild.get('ID'), + score = gchild.get('score'), + gene_id = '' )) + return p_feat, child_n_map + diff --git a/bed_to_gff.py b/bed_to_gff.py index 461ae72..4c582d8 100644 --- a/bed_to_gff.py +++ b/bed_to_gff.py @@ -25,7 +25,7 @@ def __main__(): try: bed_fname = sys.argv[1] except: - print __doc__ + print(__doc__) sys.exit(-1) bed_fh = helper.open_file(bed_fname) diff --git a/bed_to_gff.py.bak b/bed_to_gff.py.bak new file mode 100644 index 0000000..461ae72 --- /dev/null +++ b/bed_to_gff.py.bak @@ -0,0 +1,71 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in a 12 column BED format to GFF3. + +Usage: + python bed_to_gff.py in.bed > out.gff + +Requirement: + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import helper + +def __main__(): + """ + main function + """ + + try: + bed_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + bed_fh = helper.open_file(bed_fname) + + for line in bed_fh: + line = line.strip( '\n\r' ) + + if not line or line[0] in ['#']: + continue + + parts = line.split('\t') + assert len(parts) >= 12, line + + rstarts = parts[-1].split(',') + rstarts.pop() if rstarts[-1] == '' else rstarts + + exon_lens = parts[-2].split(',') + exon_lens.pop() if exon_lens[-1] == '' else exon_lens + + if len(rstarts) != len(exon_lens): + continue # checking the consistency col 11 and col 12 + + if len(rstarts) != int(parts[-3]): + continue # checking the number of exons and block count are same + + if not parts[5] in ['+', '-']: + parts[5] = '.' # replace the unknown strand with '.' + + # bed2gff result line + sys.stdout.write('%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])) + sys.stdout.write('%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])) + + st = int(parts[1]) + for ex_cnt in range(int(parts[-3])): + start = st + int(rstarts[ex_cnt]) + 1 + stop = start + int(exon_lens[ex_cnt]) - 1 + sys.stdout.write('%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s\n' % (parts[0], start, stop, parts[4], parts[5], parts[3])) + + bed_fh.close() + + +if __name__ == "__main__": + __main__() diff --git a/gbk_to_gff.py b/gbk_to_gff.py index 5220676..067e2bc 100644 --- a/gbk_to_gff.py +++ b/gbk_to_gff.py @@ -25,7 +25,7 @@ def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): """ Write the feature information """ - for gname, ginfo in genes.items(): + for gname, ginfo in list(genes.items()): line = [str(chr_id), 'gbk2gff', ginfo[3], @@ -84,11 +84,11 @@ def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.'] fStart = fStop = None - for eid, ex in cds.items(): + for eid, ex in list(cds.items()): fStart = ex[0][0] fStop = ex[0][-1] - for eid, ex in exons.items(): + for eid, ex in list(exons.items()): fStart = ex[0][0] fStop = ex[0][-1] @@ -209,7 +209,7 @@ def gbk_parse(fname): try: gbkfname = sys.argv[1] except: - print __doc__ + print(__doc__) sys.exit(-1) ## extract gbk records diff --git a/gbk_to_gff.py.bak b/gbk_to_gff.py.bak new file mode 100644 index 0000000..5220676 --- /dev/null +++ b/gbk_to_gff.py.bak @@ -0,0 +1,216 @@ +#!/usr/bin/env python +""" +Convert data from Genbank format to GFF. + +Usage: +python gbk_to_gff.py in.gbk > out.gff + +Requirements: + BioPython:- http://biopython.org/ + helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import os +import re +import sys +import helper +import collections +from Bio import SeqIO + +def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): + """ + Write the feature information + """ + for gname, ginfo in genes.items(): + line = [str(chr_id), + 'gbk2gff', + ginfo[3], + str(ginfo[0]), + str(ginfo[1]), + '.', + ginfo[2], + '.', + 'ID=%s;Name=%s' % (str(gname), str(gname))] + sys.stdout.write('\t'.join(line)+"\n") + ## construct the transcript line is not defined in the original file + t_line = [str(chr_id), 'gbk2gff', source, '0', '1', '.', ginfo[2], '.'] + + if not transcripts: + t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) + + if exons: ## get the entire transcript region from the defined feature + t_line[3] = str(exons[gname][0][0]) + t_line[4] = str(exons[gname][0][-1]) + elif cds: + t_line[3] = str(cds[gname][0][0]) + t_line[4] = str(cds[gname][0][-1]) + + if not cds: + t_line[2] = 'transcript' + else: + t_line[2] = 'mRNA' + sys.stdout.write('\t'.join(t_line)+"\n") + + if exons: + exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') + + if cds: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') + + else: ## transcript is defined + for idx in transcripts[gname]: + t_line[2] = idx[3] + t_line[3] = str(idx[0]) + t_line[4] = str(idx[1]) + t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) + sys.stdout.write('\t'.join(t_line)+"\n") + + ## feature line print call + if exons: + exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') + if cds: + exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') + + if len(genes) == 0: ## feature entry with fragment information + + line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.'] + fStart = fStop = None + + for eid, ex in cds.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + for eid, ex in exons.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + if fStart or fStart: + + line[2] = 'gene' + line[3] = str(fStart) + line[4] = str(fStop) + line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) + sys.stdout.write('\t'.join(line)+"\n") + + if not cds: + line[2] = 'transcript' + else: + line[2] = 'mRNA' + + line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) + sys.stdout.write('\t'.join(line)+"\n") + + if exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + if cds: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') + if not exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + unk +=1 + + return unk + + +def exon_line_print(temp_line, trx_exons, parent, ftype): + """ + Print the EXON feature line + """ + for ex in trx_exons: + temp_line[2] = ftype + temp_line[3] = str(ex[0]) + temp_line[4] = str(ex[1]) + temp_line[8] = 'Parent=%s' % parent + sys.stdout.write('\t'.join(temp_line)+"\n") + + +def gbk_parse(fname): + """ + Extract genome annotation recods from genbank format + + @args fname: gbk file name + @type fname: str + """ + fhand = helper.open_file(gbkfname) + unk = 1 + + for record in SeqIO.parse(fhand, "genbank"): + gene_tags = dict() + tx_tags = collections.defaultdict(list) + exon = collections.defaultdict(list) + cds = collections.defaultdict(list) + mol_type, chr_id = None, None + + for rec in record.features: + + if rec.type == 'source': + try: + mol_type = rec.qualifiers['mol_type'][0] + except: + mol_type = '.' + pass + try: + chr_id = rec.qualifiers['chromosome'][0] + except: + chr_id = record.name + continue + + strand='-' + strand='+' if rec.strand>0 else strand + + fid = None + try: + fid = rec.qualifiers['gene'][0] + except: + pass + + transcript_id = None + try: + transcript_id = rec.qualifiers['transcript_id'][0] + except: + pass + + if re.search(r'gene', rec.type): + gene_tags[fid] = (rec.location._start.position+1, + rec.location._end.position, + strand, + rec.type + ) + elif rec.type == 'exon': + exon[fid].append((rec.location._start.position+1, + rec.location._end.position)) + elif rec.type=='CDS': + cds[fid].append((rec.location._start.position+1, + rec.location._end.position)) + else: + # get all transcripts + if transcript_id: + tx_tags[fid].append((rec.location._start.position+1, + rec.location._end.position, + transcript_id, + rec.type)) + # record extracted, generate feature table + unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) + + fhand.close() + + +if __name__=='__main__': + + try: + gbkfname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + ## extract gbk records + gbk_parse(gbkfname) diff --git a/gff_to_bed.py b/gff_to_bed.py index b5cc912..bdc24bc 100644 --- a/gff_to_bed.py +++ b/gff_to_bed.py @@ -25,9 +25,9 @@ def limitBEDWrite(tinfo): @type tinfo: numpy object """ - for contig_id, feature in tinfo.items(): + for contig_id, feature in list(tinfo.items()): uns_line = dict() - for tid, tloc in feature.items(): + for tid, tloc in list(feature.items()): uns_line[(int(tloc[0])-1, int(tloc[1]))]=1 for ele in sorted(uns_line): pline = [contig_id, @@ -106,7 +106,7 @@ def __main__(): try: query_file = sys.argv[1] except: - print __doc__ + print(__doc__) sys.exit(-1) Transcriptdb = GFFParser.Parse(query_file) diff --git a/gff_to_bed.py.bak b/gff_to_bed.py.bak new file mode 100644 index 0000000..b5cc912 --- /dev/null +++ b/gff_to_bed.py.bak @@ -0,0 +1,116 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in GFF/GTF to a 12 column BED format. +BED format typically represents the transcript models. + +Usage: python gff_to_bed.py in.gff > out.bed + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def limitBEDWrite(tinfo): + """ + Write a three column BED file + + @args tinfo: list of genes + @type tinfo: numpy object + """ + + for contig_id, feature in tinfo.items(): + uns_line = dict() + for tid, tloc in feature.items(): + uns_line[(int(tloc[0])-1, int(tloc[1]))]=1 + for ele in sorted(uns_line): + pline = [contig_id, + str(ele[0]-1), + str(ele[1])] + + sys.stdout.write('\t'.join(pline)+"\n") + + +def writeBED(tinfo): + """ + writing result files in bed format + + @args tinfo: list of genes + @type tinfo: numpy object + """ + + for ent1 in tinfo: + child_flag = False + + for idx, tid in enumerate(ent1['transcripts']): + child_flag = True + exon_cnt = len(ent1['exons'][idx]) + exon_len = '' + exon_cod = '' + rel_start = None + rel_stop = None + for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript + exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) + if idz == 0: #calculate the relative start position + exon_cod += '0,' + rel_start = int(ex_cod[0])-1 + rel_stop = int(ex_cod[1]) + else: + exon_cod += '%d,' % (ex_cod[0]-1-rel_start) ## shifting the coordinates to zero + rel_stop = int(ex_cod[1]) + + if exon_len: + score = 0 + score = ent1['transcript_score'][idx] if ent1['transcript_score'].any() else score ## getting the transcript score + out_print = [ent1['chr'], + str(rel_start), + str(rel_stop), + tid[0], + str(score), + ent1['strand'], + str(rel_start), + str(rel_stop), + '0', + str(exon_cnt), + exon_len, + exon_cod] + sys.stdout.write('\t'.join(out_print)+"\n") + + if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type + score = 0 + score = ent1['transcript_score'][0] if ent1['transcript_score'].any() else score + + out_print = [ent1['chr'], + '%d' % (int(ent1['start'])-1), + '%d' % int(ent1['stop']), + ent1['name'], + str(score), + ent1['strand'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + '0', + '1', + '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), + '0,'] + + sys.stdout.write('\t'.join(out_print)+"\n") + + +def __main__(): + try: + query_file = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(query_file) + writeBED(Transcriptdb) + +if __name__ == "__main__": + __main__() diff --git a/gff_to_gtf.py b/gff_to_gtf.py index d1423c8..1ad57ac 100644 --- a/gff_to_gtf.py +++ b/gff_to_gtf.py @@ -71,7 +71,7 @@ def printGTF(tinfo): try: gff_fname = sys.argv[1] except: - print __doc__ + print(__doc__) sys.exit(-1) Transcriptdb = GFFParser.Parse(gff_fname) diff --git a/gff_to_gtf.py.bak b/gff_to_gtf.py.bak new file mode 100644 index 0000000..d1423c8 --- /dev/null +++ b/gff_to_gtf.py.bak @@ -0,0 +1,79 @@ +#!/usr/bin/env python +""" +Program to convert data from GFF to GTF + +Usage: python gff_to_gtf.py in.gff > out.gtf + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import numpy +import GFFParser + +def printGTF(tinfo): + """ + writing result file in GTF format + + @args tinfo: parsed object from gff file + @type tinfo: numpy array + """ + + for ent1 in tinfo: + for idx, tid in enumerate(ent1['transcripts']): + + exons = ent1['exons'][idx] + if numpy.isnan(numpy.min(exons)): + continue + cds_exons = ent1['cds_exons'][idx] + + stop_codon = start_codon = () + + if ent1['strand'] == '+': + if cds_exons.any(): + start_codon = (cds_exons[0][0], cds_exons[0][0]+2) + stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + elif ent1['strand'] == '-': + if cds_exons.any(): + start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + stop_codon = (cds_exons[0][0], cds_exons[0][0]+2) + else: + sys.stdout.write('STRAND information missing - %s, skip the transcript - %s\n' % (ent1['strand'], tid[0])) + pass + + last_cds_cod = 0 + for idz, ex_cod in enumerate(exons): + + sys.stdout.write('%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; \n' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])) + + if cds_exons.any(): + try: + sys.stdout.write('%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; \n' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])) + last_cds_cod = idz + except: + pass + + if idz == 0: + sys.stdout.write('%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; \n' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])) + + if stop_codon: + sys.stdout.write('%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; \n' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])) + + +if __name__ == "__main__": + + try: + gff_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(gff_fname) + + printGTF(Transcriptdb) diff --git a/gtf_to_gff.py b/gtf_to_gff.py index 6d78669..5e6c7e8 100644 --- a/gtf_to_gff.py +++ b/gtf_to_gff.py @@ -67,7 +67,7 @@ def __main__(): try: gtf_fname = sys.argv[1] except: - print __doc__ + print(__doc__) sys.exit(-1) gtf_file_content = GFFParser.Parse(gtf_fname) diff --git a/gtf_to_gff.py.bak b/gtf_to_gff.py.bak new file mode 100644 index 0000000..6d78669 --- /dev/null +++ b/gtf_to_gff.py.bak @@ -0,0 +1,78 @@ +#!/usr/bin/env python +""" +Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3]. + +Usage: python gtf_to_gff.py in.gtf > out.gff3 + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + helper.py: https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import helper +import GFFParser + +def GFFWriter(gtf_content): + """ + write the feature information to GFF format + + @args gtf_content: Parsed object from gtf file + @type gtf_content: numpy array + """ + + sys.stdout.write('##gff-version 3\n') + for ent1 in gtf_content: + chr_name = ent1['chr'] + strand = ent1['strand'] + start = ent1['start'] + stop = ent1['stop'] + source = ent1['source'] + ID = ent1['name'] + Name = ent1['gene_info']['Name'] + Name = ID if not Name else Name + + sys.stdout.write('%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s\n' % (chr_name, source, start, stop, strand, ID, Name)) + for idx, tid in enumerate(ent1['transcripts']): + + t_start = ent1['exons'][idx][0][0] + t_stop = ent1['exons'][idx][-1][-1] + t_type = ent1['transcript_type'][idx] + + utr5_exons, utr3_exons = [], [] + if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any(): + utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand) + + sys.stdout.write('%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s\n' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID)) + for ex_cod in utr5_exons: + sys.stdout.write('%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])) + + for ex_cod in ent1['cds_exons'][idx]: + sys.stdout.write('%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0])) + + for ex_cod in utr3_exons: + sys.stdout.write('%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])) + + for ex_cod in ent1['exons'][idx]: + sys.stdout.write('%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])) + + +def __main__(): + + try: + gtf_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + gtf_file_content = GFFParser.Parse(gtf_fname) + + GFFWriter(gtf_file_content) + +if __name__ == "__main__": + __main__()