[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

View of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.8 - (download) (as text) (annotate)
Fri Nov 12 19:42:55 2010 UTC (9 years, 8 months ago) by tharriso
Branch: MAIN
Changes since 1.7: +7 -3 lines
made obtaining contig data optional

#!/usr/bin/env python

import os, sys, re, hashlib
from optparse import OptionParser
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

__doc__ = """
Script to take a repository file from the following formats:
   genbank, kegg, swiss, fasta
and output the following tab seperated files:

   md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
   md5, id, function, ontology, source
   md5, sequence
   id, external refrence 1, [external refrence 2, external refrence 3, ...]

Values with * are optional, only genbank files contain that data."""

desc_re  = re.compile('(Rec|Sub)Name: Full=(.*?);')
entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)
name_re  = re.compile('^NAME\s+(.*)$', re.MULTILINE)
func_re  = re.compile('^DEFINITION\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
orth_re  = re.compile('^ORTHOLOGY\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
dbref_re = re.compile('^DBLINKS\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
seq_re   = re.compile('^AASEQ\s+\d+\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
up_go_re = re.compile('^GO:GO:(.*)')
gb_go_re = re.compile('^GO:(\d+)\s+-\s+(.*)')
gb_go_rm = re.compile('( \[Evidence [A-Z]+\])')
ko_re    = re.compile('^(K\d+)\s+(.*)')
ogid_re  = re.compile('^(\S+?OG)\d+')
go_types = ['GO_function', 'GO_process', 'GO_component']

def get_eggnog_map(filename, verb):
    emap = {}
    if filename is None:
        return emap
    if verb: sys.stdout.write("Parsing file %s ... " %filename)
    fhdl = open(filename, 'r')
        for line in fhdl:
            if line.startswith("#"): continue
            parts = line.strip().split("\t")
            if parts[0] in emap:
                emap[ parts[0] ].append( [parts[3], parts[4]] )
            else :
                emap[ parts[0] ] = [ [parts[3], parts[4]] ]
    if verb: sys.stdout.write("Done\n")
    return emap

def get_kegg_map(filename, verb):
    kmap = {}
    if filename is None:
        return kmap
    if verb: sys.stdout.write("Parsing file %s ... " %filename)
    for text in kegg_iter(filename):
        rec = get_kegg_rec(text)
        if (not rec.name) or (not rec.description): continue
        names = rec.name.split(', ')
        if len(names) > 1:
            kmap[ names[1] ] = rec.description
    if verb: sys.stdout.write("Done\n")
    return kmap

def kegg_iter(filename):
    fhdl = file(filename, 'rb')
    recs = ['']
    for data in iter(lambda: fhdl.read(100000), ''):
        recs = (recs[-1] + data).split("\n///\n")
        for rec in recs[:-1]:
            yield rec

def get_kegg_rec(text):
    record  = SeqRecord(Seq("",IUPAC.protein), id='', name='', description='', dbxrefs=[], annotations={'organism':''})
    entry_s = entry_re.search(text)
    name_s  = name_re.search(text)
    func_s  = func_re.search(text)
    orth_s  = orth_re.search(text)
    dbref_s = dbref_re.search(text)
    seq_s   = seq_re.search(text)
    if entry_s:
        record.id = entry_s.group(1)
        record.annotations['organism'] = entry_s.group(2).strip()
    if name_s:
        record.name = name_s.group(1).strip()
    if func_s:
        record.description = re.sub('\s+', ' ', func_s.group(1).strip())
    if orth_s:
        orths = {}
        for o in filter( lambda y: y, map(lambda x: ko_re.search(x.strip()), orth_s.group(1).strip().split("\n")) ):
            orths[ o.group(1) ] = o.group(2)
        record.annotations['orthology'] = orths
    if dbref_s:
        parts = dbref_s.group(1).split()
        if (len(parts) % 2) == 0:
            for i in range(0, len(parts)-1, 2):
                record.dbxrefs.append(parts[i].replace('NCBI-', '') + parts[i+1])
    if seq_s:
        record.seq = Seq( re.sub('\s+', '', seq_s.group(1)), IUPAC.protein )
    return record

def format_factory(prot_f, ont_f, seq_f, ref_f, source, form, org_desc, get_ont, get_ctg):
    if form == 'genbank':    
        def parse_genbank(rec, amap):
            if (not rec) or (not rec.id) or (not rec.seq): return
            cid   = rec.id
            cdesc = rec.description.rstrip('.')
            clen  = len(rec.seq)
            org   = rec.annotations['organism']
            # only one entry on rec (first feat is source and is not a feature), use first real feature, no contig data
            if len(rec.features) == 2:
                md5  = hashlib.md5(str(rec.seq)).hexdigest()
                func = ''
                if 'product' in rec.features[1].qualifiers:
                    func = rec.features[1].qualifiers['product'][0]
                seq_f.write("%s\t%s\n" %(md5, rec.seq))
                prot_f.write("\t".join([md5, rec.name, func, org, source]) + "\n")
            # multiple entries on rec, use CDS features, get contig data
            for feat in rec.features:
                if feat.type == 'CDS':
                    if ( (not feat.qualifiers) or (not feat.location) or
                         ('translation' not in feat.qualifiers) or
                         ('product' not in feat.qualifiers) ): continue
                    fid = ''
                    if 'protein_id' in feat.qualifiers:
                        fid = feat.qualifiers['protein_id'][0]
                    elif 'locus_tag' in feat.qualifiers:
                        fid = feat.qualifiers['locus_tag'][0]
                    seq  = feat.qualifiers['translation'][0]
                    md5  = hashlib.md5(seq).hexdigest()
                    func = feat.qualifiers['product'][0]
                    beg  = feat.location.start.position
                    end  = feat.location.end.position
                    if feat.strand:
                        strd = feat.strand
                        strd = 1
                        if (beg > end):
                            strd = -1
                            beg  = feat.location.end.position
                            end  = feat.location.start.position
                    # output GO ontology
                    if get_ont:
                        for gtype in go_types:
                            if gtype in feat.qualifiers:
                                for goid in feat.qualifiers[gtype]:
                                    go_m = gb_go_re.match(goid)
                                    if go_m:
                                        desc = gb_go_rm.sub('', go_m.group(2))
                                        ont_f.write("\t".join([md5, go_m.group(1), desc, gtype.replace('_',':')]) + "\n")
                    seq_f.write("%s\t%s\n" %(md5, seq))
                    if get_ctg:
                        prot_f.write("\t".join([md5, fid, func, org, source, str(beg), str(end), str(strd), cid, cdesc, str(clen)]) + "\n")
                        prot_f.write("\t".join([md5, fid, func, org, source]) + "\n")
                    if ('db_xref' in feat.qualifiers) and (len(feat.qualifiers['db_xref']) > 0):
                        ref_f.write(fid + "\t" + "\t".join(feat.qualifiers['db_xref']) + "\n")
        return parse_genbank
    elif form == 'swiss':
        def parse_swiss(rec, amap):
            if (not rec) or (not rec.id) or (not rec.seq): return
            md5  = hashlib.md5(str(rec.seq)).hexdigest()
            func = rec.description
            srch = desc_re.search(func)
            if srch: func = srch.group(2)
            seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
            prot_f.write("\t".join([md5, rec.id, func, rec.annotations['organism'], source]) + "\n")
            if len(rec.dbxrefs) > 0:
                ref_str = rec.id
                for ref in rec.dbxrefs:
                    go_m = up_go_re.match(ref)
                    # output GO ontology
                    if go_m and get_ont:
                        ont_f.write("\t".join([md5, go_m.group(1), '', 'GO']) + "\n")
                        ref_str += "\t" + ref
                ref_f.write(ref_str + "\n")
        return parse_swiss
    elif form == 'fasta':
        def parse_fasta(rec, amap):
            if (not rec) or (not rec.id) or (not rec.seq): return
            desc = ''
            org  = ''
            func = ''
            hdrs = rec.description.split()
            md5  = hashlib.md5(str(rec.seq)).hexdigest()
            seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
            if len(hdrs) > 1:
                desc = " ".join(hdrs)
                if desc.startswith("("): desc = desc.strip(')(')
            if org_desc: org  = desc
            else:        func = desc
            if get_ont and (rec.id in amap):
                for f in amap[rec.id]:
                    prot_f.write("\t".join([md5, rec.id, f[1], org, source]) + "\n")
                    ogid_m = ogid_re.match(f[0])
                    if ogid_m:
                        ont_f.write("\t".join([md5, f[0], f[1], ogid_m.group(1)]) + "\n")
                prot_f.write("\t".join([md5, rec.id, func, org, source]) + "\n")
        return parse_fasta

    elif form == 'kegg':
        def parse_kegg(text, amap):
            rec = get_kegg_rec(text)
            if (not rec.id) or (not rec.description) or (not rec.seq): return
            md5  = hashlib.md5(str(rec.seq)).hexdigest()
            org  = rec.annotations['organism']
            orth = []
            if org in amap:
                org = amap[org]
            # output KO ontology
            if get_ont and ('orthology' in rec.annotations):
                orth = rec.annotations['orthology']
                for oid, odesc in orth.iteritems():
                    ont_f.write("\t".join([md5, oid, odesc, 'KO']) + "\n")
            seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
            prot_f.write("\t".join([md5, rec.id, rec.description, org, source]) + "\n")
            if len(rec.dbxrefs) > 0:
                ref_f.write(rec.id + "\t" + "\t".join(rec.dbxrefs) + "\n")
        return parse_kegg

        return None

usage = "usage: %prog [options] source input_file1 [input_file2 input_file3 ...]" + __doc__

def main(args):
    parser = OptionParser(usage=usage)
    parser.add_option("-f", "--format", dest="format", metavar="FORMAT", default="fasta",
                      help="FORMAT inputed file is in. Must be one of following: genbank, kegg, swiss, fasta [default is fasta]")
    parser.add_option("-k", "--kegg_map", dest="keggmap", metavar="FILE", default=None, help="Optional KEGG genome FILE for org name mapping")
    parser.add_option("-e", "--eggnog_map", dest="eggnogmap", metavar="FILE", default=None, help="Optional eggNOG orthgroups tabed FILE for func mapping")
    parser.add_option("-d", "--out_dir", dest="outdir", metavar="DIR", default=None, help="DIR to write output [default is current dir]")
    parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")
    parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")
    parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")
    (opts, args) = parser.parse_args()
    if len(args) < 2:
        parser.error("Incorrect number of arguments")
    source = args[0]
    files  = args[1:]
    outdir = ''
    if opts.outdir: outdir = opts.outdir

    keggmap   = get_kegg_map(opts.keggmap, opts.verbose)
    eggnogmap = get_eggnog_map(opts.eggnogmap, opts.verbose)

    prot_file = open( os.path.join(outdir, source + ".md52id2func"), 'w' )
    ont_file  = open( os.path.join(outdir, source + ".md52id2ont"), 'w' )
    seq_file  = open( os.path.join(outdir, source + ".md52seq"), 'w' )
    ref_file  = open( os.path.join(outdir, source + ".id2xref"), 'w' )
    parse_format = format_factory( prot_file, ont_file, seq_file, ref_file, source, opts.format, opts.orgheader, opts.getont, opts.getcontig )
    if not parse_format:
        parser.error("Incorrect format entered")
    for f in files:
        if opts.verbose:
            sys.stdout.write("Parsing file %s ...\n" %f)
        if (opts.format == 'kegg'):
            for rec in kegg_iter(f):
                parse_format(rec, keggmap)
            for rec in SeqIO.parse(f, opts.format):
                parse_format(rec, eggnogmap)

    if opts.verbose:
        sys.stdout.write("All Done\n")

if __name__ == "__main__":

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3