[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

Diff of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Sep 16 21:31:24 2010 UTC revision 1.2, Wed Sep 22 15:04:24 2010 UTC
# Line 23  Line 23 
23    
24  Values with * are optional, only genbank files contain that data."""  Values with * are optional, only genbank files contain that data."""
25    
26  desc_re  = re.compile('RecName: Full=(.*?);')  desc_re  = re.compile('(Rec|Sub)Name: Full=(.*?);')
27  entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)  entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)
28  name_re  = re.compile('^NAME\s+(.*)$', re.MULTILINE)  name_re  = re.compile('^NAME\s+(.*)$', re.MULTILINE)
29  func_re  = re.compile('^DEFINITION\s+(.*?)^\S', re.DOTALL | re.MULTILINE)  func_re  = re.compile('^DEFINITION\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
# Line 31  Line 31 
31  seq_re   = re.compile('^AASEQ\s+\d+\s+(.*?)^\S', re.DOTALL | re.MULTILINE)  seq_re   = re.compile('^AASEQ\s+\d+\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
32  go_re    = re.compile('^GO:(.*)')  go_re    = re.compile('^GO:(.*)')
33    
34  def get_org_map(filename):  def get_eggnog_map(filename):
35      omap = {}      emap = {}
36      if filename is None:      if filename is None:
37          return omap          return emap
38        fhdl = open(filename, 'r')
39        try:
40            for line in fhdl:
41                if line.startswith("#"): continue
42                parts = line.strip().split("\t")
43                if parts[0] in emap:
44                    emap[ parts[0] ].append( "%s (%s)"%(parts[4], parts[3]) )
45                else :
46                    emap[ parts[0] ] = [ "%s (%s)"%(parts[4], parts[3]) ]
47        finally:
48            fhdl.close()
49        return emap
50    
51    def get_kegg_map(filename):
52        kmap = {}
53        if filename is None:
54            return kmap
55      for text in kegg_iter(filename):      for text in kegg_iter(filename):
56          rec = get_kegg_rec(text)          rec = get_kegg_rec(text)
57          if (not rec.name) or (not rec.description): continue          if (not rec.name) or (not rec.description): continue
58          names = rec.name.split(', ')          names = rec.name.split(', ')
59          if len(names) > 1:          if len(names) > 1:
60              omap[ names[1] ] = rec.description              kmap[ names[1] ] = rec.description
61      return omap      return kmap
62    
63  def kegg_iter(filename):  def kegg_iter(filename):
64      fhdl = file(filename, 'rb')      fhdl = file(filename, 'rb')
# Line 76  Line 93 
93    
94  def format_factory(data_f, seq_f, ref_f, source, form, org_desc, get_go):  def format_factory(data_f, seq_f, ref_f, source, form, org_desc, get_go):
95      if form == 'genbank':      if form == 'genbank':
96          def parse_genbank(rec):          def parse_genbank(rec, amap):
97              if (not rec) or (not rec.id) or (not rec.seq): return              if (not rec) or (not rec.id) or (not rec.seq): return
98              cid   = rec.id              cid   = rec.id
99              cdesc = rec.description.rstrip('.')              cdesc = rec.description.rstrip('.')
# Line 109  Line 126 
126          return parse_genbank          return parse_genbank
127    
128      elif form == 'swiss':      elif form == 'swiss':
129          def parse_swiss(rec):          def parse_swiss(rec, amap):
130              if (not rec) or (not rec.id) or (not rec.seq): return              if (not rec) or (not rec.id) or (not rec.seq): return
131              md5  = hashlib.md5(str(rec.seq)).hexdigest()              md5  = hashlib.md5(str(rec.seq)).hexdigest()
132              func = rec.description              func = rec.description
133              srch = desc_re.search(func)              srch = desc_re.search(func)
134              if srch: func = srch.group(1)              if srch: func = srch.group(2)
135              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
136              data_f.write("\t".join([md5, rec.id, func, rec.annotations['organism'], source]) + "\n")              data_f.write("\t".join([md5, rec.id, func, rec.annotations['organism'], source]) + "\n")
137              if len(rec.dbxrefs) > 0:              if len(rec.dbxrefs) > 0:
# Line 131  Line 148 
148          return parse_swiss          return parse_swiss
149    
150      elif form == 'fasta':      elif form == 'fasta':
151          def parse_fasta(rec):          def parse_fasta(rec, amap):
152              if (not rec) or (not rec.id) or (not rec.seq): return              if (not rec) or (not rec.id) or (not rec.seq): return
153              func = rec.description              desc = ''
154                hdrs = rec.description.split()
155              md5  = hashlib.md5(str(rec.seq)).hexdigest()              md5  = hashlib.md5(str(rec.seq)).hexdigest()
156              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
157              if func.startswith("("): func = rec.description.strip(')(')              if len(hdrs) > 1:
158                    hdrs.pop(0)
159                    desc = " ".join(hdrs)
160                    if desc.startswith("("): desc = desc.strip(')(')
161              if org_desc:              if org_desc:
162                  data_f.write("\t".join([md5, rec.id, '', func, source]) + "\n")                  if rec.id in amap:
163                        for f in amap[rec.id]:
164                            data_f.write("\t".join([md5, rec.id, f, desc, source]) + "\n")
165                    else:
166                        data_f.write("\t".join([md5, rec.id, '', desc, source]) + "\n")
167              else:              else:
168                  data_f.write("\t".join([md5, rec.id, func, '', source]) + "\n")                  data_f.write("\t".join([md5, rec.id, desc, '', source]) + "\n")
169          return parse_fasta          return parse_fasta
170    
171      elif form == 'kegg':      elif form == 'kegg':
172          def parse_kegg(text, orgmap):          def parse_kegg(text, amap):
173              rec = get_kegg_rec(text)              rec = get_kegg_rec(text)
174              if (not rec.id) or (not rec.description) or (not rec.seq): return              if (not rec.id) or (not rec.description) or (not rec.seq): return
175              md5 = hashlib.md5(str(rec.seq)).hexdigest()              md5 = hashlib.md5(str(rec.seq)).hexdigest()
176              org = rec.annotations['organism']              org = rec.annotations['organism']
177              if org in orgmap:              if org in amap:
178                  org = orgmap[org]                  org = amap[org]
179              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))              seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
180              data_f.write("\t".join([md5, rec.id, rec.description, org, source]) + "\n")              data_f.write("\t".join([md5, rec.id, rec.description, org, source]) + "\n")
181              if len(rec.dbxrefs) > 0:              if len(rec.dbxrefs) > 0:
# Line 167  Line 192 
192      parser = OptionParser(usage=usage)      parser = OptionParser(usage=usage)
193      parser.add_option("-f", "--format", dest="format", metavar="FORMAT", default="fasta",      parser.add_option("-f", "--format", dest="format", metavar="FORMAT", default="fasta",
194                        help="FORMAT inputed file is in. Must be one of following: genbank, kegg, swiss, fasta [default is fasta]")                        help="FORMAT inputed file is in. Must be one of following: genbank, kegg, swiss, fasta [default is fasta]")
195      parser.add_option("-m", "--org_map", dest="orgmap", metavar="FILE", default=None, help="Optional KEGG genome FILE for org name mapping")      parser.add_option("-k", "--kegg_map", dest="keggmap", metavar="FILE", default=None, help="Optional KEGG genome FILE for org name mapping")
196        parser.add_option("-e", "--eggnog_map", dest="eggnogmap", metavar="FILE", default=None, help="Optional eggNOG orthgroups tabed FILE for func mapping")
197      parser.add_option("-d", "--out_dir", dest="outdir", metavar="DIR", default=None, help="DIR to write output [default is current dir]")      parser.add_option("-d", "--out_dir", dest="outdir", metavar="DIR", default=None, help="DIR to write output [default is current dir]")
198      parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")      parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")
199      parser.add_option("-g", "--go_entry", dest="goentry", action="store_true", default=False, help="For swiss files, output for each GO alias in entry [default is off]")      parser.add_option("-g", "--go_entry", dest="goentry", action="store_true", default=False, help="For swiss files, output for each GO alias in entry [default is off]")
# Line 178  Line 204 
204          parser.error("Incorrect number of arguments")          parser.error("Incorrect number of arguments")
205      source = args[0]      source = args[0]
206      files  = args[1:]      files  = args[1:]
     orgmap = get_org_map(opts.orgmap)  
207      outdir = ''      outdir = ''
208      if opts.outdir: outdir = opts.outdir      if opts.outdir: outdir = opts.outdir
209    
210        keggmap   = get_kegg_map(opts.keggmap)
211        eggnogmap = get_eggnog_map(opts.eggnogmap)
212    
213      data_file = open( os.path.join(outdir, source + ".md52id2func"), 'w' )      data_file = open( os.path.join(outdir, source + ".md52id2func"), 'w' )
214      seq_file  = open( os.path.join(outdir, source + ".md52seq"), 'w' )      seq_file  = open( os.path.join(outdir, source + ".md52seq"), 'w' )
215      ref_file  = open( os.path.join(outdir, source + ".id2xref"), 'w' )      ref_file  = open( os.path.join(outdir, source + ".id2xref"), 'w' )
# Line 195  Line 223 
223              sys.stdout.write("Parsing file %s ...\n" %f)              sys.stdout.write("Parsing file %s ...\n" %f)
224          if (opts.format == 'kegg'):          if (opts.format == 'kegg'):
225              for rec in kegg_iter(f):              for rec in kegg_iter(f):
226                  parse_format(rec, orgmap)                  parse_format(rec, keggmap)
227          else:          else:
228              for rec in SeqIO.parse(f, opts.format):              for rec in SeqIO.parse(f, opts.format):
229                  parse_format(rec)                  parse_format(rec, eggnogmap)
230    
231      data_file.close()      data_file.close()
232      seq_file.close()      seq_file.close()

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3