[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

Diff of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Fri Nov 12 19:42:55 2010 UTC revision 1.9, Wed Dec 1 15:45:57 2010 UTC
# Line 20  Line 20 
20     md5, sequence     md5, sequence
21  SOURCE.id2xref:  SOURCE.id2xref:
22     id, external refrence 1, [external refrence 2, external refrence 3, ...]     id, external refrence 1, [external refrence 2, external refrence 3, ...]
23    SOURCE.id2tax:
24       id, genbank tax string
25    
26  Values with * are optional, only genbank files contain that data."""  Values with * are optional, only genbank files contain that data."""
27    
# Line 36  Line 38 
38  ko_re    = re.compile('^(K\d+)\s+(.*)')  ko_re    = re.compile('^(K\d+)\s+(.*)')
39  ogid_re  = re.compile('^(\S+?OG)\d+')  ogid_re  = re.compile('^(\S+?OG)\d+')
40  go_types = ['GO_function', 'GO_process', 'GO_component']  go_types = ['GO_function', 'GO_process', 'GO_component']
41    file_ext = [".md52id2func", ".md52id2ont", ".md52seq", ".id2xref", ".id2tax"]
42    
43  def get_eggnog_map(filename, verb):  def get_eggnog_map(filename, verb):
44      emap = {}      emap = {}
# Line 107  Line 110 
110          record.seq = Seq( re.sub('\s+', '', seq_s.group(1)), IUPAC.protein )          record.seq = Seq( re.sub('\s+', '', seq_s.group(1)), IUPAC.protein )
111      return record      return record
112    
113  def format_factory(prot_f, ont_f, seq_f, ref_f, source, form, org_desc, get_ont, get_ctg):  def format_factory(out_files, source, form, org_desc, get_ont, get_ctg, get_tax):
114        (prot_f, ont_f, seq_f, ref_f, tax_f) = out_files
115      if form == 'genbank':      if form == 'genbank':
116          def parse_genbank(rec, amap):          def parse_genbank(rec, amap):
117              if (not rec) or (not rec.id) or (not rec.seq): return              if (not rec) or (not rec.id) or (not rec.seq): return
# Line 123  Line 127 
127                      func = rec.features[1].qualifiers['product'][0]                      func = rec.features[1].qualifiers['product'][0]
128                  seq_f.write("%s\t%s\n" %(md5, rec.seq))                  seq_f.write("%s\t%s\n" %(md5, rec.seq))
129                  prot_f.write("\t".join([md5, rec.name, func, org, source]) + "\n")                  prot_f.write("\t".join([md5, rec.name, func, org, source]) + "\n")
130                    if get_tax and ('taxonomy' in rec.annotations):
131                        if rec.annotations['taxonomy'][0] == 'Root':
132                            x = rec.annotations['taxonomy'].pop(0)
133                        tax_f.write("%s\t%s;%s\n" %(rec.name, ";".join(rec.annotations['taxonomy']), org))
134                  return                  return
135              # multiple entries on rec, use CDS features, get contig data              # multiple entries on rec, use CDS features, get contig data
136              for feat in rec.features:              for feat in rec.features:
# Line 202  Line 210 
210                  hdrs.pop(0)                  hdrs.pop(0)
211                  desc = " ".join(hdrs)                  desc = " ".join(hdrs)
212                  if desc.startswith("("): desc = desc.strip(')(')                  if desc.startswith("("): desc = desc.strip(')(')
213                    if desc.startswith("|"): desc = desc.strip('| ')
214              if org_desc: org  = desc              if org_desc: org  = desc
215              else:        func = desc              else:        func = desc
216              if get_ont and (rec.id in amap):              if get_ont and (rec.id in amap):
# Line 250  Line 259 
259      parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")      parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")
260      parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")      parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")
261      parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")      parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")
262        parser.add_option("-t", "--get_tax", dest="gettax", action="store_true", default=False, help="Output taxonomy string for genbank files [default is off]")
263      parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")      parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")
264    
265      (opts, args) = parser.parse_args()      (opts, args) = parser.parse_args()
# Line 258  Line 268 
268      source = args[0]      source = args[0]
269      files  = args[1:]      files  = args[1:]
270      outdir = ''      outdir = ''
271      if opts.outdir: outdir = opts.outdir      if opts.outdir:
272            outdir = opts.outdir
273    
274      keggmap   = get_kegg_map(opts.keggmap, opts.verbose)      keggmap   = get_kegg_map(opts.keggmap, opts.verbose)
275      eggnogmap = get_eggnog_map(opts.eggnogmap, opts.verbose)      eggnogmap = get_eggnog_map(opts.eggnogmap, opts.verbose)
276    
277      prot_file = open( os.path.join(outdir, source + ".md52id2func"), 'w' )      o_files = []
278      ont_file  = open( os.path.join(outdir, source + ".md52id2ont"), 'w' )      o_hdls  = []
     seq_file  = open( os.path.join(outdir, source + ".md52seq"), 'w' )  
     ref_file  = open( os.path.join(outdir, source + ".id2xref"), 'w' )  
279    
280      parse_format = format_factory( prot_file, ont_file, seq_file, ref_file, source, opts.format, opts.orgheader, opts.getont, opts.getcontig )      for e in file_ext:
281            o_files.append( os.path.join(outdir, source + e) )
282        for f in o_files:
283            o_hdls.append( open(f, 'w') )
284    
285        parse_format = format_factory( o_hdls, source, opts.format, opts.orgheader, opts.getont, opts.getcontig, opts.gettax )
286      if not parse_format:      if not parse_format:
287          parser.error("Incorrect format entered")          parser.error("Incorrect format entered")
288    
# Line 282  Line 296 
296              for rec in SeqIO.parse(f, opts.format):              for rec in SeqIO.parse(f, opts.format):
297                  parse_format(rec, eggnogmap)                  parse_format(rec, eggnogmap)
298    
299      prot_file.close()      for h in o_hdls:
300      ont_file.close()          h.close()
301      seq_file.close()      for f in o_files:
302      ref_file.close()          if os.path.isfile(f) and (os.path.getsize(f) == 0):
303                if opts.verbose:
304                    sys.stdout.write("Removing empty file %s\n" %f)
305                os.remove(f)
306    
307      if opts.verbose:      if opts.verbose:
308          sys.stdout.write("All Done\n")          sys.stdout.write("All Done\n")
309    

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3