[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

Diff of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Mon May 2 21:40:07 2011 UTC revision 1.12, Wed Jul 20 16:41:16 2011 UTC
# Line 36  Line 36 
36          self.getont  = False          self.getont  = False
37          self.getctg  = False          self.getctg  = False
38          self.gettax  = False          self.gettax  = False
39            self.rnagg   = False
40          self.amap    = {}          self.amap    = {}
41    
42    func_map = {'LSU': '23S/28S ribosomal RNA', 'SSU': '16S/18S ribosomal RNA'}
43  desc_re  = re.compile('(Rec|Sub)Name: Full=(.*?);')  desc_re  = re.compile('(Rec|Sub)Name: Full=(.*?);')
44  entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)  entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)
45  name_re  = re.compile('^NAME\s+(.*)$', re.MULTILINE)  name_re  = re.compile('^NAME\s+(.*)$', re.MULTILINE)
# Line 48  Line 50 
50  up_go_re = re.compile('^GO:GO:(.*)')  up_go_re = re.compile('^GO:GO:(.*)')
51  gb_go_re = re.compile('^GO:(\d+)\s+-\s+(.*)')  gb_go_re = re.compile('^GO:(\d+)\s+-\s+(.*)')
52  gb_go_rm = re.compile('( \[Evidence [A-Z]+\])')  gb_go_rm = re.compile('( \[Evidence [A-Z]+\])')
53    gg_re    = re.compile('^[kpcofgs]__(\S+)$')
54  ko_re    = re.compile('^(K\d+)\s+(.*)')  ko_re    = re.compile('^(K\d+)\s+(.*)')
55  ogid_re  = re.compile('^(\S+?OG)\d+')  ogid_re  = re.compile('^(\S+?OG)\d+')
56  nr_re    = re.compile('^gi\|(\d+)\|(\w+)\|(\S+)\|\S*?\s+(.*)\]$')  nr_re    = re.compile('^gi\|(\d+)\|(\w+)\|(\S+)\|\S*?\s+(.*)\]$')
# Line 157  Line 160 
160      get_ont  = params.getont      get_ont  = params.getont
161      get_ctg  = params.getctg      get_ctg  = params.getctg
162      get_tax  = params.gettax      get_tax  = params.gettax
163        rna_gg   = params.rnagg
164      amap     = params.amap      amap     = params.amap
165      if form == 'genbank':      if form == 'genbank':
166          def parse_genbank(rec):          def parse_genbank(rec):
# Line 255  Line 259 
259              seq  = str(rec.seq).upper()              seq  = str(rec.seq).upper()
260              md5  = hashlib.md5(seq).hexdigest()              md5  = hashlib.md5(seq).hexdigest()
261              seq_f.write("%s\t%s\n" %(md5, seq))              seq_f.write("%s\t%s\n" %(md5, seq))
262                if rna_gg:
263                    hdrs.pop(0)
264                    hdrs.pop(0)
265                    hdrs.pop()
266                    tax_list = []
267                    org_list = []
268                    for x in hdrs:
269                        gg_m = gg_re.match(x)
270                        if (gg_m): tax_list.append(gg_m.group(1))
271                        else:      org_list.append(x)
272                    org  = " ".join(org_list).split(";")[0]
273                    func = "16S/18S ribosomal RNA"
274                    if get_tax:
275                        tax_f.write("%s\t%s;%s\n" %(rec.id, "".join(tax_list), org))
276                else:
277              if len(hdrs) > 1:              if len(hdrs) > 1:
278                  hdrs.pop(0)                  hdrs.pop(0)
279                  desc = " ".join(hdrs)                  desc = " ".join(hdrs)
280                  if desc.startswith("("): desc = desc.strip(')(')                  if desc.startswith("("): desc = desc.strip(')(')
281                  if desc.startswith("|"): desc = desc.strip('| ')                  if desc.startswith("|"): desc = desc.strip('| ')
282              if org_desc: org  = desc                  if org_desc:
283              else:        func = desc                      org = desc
284                        if source in func_map:
285                            func = func_map[source]
286                    else:
287                        func = desc
288              if get_ont and (rec.id in amap):              if get_ont and (rec.id in amap):
289                  for f in amap[rec.id]:                  for f in amap[rec.id]:
290                      prot_f.write("\t".join([md5, rec.id, f[1], org, source]) + "\n")                      prot_f.write("\t".join([md5, rec.id, f[1], org, source]) + "\n")
# Line 360  Line 383 
383      parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")      parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")
384      parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")      parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")
385      parser.add_option("-t", "--get_tax", dest="gettax", action="store_true", default=False, help="Output taxonomy string for genbank files [default is off]")      parser.add_option("-t", "--get_tax", dest="gettax", action="store_true", default=False, help="Output taxonomy string for genbank files [default is off]")
386        parser.add_option("-r", "--rna_gg", dest="rnagg", action="store_true", default=False, help="fasta file is greengenes format (header has tax string) [default is false]")
387      parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")      parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")
388    
389      (opts, args) = parser.parse_args()      (opts, args) = parser.parse_args()
# Line 376  Line 400 
400      params.getont  = opts.getont      params.getont  = opts.getont
401      params.getctg  = opts.getcontig      params.getctg  = opts.getcontig
402      params.gettax  = opts.gettax      params.gettax  = opts.gettax
403        params.rnagg   = opts.rnagg
404    
405      if (params.format == 'nr') and opts.nrdbs:      if (params.format == 'nr') and opts.nrdbs:
406          for d in opts.nrdbs.split(','):          for d in opts.nrdbs.split(','):

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3