[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

Annotation of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : tharriso 1.1 #!/usr/bin/env python
2 :    
3 : tharriso 1.13 import os, sys, shutil, re, hashlib
4 : tharriso 1.1 from optparse import OptionParser
5 : tharriso 1.10 from multiprocessing import Pool
6 : tharriso 1.1 from Bio import SeqIO
7 :     from Bio.Seq import Seq
8 :     from Bio.SeqRecord import SeqRecord
9 :     from Bio.Alphabet import IUPAC
10 :    
11 :     __doc__ = """
12 :     Script to take a repository file from the following formats:
13 :     genbank, kegg, swiss, fasta
14 :     and output the following tab seperated files:
15 :    
16 :     SOURCE.md52id2func:
17 :     md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
18 : tharriso 1.5 SOURCE.ms52id2ont:
19 :     md5, id, function, ontology, source
20 : tharriso 1.1 SOURCE.md52seq:
21 :     md5, sequence
22 : tharriso 1.4 SOURCE.id2xref:
23 : tharriso 1.1 id, external refrence 1, [external refrence 2, external refrence 3, ...]
24 : tharriso 1.9 SOURCE.id2tax:
25 :     id, genbank tax string
26 : tharriso 1.1
27 :     Values with * are optional, only genbank files contain that data."""
28 :    
29 : tharriso 1.10 class Info:
30 :     def __init__(self):
31 :     self.source = ''
32 :     self.format = 'fasta'
33 :     self.verbose = False
34 :     self.outdir = ''
35 : tharriso 1.13 self.annhead = ''
36 : tharriso 1.10 self.getont = False
37 :     self.getctg = False
38 :     self.gettax = False
39 :     self.amap = {}
40 : tharriso 1.14 self.interpro = False
41 : tharriso 1.10
42 : tharriso 1.12 func_map = {'LSU': '23S/28S ribosomal RNA', 'SSU': '16S/18S ribosomal RNA'}
43 : tharriso 1.2 desc_re = re.compile('(Rec|Sub)Name: Full=(.*?);')
44 : tharriso 1.1 entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)
45 :     name_re = re.compile('^NAME\s+(.*)$', re.MULTILINE)
46 :     func_re = re.compile('^DEFINITION\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
47 : tharriso 1.5 orth_re = re.compile('^ORTHOLOGY\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
48 : tharriso 1.1 dbref_re = re.compile('^DBLINKS\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
49 :     seq_re = re.compile('^AASEQ\s+\d+\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
50 : tharriso 1.14 up_ip_re = re.compile('^InterPro:(.*)')
51 :     up_go_re = re.compile('^GO:(GO:.*)')
52 :     gb_go_re = re.compile('^(GO:\d+)\s+-\s+(.*)')
53 : tharriso 1.6 gb_go_rm = re.compile('( \[Evidence [A-Z]+\])')
54 : tharriso 1.12 gg_re = re.compile('^[kpcofgs]__(\S+)$')
55 : tharriso 1.5 ko_re = re.compile('^(K\d+)\s+(.*)')
56 :     ogid_re = re.compile('^(\S+?OG)\d+')
57 : tharriso 1.14 nr_re = re.compile('^gi\|(\d+)\|(\w+)\|(\S+)\|\S*?\s+(.*\])$')
58 : tharriso 1.10 nr_types = {'ref': 'RefSeq', 'gb': 'GenBank', 'emb': 'EMBL', 'dbj': 'DDBJ', 'pir': 'PIR', 'prf': 'PRF', 'pdb': 'PDB'}
59 : tharriso 1.5 go_types = ['GO_function', 'GO_process', 'GO_component']
60 : tharriso 1.9 file_ext = [".md52id2func", ".md52id2ont", ".md52seq", ".id2xref", ".id2tax"]
61 : tharriso 1.10 params = Info()
62 :    
63 :     def parse_nr_header(line):
64 :     items = []
65 :     hdr_set = nr_re.match(line)
66 :     if hdr_set:
67 :     (gi_id, nrdb, nrdb_id, func_org) = hdr_set.groups()
68 :     if nrdb in params.amap:
69 : tharriso 1.14 func_txt, org_txt = parse_bracket_line(func_org, '[', ']', 1)
70 : tharriso 1.13 items = [ gi_id, params.amap[nrdb], nrdb_id, func_txt, org_txt ]
71 : tharriso 1.10 return items
72 :    
73 : tharriso 1.14 def parse_bracket_line(line, beg_delim, end_delim, rev):
74 :     in_bracket = ''
75 :     if rev:
76 :     line = reverse_string(line)
77 :     end_break = 0
78 : tharriso 1.13 beg_break = 0
79 : tharriso 1.14 for i, c in enumerate(line):
80 : tharriso 1.13 if c == end_delim: end_break += 1
81 :     if c == beg_delim: beg_break += 1
82 :     if beg_break == end_break: break
83 : tharriso 1.14 if i > 0: in_bracket += c
84 :     if rev:
85 :     bracket_txt = reverse_string(in_bracket).strip()
86 :     remain_txt = reverse_string(line[i+1:]).strip()
87 :     return remain_txt, bracket_txt
88 :     else:
89 :     bracket_txt = in_bracket.strip()
90 :     remain_txt = line[i+1:].strip()
91 :     return bracket_txt, remain_txt
92 : tharriso 1.13
93 : tharriso 1.10 def reverse_string(string):
94 :     return string[::-1]
95 : tharriso 1.1
96 : tharriso 1.14 def get_interpro_map(filename):
97 :     imap = {}
98 :     if filename is None:
99 :     return imap
100 :     if params.verbose: sys.stdout.write("Parsing file %s ... " %filename)
101 :     fhdl = open(filename, 'r')
102 :     try:
103 :     for line in fhdl:
104 :     parts = line.strip().split("\t")
105 :     imap[ parts[0] ] = parts[1]
106 :     finally:
107 :     fhdl.close()
108 :     if params.verbose: sys.stdout.write("Done\n")
109 :     return imap
110 :    
111 : tharriso 1.10 def get_eggnog_map(filename):
112 : tharriso 1.2 emap = {}
113 : tharriso 1.1 if filename is None:
114 : tharriso 1.2 return emap
115 : tharriso 1.10 if params.verbose: sys.stdout.write("Parsing file %s ... " %filename)
116 : tharriso 1.2 fhdl = open(filename, 'r')
117 :     try:
118 :     for line in fhdl:
119 :     if line.startswith("#"): continue
120 :     parts = line.strip().split("\t")
121 :     if parts[0] in emap:
122 : tharriso 1.5 emap[ parts[0] ].append( [parts[3], parts[4]] )
123 : tharriso 1.2 else :
124 : tharriso 1.5 emap[ parts[0] ] = [ [parts[3], parts[4]] ]
125 : tharriso 1.2 finally:
126 :     fhdl.close()
127 : tharriso 1.10 if params.verbose: sys.stdout.write("Done\n")
128 : tharriso 1.2 return emap
129 :    
130 : tharriso 1.10 def get_kegg_map(filename):
131 : tharriso 1.2 kmap = {}
132 :     if filename is None:
133 :     return kmap
134 : tharriso 1.10 if params.verbose: sys.stdout.write("Parsing file %s ... " %filename)
135 : tharriso 1.1 for text in kegg_iter(filename):
136 :     rec = get_kegg_rec(text)
137 :     if (not rec.name) or (not rec.description): continue
138 :     names = rec.name.split(', ')
139 :     if len(names) > 1:
140 : tharriso 1.11 kmap[ names[1] ] = [ rec.description, names[0] ]
141 : tharriso 1.10 if params.verbose: sys.stdout.write("Done\n")
142 : tharriso 1.2 return kmap
143 : tharriso 1.1
144 :     def kegg_iter(filename):
145 :     fhdl = file(filename, 'rb')
146 :     recs = ['']
147 :     for data in iter(lambda: fhdl.read(100000), ''):
148 :     recs = (recs[-1] + data).split("\n///\n")
149 :     for rec in recs[:-1]:
150 :     yield rec
151 :    
152 :     def get_kegg_rec(text):
153 :     record = SeqRecord(Seq("",IUPAC.protein), id='', name='', description='', dbxrefs=[], annotations={'organism':''})
154 :     entry_s = entry_re.search(text)
155 :     name_s = name_re.search(text)
156 :     func_s = func_re.search(text)
157 : tharriso 1.5 orth_s = orth_re.search(text)
158 : tharriso 1.1 dbref_s = dbref_re.search(text)
159 :     seq_s = seq_re.search(text)
160 :     if entry_s:
161 :     record.id = entry_s.group(1)
162 :     record.annotations['organism'] = entry_s.group(2).strip()
163 :     if name_s:
164 :     record.name = name_s.group(1).strip()
165 :     if func_s:
166 :     record.description = re.sub('\s+', ' ', func_s.group(1).strip())
167 : tharriso 1.5 if orth_s:
168 :     orths = {}
169 :     for o in filter( lambda y: y, map(lambda x: ko_re.search(x.strip()), orth_s.group(1).strip().split("\n")) ):
170 :     orths[ o.group(1) ] = o.group(2)
171 :     record.annotations['orthology'] = orths
172 : tharriso 1.1 if dbref_s:
173 :     parts = dbref_s.group(1).split()
174 :     if (len(parts) % 2) == 0:
175 :     for i in range(0, len(parts)-1, 2):
176 :     record.dbxrefs.append(parts[i].replace('NCBI-', '') + parts[i+1])
177 :     if seq_s:
178 :     record.seq = Seq( re.sub('\s+', '', seq_s.group(1)), IUPAC.protein )
179 :     return record
180 :    
181 : tharriso 1.10 def format_factory(out_files):
182 : tharriso 1.9 (prot_f, ont_f, seq_f, ref_f, tax_f) = out_files
183 : tharriso 1.10 source = params.source
184 :     form = params.format
185 : tharriso 1.13 ann_desc = params.annhead
186 : tharriso 1.10 get_ont = params.getont
187 :     get_ctg = params.getctg
188 :     get_tax = params.gettax
189 : tharriso 1.14 get_ip = params.interpro
190 : tharriso 1.10 amap = params.amap
191 : tharriso 1.1 if form == 'genbank':
192 : tharriso 1.10 def parse_genbank(rec):
193 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
194 :     cid = rec.id
195 :     cdesc = rec.description.rstrip('.')
196 : tharriso 1.10 seq = str(rec.seq).upper()
197 :     clen = len(seq)
198 : tharriso 1.14 if 'organism' in rec.annotations:
199 :     org = rec.annotations['organism']
200 :     elif (rec.features[0].type == 'source') and ('organism' in rec.features[0].qualifiers):
201 :     org = rec.features[0].qualifiers['organism'][0]
202 :     else:
203 :     return
204 : tharriso 1.3 # only one entry on rec (first feat is source and is not a feature), use first real feature, no contig data
205 :     if len(rec.features) == 2:
206 : tharriso 1.10 md5 = hashlib.md5(seq).hexdigest()
207 : tharriso 1.3 func = ''
208 :     if 'product' in rec.features[1].qualifiers:
209 :     func = rec.features[1].qualifiers['product'][0]
210 : tharriso 1.10 seq_f.write("%s\t%s\n" %(md5, seq))
211 : tharriso 1.5 prot_f.write("\t".join([md5, rec.name, func, org, source]) + "\n")
212 : tharriso 1.9 if get_tax and ('taxonomy' in rec.annotations):
213 :     if rec.annotations['taxonomy'][0] == 'Root':
214 :     x = rec.annotations['taxonomy'].pop(0)
215 :     tax_f.write("%s\t%s;%s\n" %(rec.name, ";".join(rec.annotations['taxonomy']), org))
216 : tharriso 1.3 return
217 :     # multiple entries on rec, use CDS features, get contig data
218 : tharriso 1.1 for feat in rec.features:
219 :     if feat.type == 'CDS':
220 :     if ( (not feat.qualifiers) or (not feat.location) or
221 :     ('translation' not in feat.qualifiers) or
222 :     ('product' not in feat.qualifiers) ): continue
223 : tharriso 1.6 fid = ''
224 :     if 'protein_id' in feat.qualifiers:
225 :     fid = feat.qualifiers['protein_id'][0]
226 :     elif 'locus_tag' in feat.qualifiers:
227 :     fid = feat.qualifiers['locus_tag'][0]
228 :     else:
229 :     continue
230 : tharriso 1.1 seq = feat.qualifiers['translation'][0]
231 :     md5 = hashlib.md5(seq).hexdigest()
232 :     func = feat.qualifiers['product'][0]
233 :     beg = feat.location.start.position
234 :     end = feat.location.end.position
235 :     if feat.strand:
236 :     strd = feat.strand
237 :     else:
238 :     strd = 1
239 :     if (beg > end):
240 :     strd = -1
241 :     beg = feat.location.end.position
242 :     end = feat.location.start.position
243 : tharriso 1.5 # output GO ontology
244 :     if get_ont:
245 :     for gtype in go_types:
246 :     if gtype in feat.qualifiers:
247 :     for goid in feat.qualifiers[gtype]:
248 :     go_m = gb_go_re.match(goid)
249 :     if go_m:
250 : tharriso 1.6 desc = gb_go_rm.sub('', go_m.group(2))
251 : tharriso 1.10 ont_f.write("\t".join([md5, go_m.group(1), desc, 'GO']) + "\n")
252 : tharriso 1.1 seq_f.write("%s\t%s\n" %(md5, seq))
253 : tharriso 1.8 if get_ctg:
254 :     prot_f.write("\t".join([md5, fid, func, org, source, str(beg), str(end), str(strd), cid, cdesc, str(clen)]) + "\n")
255 :     else:
256 :     prot_f.write("\t".join([md5, fid, func, org, source]) + "\n")
257 : tharriso 1.1 if ('db_xref' in feat.qualifiers) and (len(feat.qualifiers['db_xref']) > 0):
258 :     ref_f.write(fid + "\t" + "\t".join(feat.qualifiers['db_xref']) + "\n")
259 :     return parse_genbank
260 :    
261 :     elif form == 'swiss':
262 : tharriso 1.10 def parse_swiss(rec):
263 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
264 : tharriso 1.10 seq = str(rec.seq).upper()
265 :     md5 = hashlib.md5(seq).hexdigest()
266 : tharriso 1.14 if not get_ip:
267 :     func = rec.description
268 :     srch = desc_re.search(func)
269 :     if srch: func = srch.group(2)
270 :     seq_f.write("%s\t%s\n" %(md5, seq))
271 :     prot_f.write("\t".join([md5, rec.id, func, rec.annotations['organism'], source]) + "\n")
272 :     if len(rec.dbxrefs) > 0:
273 :     ref_str = rec.id
274 :     for ref in rec.dbxrefs:
275 :     go_m = up_go_re.match(ref)
276 :     # output GO ontology
277 :     if go_m and get_ont:
278 :     ont_f.write("\t".join([md5, go_m.group(1), func, 'GO']) + "\n")
279 :     else:
280 :     ref_str += "\t" + ref
281 :     ref_f.write(ref_str + "\n")
282 :     elif get_ip and (len(rec.dbxrefs) > 0):
283 :     has_ip = 0
284 : tharriso 1.1 for ref in rec.dbxrefs:
285 : tharriso 1.14 ip_m = up_ip_re.match(ref)
286 :     if ip_m and (ip_m.group(1) in amap):
287 :     has_ip = 1
288 :     prot_f.write("\t".join([md5, ip_m.group(1), amap[ip_m.group(1)], rec.annotations['organism'], source]) + "\n")
289 :     if has_ip:
290 :     seq_f.write("%s\t%s\n" %(md5, seq))
291 : tharriso 1.1 return parse_swiss
292 :    
293 :     elif form == 'fasta':
294 : tharriso 1.10 def parse_fasta(rec):
295 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
296 : tharriso 1.2 desc = ''
297 : tharriso 1.6 org = ''
298 :     func = ''
299 : tharriso 1.2 hdrs = rec.description.split()
300 : tharriso 1.10 seq = str(rec.seq).upper()
301 :     md5 = hashlib.md5(seq).hexdigest()
302 :     seq_f.write("%s\t%s\n" %(md5, seq))
303 : tharriso 1.13 if ann_desc == 'greengenes':
304 : tharriso 1.2 hdrs.pop(0)
305 : tharriso 1.12 hdrs.pop(0)
306 :     hdrs.pop()
307 :     tax_list = []
308 :     org_list = []
309 :     for x in hdrs:
310 :     gg_m = gg_re.match(x)
311 : tharriso 1.13 if gg_m: tax_list.append(gg_m.group(1))
312 :     else: org_list.append(x)
313 : tharriso 1.12 org = " ".join(org_list).split(";")[0]
314 :     func = "16S/18S ribosomal RNA"
315 :     if get_tax:
316 :     tax_f.write("%s\t%s;%s\n" %(rec.id, "".join(tax_list), org))
317 : tharriso 1.16 elif ann_desc == 'fungal':
318 : tharriso 1.15 hdrs.pop(0)
319 :     tax_id = 0
320 :     if hdrs[0].isdigit():
321 :     tax_id = int( hdrs.pop(0) )
322 :     tax_str = " ".join(hdrs).rstrip('.')
323 :     org = tax_str.split(";")[-1]
324 :     func = "ITS ribosomal RNA"
325 :     if get_tax:
326 :     if tax_id > 0:
327 :     tax_f.write("%s\t%d\n" %(rec.id, tax_id))
328 :     else:
329 :     tax_f.write("%s\t%s\n" %(rec.id, tax_str))
330 : tharriso 1.12 else:
331 :     if len(hdrs) > 1:
332 :     hdrs.pop(0)
333 :     desc = " ".join(hdrs)
334 :     if desc.startswith("("): desc = desc.strip(')(')
335 :     if desc.startswith("|"): desc = desc.strip('| ')
336 : tharriso 1.13 if ann_desc == 'img':
337 :     hdrs.pop(0)
338 :     desc = " ".join(hdrs)
339 :     if desc[-1] != ']':
340 :     return
341 : tharriso 1.14 func, org = parse_bracket_line(desc, '[', ']', 1)
342 : tharriso 1.13 elif ann_desc == 'seed':
343 :     if desc[-1] != ')':
344 :     return
345 : tharriso 1.14 func, org = parse_bracket_line(desc, '(', ')', 1)
346 :     elif ann_desc == 'phantome':
347 :     if (desc[-1] != ']') and (desc[0] != '['):
348 :     return
349 :     _junk, org = parse_bracket_line(desc, '[', ']', 1)
350 :     func, _junk = parse_bracket_line(desc, '[', ']', 0)
351 : tharriso 1.13 elif ann_desc == 'organism':
352 :     org = desc
353 :     if source in func_map:
354 :     func = func_map[source]
355 :     elif ann_desc == 'function':
356 :     func = desc
357 : tharriso 1.6 if get_ont and (rec.id in amap):
358 :     for f in amap[rec.id]:
359 :     prot_f.write("\t".join([md5, rec.id, f[1], org, source]) + "\n")
360 :     ogid_m = ogid_re.match(f[0])
361 :     if ogid_m:
362 : tharriso 1.7 ont_f.write("\t".join([md5, f[0], f[1], ogid_m.group(1)]) + "\n")
363 : tharriso 1.1 else:
364 : tharriso 1.6 prot_f.write("\t".join([md5, rec.id, func, org, source]) + "\n")
365 : tharriso 1.1 return parse_fasta
366 :    
367 : tharriso 1.10 elif form == 'nr':
368 :     def parse_nr(rec):
369 :     if (not rec) or (not rec.description) or (not rec.seq): return
370 :     get_seq = False
371 :     hdrs = rec.description.split('\x01')
372 :     seq = str(rec.seq).upper()
373 :     md5 = hashlib.md5(seq).hexdigest()
374 :     if len(hdrs) > 0:
375 :     for h in hdrs:
376 :     info = parse_nr_header(h)
377 :     if len(info) > 0:
378 :     (gi_id, nrdb, nrdb_id, func, org) = info
379 :     get_seq = True
380 :     prot_f.write("\t".join([md5, nrdb_id, func, org, nrdb]) + "\n")
381 :     ref_f.write("%s\tGI:%s\n"%(nrdb_id, gi_id))
382 :     if get_seq:
383 :     seq_f.write("%s\t%s\n" %(md5, seq))
384 :     return parse_nr
385 :    
386 : tharriso 1.1 elif form == 'kegg':
387 : tharriso 1.10 def parse_kegg(text):
388 : tharriso 1.1 rec = get_kegg_rec(text)
389 :     if (not rec.id) or (not rec.description) or (not rec.seq): return
390 : tharriso 1.10 seq = str(rec.seq).upper()
391 :     md5 = hashlib.md5(seq).hexdigest()
392 : tharriso 1.5 org = rec.annotations['organism']
393 : tharriso 1.11 code = ""
394 : tharriso 1.5 orth = []
395 : tharriso 1.2 if org in amap:
396 : tharriso 1.11 (org, code) = amap[org]
397 :     code += ":"
398 : tharriso 1.5 # output KO ontology
399 :     if get_ont and ('orthology' in rec.annotations):
400 :     orth = rec.annotations['orthology']
401 :     for oid, odesc in orth.iteritems():
402 : tharriso 1.7 ont_f.write("\t".join([md5, oid, odesc, 'KO']) + "\n")
403 : tharriso 1.10 seq_f.write("%s\t%s\n" %(md5, seq))
404 : tharriso 1.11 prot_f.write("\t".join([md5, code + rec.id, rec.description, org, source]) + "\n")
405 : tharriso 1.1 if len(rec.dbxrefs) > 0:
406 :     ref_f.write(rec.id + "\t" + "\t".join(rec.dbxrefs) + "\n")
407 :     return parse_kegg
408 :    
409 :     else:
410 :     return None
411 :    
412 : tharriso 1.10 def process_file(infile):
413 :     o_files = []
414 :     o_hdls = []
415 :     fformat = params.format
416 :     fname = os.path.basename(infile)
417 :     for e in file_ext: o_files.append( os.path.join(params.outdir, fname + e) )
418 :     for f in o_files: o_hdls.append( open(f, 'w') )
419 :    
420 :     parse_format = format_factory( o_hdls )
421 :     if not parse_format:
422 :     sys.stderr.write("Invalid format %s\n"%fformat)
423 :     sys.exit(1)
424 :    
425 :     if params.verbose: sys.stdout.write("Parsing file %s ...\n" %infile)
426 :     if fformat == 'kegg':
427 :     for rec in kegg_iter(infile):
428 :     parse_format(rec)
429 :     else:
430 :     if fformat == 'nr': fformat = 'fasta'
431 :     for rec in SeqIO.parse(infile, fformat):
432 :     parse_format(rec)
433 :    
434 :     for h in o_hdls: h.close()
435 :     return os.path.join(params.outdir, fname)
436 :    
437 : tharriso 1.1
438 :     usage = "usage: %prog [options] source input_file1 [input_file2 input_file3 ...]" + __doc__
439 :    
440 :     def main(args):
441 : tharriso 1.10 global params
442 : tharriso 1.1 parser = OptionParser(usage=usage)
443 :     parser.add_option("-f", "--format", dest="format", metavar="FORMAT", default="fasta",
444 : tharriso 1.10 help="FORMAT inputed file is in. Must be one of following: genbank, kegg, swiss, nr, fasta [default is 'fasta']")
445 :     parser.add_option("-n", "--nr_dbs", dest="nrdbs", metavar="DBs", default="ref,gb",
446 :     help="Comma seperated list of databases to parse out of ncbi nr, from following: ref, gb, emb, dbj, pir, prf, pdb [default is 'ref,gb']")
447 :     parser.add_option("-p", "--processes", dest="processes", metavar="NUM_PROCESSES", type="int", default=4, help="Number of processes to use [default '4']")
448 : tharriso 1.14 parser.add_option("-i", "--interpro_map", dest="intermap", metavar="FILE", default=None, help="Optional InterPro names FILE for func mapping with UniProt")
449 : tharriso 1.2 parser.add_option("-k", "--kegg_map", dest="keggmap", metavar="FILE", default=None, help="Optional KEGG genome FILE for org name mapping")
450 :     parser.add_option("-e", "--eggnog_map", dest="eggnogmap", metavar="FILE", default=None, help="Optional eggNOG orthgroups tabed FILE for func mapping")
451 : tharriso 1.10 parser.add_option("-d", "--out_dir", dest="outdir", metavar="DIR", default="", help="DIR to write output [default is current dir]")
452 : tharriso 1.13 parser.add_option("-a", "--annotation_header", dest="annheader", metavar="ANNOTATION", default="function",
453 : tharriso 1.16 help="For fasta files, header description contains annotation. Must be one of following: greengenes, fungal, seed, img, phantome, organism, function [default is function]")
454 : tharriso 1.13 parser.add_option("-o", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")
455 : tharriso 1.8 parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")
456 : tharriso 1.9 parser.add_option("-t", "--get_tax", dest="gettax", action="store_true", default=False, help="Output taxonomy string for genbank files [default is off]")
457 : tharriso 1.1 parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")
458 :    
459 :     (opts, args) = parser.parse_args()
460 :     if len(args) < 2:
461 :     parser.error("Incorrect number of arguments")
462 : tharriso 1.10 sfiles = filter(lambda x: os.path.isfile(x), args[1:])
463 :     scount = len(sfiles)
464 :    
465 :     params.source = args[0]
466 :     params.format = opts.format
467 :     params.verbose = opts.verbose
468 :     params.outdir = opts.outdir
469 : tharriso 1.13 params.annhead = opts.annheader
470 : tharriso 1.10 params.getont = opts.getont
471 :     params.getctg = opts.getcontig
472 :     params.gettax = opts.gettax
473 :    
474 :     if (params.format == 'nr') and opts.nrdbs:
475 :     for d in opts.nrdbs.split(','):
476 :     if d in nr_types:
477 :     params.amap[d] = nr_types[d]
478 :     elif (params.format == 'kegg') and opts.keggmap and os.path.isfile(opts.keggmap):
479 :     params.amap = get_kegg_map(opts.keggmap)
480 :     elif (params.format == 'fasta') and opts.eggnogmap and os.path.isfile(opts.eggnogmap):
481 :     params.amap = get_eggnog_map(opts.eggnogmap)
482 : tharriso 1.14 elif (params.format == 'swiss') and opts.intermap and os.path.isfile(opts.intermap):
483 :     params.amap = get_interpro_map(opts.intermap)
484 :     params.interpro = True
485 : tharriso 1.1
486 : tharriso 1.10 if scount < opts.processes:
487 :     min_proc = scount
488 :     else:
489 :     min_proc = opts.processes
490 : tharriso 1.2
491 : tharriso 1.10 if min_proc == 1:
492 :     if params.verbose: sys.stdout.write("Parsing %d %s files, single threaded\n"%(scount, params.format))
493 :     rfiles = []
494 :     for f in sfiles:
495 :     r = process_file(f)
496 :     rfiles.append(r)
497 :     else:
498 :     if params.verbose: sys.stdout.write("Parsing %d %s files using %d threades\n"%(scount, params.format, min_proc))
499 :     pool = Pool(processes=min_proc)
500 :     rfiles = pool.map(process_file, sfiles, 1)
501 :     pool.close()
502 :     pool.join()
503 :     if params.verbose: sys.stdout.write("Done\n")
504 : tharriso 1.9
505 : tharriso 1.10 if params.verbose: sys.stdout.write("Merging %d files ... "%len(rfiles))
506 : tharriso 1.9 for e in file_ext:
507 : tharriso 1.10 out_file = os.path.join(params.outdir, params.source + e)
508 : tharriso 1.13 out_tmp = open(out_file+'.tmp', 'w')
509 : tharriso 1.10 file_set = map( lambda x: x + e, rfiles )
510 :     if os.path.isfile(out_file):
511 :     os.remove(out_file)
512 :     for f in file_set:
513 : tharriso 1.13 shutil.copyfileobj( open(f, 'r'), out_tmp )
514 : tharriso 1.9 os.remove(f)
515 : tharriso 1.13 out_tmp.close()
516 :     os.system( "sort -u %s.tmp > %s"%(out_file, out_file) )
517 :     os.remove(out_file+'.tmp')
518 : tharriso 1.10 if os.path.isfile(out_file) and (os.path.getsize(out_file) == 0):
519 :     os.remove(out_file)
520 :     if params.verbose: sys.stdout.write("Done\n")
521 : tharriso 1.9
522 : tharriso 1.10 if params.verbose: sys.stdout.write("All Done\n")
523 : tharriso 1.1
524 :     if __name__ == "__main__":
525 : tharriso 1.10 sys.exit( main(sys.argv) )

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3