[Bio] / Babel / bin / source2ach.py Repository:
ViewVC logotype

Annotation of /Babel/bin/source2ach.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : tharriso 1.1 #!/usr/bin/env python
2 :    
3 :     import os, sys, re, hashlib
4 :     from optparse import OptionParser
5 :     from Bio import SeqIO
6 :     from Bio.Seq import Seq
7 :     from Bio.SeqRecord import SeqRecord
8 :     from Bio.Alphabet import IUPAC
9 :    
10 :     __doc__ = """
11 :     Script to take a repository file from the following formats:
12 :     genbank, kegg, swiss, fasta
13 :     and output the following tab seperated files:
14 :    
15 :     SOURCE.md52id2func:
16 :     md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
17 : tharriso 1.5 SOURCE.ms52id2ont:
18 :     md5, id, function, ontology, source
19 : tharriso 1.1 SOURCE.md52seq:
20 :     md5, sequence
21 : tharriso 1.4 SOURCE.id2xref:
22 : tharriso 1.1 id, external refrence 1, [external refrence 2, external refrence 3, ...]
23 : tharriso 1.9 SOURCE.id2tax:
24 :     id, genbank tax string
25 : tharriso 1.1
26 :     Values with * are optional, only genbank files contain that data."""
27 :    
28 : tharriso 1.2 desc_re = re.compile('(Rec|Sub)Name: Full=(.*?);')
29 : tharriso 1.1 entry_re = re.compile('^ENTRY\s+(\S+)\s+CDS\s+(.*)$', re.MULTILINE)
30 :     name_re = re.compile('^NAME\s+(.*)$', re.MULTILINE)
31 :     func_re = re.compile('^DEFINITION\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
32 : tharriso 1.5 orth_re = re.compile('^ORTHOLOGY\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
33 : tharriso 1.1 dbref_re = re.compile('^DBLINKS\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
34 :     seq_re = re.compile('^AASEQ\s+\d+\s+(.*?)^\S', re.DOTALL | re.MULTILINE)
35 : tharriso 1.5 up_go_re = re.compile('^GO:GO:(.*)')
36 :     gb_go_re = re.compile('^GO:(\d+)\s+-\s+(.*)')
37 : tharriso 1.6 gb_go_rm = re.compile('( \[Evidence [A-Z]+\])')
38 : tharriso 1.5 ko_re = re.compile('^(K\d+)\s+(.*)')
39 :     ogid_re = re.compile('^(\S+?OG)\d+')
40 :     go_types = ['GO_function', 'GO_process', 'GO_component']
41 : tharriso 1.9 file_ext = [".md52id2func", ".md52id2ont", ".md52seq", ".id2xref", ".id2tax"]
42 : tharriso 1.1
43 : tharriso 1.6 def get_eggnog_map(filename, verb):
44 : tharriso 1.2 emap = {}
45 : tharriso 1.1 if filename is None:
46 : tharriso 1.2 return emap
47 : tharriso 1.6 if verb: sys.stdout.write("Parsing file %s ... " %filename)
48 : tharriso 1.2 fhdl = open(filename, 'r')
49 :     try:
50 :     for line in fhdl:
51 :     if line.startswith("#"): continue
52 :     parts = line.strip().split("\t")
53 :     if parts[0] in emap:
54 : tharriso 1.5 emap[ parts[0] ].append( [parts[3], parts[4]] )
55 : tharriso 1.2 else :
56 : tharriso 1.5 emap[ parts[0] ] = [ [parts[3], parts[4]] ]
57 : tharriso 1.2 finally:
58 :     fhdl.close()
59 : tharriso 1.6 if verb: sys.stdout.write("Done\n")
60 : tharriso 1.2 return emap
61 :    
62 : tharriso 1.6 def get_kegg_map(filename, verb):
63 : tharriso 1.2 kmap = {}
64 :     if filename is None:
65 :     return kmap
66 : tharriso 1.6 if verb: sys.stdout.write("Parsing file %s ... " %filename)
67 : tharriso 1.1 for text in kegg_iter(filename):
68 :     rec = get_kegg_rec(text)
69 :     if (not rec.name) or (not rec.description): continue
70 :     names = rec.name.split(', ')
71 :     if len(names) > 1:
72 : tharriso 1.2 kmap[ names[1] ] = rec.description
73 : tharriso 1.6 if verb: sys.stdout.write("Done\n")
74 : tharriso 1.2 return kmap
75 : tharriso 1.1
76 :     def kegg_iter(filename):
77 :     fhdl = file(filename, 'rb')
78 :     recs = ['']
79 :     for data in iter(lambda: fhdl.read(100000), ''):
80 :     recs = (recs[-1] + data).split("\n///\n")
81 :     for rec in recs[:-1]:
82 :     yield rec
83 :    
84 :     def get_kegg_rec(text):
85 :     record = SeqRecord(Seq("",IUPAC.protein), id='', name='', description='', dbxrefs=[], annotations={'organism':''})
86 :     entry_s = entry_re.search(text)
87 :     name_s = name_re.search(text)
88 :     func_s = func_re.search(text)
89 : tharriso 1.5 orth_s = orth_re.search(text)
90 : tharriso 1.1 dbref_s = dbref_re.search(text)
91 :     seq_s = seq_re.search(text)
92 :     if entry_s:
93 :     record.id = entry_s.group(1)
94 :     record.annotations['organism'] = entry_s.group(2).strip()
95 :     if name_s:
96 :     record.name = name_s.group(1).strip()
97 :     if func_s:
98 :     record.description = re.sub('\s+', ' ', func_s.group(1).strip())
99 : tharriso 1.5 if orth_s:
100 :     orths = {}
101 :     for o in filter( lambda y: y, map(lambda x: ko_re.search(x.strip()), orth_s.group(1).strip().split("\n")) ):
102 :     orths[ o.group(1) ] = o.group(2)
103 :     record.annotations['orthology'] = orths
104 : tharriso 1.1 if dbref_s:
105 :     parts = dbref_s.group(1).split()
106 :     if (len(parts) % 2) == 0:
107 :     for i in range(0, len(parts)-1, 2):
108 :     record.dbxrefs.append(parts[i].replace('NCBI-', '') + parts[i+1])
109 :     if seq_s:
110 :     record.seq = Seq( re.sub('\s+', '', seq_s.group(1)), IUPAC.protein )
111 :     return record
112 :    
113 : tharriso 1.9 def format_factory(out_files, source, form, org_desc, get_ont, get_ctg, get_tax):
114 :     (prot_f, ont_f, seq_f, ref_f, tax_f) = out_files
115 : tharriso 1.1 if form == 'genbank':
116 : tharriso 1.2 def parse_genbank(rec, amap):
117 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
118 :     cid = rec.id
119 :     cdesc = rec.description.rstrip('.')
120 :     clen = len(rec.seq)
121 :     org = rec.annotations['organism']
122 : tharriso 1.3 # only one entry on rec (first feat is source and is not a feature), use first real feature, no contig data
123 :     if len(rec.features) == 2:
124 :     md5 = hashlib.md5(str(rec.seq)).hexdigest()
125 :     func = ''
126 :     if 'product' in rec.features[1].qualifiers:
127 :     func = rec.features[1].qualifiers['product'][0]
128 :     seq_f.write("%s\t%s\n" %(md5, rec.seq))
129 : tharriso 1.5 prot_f.write("\t".join([md5, rec.name, func, org, source]) + "\n")
130 : tharriso 1.9 if get_tax and ('taxonomy' in rec.annotations):
131 :     if rec.annotations['taxonomy'][0] == 'Root':
132 :     x = rec.annotations['taxonomy'].pop(0)
133 :     tax_f.write("%s\t%s;%s\n" %(rec.name, ";".join(rec.annotations['taxonomy']), org))
134 : tharriso 1.3 return
135 :     # multiple entries on rec, use CDS features, get contig data
136 : tharriso 1.1 for feat in rec.features:
137 :     if feat.type == 'CDS':
138 :     if ( (not feat.qualifiers) or (not feat.location) or
139 :     ('translation' not in feat.qualifiers) or
140 :     ('product' not in feat.qualifiers) ): continue
141 : tharriso 1.6 fid = ''
142 :     if 'protein_id' in feat.qualifiers:
143 :     fid = feat.qualifiers['protein_id'][0]
144 :     elif 'locus_tag' in feat.qualifiers:
145 :     fid = feat.qualifiers['locus_tag'][0]
146 :     else:
147 :     continue
148 : tharriso 1.1 seq = feat.qualifiers['translation'][0]
149 :     md5 = hashlib.md5(seq).hexdigest()
150 :     func = feat.qualifiers['product'][0]
151 :     beg = feat.location.start.position
152 :     end = feat.location.end.position
153 :     if feat.strand:
154 :     strd = feat.strand
155 :     else:
156 :     strd = 1
157 :     if (beg > end):
158 :     strd = -1
159 :     beg = feat.location.end.position
160 :     end = feat.location.start.position
161 : tharriso 1.5 # output GO ontology
162 :     if get_ont:
163 :     for gtype in go_types:
164 :     if gtype in feat.qualifiers:
165 :     for goid in feat.qualifiers[gtype]:
166 :     go_m = gb_go_re.match(goid)
167 :     if go_m:
168 : tharriso 1.6 desc = gb_go_rm.sub('', go_m.group(2))
169 : tharriso 1.7 ont_f.write("\t".join([md5, go_m.group(1), desc, gtype.replace('_',':')]) + "\n")
170 : tharriso 1.1 seq_f.write("%s\t%s\n" %(md5, seq))
171 : tharriso 1.8 if get_ctg:
172 :     prot_f.write("\t".join([md5, fid, func, org, source, str(beg), str(end), str(strd), cid, cdesc, str(clen)]) + "\n")
173 :     else:
174 :     prot_f.write("\t".join([md5, fid, func, org, source]) + "\n")
175 : tharriso 1.1 if ('db_xref' in feat.qualifiers) and (len(feat.qualifiers['db_xref']) > 0):
176 :     ref_f.write(fid + "\t" + "\t".join(feat.qualifiers['db_xref']) + "\n")
177 :     return parse_genbank
178 :    
179 :     elif form == 'swiss':
180 : tharriso 1.2 def parse_swiss(rec, amap):
181 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
182 :     md5 = hashlib.md5(str(rec.seq)).hexdigest()
183 :     func = rec.description
184 :     srch = desc_re.search(func)
185 : tharriso 1.2 if srch: func = srch.group(2)
186 : tharriso 1.1 seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
187 : tharriso 1.5 prot_f.write("\t".join([md5, rec.id, func, rec.annotations['organism'], source]) + "\n")
188 : tharriso 1.1 if len(rec.dbxrefs) > 0:
189 :     ref_str = rec.id
190 :     for ref in rec.dbxrefs:
191 : tharriso 1.5 go_m = up_go_re.match(ref)
192 :     # output GO ontology
193 :     if go_m and get_ont:
194 : tharriso 1.7 ont_f.write("\t".join([md5, go_m.group(1), '', 'GO']) + "\n")
195 : tharriso 1.1 else:
196 :     ref_str += "\t" + ref
197 :     ref_f.write(ref_str + "\n")
198 :     return parse_swiss
199 :    
200 :     elif form == 'fasta':
201 : tharriso 1.2 def parse_fasta(rec, amap):
202 : tharriso 1.1 if (not rec) or (not rec.id) or (not rec.seq): return
203 : tharriso 1.2 desc = ''
204 : tharriso 1.6 org = ''
205 :     func = ''
206 : tharriso 1.2 hdrs = rec.description.split()
207 : tharriso 1.1 md5 = hashlib.md5(str(rec.seq)).hexdigest()
208 :     seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
209 : tharriso 1.2 if len(hdrs) > 1:
210 :     hdrs.pop(0)
211 :     desc = " ".join(hdrs)
212 :     if desc.startswith("("): desc = desc.strip(')(')
213 : tharriso 1.9 if desc.startswith("|"): desc = desc.strip('| ')
214 : tharriso 1.6 if org_desc: org = desc
215 :     else: func = desc
216 :     if get_ont and (rec.id in amap):
217 :     for f in amap[rec.id]:
218 :     prot_f.write("\t".join([md5, rec.id, f[1], org, source]) + "\n")
219 :     ogid_m = ogid_re.match(f[0])
220 :     if ogid_m:
221 : tharriso 1.7 ont_f.write("\t".join([md5, f[0], f[1], ogid_m.group(1)]) + "\n")
222 : tharriso 1.1 else:
223 : tharriso 1.6 prot_f.write("\t".join([md5, rec.id, func, org, source]) + "\n")
224 : tharriso 1.1 return parse_fasta
225 :    
226 :     elif form == 'kegg':
227 : tharriso 1.2 def parse_kegg(text, amap):
228 : tharriso 1.1 rec = get_kegg_rec(text)
229 :     if (not rec.id) or (not rec.description) or (not rec.seq): return
230 : tharriso 1.5 md5 = hashlib.md5(str(rec.seq)).hexdigest()
231 :     org = rec.annotations['organism']
232 :     orth = []
233 : tharriso 1.2 if org in amap:
234 :     org = amap[org]
235 : tharriso 1.5 # output KO ontology
236 :     if get_ont and ('orthology' in rec.annotations):
237 :     orth = rec.annotations['orthology']
238 :     for oid, odesc in orth.iteritems():
239 : tharriso 1.7 ont_f.write("\t".join([md5, oid, odesc, 'KO']) + "\n")
240 : tharriso 1.1 seq_f.write("%s\t%s\n" %(md5, str(rec.seq)))
241 : tharriso 1.5 prot_f.write("\t".join([md5, rec.id, rec.description, org, source]) + "\n")
242 : tharriso 1.1 if len(rec.dbxrefs) > 0:
243 :     ref_f.write(rec.id + "\t" + "\t".join(rec.dbxrefs) + "\n")
244 :     return parse_kegg
245 :    
246 :     else:
247 :     return None
248 :    
249 :    
250 :     usage = "usage: %prog [options] source input_file1 [input_file2 input_file3 ...]" + __doc__
251 :    
252 :     def main(args):
253 :     parser = OptionParser(usage=usage)
254 :     parser.add_option("-f", "--format", dest="format", metavar="FORMAT", default="fasta",
255 :     help="FORMAT inputed file is in. Must be one of following: genbank, kegg, swiss, fasta [default is fasta]")
256 : tharriso 1.2 parser.add_option("-k", "--kegg_map", dest="keggmap", metavar="FILE", default=None, help="Optional KEGG genome FILE for org name mapping")
257 :     parser.add_option("-e", "--eggnog_map", dest="eggnogmap", metavar="FILE", default=None, help="Optional eggNOG orthgroups tabed FILE for func mapping")
258 : tharriso 1.1 parser.add_option("-d", "--out_dir", dest="outdir", metavar="DIR", default=None, help="DIR to write output [default is current dir]")
259 :     parser.add_option("-o", "--org_header", dest="orgheader", action="store_true", default=False, help="For fasta files, header description is organism [default is function]")
260 : tharriso 1.5 parser.add_option("-g", "--get_ontology", dest="getont", action="store_true", default=False, help="Output ontology (id, type) for proteins with mapped ontology [default is off]")
261 : tharriso 1.8 parser.add_option("-c", "--get_contig", dest="getcontig", action="store_true", default=False, help="Output contig info for organism genbank files [default is off]")
262 : tharriso 1.9 parser.add_option("-t", "--get_tax", dest="gettax", action="store_true", default=False, help="Output taxonomy string for genbank files [default is off]")
263 : tharriso 1.1 parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default is off]")
264 :    
265 :     (opts, args) = parser.parse_args()
266 :     if len(args) < 2:
267 :     parser.error("Incorrect number of arguments")
268 :     source = args[0]
269 :     files = args[1:]
270 :     outdir = ''
271 : tharriso 1.9 if opts.outdir:
272 :     outdir = opts.outdir
273 : tharriso 1.1
274 : tharriso 1.6 keggmap = get_kegg_map(opts.keggmap, opts.verbose)
275 :     eggnogmap = get_eggnog_map(opts.eggnogmap, opts.verbose)
276 : tharriso 1.2
277 : tharriso 1.9 o_files = []
278 :     o_hdls = []
279 :    
280 :     for e in file_ext:
281 :     o_files.append( os.path.join(outdir, source + e) )
282 :     for f in o_files:
283 :     o_hdls.append( open(f, 'w') )
284 : tharriso 1.1
285 : tharriso 1.9 parse_format = format_factory( o_hdls, source, opts.format, opts.orgheader, opts.getont, opts.getcontig, opts.gettax )
286 : tharriso 1.1 if not parse_format:
287 :     parser.error("Incorrect format entered")
288 :    
289 :     for f in files:
290 :     if opts.verbose:
291 :     sys.stdout.write("Parsing file %s ...\n" %f)
292 :     if (opts.format == 'kegg'):
293 :     for rec in kegg_iter(f):
294 : tharriso 1.2 parse_format(rec, keggmap)
295 : tharriso 1.1 else:
296 :     for rec in SeqIO.parse(f, opts.format):
297 : tharriso 1.2 parse_format(rec, eggnogmap)
298 : tharriso 1.1
299 : tharriso 1.9 for h in o_hdls:
300 :     h.close()
301 :     for f in o_files:
302 :     if os.path.isfile(f) and (os.path.getsize(f) == 0):
303 :     if opts.verbose:
304 :     sys.stdout.write("Removing empty file %s\n" %f)
305 :     os.remove(f)
306 :    
307 : tharriso 1.1 if opts.verbose:
308 :     sys.stdout.write("All Done\n")
309 :    
310 :     if __name__ == "__main__":
311 :     sys.exit(main(sys.argv))

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3