[Bio] / FigKernelScripts / html_parser.py Repository:
ViewVC logotype

Annotation of /FigKernelScripts/html_parser.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : disz 1.1 #!/usr/bin/python
2 : olson 1.2 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : disz 1.1 from FigKernelPackages.FIG2 import FIG
20 :     from HTMLParser import HTMLParser
21 :     import sys, string
22 :    
23 :     fig = FIG()
24 :    
25 :     # create our own "parser"
26 :     class MyParser(HTMLParser):
27 :     def __init__(self, subsystem):
28 :     # need to init HTMLParser, as well as add our
29 :     # own flags
30 :     HTMLParser.__init__(self)
31 :     self.area_num = 0
32 :     self.subsystem = subsystem
33 :    
34 :     def get_role(self, subsystem, abbrev):
35 :     sub = fig.get_subsystem(subsystem)
36 :     if sub is None:
37 :     return("No Role for %s:%s" % (subsystem, abbrev))
38 :     else:
39 :     return sub.get_role_from_abbr(abbrev)
40 :    
41 :    
42 :     def handle_area(self, attrs):
43 :     #print "AREA %s" % self.area_num
44 :     area = "A%d" % self.area_num
45 :     for item in attrs:
46 :     if item[0].lower() == "alt":
47 :     type, value = string.split(item[1], ",")
48 :     if type.lower() == "compound":
49 :     print '\t'.join((area, type.lower(), value))
50 :     elif type.lower() == "role":
51 :     full_role = self.get_role(self.subsystem, value.strip())
52 :     print '\t'.join((area, type.lower(), full_role))
53 :     print '\t'.join((area, "abbrev %s"% value))
54 :     elif item[0].lower() == "shape":
55 :     shape = item[1].strip().lower()
56 :     elif item[0].lower() == "coords":
57 :     coords = item[1]
58 :     print '\t'.join((area, "shape", "%s:%s" % (shape, coords)))
59 :     elif item[0].lower() == "href":
60 :     if item[1][0:4] == "none":
61 :     url = "none"
62 :     elif item[1][0:4].lower() == "http":
63 :     url = item[1]
64 :     else:
65 :     kegg, the_rest = string.split(item[1], ",", 1)
66 :     if kegg.lower() == "kegg":
67 :     type, num = string.split(the_rest, ",")
68 :     url = the_rest
69 :     if type.strip().lower() == 'ec':
70 :     url = "http://www.genome.ad.jp/dbget-bin/www_bget?enzyme+%s" % num.strip()
71 :     elif type.strip().lower() == "compound":
72 :     url = "http://www.genome.ad.jp/dbget-bin/www_bget?compound+%s" % num.strip()
73 :     print '\t'.join((area, 'link', '"KEGG entry",%s' % url))
74 :    
75 :    
76 :     #print "A%d " % self.area_num, item[0],":", item[1]
77 :     self.area_num += 1
78 :     def handle_map(self, attrs):
79 :     for item in attrs:
80 :     self.subsystem = item[1]
81 :     #print item[0],":", item[1]
82 :     def handle_img(self, attrs):
83 :     print "IMG"
84 :     for item in attrs:
85 :     print item[0],":", item[1]
86 :     def handle_unknown(self, tag, attrs):
87 :     return
88 :     print "UNKNOWN:", tag
89 :     for item in attrs:
90 :     print item[0],":", item[1]
91 :    
92 :     # override the handle_starttag method
93 :     def handle_starttag(self, tag, attrs):
94 :     if tag == "area":
95 :     self.handle_area(attrs)
96 :     elif tag == "map":
97 :     self.handle_map(attrs)
98 :     elif tag == "img":
99 :     self.handle_img(attrs)
100 :     else:
101 :     self.handle_unknown(tag, attrs)
102 :    
103 :     # override the handle_endtag method
104 :     # use this to turn the flag OFF, so
105 :     # we know to stop capturing data
106 :     def handle_endtag(self, tag):
107 :     pass
108 :     #print "Endtag", tag
109 :     # if tag == "area":
110 :     # self.TAG = 0
111 :     # elif tag == "img":
112 :     # self.TAG2 = 0
113 :    
114 :     # override the handle_data method
115 :     # we use the state of our flags to
116 :     # decide which tag's data we're retrieving
117 :     # this method gets called after every
118 :     def handle_data(self, data):
119 :     pass
120 :     #print "Handle data", self.TAG, data
121 :     #if self.TAG == 1:
122 :     # self.TAGDATA.append(data)
123 :     #elif self.TAG2 == 1:
124 :     # self.TAG2DATA.append(data)
125 :    
126 :     # create our own method, to print
127 :     # the data out
128 :     def tagprint(self):
129 :     print "TAG : ",self.TAGDATA
130 :     print "TAG2 : ", "".join(self.TAG2DATA)
131 :     def main():
132 :     if len(sys.argv) != 2:
133 :     print "Usage: html_parser file"
134 :     sys.exit(0)
135 :    
136 :     file = open(sys.argv[1], "r")
137 :     n = file.read()
138 :     #n = """<html><tag>info</tag>
139 :     #<tag2> info <anothertag>info</anothertag> some more data here</tag2></html>"""
140 :     #print n
141 :     parser = MyParser("subsystem name")
142 :     parser.feed(n)
143 :     #parser.tagprint()
144 :     parser.close()
145 :    
146 :     if __name__ == "__main__":
147 :     main()
148 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3