[Bio] / FigKernelScripts / html_parser.py Repository:
ViewVC logotype

View of /FigKernelScripts/html_parser.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (14 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +17 -0 lines
Add license words.

#!/usr/bin/python

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info or the Fellowship for Interpretation of
# Genomes at veronika or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

from FigKernelPackages.FIG2 import FIG
from HTMLParser import HTMLParser
import sys, string

fig = FIG()

# create our own "parser"

class MyParser(HTMLParser):
    def __init__(self, subsystem):
        # need to init HTMLParser, as well as add our

        # own flags

        HTMLParser.__init__(self)
	self.area_num = 0
	self.subsystem = subsystem

    def get_role(self, subsystem, abbrev):
	    sub = fig.get_subsystem(subsystem)
	    if sub is None:
		    return("No Role for %s:%s" % (subsystem, abbrev))
	    else:
		    return sub.get_role_from_abbr(abbrev)


    def handle_area(self, attrs):
	    #print "AREA %s" % self.area_num
	    area = "A%d" % self.area_num
	    for item in attrs:
		    if item[0].lower() == "alt":
			    type, value = string.split(item[1], ",")
			    if type.lower() == "compound":
				    print '\t'.join((area, type.lower(), value))
			    elif type.lower() == "role":
				    full_role = self.get_role(self.subsystem, value.strip())
				    print '\t'.join((area, type.lower(), full_role))
				    print '\t'.join((area, "abbrev %s"% value))
		    elif item[0].lower() == "shape":
			    shape = item[1].strip().lower()
		    elif item[0].lower() == "coords":
			    coords = item[1]
			    print '\t'.join((area, "shape", "%s:%s" % (shape, coords)))
		    elif item[0].lower() == "href":
			    if item[1][0:4] == "none":
				    url = "none"
			    elif item[1][0:4].lower() == "http":
			    	    url = item[1]
			    else:
				    kegg, the_rest = string.split(item[1], ",", 1)
				    if kegg.lower() == "kegg":
					    type, num = string.split(the_rest, ",")
					    url = the_rest
					    if type.strip().lower() == 'ec':
						    url  = "http://www.genome.ad.jp/dbget-bin/www_bget?enzyme+%s" % num.strip()
					    elif type.strip().lower() == "compound":
						    url  = "http://www.genome.ad.jp/dbget-bin/www_bget?compound+%s" % num.strip()
			    print '\t'.join((area, 'link', '"KEGG entry",%s' % url))
			    

		    #print "A%d " % self.area_num, item[0],":", item[1]
	    self.area_num += 1
    def handle_map(self, attrs):
	    for item in attrs:
		    self.subsystem = item[1]
	    #print item[0],":", item[1]
    def handle_img(self, attrs):
	    print "IMG"
	    for item in attrs:
		    print item[0],":", item[1]
    def handle_unknown(self, tag, attrs):
	    return
	    print "UNKNOWN:", tag
	    for item in attrs:
		    print item[0],":", item[1]

      # override the handle_starttag method

    def handle_starttag(self, tag, attrs):
        if tag == "area":
	    self.handle_area(attrs)
        elif tag == "map":
	    self.handle_map(attrs)
        elif tag == "img":
	    self.handle_img(attrs)
	else:
	    self.handle_unknown(tag, attrs)

    # override the handle_endtag method

    # use this to turn the flag OFF, so

    # we know to stop capturing data

    def handle_endtag(self, tag):
	pass
	#print "Endtag", tag
#        if tag == "area":

#            self.TAG = 0

#        elif tag == "img":

#            self.TAG2 = 0


    # override the handle_data method

    # we use the state of our flags to   

    # decide which tag's data we're retrieving

    # this method gets called after every 

    def handle_data(self, data):
	pass
	#print "Handle data", self.TAG, data
        #if self.TAG == 1:
        #    self.TAGDATA.append(data)

        #elif self.TAG2 == 1:
        #    self.TAG2DATA.append(data)


    # create our own method, to print

    # the data out

    def tagprint(self):
        print "TAG : ",self.TAGDATA
        print "TAG2 : ", "".join(self.TAG2DATA)
def main():
    if len(sys.argv) != 2:
	    print "Usage: html_parser file"
	    sys.exit(0)

    file = open(sys.argv[1], "r")
    n = file.read()
    #n = """<html><tag>info</tag>

#<tag2> info <anothertag>info</anothertag> some more data here</tag2></html>"""

    #print n
    parser = MyParser("subsystem name")
    parser.feed(n)
    #parser.tagprint()

    parser.close()

if __name__ == "__main__":
    main()


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3