Parent Directory
|
Revision Log
Revision 1.2 - (view) (download) (as text)
1 : | olson | 1.2 | # |
2 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
3 : | # for Interpretations of Genomes. All Rights Reserved. | ||
4 : | # | ||
5 : | # This file is part of the SEED Toolkit. | ||
6 : | # | ||
7 : | # The SEED Toolkit is free software. You can redistribute | ||
8 : | # it and/or modify it under the terms of the SEED Toolkit | ||
9 : | # Public License. | ||
10 : | # | ||
11 : | # You should have received a copy of the SEED Toolkit Public License | ||
12 : | # along with this program; if not write to the University of Chicago | ||
13 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
14 : | # Genomes at veronika@thefig.info or download a copy from | ||
15 : | # http://www.theseed.org/LICENSE.TXT. | ||
16 : | # | ||
17 : | |||
18 : | olson | 1.1 | |
19 : | # package main; | ||
20 : | |||
21 : | # use Data::Dumper; | ||
22 : | # use strict; | ||
23 : | |||
24 : | # my $p = HtmlSplitter->new(); | ||
25 : | # my $ret = $p->parse_file(shift || die); | ||
26 : | # print "Done parsing: $ret\n"; | ||
27 : | |||
28 : | # $p->{head} =~ s/\r\n/\n/gm; | ||
29 : | # $p->{head} =~ s/\r/\n/gm; | ||
30 : | |||
31 : | # $p->{body} =~ s/\r\n/\n/gm; | ||
32 : | # $p->{body} =~ s/\r/\n/gm; | ||
33 : | |||
34 : | # #print "HEAD: $p->{head}\n"; | ||
35 : | # #print "BODY: $p->{body}\n"; | ||
36 : | |||
37 : | # my @maps = @{$p->{map_names}}; | ||
38 : | |||
39 : | # print "maps: @maps\n"; | ||
40 : | # for my $map (@maps) | ||
41 : | # { | ||
42 : | # print "$map:\n"; | ||
43 : | # print $p->{map}->{$map}, "\n"; | ||
44 : | |||
45 : | # } | ||
46 : | |||
47 : | package HtmlPageParser; | ||
48 : | |||
49 : | use strict; | ||
50 : | use Data::Dumper; | ||
51 : | use HTML::Parser (); | ||
52 : | |||
53 : | use base 'HTML::Parser'; | ||
54 : | |||
55 : | sub new | ||
56 : | { | ||
57 : | my($class) = @_; | ||
58 : | |||
59 : | my $self = $class->SUPER::new(api_version => 3, | ||
60 : | start_h => ["start_handler", "self,tagname,text,attr"], | ||
61 : | end_h => ["end_handler", "self,tagname,text,attr"], | ||
62 : | text_h => ["text_handler", "self,tagname,text"], | ||
63 : | default_h => ["default_handler", "self,text"]); | ||
64 : | |||
65 : | $self->{state} = 'start'; | ||
66 : | return bless($self, $class); | ||
67 : | } | ||
68 : | |||
69 : | sub start_handler | ||
70 : | { | ||
71 : | my($self, $tag, $txt, $attr) = @_; | ||
72 : | |||
73 : | # print "Start tag=$tag txt=$txt state=" . $self->state . "\n"; | ||
74 : | |||
75 : | if ($tag eq 'map') | ||
76 : | { | ||
77 : | my $name = $attr->{name}; | ||
78 : | $self->{in_map} = $name; | ||
79 : | push(@{$self->{map_names}}, $name); | ||
80 : | } | ||
81 : | elsif ($tag eq 'img') | ||
82 : | { | ||
83 : | my $src = $attr->{src}; | ||
84 : | } | ||
85 : | |||
86 : | if (my $map = $self->{in_map}) | ||
87 : | { | ||
88 : | $self->{map}->{$map} .= $txt; | ||
89 : | } | ||
90 : | |||
91 : | # | ||
92 : | # If we're gathering information from the <HEAD> block, just accumulate text. | ||
93 : | # | ||
94 : | if ($self->state eq 'head') | ||
95 : | { | ||
96 : | $self->{head} .= $txt; | ||
97 : | } | ||
98 : | elsif ($self->state eq 'body') | ||
99 : | { | ||
100 : | $self->{body} .= $txt; | ||
101 : | } | ||
102 : | # | ||
103 : | # Otherwise, if we see a <head>, start gathering | ||
104 : | # | ||
105 : | elsif ($tag eq 'head') | ||
106 : | { | ||
107 : | $self->state('head'); | ||
108 : | } | ||
109 : | elsif ($tag eq 'body') | ||
110 : | { | ||
111 : | $self->state('body'); | ||
112 : | } | ||
113 : | } | ||
114 : | |||
115 : | sub end_handler | ||
116 : | { | ||
117 : | my($self, $tag, $txt, $attr) = @_; | ||
118 : | |||
119 : | if (my $map = $self->{in_map}) | ||
120 : | { | ||
121 : | $self->{map}->{$map} .= $txt; | ||
122 : | } | ||
123 : | |||
124 : | if ($tag eq 'map') | ||
125 : | { | ||
126 : | delete $self->{in_map}; | ||
127 : | } | ||
128 : | |||
129 : | |||
130 : | # | ||
131 : | # If we've finished the head, switch out of head state. | ||
132 : | # | ||
133 : | if ($tag eq 'head') | ||
134 : | { | ||
135 : | $self->state('none'); | ||
136 : | } | ||
137 : | elsif ($tag eq 'body') | ||
138 : | { | ||
139 : | $self->state('none'); | ||
140 : | } | ||
141 : | elsif ($self->state eq 'head') | ||
142 : | { | ||
143 : | $self->{head} .= $txt; | ||
144 : | } | ||
145 : | elsif ($self->state eq 'body') | ||
146 : | { | ||
147 : | $self->{body} .= $txt; | ||
148 : | } | ||
149 : | |||
150 : | } | ||
151 : | |||
152 : | sub text_handler | ||
153 : | { | ||
154 : | my($self, $tag, $txt) = @_; | ||
155 : | |||
156 : | # print "txt tag=$tag txt='$txt'\n"; | ||
157 : | |||
158 : | if (my $map = $self->{in_map}) | ||
159 : | { | ||
160 : | $self->{map}->{$map} .= $txt; | ||
161 : | } | ||
162 : | |||
163 : | if ($self->state eq 'head') | ||
164 : | { | ||
165 : | $self->{head} .= $txt; | ||
166 : | } | ||
167 : | elsif ($self->state eq 'body') | ||
168 : | { | ||
169 : | $self->{body} .= $txt; | ||
170 : | } | ||
171 : | |||
172 : | } | ||
173 : | sub default_handler | ||
174 : | { | ||
175 : | my($self, $tag, $txt) = @_; | ||
176 : | |||
177 : | # print "def tag=$tag txt='$txt'\n"; | ||
178 : | |||
179 : | if (my $map = $self->{in_map}) | ||
180 : | { | ||
181 : | $self->{map}->{$map} .= $txt; | ||
182 : | } | ||
183 : | |||
184 : | if ($self->state eq 'head') | ||
185 : | { | ||
186 : | $self->{head} .= $txt; | ||
187 : | } | ||
188 : | elsif ($self->state eq 'body') | ||
189 : | { | ||
190 : | $self->{body} .= $txt; | ||
191 : | } | ||
192 : | } | ||
193 : | |||
194 : | sub state | ||
195 : | { | ||
196 : | my($self, $s) = @_; | ||
197 : | |||
198 : | if (defined($s)) | ||
199 : | { | ||
200 : | # cluck "set state to $s"; | ||
201 : | my $old = $self->{state}; | ||
202 : | $self->{state} = $s; | ||
203 : | return $old; | ||
204 : | } | ||
205 : | else | ||
206 : | { | ||
207 : | return $self->{state}; | ||
208 : | } | ||
209 : | } | ||
210 : | |||
211 : | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |