Parent Directory
|
Revision Log
Revision 1.1 - (view) (download) (as text)
1 : | olson | 1.1 | |
2 : | # package main; | ||
3 : | |||
4 : | # use Data::Dumper; | ||
5 : | # use strict; | ||
6 : | |||
7 : | # my $p = HtmlSplitter->new(); | ||
8 : | # my $ret = $p->parse_file(shift || die); | ||
9 : | # print "Done parsing: $ret\n"; | ||
10 : | |||
11 : | # $p->{head} =~ s/\r\n/\n/gm; | ||
12 : | # $p->{head} =~ s/\r/\n/gm; | ||
13 : | |||
14 : | # $p->{body} =~ s/\r\n/\n/gm; | ||
15 : | # $p->{body} =~ s/\r/\n/gm; | ||
16 : | |||
17 : | # #print "HEAD: $p->{head}\n"; | ||
18 : | # #print "BODY: $p->{body}\n"; | ||
19 : | |||
20 : | # my @maps = @{$p->{map_names}}; | ||
21 : | |||
22 : | # print "maps: @maps\n"; | ||
23 : | # for my $map (@maps) | ||
24 : | # { | ||
25 : | # print "$map:\n"; | ||
26 : | # print $p->{map}->{$map}, "\n"; | ||
27 : | |||
28 : | # } | ||
29 : | |||
30 : | package HtmlPageParser; | ||
31 : | |||
32 : | use strict; | ||
33 : | use Data::Dumper; | ||
34 : | use HTML::Parser (); | ||
35 : | |||
36 : | use base 'HTML::Parser'; | ||
37 : | |||
38 : | sub new | ||
39 : | { | ||
40 : | my($class) = @_; | ||
41 : | |||
42 : | my $self = $class->SUPER::new(api_version => 3, | ||
43 : | start_h => ["start_handler", "self,tagname,text,attr"], | ||
44 : | end_h => ["end_handler", "self,tagname,text,attr"], | ||
45 : | text_h => ["text_handler", "self,tagname,text"], | ||
46 : | default_h => ["default_handler", "self,text"]); | ||
47 : | |||
48 : | $self->{state} = 'start'; | ||
49 : | return bless($self, $class); | ||
50 : | } | ||
51 : | |||
52 : | sub start_handler | ||
53 : | { | ||
54 : | my($self, $tag, $txt, $attr) = @_; | ||
55 : | |||
56 : | # print "Start tag=$tag txt=$txt state=" . $self->state . "\n"; | ||
57 : | |||
58 : | if ($tag eq 'map') | ||
59 : | { | ||
60 : | my $name = $attr->{name}; | ||
61 : | $self->{in_map} = $name; | ||
62 : | push(@{$self->{map_names}}, $name); | ||
63 : | } | ||
64 : | elsif ($tag eq 'img') | ||
65 : | { | ||
66 : | my $src = $attr->{src}; | ||
67 : | } | ||
68 : | |||
69 : | if (my $map = $self->{in_map}) | ||
70 : | { | ||
71 : | $self->{map}->{$map} .= $txt; | ||
72 : | } | ||
73 : | |||
74 : | # | ||
75 : | # If we're gathering information from the <HEAD> block, just accumulate text. | ||
76 : | # | ||
77 : | if ($self->state eq 'head') | ||
78 : | { | ||
79 : | $self->{head} .= $txt; | ||
80 : | } | ||
81 : | elsif ($self->state eq 'body') | ||
82 : | { | ||
83 : | $self->{body} .= $txt; | ||
84 : | } | ||
85 : | # | ||
86 : | # Otherwise, if we see a <head>, start gathering | ||
87 : | # | ||
88 : | elsif ($tag eq 'head') | ||
89 : | { | ||
90 : | $self->state('head'); | ||
91 : | } | ||
92 : | elsif ($tag eq 'body') | ||
93 : | { | ||
94 : | $self->state('body'); | ||
95 : | } | ||
96 : | } | ||
97 : | |||
98 : | sub end_handler | ||
99 : | { | ||
100 : | my($self, $tag, $txt, $attr) = @_; | ||
101 : | |||
102 : | if (my $map = $self->{in_map}) | ||
103 : | { | ||
104 : | $self->{map}->{$map} .= $txt; | ||
105 : | } | ||
106 : | |||
107 : | if ($tag eq 'map') | ||
108 : | { | ||
109 : | delete $self->{in_map}; | ||
110 : | } | ||
111 : | |||
112 : | |||
113 : | # | ||
114 : | # If we've finished the head, switch out of head state. | ||
115 : | # | ||
116 : | if ($tag eq 'head') | ||
117 : | { | ||
118 : | $self->state('none'); | ||
119 : | } | ||
120 : | elsif ($tag eq 'body') | ||
121 : | { | ||
122 : | $self->state('none'); | ||
123 : | } | ||
124 : | elsif ($self->state eq 'head') | ||
125 : | { | ||
126 : | $self->{head} .= $txt; | ||
127 : | } | ||
128 : | elsif ($self->state eq 'body') | ||
129 : | { | ||
130 : | $self->{body} .= $txt; | ||
131 : | } | ||
132 : | |||
133 : | } | ||
134 : | |||
135 : | sub text_handler | ||
136 : | { | ||
137 : | my($self, $tag, $txt) = @_; | ||
138 : | |||
139 : | # print "txt tag=$tag txt='$txt'\n"; | ||
140 : | |||
141 : | if (my $map = $self->{in_map}) | ||
142 : | { | ||
143 : | $self->{map}->{$map} .= $txt; | ||
144 : | } | ||
145 : | |||
146 : | if ($self->state eq 'head') | ||
147 : | { | ||
148 : | $self->{head} .= $txt; | ||
149 : | } | ||
150 : | elsif ($self->state eq 'body') | ||
151 : | { | ||
152 : | $self->{body} .= $txt; | ||
153 : | } | ||
154 : | |||
155 : | } | ||
156 : | sub default_handler | ||
157 : | { | ||
158 : | my($self, $tag, $txt) = @_; | ||
159 : | |||
160 : | # print "def tag=$tag txt='$txt'\n"; | ||
161 : | |||
162 : | if (my $map = $self->{in_map}) | ||
163 : | { | ||
164 : | $self->{map}->{$map} .= $txt; | ||
165 : | } | ||
166 : | |||
167 : | if ($self->state eq 'head') | ||
168 : | { | ||
169 : | $self->{head} .= $txt; | ||
170 : | } | ||
171 : | elsif ($self->state eq 'body') | ||
172 : | { | ||
173 : | $self->{body} .= $txt; | ||
174 : | } | ||
175 : | } | ||
176 : | |||
177 : | sub state | ||
178 : | { | ||
179 : | my($self, $s) = @_; | ||
180 : | |||
181 : | if (defined($s)) | ||
182 : | { | ||
183 : | # cluck "set state to $s"; | ||
184 : | my $old = $self->{state}; | ||
185 : | $self->{state} = $s; | ||
186 : | return $old; | ||
187 : | } | ||
188 : | else | ||
189 : | { | ||
190 : | return $self->{state}; | ||
191 : | } | ||
192 : | } | ||
193 : | |||
194 : | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |