Parent Directory
|
Revision Log
Update of the tutorial publication stuff.
use strict; use Data::Dumper; use File::Copy; use Getopt::Long; use XML::LibXML; use HTML::TreeBuilder; #/home/parrello/CdmiData/Published/KSaplingDBD.xml @ARGV == 2 or die "Usage: $0 DBD-xml-file destination_dir < terms\n"; my $in_file = shift; my $dest = shift; my %terms; my $doc = XML::LibXML->new->parse_file($in_file); $doc or die "cannot parse $in_file\n"; while (<>) { chomp; $terms{$_} = 1; } for my $r ($doc->findnodes('//Relationships/Relationship')) { my $n = $r->getAttribute("name"); $terms{$n} = 1; } for my $e ($doc->findnodes('//Entities/Entity')) { my $n = $e->getAttribute("name"); $terms{$n} = 1; } my $term_ctr = 1; my $terms_re = join("|", map { my $x = quotemeta $_; "(?:$x)" } sort { length($b) <=> length($a) } keys %terms); open(ORDER, "<", "ORDER") or die "cannot open ORDER: $!"; my @dirs = <ORDER>; chomp @dirs; close(ORDER); copy("ORDER", "$dest/ORDER"); my @files; for my $dir (@dirs) { unless(-d "$dest/$dir"){ mkdir "$dest/$dir" or die "cannot make dir $dir: $!"; } @files = find_files($dir); my %seen; for my $html (@files) { my $tree = HTML::TreeBuilder->new; $tree->parse_file("$dir/$html"); $tree->objectify_text; my $title; my $cur_h2 = "Here"; $tree->root->traverse([sub { my($h) = @_; if (!ref($h)) { return 0; } if ($h->tag eq 'h1') { my $hx = $h->clone; $hx->deobjectify_text; $title = $hx->as_text; $hx->delete; return 0; } if ($h->tag eq 'h2') { my $hx = $h->clone; $hx->deobjectify_text; $cur_h2 = $hx->as_text; $hx->delete; # print STDERR "got h2 $cur_h2\n"; return 0; } elsif ($h->tag ne '~text') { return 1; } if ($h->parent->tag =~ /^(pre|h)/) { return 0; } my $t = $h->attr("text"); # print STDERR "Process '$t'\n"; # # We need to process names inside links specially. Here, # we just add name= to the link instead of replacing the text. # my $inside_link; if ($h->parent->tag eq 'a') { $inside_link = 1; } my @new; my $last = 0; while ($t =~ /\b($terms_re)\b/gc) { my $term = $1; # print "'$1'\n"; my $ms = $-[0]; my $me = $+[0]; my $xx = substr($t, $last, ($ms - $last)); $last = $me; # print "$ms $me '$xx'\n"; my $v = $term_ctr++; if ($inside_link) { $h->parent->attr(name => "$term$v"); } else { my $ne = HTML::Element->new("~text", text => $xx); push(@new, $ne); my $ne = HTML::Element->new("a", name => "$term$v"); $ne->push_content($term); push(@new, $ne); } print join("\t", $title, "$dir/$html", $cur_h2, $term, "$term$v"), "\n"; } my $xx = substr($t, $last); #print "'$xx'\n"; if (!$inside_link) { if ($xx) { my $ne = HTML::Element->new("~text", text => $xx); push(@new, $ne); } $h->replace_with(@new); } return 0; }, undef]); if (0) { for my $h ($tree->look_down('_tag', '~text')) { if ($h->parent->tag =~ /^(pre|h)/) { next; } my $t = $h->attr("text"); # print STDERR "Process '$t'\n"; my $h2_txt = "Here"; my $h2 = $h->look_up('_tag', 'h2'); if ($h2) { $h2 = $h2->clone; $h2->deobjectify_text; $h2_txt = $h2->as_text; #$h2->dump(\*STDERR); #print STDERR "got h2 txt $h2_txt\n"; } else { print STDERR "No h2 for $html "; $h->dump(\*STDERR); } my @new; my $last = 0; while ($t =~ /\b($terms_re)\b/gc) { my $term = $1; # print "'$1'\n"; my $ms = $-[0]; my $me = $+[0]; my $xx = substr($t, $last, ($ms - $last)); $last = $me; # print "$ms $me '$xx'\n"; my $ne = HTML::Element->new("~text", text => $xx); push(@new, $ne); my $v = $term_ctr++; my $ne = HTML::Element->new("a", name => "$term$v"); $ne->push_content($term); push(@new, $ne); print join("\t", $title, "$dir/$html", $h2_txt, $term, "$term$v"), "\n"; } my $xx = substr($t, $last); #print "'$xx'\n"; if ($xx) { my $ne = HTML::Element->new("~text", text => $xx); push(@new, $ne); } $h->replace_with(@new); }} $tree->deobjectify_text(); open (T, ">", "$dest/$dir/$html") or die "cannot open $html: $!"; my $body = $tree->look_down("_tag", "body"); if ($body) { for my $c ($body->content_list) { if (ref($c)) { print T $c->as_HTML(undef, ' '); } else { print T $c; } } } else { print T $tree->as_HTML(undef, ' '); } close(T); $seen{$html}++; next; my $h2 = "Here"; my $title = get_title("$dir/$html"); open(F, "<", "$dir/$html") or die "Cannot open $html: $!"; open (T, ">", "$dest/$dir/$html") or die "cannot open $html: $!"; while (<F>) { my $x = $_; if ($x =~ m,<h2>(.*)</h2>,) { $h2 = $1; } $x =~ s!(\w+)!if ($terms{$1}) {my $term = $1; my $v= $term_ctr++; print "$title\t$dir/$html\t$h2\t$term\t$term$v\n"; "<a name=\"$term".$v."\"> $term</a> " } else {$1}!eg; print T $x; } } opendir(D, $dir) or die "Cannot opendir $dir: $!"; for my $f (grep { -f "$dir/$_" && ! $seen{$_} && !/~$/ && !/^\#/ && !/^\./ } readdir(D)) { print STDERR "Copy $dir $f\n"; copy("$dir/$f", "$dest/$dir/$f"); } } sub find_files { # get titles too from publish_site my($dir) = @_; my @files; if (open(O, "<", "$dir/ORDER")) { @files = <O>; chomp @files; close(O); } else { opendir(D, $dir) or die "cannot opendir $dir: $!"; @files = sort { $a cmp $b } grep { /\.html$/ && -f "$dir/$_" } readdir(D); closedir(D); } return @files; } sub get_title { my($file) = @_; open(F, "<", $file) or die "Cannot read $file: $!"; my $title; while (<F>) { if (/<h1>(.*?)</) { $title = $1; last; } } close(F); return $title; }
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |