[Bio] / KBaseTutorials / index_dbd_terms.pl Repository:
ViewVC logotype

View of /KBaseTutorials/index_dbd_terms.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Jun 13 21:09:01 2012 UTC (7 years, 4 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +214 -22 lines
Update of the tutorial publication stuff.

use strict;
use Data::Dumper;
use File::Copy;
use Getopt::Long;
use XML::LibXML;
use HTML::TreeBuilder;

#/home/parrello/CdmiData/Published/KSaplingDBD.xml

@ARGV == 2 or die "Usage: $0 DBD-xml-file destination_dir < terms\n";

my $in_file = shift;
my $dest = shift;
my %terms;
my $doc = XML::LibXML->new->parse_file($in_file);
$doc or die "cannot parse $in_file\n";

while (<>) {
        chomp;
        $terms{$_} = 1;
}
for my $r ($doc->findnodes('//Relationships/Relationship'))
{
    my $n = $r->getAttribute("name");
    $terms{$n} = 1;
}

for my $e ($doc->findnodes('//Entities/Entity'))
{
    my $n = $e->getAttribute("name");
    $terms{$n} = 1;
}

my $term_ctr = 1;	

my $terms_re = join("|", map { my $x = quotemeta $_; "(?:$x)"  } sort { length($b) <=> length($a) }  keys %terms);

open(ORDER, "<", "ORDER") or die "cannot open ORDER: $!";

my @dirs = <ORDER>;
chomp @dirs;
close(ORDER);

copy("ORDER", "$dest/ORDER");

my @files;

for my $dir (@dirs) {
    unless(-d "$dest/$dir"){
	mkdir "$dest/$dir" or die "cannot make dir $dir: $!";
    }
    
    @files = find_files($dir);

    my %seen;
    for my $html (@files) {

	my $tree = HTML::TreeBuilder->new;
	$tree->parse_file("$dir/$html");
	$tree->objectify_text;

	my $title;
	my $cur_h2 = "Here";
	$tree->root->traverse([sub {
	    my($h) = @_;

	    if (!ref($h))
	    {
		return 0;
	    }
	    if ($h->tag eq 'h1')
	    {
		my $hx = $h->clone;
		$hx->deobjectify_text;
		$title = $hx->as_text;
		$hx->delete;
		return 0;
	    }
		
	    if ($h->tag eq 'h2')
	    {
		my $hx = $h->clone;
		$hx->deobjectify_text;
		$cur_h2 = $hx->as_text;
		$hx->delete;
		# print STDERR "got h2 $cur_h2\n";

		return 0;
	    }
	    elsif ($h->tag ne '~text')
	    {
		return 1;
	    }
	    
	    if ($h->parent->tag =~ /^(pre|h)/)
	    {
		return 0;
	    }
	    my $t = $h->attr("text");
	    # print STDERR "Process '$t'\n";

	    #
	    # We need to process names inside links specially. Here,
	    # we just add name= to the link instead of replacing the text.
	    #
	    my $inside_link;
	    if ($h->parent->tag eq 'a')
	    {
		$inside_link = 1;
	    }
	    
	    my @new;
	    my $last = 0;
	    while ($t =~ /\b($terms_re)\b/gc)
	    {
		my $term = $1;
		# print "'$1'\n";
		my $ms = $-[0];
		my $me = $+[0];
		my $xx = substr($t, $last, ($ms - $last));
		$last = $me;
		# print "$ms $me '$xx'\n";

		my $v = $term_ctr++;

		if ($inside_link)
		{
		    $h->parent->attr(name => "$term$v");
		}
		else
		{
		    my $ne = HTML::Element->new("~text", text => $xx);
		    push(@new, $ne);
		    my $ne = HTML::Element->new("a", name => "$term$v");
		    $ne->push_content($term);
		    push(@new, $ne);
		}
		print join("\t", $title, "$dir/$html", $cur_h2, $term, "$term$v"), "\n";
	    }
	    my $xx = substr($t, $last);
	    #print "'$xx'\n";
	    if (!$inside_link)
	    {
		if ($xx)
		{
		    my $ne = HTML::Element->new("~text", text => $xx);
		    push(@new, $ne);
		}
		$h->replace_with(@new);
	    }
	    return 0;
	}, undef]);

	if (0) {
	for my $h ($tree->look_down('_tag', '~text'))
	{
	    if ($h->parent->tag =~ /^(pre|h)/)
	    {
		next;
	    }
	    my $t = $h->attr("text");
	    # print STDERR "Process '$t'\n";

	    my $h2_txt = "Here";
	    my $h2 = $h->look_up('_tag', 'h2');
	    if ($h2)
	    {
		$h2 = $h2->clone;
		$h2->deobjectify_text;
		$h2_txt = $h2->as_text;
		#$h2->dump(\*STDERR);
		#print STDERR "got h2 txt $h2_txt\n";
	    }
	    else
	    {
		print STDERR "No h2 for $html ";
		$h->dump(\*STDERR);
	    }

	    my @new;
	    my $last = 0;
	    while ($t =~ /\b($terms_re)\b/gc)
	    {
		my $term = $1;
		# print "'$1'\n";
		my $ms = $-[0];
		my $me = $+[0];
		my $xx = substr($t, $last, ($ms - $last));
		$last = $me;
		# print "$ms $me '$xx'\n";

		my $ne = HTML::Element->new("~text", text => $xx);
		push(@new, $ne);
		my $v = $term_ctr++;
		my $ne = HTML::Element->new("a", name => "$term$v");
		$ne->push_content($term);
		push(@new, $ne);
		print join("\t", $title, "$dir/$html", $h2_txt, $term, "$term$v"), "\n";
	    }
	    my $xx = substr($t, $last);
	    #print "'$xx'\n";
	    if ($xx)
	    {
		my $ne = HTML::Element->new("~text", text => $xx);
		push(@new, $ne);
	    }
	    $h->replace_with(@new);
	}}
	$tree->deobjectify_text();

	open (T, ">", "$dest/$dir/$html") or die "cannot open $html: $!";

	my $body = $tree->look_down("_tag", "body");
	if ($body)
	{
	    for my $c ($body->content_list)
	    {
		if (ref($c))
		{
		    print T $c->as_HTML(undef, ' ');
		}
		else
		{
		    print T $c;
		}
	    }
	}
	else
	{
	    print T $tree->as_HTML(undef, '  ');
	}
	close(T);

	$seen{$html}++;

	next;
	my $h2 = "Here";
	my $title = get_title("$dir/$html");
	open(F, "<", "$dir/$html") or die "Cannot open $html: $!";
	open (T, ">", "$dest/$dir/$html") or die "cannot open $html: $!";
	while (<F>) {
	    my $x = $_;
	    if ($x =~ m,<h2>(.*)</h2>,) {
		$h2 = $1;
	    }
	    $x =~ s!(\w+)!if ($terms{$1}) {my $term = $1;
					   my $v= $term_ctr++;
					   print "$title\t$dir/$html\t$h2\t$term\t$term$v\n"; 
					   "<a name=\"$term".$v."\"> $term</a> "
					   } else {$1}!eg;
	    print T $x;
	}
    }
    opendir(D, $dir) or die "Cannot opendir $dir: $!";
    for my $f (grep { -f "$dir/$_" && ! $seen{$_} && !/~$/ && !/^\#/  && !/^\./ } readdir(D))
    {
	print STDERR "Copy $dir $f\n";
	copy("$dir/$f", "$dest/$dir/$f");
    }
}			


	

sub find_files {
# get titles too from publish_site
    my($dir) = @_;
    my @files;
    if (open(O, "<", "$dir/ORDER"))
    {
        @files = <O>;
        chomp @files;
        close(O);
    }
    else
    {
        opendir(D, $dir) or die "cannot opendir $dir: $!";
        @files = sort { $a cmp $b } grep { /\.html$/ && -f "$dir/$_" } readdir(D);
        closedir(D);
    }
    return @files;
}
sub get_title
{
    my($file) = @_;
    open(F, "<", $file) or die "Cannot read $file: $!";
    my $title;
    while (<F>)
    {   
        if (/<h1>(.*?)</)
        {   
            $title = $1;
            last;
        }
    }
    close(F);
    return $title;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3