[Bio] / FigKernelScripts / sphinx_index_genome.pl Repository:
ViewVC logotype

View of /FigKernelScripts/sphinx_index_genome.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Sat May 7 23:27:39 2011 UTC (9 years ago) by olson
Branch: MAIN
Changes since 1.7: +162 -62 lines
Doh. Check in the latest indexer, things will be happier then.

use Data::Dumper;

use SeedSearch;

use strict;
use Encode;
use FIG;
my $fig = new FIG;

@ARGV == 1 or die "Usage: sphinx_index_genome function|attrib|subsystem\n";

my $which = shift;

my @fields;
my %attr_types;
if ($which eq 'function')
{
    @fields = qw(annotation fid genome);
    $attr_types{$_} = "attr=\"string\"" for @fields;
}
elsif ($which eq 'genome')
{
    @fields = qw(genome name taxonomy);
    $attr_types{$_} = "attr=\"string\"" for qw(genome name);
}
elsif ($which eq 'attrib')
{
    @fields = qw(genome alias subsystem annotation fid);
    $attr_types{$_} = 'attr="string"' foreach qw(annotation);
}
elsif ($which eq 'subsystem')
{
    @fields = qw(subsystem curator version classification);
    $attr_types{$_} = "attr=\"string\"" for @fields;
}
else
{
    die "Unknown type $which\n";
}

print <<END;
<?xml version="1.0" encoding="utf-8"?>
<sphinx:docset>
<sphinx:schema>
END
print "<sphinx:field name=\"$_\" $attr_types{$_}/>\n" for @fields;
print <<END;
</sphinx:schema>
END

my %tmap = (peg => 1, rna => 2);

my @genomes;
if (my $glist = $ENV{SPHINX_INDEX_ONLY})
{
    @genomes = split(/,/, $glist);
}
else
{
    @genomes = $fig->genomes(1);
}

if ($which eq 'subsystem')
{
    &index_subsystems;
    print "</sphinx:docset>\n";
    exit;
}
elsif ($which eq 'genome')
{
    &index_genomes(\@genomes);
    print "</sphinx:docset>\n";
    exit;
}


#
# Ingest the subsystem index.
#

my $next_id = 1;
    
for my $genome (@genomes)
{
    print STDERR "$genome\n";
    my $gs = escape($fig->genus_species($genome));
    if ($which eq 'function')
    {
	my @fids = $fig->all_features($genome);
	my $fns = $fig->function_of_bulk(\@fids);

	for my $fid (keys %$fns)
	{
	    my $fn = escape($fns->{$fid});
	    my $docid = SeedSearch::fid_to_docid($fid);
	    print <<END;
<sphinx:document id="$docid">
<genome>$gs</genome>
<annotation>$fn</annotation>
</sphinx:document>
END
	}
    }
    else
    {
	my %ss_info;
	
	my $sth = $fig->db_handle->{_dbh}->prepare(qq(SELECT i.protein, i.subsystem
						      FROM subsystem_index i LEFT JOIN aux_roles a ON i.role = a.role
						      WHERE i.protein LIKE 'fig|$genome.peg.%' AND a.subsystem IS NULL),
					       { mysql_use_result => 1});
	$sth->execute();
	while (my $ent = $sth->fetchrow_arrayref())
	{
	    my($prot, $ss) = @$ent;
	    $ss_info{$prot}->{$ss} = 1;
	}
	
	my $gs = escape($fig->genus_species($genome));
	
	my $all_data = $fig->all_features_detailed_fast($genome);
	
	my $ext_aliases_l = $fig->db_handle->SQL(qq(SELECT id, alias
						    FROM ext_alias
						    WHERE id like 'fig|${genome}.%'));
	my %ext_aliases;
	map { $ext_aliases{$_->[0]}->{$_->[1]}++ } @$ext_aliases_l;
	
	for my $feature (@$all_data)
	{
	    my($fid, $loc, $aliases, $type, $b, $e, $func, $who) = @$feature;
	    
	    # my @ss = $fig->peg_to_subsystems($fid, 1, 1);
	    my @ss = keys %{$ss_info{$fid}};
	    @ss = map { defined($_) ? encode_utf8($_) : () } @ss;
	    my $ss = escape(join("\n", map { s/_/ /g; $_ } @ss));
	    
	    $func = defined($func) ? escape($func) : "";
	    my $efid = escape($fid);
	    
	    my %aliases = map { $_ => 1 } split(",", $aliases);
	    map { $aliases{$_} = 1 } keys %{$ext_aliases{$fid}};
	    my @aliases = keys %aliases;
	    my $alias_txt = "";
	    if (@aliases)
	    {
		$alias_txt = escape(join("\n",
					 map { s/&/&amp;/g;
					       s/</&lt;/g;
					       s/>/&gt;/g;
					       $_ } @aliases));
	    }
	    my $docid = SeedSearch::fid_to_docid($fid);
	    print <<END;
<sphinx:document id="$docid">
<genome>$genome $gs</genome>
<fid>$efid</fid>
<annotation>$func</annotation>
<alias>$alias_txt</alias>
<subsystem>$ss</subsystem>
</sphinx:document>
END
	}
    }
}
print "</sphinx:docset>\n";

sub index_genomes
{
    my($genomes) = @_;
    my $i = 1;
    for my $g (@$genomes)
    {
	my $gs = escape($fig->genus_species($g));
	my $tax =escape($fig->taxonomy_of($g));
	print "<sphinx:document id=\"$i\">\n";

	print "<genome>$g</genome>\n";
	print "<name>$gs</name>\n";
	print "<taxonomy>$tax</taxonomy>\n";
	print "</sphinx:document>\n";
	$i++;
    }
}

sub index_subsystems
{
    my $i = 1;
    my @ss = $fig->all_subsystems_detailed();
    for my $ent (@ss)
    {
	print "<sphinx:document id=\"$i\">\n";
	$ent->{subsystem} =~ s/_/ /g;
	for my $f (@fields)
	{
	    print "<$f>" . escape($ent->{$f}) . "</$f>\n";
	}
	print "</sphinx:document>\n";
	$i++;
    }
}

sub escape
{
    my($s) = @_;
    return "" unless defined($s);
    $s = encode_utf8($s);
    $s =~ s/&/&amp;/g;
    $s =~ s/</&lt;/g;
    $s =~ s/>/&gt;/g;
    return $s;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3