[Bio] / FortyEightMeta / mg_load_sims_file.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_load_sims_file.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Jun 11 21:16:42 2008 UTC (11 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_rel_2008_0919, mgrast_rel_2008_1110, mgrast_rel_2008_0917
Changes since 1.1: +87 -8 lines
Commit of initial MGRAST2 code.

#!/usr/bin/perl

#
# Load a file of sims data into the relational database for use in the frontend analysis software.
#
# We assume the incoming data is grouped by id1 (the fragment id), and within the groups
# it is sorted in order of increasing e-value. The data as returned from the underlying
# blast runs should have these properties.
#
# We analyze each group of hits in turn, assigning ranks within the group for
# the e-value ordering and the percent-identity ordering. We also look up
# the id2 values for each hit in order to assign a taxonomic identifier for the
# hit.
#
# For FIG hits, we expand the id2 into a fig| identifier if it is an xxx identifier, and look up
# a current subsystem membership for the id. If it has one, map the ss name into
# the "tax string" for that subsystem.
#
# The table is to have already been created, and the table name passed in on the
# command line.
#
# We also must be passed the database name, version, and sequence data file name
# (the logical name, not the pathname, thought it may be possible to work
# backwards from the pathname to the rest; hm. Short term that is easiest, but requires
# constraints on the storage of data - no two entries in databases.xml
# may have the same fasta pathname).
#

use strict;
use DBI;
use Data::Dumper;
use FIG;
use FIG_Config;
use DB_File;
use DBrtns;
use File::Basename;
use Cwd 'abs_path';
use FortyEightMeta::TaxStr;
use FortyEightMeta::SimDB;


@ARGV == 5 or die "Usage: $0 NR-file sims-file table-name best-by-iden-table best-by-psc-table\n";

my $nr_file = shift;
my $sims_file = shift;
my $tbl = shift;
my $best_by_iden_tbl = shift;
my $best_by_psc_tbl = shift;

-f $sims_file or die "Sims file $sims_file does not exist\n";

my($mgdb, $mgdb2);
eval {
    $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
		       $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
		       $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
		       $FIG_Config::mgrast_dbsock);

    $mgdb2 = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
		       $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
		       $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
		       $FIG_Config::mgrast_dbsock);

};
if ($@ or !defined($mgdb))
{
    die "cannot connect to database: $@";
}

my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);

my $dbh = $mgdb->{_dbh};
my $dbh2 = $mgdb2->{_dbh};
$dbh2->{RaiseError} = 1;
my $tax_lookup = $dbh->prepare(qq(SELECT tax_str
				  FROM rdp_to_tax
				  WHERE dbid = ? AND seq_num = ?));

#
# Determine the dbid for the data we are processing. If we are
# given an NR file, use the simdb to determine the database name
# and version that it corresponds to.
#
# (In the current version that is the only option)
#

my($db_name, $db_version, $tax_info_list) = $simdb->db_files_for_fasta_file($nr_file);

if (!defined($db_name))
{
    die "Could not find database info for NR file $nr_file";
}

if (@$tax_info_list == 0)
{
    die "No tax databases registered for database $db_name version $db_version (NR file $nr_file)";
}

#
# Look up the dbid(s) for the data for this file.
#
my @db_ids;
my %specials;

for my $tinfo (@$tax_info_list)
{
    my $res = $mgdb->SQL(qq(SELECT dbid
			    FROM seq_db
			    WHERE name = ? AND version = ? AND tax_db_name = ?), undef, 
			 $db_name, $db_version, $tinfo->{name});
    if (@$res == 0)
    {
	die "No dbid found for database $db_name version $db_version  tax data $tinfo->{name}";
    }
    my $dbid = $res->[0]->[0];
    
    if ($tinfo->{special})
    {
	$specials{$dbid} = $tinfo->{special};
    }
    push(@db_ids, $dbid);
}

print "Found dbids @db_ids\n";

my %rdp_cache;
fill_rdp_cache($dbh, \@db_ids, \%rdp_cache);

#
# If we are processing SEED data, tie in the pegsyn index.
#
my %peg_syn;
if ($db_name eq 'SEED')
{
    my $pegsyn = $simdb->get_pegsyn($db_name, $db_version);
    #
    # Need the index.
    #
    $pegsyn or die "peg synonyms not found for db $db_name version $db_version";
    $pegsyn = "$pegsyn.index.t";
    
    my $tied = tie %peg_syn, 'DB_File', $pegsyn, O_RDONLY, 0, $DB_BTREE;
    $tied or die "Cannot tie $pegsyn: $!";
}

my $l10 = 10 / log(10);

#
# sim-block columns to rank by.
# column index is as pushed into the @block  in the sim processing loops.
# third column is 1 for ascending, -1 for descending.
#
my @rank = ([1, 'rank_iden', -1],
	    [7, 'rank_logpsc', 1],
	   );

$| = 1;

load_tax(\@db_ids, \%specials, $db_name, $sims_file, $tbl, $best_by_iden_tbl, $best_by_psc_tbl);

exit;

#
# Fill the rdp data cache.
#
sub fill_rdp_cache
{
    my($dbh, $ids, $cache) = @_;

    my $s = join(", ", @$ids);
    my $sth = $dbh->prepare(qq(SELECT dbid, seq_num, tax_str
			       FROM rdp_to_tax
			       WHERE dbid IN ($s)));
    $sth->execute;
    my($dbid, $seq_num, $tax_str);
    $sth->bind_columns(\$dbid, \$seq_num, \$tax_str);
    while ($sth->fetch)
    {
	$cache->{$dbid}->{$seq_num} = $tax_str;
    }
}

#
# Map the given database and sequence id to the corresponding tax str.
#
sub id_to_tax_str
{
    my($dbid, $seq) = @_;
    return $rdp_cache{$dbid}->{$seq};
}

sub id_to_tax_str_dblookup
{
    my($dbid, $seq) = @_;

    $tax_lookup->execute($dbid, $seq);
    my $res = $tax_lookup->fetchall_arrayref();
    if (@$res)
    {
	return $res->[0]->[0];
    }
    else
    {
#	print "NO for $dbid $seq\n";
	return undef;
    }
}

sub expand_seed_id
{
    my($id) = @_;
    my $ret = $id;
    my $genome;
    if ($id =~ /^xxx/)
    {
	my $x = $peg_syn{$id};
	if ($x =~ /(fig\|(\d+\.\d+)\.peg\.\d+)/)
	{
	    $ret = $1;
	    $genome = $2;
	}
    }
    return($ret, $genome);
}

sub load_tax
{
    my($db_ids, $specials, $db_name, $file, $tbl, $best_by_iden_tbl, $best_by_psc_tbl) = @_;

    my($best_iden_fh, $best_psc_fh);
    
    my $tmp_best_iden = "$FIG_Config::temp/best_by_iden.$$";
    my $tmp_best_psc = "$FIG_Config::temp/best_by_psc.$$";

    open($best_iden_fh, ">", $tmp_best_iden) or die "Cannot open $tmp_best_iden: $!";
    open($best_psc_fh, ">", $tmp_best_psc) or die "Cannot open $tmp_best_psc: $!";

    $dbh2->begin_work();
    $dbh2->do("COPY $tbl FROM STDIN");

    #
    # SEED is handled specially. We need to expand xxx id's using the
    # peg.synonyms, and to pull the genome ID in order to map fid to
    # tax-via-genome.
    #
    my $proc_id;
    if ($db_name eq 'SEED')
    {
	$proc_id = \&expand_seed_id;
    }
    else
    {
	$proc_id = sub { return ($_[0]); };
    }

    open(F, "<$file") or die "cannot open $file: $!";

    print "Load $file\n";
    
    my $last_id;
    my %blocks;
    map { $blocks{$_} = [] } @$db_ids;
    while (<F>)
    {
	chomp;
	my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);
	$iden = int($iden);
	$bsc = int($bsc);

	my ($id2_proc, $genome) = &$proc_id($id2);
#	print "Have: $id1, $id2, $id2_proc, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc\n";

	if ($id1 ne $last_id)
	{
	    for my $ident (@$db_ids)
	    {
		my $block = $blocks{$ident};
		handle_block($ident, $last_id, $block, $best_iden_fh, $best_psc_fh) if @$block;
		@$block = ();
	    }
	    $last_id = $id1;
	}
	
	for my $ident (@$db_ids)
	{
	    my $block = $blocks{$ident};

	    my $tstr;
	    if ($specials->{$ident} eq 'seed_genomes')
	    {
		$tstr = id_to_tax_str($ident, $genome);
	    }
	    else
	    {
		$tstr = id_to_tax_str($ident, $id2_proc);
	    }
	    if ($tstr)
	    {
		# print "Lookup '$ident' '$id2_proc' => '$tstr'\n";
		my(@g) = split(/:/, $tstr);
		$#g = 3;
		my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
		push(@$block, [$id2_proc, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2]]);
	    }
	}
    }
    for my $ident (@$db_ids)
    {
	my $block = $blocks{$ident};
	handle_block($ident, $last_id, $block, $best_iden_fh, $best_psc_fh) if @$block;
    }

    $dbh2->pg_endcopy();
    $dbh2->commit();

    close($best_iden_fh);
    close($best_psc_fh);

    open($best_iden_fh, "<", $tmp_best_iden) or die "cannot open $tmp_best_iden for reading $!";
    open($best_psc_fh, "<", $tmp_best_psc) or die "cannot open $tmp_best_psc for reading $!";

    print "load $tmp_best_iden\n";
    $dbh2->begin_work();
    $dbh2->do("COPY $best_by_iden_tbl FROM STDIN");
    while (<$best_iden_fh>)
    {
	$dbh2->pg_putline($_);
    }
    $dbh2->pg_endcopy();
    $dbh2->commit();
    close($best_iden_fh);
    
    print "load $tmp_best_psc\n";
    $dbh2->begin_work();
    $dbh2->do("COPY $best_by_psc_tbl FROM STDIN");
    while (<$best_psc_fh>)
    {
	$dbh2->pg_putline($_);
    }
    $dbh2->pg_endcopy();
    $dbh2->commit();
    close($best_psc_fh);
    close(F);
}

#
# Process a block of sims from the same id1.
#
# We sort by each of the fields we are interested in ranking on
# (pct identity, bitscore, pscore) and store that rank as well.
#
# We actually don't have to sort on bitscore/pscore since we assume
# our data comes in that way. Those ranks are just the indexes of the
# rows in the block.
#
sub handle_block
{
    my($dbid, $id1, $block, $best_iden_fh, $best_psc_fh) = @_;

    # print "Norm: " . Dumper($block);

#    return if @$block < 10 || $block->[0]->[1] > 90;

    my @r;
    for my $rent (@rank)
    {
	my($col_idx, $col_name, $direction) = @$rent;

	# print "@$rent\n";
	my @s = sort { $direction * ($a->[$col_idx] <=> $b->[$col_idx]) } @$block;
	map { push(@{$s[$_]}, $_) } 0..$#s;

	if ($col_name eq 'rank_iden' and @s)
	{
	    my $r = $s[0];
	    print $best_iden_fh join("\t", $dbid, $id1, @$r[0..12]), "\n";
	}
	elsif ($col_name eq 'rank_logpsc' and @s)
	{
	    my $r = $s[0];
	    print $best_psc_fh join("\t", $dbid, $id1, @$r[0..12]), "\n";
	}

#	for my $i (0..$#s)
#	{
#	    print join("\t", $i, @{$s[$i]}, @{$block->[$i]}), "\n";
#	}
    }

    for my $i (0..$#$block)
    {
	my $l = join("\t", $dbid, $id1, @{$block->[$i]}) . "\n";
	$dbh2->pg_putline($l);
    }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3