[Bio] / FortyEightMeta / load_sim_data.pl Repository:
ViewVC logotype

View of /FortyEightMeta/load_sim_data.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Wed Apr 2 18:20:38 2008 UTC (11 years, 10 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.3: +13 -3 lines
Loader tweaks.

#!/usr/bin/perl

#
# Load sims data into the relational database for use in the frontend analysis software.
#
# We assume the incoming data is grouped by id1 (the fragment id), and within the groups
# it is sorted in order of increasing e-value. The data as returned from the underlying
# blast runs should have these properties.
#
# We analyze each group of hits in turn, assigning ranks within the group for
# the e-value ordering and the percent-identity ordering. We also look up
# the id2 values for each hit in order to assign a taxonomic identifier for the
# hit.
#
# For FIG hits, we expand the id2 into a fig| identifier if it is an xxx identifier, and look up
# a current subsystem membership for the id. If it has one, map the ss name into
# the "tax string" for that subsystem.
#

use strict;
use DBI;
use Data::Dumper;
use FIG;
use FIG_Config;
use DB_File;
use File::Basename;
use Cwd 'abs_path';
use TaxStr;

my $mysql = "$FIG_Config::ext_bin/mysql";
my $mysql_sock = $FIG_Config::dbsock;
my $dbname = $FIG_Config::mgrast_db;
my $dbuser = $FIG_Config::mgrast_dbuser;

my $metagenome_data = $FIG_Config::rast_mg_data;
my $seed_data = $FIG_Config::organisms;
my $pegsyn = "/vol/48-hour/Data/peg.synonyms.index.t";


my($dbh, $dbh2);

&init_db;

my $tax_mapper = new TaxStr($dbh2);

sub init_db
{
    $dbh = DBI->connect_cached("dbi:mysql:$dbname;mysql_socket=$mysql_sock;mysql_local_infile=1", $dbuser);
    $dbh2 = DBI->connect_cached("dbi:mysql:$dbname;mysql_socket=$mysql_sock;mysql_local_infile=1", $dbuser);
    $tax_mapper->dbh($dbh2);
}

#my $jobdir = abs_path(".");

my $job = shift;
my $jobdir = "$FIG_Config::rast_jobs/$job";

#my $job = basename($jobdir);

my(%rdp_to_tax, %key_cache, %fid_to_ss_info);

load_ss_cache(\%key_cache, \%fid_to_ss_info);
load_rdp_cache(\%rdp_to_tax);

my $tbl = create_table($job);
print "Created $tbl\n";

my $test_mode;
my $fifo = "/tmp/load.fifo.$$";
my $load_pid;
my $cmdfile;

if (!$test_mode)
{
    if (system('mkfifo', $fifo) != 0)
    {
	die "error creating FIFO\n";
    }
    
    $cmdfile = "/tmp/load.cmd.$$";
    open(C, ">$cmdfile") or die "cannot write $cmdfile: $!";
    print C "LOAD DATA LOCAL INFILE '$fifo' INTO TABLE $tbl FIELDS ESCAPED BY '' ENCLOSED BY ''\n";
    close(C);
    
    
    $load_pid = fork;
    
    if ($load_pid == 0)
    {
	my $cmd = "$mysql --local-infile -S $mysql_sock -u $dbuser $dbname < $cmdfile";
	print "Child running $cmd\n";
	my $rc = system($cmd);
	print "Child finished running $cmd with rc=$rc\n";
	exit;
    }
    sleep(1);

    open(L, ">$fifo") or die "Cannot write to fifo $fifo: $!\n";
}
else
{
    print "Test mode writing to /tmp/test.out\n";
    open(L, ">/tmp/test.out") or die "cannot open /tmp/test.out: $!";
}

print "Started child $load_pid\n";

my $l10 = 10 / log(10);
my ($maxlen1, $maxlen2, $maxlen3);

#
# sim-block columns to rank by.
# column index is as pushed into the @block  in the sim processing loops.
# third column is 1 for ascending, -1 for descending.
#
my @rank = ([1, 'rank_iden', -1],
	    [6, 'rank_logpsc', 1],
	   );

#load_seed("t");

$| = 1;

load_seed("$jobdir/proc/sims.seed.raw");

if (1)
{
load_rdp('16s', "$jobdir/proc/sims.16s.raw");
load_rdp('gg', "$jobdir/proc/sims.gg.raw");
load_rdp('ssu', "$jobdir/proc/sims.ssu.raw");
load_rdp('lsu', "$jobdir/proc/sims.lsu.raw");
}
close(L);

if (!$test_mode)
{
    print "Waiting for child $load_pid to finish\n";
    my $proc = waitpid($load_pid, 0);
    print "Child $proc finished with retcode $?\n";
    
    unlink($fifo);
    unlink($cmdfile);
}

exit;

sub load_rdp
{
    my($db, $file) = @_;

    my $ident = get_db_ident($db);

    open(F, "<$file") or die "cannot open $file: $!";

    print "Load $file\n";
    
    my $last_id;
    my @block;
    while (<F>)
    {
	chomp;
	my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);
	my $tstr = $rdp_to_tax{$ident, $id2};
	if ($tstr)
	{
	    update_maxlen($id1, $id2, $tstr);

	    if ($id1 ne $last_id)
	    {
		handle_block($ident, $last_id, \@block) if @block;
		@block = ();
		$last_id = $id1;
	    }
	    my(@g) = split(/:/, $tstr, 4);
	    my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
	    push(@block, [$id2, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2]]);
	}
    }
    handle_block($ident, $last_id, \@block);
    close(F);
}

#
# Process a block of sims from the same id1.
#
# We sort by each of the fields we are interested in ranking on
# (pct identity, bitscore, pscore) and store that rank as well.
#
# We actually don't have to sort on bitscore/pscore since we assume
# our data comes in that way. Those ranks are just the indexes of the
# rows in the block.
#
sub handle_block
{
    my($dbid, $id1, $block) = @_;

#    print "Norm: " . Dumper($block);

#    return if @$block < 10 || $block->[0]->[1] > 90;

    my @r;
    for my $rent (@rank)
    {
	my($col_idx, $col_name, $direction) = @$rent;

#	print "@$rent\n";
	my @s = sort { $direction * ($a->[$col_idx] <=> $b->[$col_idx]) } @$block;
	map { push(@{$s[$_]}, $_) } 0..$#s;

#	for my $i (0..$#s)
#	{
#	    print join("\t", $i, @{$s[$i]}, @{$block->[$i]}), "\n";
#	}
    }

    for my $i (0..$#$block)
    {
	if (!print L join("\t", $dbid, $id1, @{$block->[$i]}), "\n")
	{
	    die "Error printing to fifo: $!";
	}
    }
}

sub handle_sim_block
{
    my($dbid, $id1, $block) = @_;

#    return if @$block < 10 || $block->[0]->[1] > 90;
#    print "Sim: " . Dumper($block);

    my @r;
    for my $rent (@rank)
    {
	my($col_idx, $col_name, $direction) = @$rent;

	my @s = sort { $direction * ($a->[$col_idx] <=> $b->[$col_idx]) } @$block;
	map { push(@{$s[$_]}, $_) } 0..$#s;

    }

    my $nr = @rank;
#    print Dumper($block);
    for my $i (0..$#$block)
    {
	my $l = $block->[$i];
	my @ranks = splice(@$l, -($nr), $nr);
	my $slist = pop(@$l);
#	print Dumper($l, \@ranks, $slist);
	for my $sent (@$slist)
	{
	    if (!print L join("\t", $dbid, $id1, @$l, @$sent, @ranks), "\n")
	    {
		die "Error printing to fifo: $!";
	    }
	}
    }
}

sub load_seed
{
    my($file) = @_;

    my $tax_ident = get_db_ident('seed');
    my $sim_ident = get_db_ident('seed_subsystem');

    open(F, "<$file") or die "cannot open $file: $!";
    print "load $file\n";

    my %ps;
    my $tied = tie %ps, 'DB_File', $pegsyn, O_RDONLY, 0, $DB_BTREE;
    $tied or die "cannot tie $pegsyn: $!";

    my $last_id;
    my @tax_block;
    my @sim_block;
    while (<F>)
    {
	chomp;
	my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);

	print "." if $. % 10000 == 0;

	my $xid = $id2;
	if ($id2 =~ /^xxx/)
	{
	    $xid = $ps{$id2};
	}
	if ($xid =~ /(fig\|(\d+\.\d+)\.peg\.\d+)/)
	{
	    my $peg = $1;
	    my $genome = $2;
#	    print "Mapped $id2 to $xid to $peg to $genome\n";

	    my $ss_list = $fid_to_ss_info{$peg};
	    my $tstr = $rdp_to_tax{$tax_ident, $genome};

	    my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
	    
	    if ($id1 ne $last_id)
	    {
		handle_block($tax_ident, $last_id, \@tax_block);
		handle_sim_block($sim_ident, $last_id, \@sim_block);
		@tax_block = ();
		@sim_block = ();
		$last_id = $id1;
	    }

	    if ($tstr)
	    {
		update_maxlen($id1, $peg, $tstr);

		my(@g) = split(/:/, $tstr, 4);
		push(@tax_block, [$peg, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2]]);
	    }
	    if ($ss_list)
	    {
		push(@sim_block, [$peg, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $ss_list]);
	    }
	}
#	last if $. > 100;
    }
    handle_block($tax_ident, $last_id, \@tax_block);
    handle_sim_block($sim_ident, $last_id, \@sim_block);
    close(F);
}


sub get_db_ident
{
    my($str) = @_;
    my $res = $dbh->selectcol_arrayref(qq(SELECT dbid
					 FROM db_type
					 WHERE name = ?), undef, $str);
    if (@$res)
    {
	return $res->[0];
    }

    $dbh->do(qq(INSERT INTO db_type (name) VALUES (?)), undef, $str);
    my $id = $dbh->{mysql_insertid};
    return $id;
}

#
# Create a sims data table for the given job.
#
sub create_table
{
    my($job) = @_;

    my $tbl_name = "tax_sim_${job}";

    $dbh->do(qq(DROP TABLE IF EXISTS $tbl_name));
    my $qry = qq(CREATE TABLE $tbl_name
		 (
		  dbid	TINYINT,
		  id1	CHAR(10),
		  id2	CHAR(32),
		  iden	TINYINT UNSIGNED,
		  ali_ln	SMALLINT UNSIGNED,
		  b1	SMALLINT UNSIGNED,
		  e1	SMALLINT UNSIGNED,
		  b2	SMALLINT UNSIGNED,
		  e2	SMALLINT UNSIGNED,
		  logpsc	SMALLINT,
		  bsc	SMALLINT UNSIGNED,
		  tax_str	CHAR(160),
		  tax_group_1	CHAR(3),
		  tax_group_2	CHAR(3),
		  tax_group_3	CHAR(3),
		  rank_iden	SMALLINT UNSIGNED,
		  rank_psc	SMALLINT UNSIGNED,
		  KEY(id1),
		  KEY(id2),
		  KEY(iden),
		  KEY(ali_ln),
		  KEY(logpsc),
		  KEY(bsc),
		  KEY(rank_iden),
		  KEY(rank_psc),
		  KEY(tax_str),
		  KEY(tax_group_1, tax_group_2, tax_group_3)
		 ) ENGINE = InnoDB;
		);
    
    print $qry;

    $dbh->do($qry);

    return $tbl_name;
}

sub update_maxlen
{
    my $l1 = length($_[0]);
    my $l2 = length($_[1]);
    my $l3 = length($_[2]);
    $maxlen1 = $l1 if $l1 > $maxlen1;
    $maxlen2 = $l2 if $l2 > $maxlen2;
    $maxlen3 = $l3 if $l3 > $maxlen3;
}

sub load_ss_cache
{
    my($key_cache, $ss_cache) = @_;

    #
    # First verify we have tax_items for all of our functional roles. We suck in all the
    # dbkeys so we can just do an in-memory lookup.
    #
    
    my $sth = $dbh->prepare(qq(SELECT dbkey, str FROM tax_item), { mysql_use_result => 1 });
    $sth->execute;
    while (my $row = $sth->fetchrow_arrayref())
    {
	$key_cache->{$row->[1]} = $row->[0];
    }
    
    my $sth = $dbh->prepare(qq(SELECT protein, subsystem, tax_str, role
			       FROM $FIG_Config::db.subsystem_index JOIN ss_to_tax ON subsystem = name), { mysql_use_result => 1});
    $sth->execute;
    my($protein, $subsystem, $tax_str, $role);
    $sth->bind_columns(\ ($protein, $subsystem, $tax_str, $role));
    while ($sth->fetch)
    {
	#
	# Try the local cache first.
	#
	my $role_id = $key_cache{$role};
	if (!$role_id)
	{
	    # print "Inserting role $role\n";
	    $role_id = $tax_mapper->tax_name_to_id($role);
	}

	my @groups = split(/:/, $tax_str, 3);
	$tax_str .= ":$role_id";
	push @{$ss_cache->{$protein}}, [$tax_str, @groups];
    }
}


sub load_rdp_cache
{
    my ($rdp_to_tax) = @_;
    my $sth = $dbh->prepare(qq(SELECT dbid, seq_num, tax_str
			       FROM rdp_to_tax), { mysql_use_result => 1 });
    $sth->execute();
    my($dbid, $seq, $str);
    $sth->bind_columns(\ ($dbid, $seq, $str));
    while ($sth->fetch)
    {
	$rdp_to_tax->{$dbid, $seq} = $str;
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3