[Bio] / FortyEightMeta / load_sim_data_noncached.pl Repository:
ViewVC logotype

View of /FortyEightMeta/load_sim_data_noncached.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Mar 14 15:53:38 2008 UTC (12 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Rename sim data loader scripts.

#!/usr/bin/perl

#
# Load sims data into the relational database for use in the frontend analysis software.
#
# We assume the incoming data is grouped by id1 (the fragment id), and within the groups
# it is sorted in order of increasing e-value. The data as returned from the underlying
# blast runs should have these properties.
#
# We analyze each group of hits in turn, assigning ranks within the group for
# the e-value ordering and the percent-identity ordering. We also look up
# the id2 values for each hit in order to assign a taxonomic identifier for the
# hit.
#
# For FIG hits, we expand the id2 into a fig| identifier if it is an xxx identifier, and look up
# a current subsystem membership for the id. If it has one, map the ss name into
# the "tax string" for that subsystem.
#

use strict;
use DBI;
use Data::Dumper;
use FIG;
use FIG_Config;
use DB_File;
use File::Basename;
use Cwd 'abs_path';

my $mysql = "$FIG_Config::ext_bin/mysql";
my $mysql_sock = "/var/run/mysqld/mysqld.sock";
my $dbname = "bobtest2";

my $metagenome_data = "/vol/metagenome-48-hour/Data";
my $seed_data = "/vol/seed-data-anno-mirror/Data.Jan3/Organisms";
my $pegsyn = "/vol/48-hour/Data/peg.synonyms.index.t";

my $dbh = DBI->connect("dbi:mysql:$dbname;mysql_socket=$mysql_sock;mysql_local_infile=1", "olson");

my $jobdir = abs_path(".");

my $job = basename($jobdir);


my $fig = new FIG;
my $ss_lookup = $fig->db_handle->{_dbh}->prepare(qq(SELECT subsystem FROM subsystem_index WHERE protein = ?));

my $tbl = create_table($job);
print "Created $tbl\n";

my $fifo = "/tmp/load.fifo.$$";

if (system('mkfifo', $fifo) != 0)
{
    die "error creating FIFO\n";
}

my $cmdfile = "/tmp/load.cmd.$$";
open(C, ">$cmdfile") or die "cannot write $cmdfile: $!";
print C "LOAD DATA LOCAL INFILE '$fifo' INTO TABLE $tbl FIELDS ESCAPED BY '' ENCLOSED BY ''\n";
close(C);

#my $load_file = "/tmp/load_rdp.$$";
#open(L, ">$load_file") or die "cannot write $load_file: $!";

my $load_pid = fork;

if ($load_pid == 0)
{
    my $cmd = "$mysql --local-infile -S $mysql_sock $dbname < $cmdfile";
    print "Child running $cmd\n";
    my $rc = system($cmd);
    print "Child finished running $cmd with rc=$rc\n";
    exit;
}
sleep(1);

open(L, ">$fifo") or die "Cannot write to fifo $fifo: $!\n";

print "Started child $load_pid\n";

my $l10 = 10 / log(10);
my ($maxlen1, $maxlen2, $maxlen3);

#
# sim-block columns to rank by.
# column index is as pushed into the @block  in the sim processing loops.
# third column is 1 for ascending, -1 for descending.
#
my @rank = ([1, 'rank_iden', -1],
	    [6, 'rank_logpsc', 1],
	   );

#load_seed("t");

if (1)
{
load_seed("$jobdir/proc/sims.seed.raw");

load_rdp('gg', "$jobdir/proc/sims.gg.raw");
load_rdp('ssu', "$jobdir/proc/sims.ssu.raw");
load_rdp('lsu', "$jobdir/proc/sims.lsu.raw");
}
close(L);

print "Waiting for child $load_pid to finish\n";
my $proc = waitpid($load_pid, 0);
print "Child $proc finished with retcode $?\n";
exit;

#my $tbl = create_table($job, $maxlen1, $maxlen2, $maxlen3);

#print "Loading into table $tbl\n";
#my $n = $dbh->do(qq(LOAD DATA INFILE '$load_file' INTO TABLE $tbl FIELDS  ESCAPED BY '' ENCLOSED BY ''));
#print "load returns $n\n";

sub load_rdp
{
    my($db, $file) = @_;

    my $ident = get_db_ident($db);

    my $lookup_sth = $dbh->prepare(qq(SELECT tax_str
				      FROM rdp_to_tax
				      WHERE dbid = $ident AND seq_num = ?));

    open(F, "<$file") or die "cannot open $file: $!";

    print "Load $file\n";
    
    my $last_id;
    my @block;
    while (<F>)
    {
	chomp;
	my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);
	$lookup_sth->execute($id2);
	my $res = $lookup_sth->fetchrow_arrayref();
	if ($res)
	{
	    my $tstr = $res->[0];

	    update_maxlen($id1, $id2, $tstr);

	    if ($id1 ne $last_id)
	    {
		handle_block($ident, $last_id, \@block) if @block;
		@block = ();
		$last_id = $id1;
	    }
	    my(@g) = split(/:/, $tstr, 4);
	    my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
	    push(@block, [$id2, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2]]);
	}
    }
    handle_block($ident, $last_id, \@block);
    close(F);
}

#
# Process a block of sims from the same id1.
#
# We sort by each of the fields we are interested in ranking on
# (pct identity, bitscore, pscore) and store that rank as well.
#
# We actually don't have to sort on bitscore/pscore since we assume
# our data comes in that way. Those ranks are just the indexes of the
# rows in the block.
#
sub handle_block
{
    my($dbid, $id1, $block) = @_;

#    return if @$block < 10 || $block->[0]->[1] > 90;

    my @r;
    for my $rent (@rank)
    {
	my($col_idx, $col_name, $direction) = @$rent;

#	print "@$rent\n";
	my @s = sort { $direction * ($a->[$col_idx] <=> $b->[$col_idx]) } @$block;
	map { push(@{$s[$_]}, $_) } 0..$#s;

#	for my $i (0..$#s)
#	{
#	    print join("\t", $i, @{$s[$i]}, @{$block->[$i]}), "\n";
#	}
    }

    for my $i (0..$#$block)
    {
	print L join("\t", $dbid, $id1, @{$block->[$i]}), "\n";
    }
}

sub load_seed
{
    my($file) = @_;

    my $ident = get_db_ident('seed');

    my $lookup_sth = $dbh->prepare(qq(SELECT tax_str
				      FROM rdp_to_tax
				      WHERE dbid = $ident AND seq_num = ?));
    my $ss_lookup_tax = $dbh->prepare(qq(SELECT tax_str
					 FROM ss_to_tax
					 WHERE name = ?));

    open(F, "<$file") or die "cannot open $file: $!";
    print "load $file\n";

    my %ps;
    my $tied = tie %ps, 'DB_File', $pegsyn, O_RDONLY, 0, $DB_BTREE;
    $tied or die "cannot tie $pegsyn: $!";

    my $last_id;
    my @block;
    while (<F>)
    {
	chomp;
	my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);

	my $xid = $id2;
	if ($id2 =~ /^xxx/)
	{
	    $xid = $ps{$id2};
	}
	if ($xid =~ /(fig\|(\d+\.\d+)\.peg\.\d+)/)
	{
	    my $peg = $1;
	    my $genome = $2;
#	    print "Mapped $id2 to $xid to $peg to $genome\n";

	    my($ss, $ss_tax);
	    
	    $ss_lookup->execute($peg);
	    my $sres = $ss_lookup->fetchrow_arrayref();
	    if ($sres)
	    {
		$ss = $sres->[0];
		$ss_lookup_tax->execute($ss);
		my $ssres = $ss_lookup_tax->fetchrow_arrayref();
		$ss_tax = $ssres->[0] if $ssres;
#		print "Got ss $ss $ss_tax\n";
	    }

	    $lookup_sth->execute($genome);
	    my $res = $lookup_sth->fetchrow_arrayref();
	    if ($res)
	    {
		my $tstr = $res->[0];
		update_maxlen($id1, $peg, $tstr);

		if ($id1 ne $last_id)
		{
		    handle_block($ident, $last_id, \@block);
		    @block = ();
		    $last_id = $id1;
		}

		my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
		my(@g) = split(/:/, $tstr, 4);
		my(@s) = split(/:/, $ss_tax, 4);
		push(@block, [$peg, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2], $ss_tax, @s[0..2]]);
	    }
	}
#	last if $. > 100;
    }
    handle_block($ident, $last_id, \@block);
    close(F);
}


sub get_db_ident
{
    my($str) = @_;
    my $res = $dbh->selectcol_arrayref(qq(SELECT dbid
					 FROM db_type
					 WHERE name = ?), undef, $str);
    if (@$res)
    {
	return $res->[0];
    }

    $dbh->do(qq(INSERT INTO db_type (name) VALUES (?)), undef, $str);
    my $id = $dbh->{mysql_insertid};
    return $id;
}

#
# Create a sims data table for the given job.
#
sub create_table
{
    my($job) = @_;

    my $tbl_name = "tax_sim_${job}_a";

    $dbh->do(qq(DROP TABLE IF EXISTS $tbl_name));
    my $qry = qq(CREATE TABLE $tbl_name
		 (
		  dbid	TINYINT,
		  id1	CHAR(32),
		  id2	CHAR(32),
		  iden	TINYINT UNSIGNED,
		  ali_ln	SMALLINT UNSIGNED,
		  b1	SMALLINT UNSIGNED,
		  e1	SMALLINT UNSIGNED,
		  b2	SMALLINT UNSIGNED,
		  e2	SMALLINT UNSIGNED,
		  logpsc	SMALLINT,
		  bsc	SMALLINT UNSIGNED,
		  tax_str	CHAR(160),
		  tax_group_1	CHAR(2),
		  tax_group_2	CHAR(2),
		  tax_group_3	CHAR(2),
		  ss_str	CHAR(12),
		  ss_group_1	CHAR(2),
		  ss_group_2	CHAR(2),
		  ss_group_3	CHAR(2),
		  rank_iden	TINYINT UNSIGNED,
		  rank_psc	TINYINT UNSIGNED,
		  KEY(id1),
		  KEY(id2),
		  KEY(iden),
		  KEY(ali_ln),
		  KEY(logpsc),
		  KEY(bsc),
		  KEY(rank_iden),
		  KEY(rank_psc),
		  KEY(tax_str),
		  KEY(tax_group_1, tax_group_2, tax_group_3),
		  KEY(ss_str),
		  KEY(ss_group_1, ss_group_2, ss_group_3)
		 ) ENGINE = InnoDB;
		);
    
    print $qry;

    $dbh->do($qry);

    return $tbl_name;
}

sub update_maxlen
{
    my $l1 = length($_[0]);
    my $l2 = length($_[1]);
    my $l3 = length($_[2]);
    $maxlen1 = $l1 if $l1 > $maxlen1;
    $maxlen2 = $l2 if $l2 > $maxlen2;
    $maxlen3 = $l3 if $l3 > $maxlen3;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3