[Bio] / FortyEightMeta / mg_load_tax_db.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_load_tax_db.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Jun 11 21:16:42 2008 UTC (11 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +1 -1 lines
Commit of initial MGRAST2 code.

#
# Load the taxonomy for a new sequence database for use
# with the MG-RAST.
#
# The database must have already been defined in the databases.xml.
#
# Specify the database with the name and version as specified in databases.xml.
#

use Data::Dumper;
use FortyEightMeta::SimDB;
use FortyEightMeta::TaxStr;
use FIG_Config;
use DBrtns;
use FIG;

use strict;

@ARGV == 2 or die "Usage: $0 db-name db-version\n";

my($mgdb, $mgdb2);
eval {
    $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
		       $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
		       $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
		       $FIG_Config::mgrast_dbsock);

    $mgdb2 = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
		       $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
		       $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
		       $FIG_Config::mgrast_dbsock);
};
if ($@ or !defined($mgdb))
{
    die "cannot connect to database: $@";
}

my $db_name = shift;
my $db_version = shift;

my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);

my @db_files = $simdb->db_files_for_database_version($db_name, $db_version);

if (@db_files == 0)
{
    die "No files found for database $db_name version $db_version\n";
}

my $errs;
for my $d (@db_files)
{
    my $tax = $d->{file};
    if (($tax eq '' or ! -f $tax) and !$d->{special})
    {
	warn "Tax file missing or cannot be opened\n";
	$errs++;
    }
    if ($d->{name} eq '')
    {
	warn "Missing tax db name\n";
	$errs++;
    }
}

die "Validation errors occured, exiting.\n" if $errs;

#
# We're good to go.
#
# See if we have a dbid assigned for this database version yet.
#

my $dbh = $mgdb->{_dbh};

my $tax_str = new FortyEightMeta::TaxStr($mgdb2->{_dbh});
my $tax_insert = $dbh->prepare(qq(INSERT INTO rdp_to_tax(dbid, seq_num, tax_str) VALUES (?, ?, ?)));

for my $d (@db_files)
{
    my $dbid = get_or_assign_dbid($dbh, $db_name, $db_version, $d->{name});

    if ($d->{file})
    {
	$dbh->begin_work();
	$dbh->do(qq(DELETE FROM rdp_to_tax WHERE dbid = ?), undef, $dbid);
	load_file($dbid, $d->{file});
	$dbh->commit();
    }
    elsif ($d->{special})
    {
	$dbh->begin_work();
	$dbh->do(qq(DELETE FROM rdp_to_tax WHERE dbid = ?), undef, $dbid);
	handle_special($dbid, $d);
	$dbh->commit();
    }
}

$tax_insert->finish();
undef $tax_insert;
undef $tax_str;
$dbh->disconnect();

sub handle_special
{
    my($dbid, $d) = @_;

    if ($d->{special} eq 'seed_genomes')
    {
	load_seed_genomes($dbid);
    }
}

#
# Load tax information from the SEED genomes.
#
sub load_seed_genomes
{
    my($dbid) = @_;

    my $fig = new FIG;
    $fig or die "cannot create FIG object";

    $dbh->do(qq(COPY rdp_to_tax FROM STDIN));

    for my $g (sort { &FIG::by_genome_id($a, $b) } $fig->genomes())
    {
	my $tax = $fig->taxonomy_of($g);

	my @tax = split(/;\s*/, $tax);
	my $tstr = join(":", map { $tax_str->tax_name_to_id($_) } @tax);

	$dbh->pg_putline(join("\t", $dbid, $g, $tstr). "\n");
    }
    $dbh->pg_endcopy();
}
    

sub load_file
{
    my($dbid, $file) = @_;

    $dbh->do(qq(COPY rdp_to_tax FROM STDIN));

    open(F, "<", $file) or die "Cannot open $file: $!";

    print "Load $file\n";
    open(T, ">/tmp/t");
	 
    while (<F>)
    {
	chomp;
	my($id, @tax) = split(/\t/);

	my $tstr = join(":", map { $tax_str->tax_name_to_id($_) } @tax);
	#$tax_insert->execute($dbid, $id, $tstr);
	my $line = join("\t", $dbid, $id, $tstr) . "\n";

	$dbh->pg_putline($line);
	
#	last if $. > 1000;
    }
    $dbh->pg_endcopy();

    close(T);
    close(F);
    print "Finished $file\n";
}


sub get_or_assign_dbid
{
    my($dbh, $name, $version, $dbname) = @_;
    
    $dbh->begin_work();
    my $res = $dbh->selectall_arrayref(qq(SELECT dbid
					  FROM seq_db
					  WHERE name = ? AND version = ? AND tax_db_name = ?), undef,
				       $name, $version, $dbname);
    my $id;
    if (@$res == 0)
    {
	#
	# Assign.
	#
	$res = $dbh->selectcol_arrayref(qq(SELECT MAX(dbid) FROM seq_db));
	$id = defined($res->[0]) ? $res->[0] + 1 : 1;
	print "new id=$id\n";
	$dbh->do(qq(INSERT INTO seq_db (dbid, name, version, tax_db_name) VALUES (?, ?, ?, ?)),
		 undef, $id, $name, $version, $dbname);
    }
    else
    {
	$id = $res->[0]->[0];
	print "Found existing id $id\n";
    }
    $dbh->commit();
    return $id;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3