[Bio] / FortyEightMeta / load_tax.pl Repository:
ViewVC logotype

View of /FortyEightMeta/load_tax.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri Mar 28 20:18:49 2008 UTC (11 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.2: +65 -65 lines
Updates to mgrast sim loader.

#!/usr/bin/perl

use strict;
use DBI;
use Data::Dumper;
use FIG;
use FIG_Config;
use TaxStr;

my $metagenome_data = $FIG_Config::rast_mg_data;

my $seed_data = $FIG_Config::data;

my $dbh = DBI->connect("dbi:mysql:$FIG_Config::mgrast_db;mysql_socket=$FIG_Config::dbsock", $FIG_Config::mgrast_dbuser);

my $tax_db = new TaxStr($dbh);

my $maxl;

my $tax_insert = $dbh->prepare(qq(INSERT INTO rdp_to_tax(dbid, seq_num, tax_str) VALUES (?, ?, ?)));
my $ss_insert = $dbh->prepare(qq(INSERT INTO ss_to_tax(name, tax_str) VALUES (?, ?)));

#
# Load the tax data mappings into the database.
#

print "Load subsystems\n";
load_subsystems("$seed_data/Subsystems");
print "load 16s\n";
load_16s("$metagenome_data/release9_27_unaligned.gb", '16s');
print "Load seed\n";
load_seed("$seed_data/Organisms");
print "load gg\n";
load_gg("$metagenome_data/greengenes_part2.tab", 'gg');
print "Load lsu/ssu\n";
load_xsu("$metagenome_data/lsu_headers.txt", 'lsu');
load_xsu("$metagenome_data/ssu_headers.txt", 'ssu');

print "max length $maxl\n";

sub load_subsystems
{
    my($dir) = @_;

    opendir(D, $dir) or die "cannot opendir $dir: $!";

    for my $ss (sort readdir(D))
    {
	next if $ss =~ /^\./;
	my $sdir = "$dir/$ss";
	next unless -d $sdir;
	print "$ss\n";
	if (open(C, "$sdir/CLASSIFICATION"))
	{
	    my $c = <C>;
	    chomp $c;
	    my @a = split(/\t/, $c, 2);
	    $a[2] = $ss;
	    insert_ss($ss, @a);
	}
	close(C);
    }
	
}

sub load_seed
{
    my ($seed) = @_;

    my $ident = get_db_ident('seed');

    opendir(D, $seed) or die "cannot opendir $seed: $!";
    for my $f (readdir(D))
    {
	my $path = "$seed/$f";
	if ($f !~ /^\d+\.\d+$/ or ! -d $path)
	{
	    next;
	}

	my $tax = &FIG::file_head("$path/TAXONOMY", 1);
	next if $tax eq '';
	chomp $tax;

	insert_tax($ident, $f, [split(/;\s*/, $tax)]);
    }
    closedir(D);
}

sub load_xsu
{
    my($file, $idstr) = @_;

    my $ident = get_db_ident($idstr);

    open(F, "<$file") or die "cannot open $file: $!";
    print "Load $file\n";

    while (<F>)
    {
	chomp;
	my($id, $species, $xid, $strain,  @rest) = split(/\t/);
	my $what = pop(@rest);
	insert_tax($ident, $id, \@rest);	
    }
    close(F);
}


sub load_16s
{
    my($file, $idstr) = @_;

    my $ident = get_db_ident($idstr);

    open(GB, "<$file") or die "cannot open $file: $!";
    
    my %seen;
    
    my $cur;
    while (<GB>)
    {
	chomp;
	if (/^LOCUS\s+(\S+)/)
	{
	    $cur = $1;
	    if ($seen{$cur})
	    {
		print "DUP $cur $.\n";
	    }
	    $seen{$cur}++;
	}
	elsif (/^SOURCE/)
	{
	    my $skip;
	    while (<GB>)
	    {
		if (/^ {12}/)
		{
		    $skip++;
		}
		else
		{
		    last;
		}
	    }
	    chomp;
	    if (/^\s+ORGANISM\s+(.*)\s*$/)
	    {
		my $org = $1;
		for my $s (1..($skip + 1))
		{
		    $_ = <GB>;
		}
		my $tax;
		while (defined($_) and /^\s+(.*)\S*/)
		{
		    $tax .= " $1";
		    $_ = <GB>;
		}
		if ($tax eq '')
		{
		    warn "empty tax at $.\n";
		}
		$tax =~ s/\.$//;
		my @tax = split(/;\s+/, $tax);
#		print "$cur\t$org\t$tax\n";

		# print "Inserting $ident $seq @tax\n";
		insert_tax($ident, $cur, \@tax);
	    }
	    else
	    {
		warn "bad parse 1 for org at $.\n";
	    }
	}
    }
}

sub load_gg
{
    my($file, $idstr) = @_;

    my $ident = get_db_ident($idstr);

    open(F, "<$file") or die "cannot open $file: $!";
    $_ = <F>;			# skip header

    while (<F>)
    {
	chomp;
	my($xid, $seq, @tax) = split(/\t/);

	for my $tax (@tax)
	{
	    my @ent = grep { !/^otu/ } split(/;\s+/, $tax);
	    insert_tax($ident, $seq, \@ent);
	}

    }
}

sub insert_tax
{
    my($dbid, $seq, $tax) = @_;

    my $tstr = join(":", map { $tax_db->tax_name_to_id($_) } @$tax);

    # print "Insert $dbid $seq @$tax: $tstr\n";
    my $l = length($tstr);
    $maxl = $l if $l > $maxl;

    $tax_insert->execute($dbid, $seq, $tstr);
}

sub insert_ss
{
    my($name, @tax) = @_;

    my $tstr = join(":", map { $tax_db->tax_name_to_id($_) } @tax);

    $ss_insert->execute($name, $tstr);
}

sub get_db_ident
{
    my($str) = @_;
    my $res = $dbh->selectcol_arrayref(qq(SELECT dbid
					 FROM db_type
					 WHERE name = ?), undef, $str);
    if (@$res)
    {
	return $res->[0];
    }

    $dbh->do(qq(INSERT INTO db_type (name) VALUES (?)), undef, $str);
    my $id = $dbh->{mysql_insertid};
    return $id;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3