[Bio] / FigMetagenomeTools / 16s_taxa_mysql.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/16s_taxa_mysql.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Jul 6 21:21:51 2007 UTC (12 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +19 -9 lines
MG server updates

#!/usr/bin/perl -w

# look through the -m 8 16S hits and find out the tax that have the best hits and the number of best hits to them
# this should compensate for multiple hits to the same organism

use strict;
use Data::Dumper;
use DBI;
use FIG_Config;

my $db='rdp';
my $usage=<<EOF;
$0 

-d database (default: $db) [alternatives  rdp, lsu, ssu]
-b blast file (can use multiple -b's)
-c cutoff e value default = 0.01
-l minimum length of hit (otherwise all hits)
-ah all hits output file
-bh best hits output file

EOF

my $cutoff=0.01;
my $minlen=0;
my @blastf;

my ($ahof, $bhof);

while (@ARGV) {
 my $t=shift;
 if ($t eq "-b") {push @blastf, shift}
 elsif ($t eq "-c") {$cutoff=shift}
 elsif ($t eq "-l") {$minlen=shift}
 elsif ($t eq "-ah") {$ahof=shift}
 elsif ($t eq "-bh") {$bhof=shift}
 elsif ($t eq "-d") {$db=shift}
}

die "$blastf[0] && $ahof && $bhof && $db\n".$usage unless ($blastf[0] && $ahof && $bhof && ($db eq "rdp" || $db eq "lsu" || $db eq "ssu"));



my $best; my $all; my $seen;

foreach my $blastf (@blastf) {
    die "blast file not found" unless (-e $blastf);
    print STDERR "Working on $blastf\n";
    if ($blastf =~ /gz$/) {open(IN, "gunzip -c $blastf|") || die "Can't open pipe"}
    else {open(IN, $blastf) || die "Can't open $blastf"}
    while (<IN>)
    {
        chomp;
        my @line=split /\t/;
        $line[1] =~ s/\|.*$//;
        next unless ($line[10] < $cutoff);
        next unless ($line[3] > $minlen);
        unless (exists $best->{$line[0]}) {push @{$best->{$line[0]}}, $line[1]}
        push @{$all->{$line[0]}}, $line[1];
        $seen->{$line[1]}=1;
    }
}
print STDERR "Getting mysql data\n";
my $taxa;
if ($db eq "ssu" || $db eq "lsu") {$taxa=&euro16Staxa($seen, $db)}
elsif ($db eq "rdp") {$taxa=&rdp16Staxa($seen)}

print STDERR "Done\n";

open(OUT, ">$bhof") || die "Can't write $bhof";
my($count, $leaf) = &countthem($best, $taxa);

for my $key (sort {uc($a) cmp uc($b)} keys %$count)
{
    print OUT join("\t", $key, $count->{$key}, $leaf->{$key}), "\n";
}
close OUT;

open(OUT, ">$ahof") || die "Can't write $ahof";
my ($count, $leaf) = &countthem($all, $taxa);
for my $key (sort {uc($a) cmp uc($b)} keys %$count)
{
    print OUT join("\t", $key, $count->{$key}, $leaf->{$key}), "\n";
}
close OUT;



sub countthem {
    my ($n, $taxa)=@_;

    my $count = {};
    my $leaf = {};
    foreach my $k (keys %$n)
    {
        foreach my $hit (@{$n->{$k}})
        {
            unless ($taxa->{$hit}) {print STDERR "No taxa for ", $hit, "\n"; $taxa->{$hit} = "Unclassified; $hit"}
            my $tx=$taxa->{$hit};
            $count->{$tx}++;
	    $leaf->{$tx}++;
            while ($tx =~ /;\s*/)
            {
                $tx =~ s/^(.*)\;\s+.*?$/$1/;
                $count->{$tx}++ if ($tx);
            }
        }
    }
    return $count, $leaf;
}


sub euro16Staxa {
    my ($need, $db)=@_;

    my $taxa;
    my @allk=sort keys  %$need;
    my $nsql=0;
    print STDERR "There are ", scalar(@allk), " keys\n";

    #my $dbh=DBI->connect("DBI:mysql:European_rRNA", "rob", "forestry")|| die "Can't connect to db";
    my $dbh = DBI->connect("DBI:${FIG_Config::dbms}:European_rRNA;$FIG_Config::metagenome_db_opts", $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass);
    
    while (@allk)
    {
        my @five=splice(@allk, 0, 5);
        my $sql="select  accession, organism, taxonomy from $db where ";
        map {$sql .= " accession='$_' or "} @five;
        $nsql++;
        $sql =~ s/\s*or\s*$//;

        my $exc=$dbh->prepare($sql);
        $exc->execute || die $dbh->errstr;
        while (my @ret=$exc->fetchrow_array)
        {
            while ($ret[2] =~ s/;\s*$//) {1};
            $taxa->{$ret[0]}="$ret[2]; $ret[1]";
        }
    }
    print STDERR "Tried $nsql queries\n";
    return $taxa;
}

sub rdp16Staxa {
    my $need=shift;
    my $taxa;
    my @allk=sort keys  %$need;
    my $nsql=0;
    print STDERR "There are ", scalar(@allk), " keys\n";
    while (@allk)
    {
        my @five=splice(@allk, 0, 5);
        my $sql="select  * from organism where ";
        map {$sql .= " accession='$_' or "} @five;
        $nsql++;
        $sql =~ s/\s*or\s*$//;

        #my $dbh=DBI->connect("DBI:mysql:16S", "rob", "forestry")|| die "Can't connect to db";

	my $dbh = DBI->connect("DBI:${FIG_Config::dbms}:16S;$FIG_Config::metagenome_db_opts", $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass);

        my $exc=$dbh->prepare($sql);
        $exc->execute || die $dbh->errstr;
        while (my @ret=$exc->fetchrow_array)
        {
            $ret[3] =~ s/^.*?Bacteria\;/Bacteria\;/;
            $ret[3] =~ s/FEATURES.*$//;
            $taxa->{$ret[1]}="$ret[3]; $ret[2]";
        }
    }
    print STDERR "Tried $nsql queries\n";
    return $taxa;
}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3