[Bio] / FigMetagenomeTools / blast2taxa.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/blast2taxa.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Jul 6 21:21:51 2007 UTC (12 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +4 -2 lines
MG server updates

#!/usr/bin/perl -w

# convert the blast against greengenes into a taxonomy profile
# the greengenes data is available from http://greengenes.lbl.gov/ and I converted
# this into a simple tab separated file that has the following columns:
#
#     number (arbitary)
#     prokMSA_id (a unique ID)
#     Hugenholtz taxonomy
#     RDP taxonomy
#
# I like Phil's more than the RDP one.
#
# Usage: blast2taxa sims-output besthits-file allhits-file
#

use FIG_Config;
use strict;

@ARGV == 3 or die "usage: $0 sims-output besthits-file allhits-file\n";

my $sims_file = shift;
my $besthits = shift;
my $allhits = shift;

my $gg_data = "$FIG_Config::metagenome_data/greengenes_part2.tab";

-f $sims_file or die "Sims file $sims_file does not exist\n";
if ($sims_file =~ /gz$/)
{
    
    open(SIMS, "gunzip -c $sims_file |") or die "Cannot open unzip pipe from $sims_file\n";
}
else
{
    open(SIMS, "<$sims_file") or die "Cannot open sims data $sims_file: $!\n";
}

open(IN, $gg_data) or
    die "Can't open the file $gg_data. Please check where it is, or grab it from Rob";

my %taxa;
my %leaf;
while (<IN>)
{
    chomp;
    my @a=split /\t/;
    $taxa{$a[1]}=$a[2];
    $leaf{$a[2]} = 1;
}
close(IN);

my ($mine, $minlength)=(1e-5, 50);

my %best;
my %all;
my %seen;

while (<SIMS>)
{
    chomp;
    my @a=split /\t/;
    next unless ($a[10] < $mine && $a[3] > $minlength);
    my $taxonomy=$taxa{$a[1]};
    unless ($taxonomy) {$taxonomy="Unclassified"}
    print STDERR "q: $a[1] tax: $taxonomy\n" unless ($taxonomy);
    while ($taxonomy =~ /;\s*/) 
    {
        $best{$taxonomy}++ unless ($seen{$a[0]});
        $all{$taxonomy}++;
        $taxonomy =~ s/^(.*)\;\s*.*?$/$1/;
    }
    $best{$taxonomy}++ unless ($seen{$a[0]});
    $all{$taxonomy}++;
    $seen{$a[0]}=1;
}
close SIMS;

open(OUT, ">$besthits") || die "Can't write to $besthits: $!";
map {print OUT "$_\t$best{$_}\t$leaf{$_}\n"} sort {uc($a) cmp uc($b)} keys %best;
close OUT;

open(OUT, ">$allhits") || die "Can't write to $allhits: $!";
map {print OUT "$_\t$all{$_}\t$leaf{$_}\n"} sort {uc($a) cmp uc($b)} keys %all;
close OUT;



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3