[Bio] / FigMetagenomeTools / 16s_by_taxa.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/16s_by_taxa.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Apr 10 16:37:39 2007 UTC (12 years, 10 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
new stuff that didn't make it in before

#!/usr/bin/perl -w

# look through the -m 8 16S hits and find out the tax that have the best hits and the number of best hits to them
# this should compensate for multiple hits to the same organism

use FIG_Config;

use strict;
use Data::Dumper;

my ($taxaf, @blastf)=("$FIG_Config::metagenome_data/rel9taxa.txt");

my $usage=<<EOF;
$0 
-t taxonomy file (default $taxaf)
-b blast file (can use multiple -b's)
-c cutoff e value default = 0.01
-s short report if available
-l minimum length of hit (otherwise all hits)

EOF

my $cutoff=0.01;
my $short; my $minlen=0;

while (@ARGV) {
 my $t=shift;
 if ($t eq "-t") {$taxaf=shift}
 elsif ($t eq "-b") {push @blastf, shift}
 elsif ($t eq "-c") {$cutoff=shift}
 elsif ($t eq "-s") {$short=1}
 elsif ($t eq "-l") {$minlen=shift}
}

die $usage unless ($taxaf && $blastf[0]);

my $genus; my $species; my $wholeline;
if ($taxaf =~ /gz$/) {open(IN, "gunzip -c $taxaf|") || die "Can't open pipe"}
else {open(IN, $taxaf) || die "Can't open $taxaf"}

while (<IN>) 
{
 chomp;
 s/\s*REFERENCE.*?$//;
 my @line=split /\t/;
 $wholeline->{$line[0]}=\@line;
 my @geni=split /\;\s+/, $line[2];
 
 my $b=$geni[0];
 $b =~ s/sp\..*$//; $b =~ s/unclassified_//;
 $geni[$#geni] =~ s/sp\..*$//; $geni[$#geni] =~ s/unclassified_//;
 foreach my $qw (qw[uncultured bacteria bacterium])
 {
  $b =~ s/\b$qw\b//i;
  $geni[$#geni] =~ s/\b$qw\b//;
 }
 unless ($b) {die "Obliterated everything from $geni[0]"}
 #if ($b =~ /^(.*?\_)/) {print STDERR "ERR: $1\n"}
 #if ($b =~ /(\_.*?)$/) {print STDERR "ERR: $1\n"}
 
 #$species->{$line[0]}=shift @geni;
 shift @geni;
 $species->{$line[0]}=$b;
 $genus->{$line[0]}=\@geni;
}
close IN;


my $allbest; my %best;
foreach my $blastf (@blastf) {
 print STDERR "Working on $blastf\n";
 if ($blastf =~ /gz$/) {open(IN, "gunzip -c $blastf|") || die "Can't open pipe"}
 else {open(IN, $blastf) || die "Can't open $blastf"}
 while (<IN>)
 {
  chomp;
  my @line=split /\t/;
  next unless ($line[10] < $cutoff);
  next unless ($line[3] > $minlen);
  unless (exists $best{$line[0]}) {$best{$line[0]}=11000}
  next unless ($line[10] <= $best{$line[0]});
  if ($line[10] < $best{$line[0]})
  {
   $allbest->{$line[0]}=[\@line];
   $best{$line[0]}=$line[10];
  }
  elsif ($line[10] == $best{$line[0]})  
  {
   push @{$allbest->{$line[0]}}, \@line;
  }
 }
}

my $count; my $total; my $speciescount;
my $longest; my $bestspecies;
foreach my $hit (keys %$allbest)
{
 if (scalar(@{$allbest->{$hit}}) == 1)
 {
  $longest->{$hit}=join("\t", @{$genus->{$allbest->{$hit}->[0]->[1]}});
  $bestspecies->{$hit}=$species->{$allbest->{$hit}->[0]->[1]};
  $count->{$hit}->{$longest->{$hit}}=1;
  $speciescount->{$hit}->{$bestspecies->{$hit}}=1;
  $total->{$hit}=1;
  next;
 }
 foreach my $line (@{$allbest->{$hit}})
 {
  my $type;
  eval {$speciescount->{$hit}->{$species->{$line->[1]}}++};
  if ($@) 
  {
    print STDERR join("\n", $@, "LINE: ", Dumper($line), "HIT", Dumper($hit), "Speciescount", Dumper($speciescount), "", "");

  }
  $total->{$hit}++;
  foreach my $g (@{$genus->{$line->[1]}})
  {
   $type .= "\t$g";
   $count->{$hit}->{$type}++;
  }
 }
}

foreach my $hit (keys %$allbest)
{
 foreach my $species (keys %{$speciescount->{$hit}}) 
 {
  if ($speciescount->{$hit}->{$species} >= (0.5 * $total->{$hit})) 
  {
   $bestspecies->{$hit}=$species;
  }
 }
 unless ($longest->{$hit}) {$longest->{$hit}=0}
 foreach my $ty (keys %{$count->{$hit}})
 {
  #print STDERR $hit, $ty, "\t", $count->{$hit}->{$ty}, "\t", $total->{$hit}, "\n"; 
  #if (($count->{$hit}->{$ty} == $total->{$hit}) && (length($ty) > length($longest->{$hit})))
  #if ($hit eq "A05_SM45BB11.27F_001") {print join("\t", $hit, $ty, $count->{$hit}->{$ty}), "\n"}
  if (($count->{$hit}->{$ty} >= (0.5 * $total->{$hit})) && (length($ty) > length($longest->{$hit})))
  {
   $longest->{$hit}=$ty;
  }
 }
}

foreach my $hit (keys %$allbest)
{
 if ($bestspecies->{$hit} && $short && $wholeline->{$bestspecies->{$hit}})
 {
   unless ($wholeline->{$bestspecies->{$hit}}) {$wholeline->{$bestspecies->{$hit}}=[]}
   print join("\t", $hit, $bestspecies->{$hit}, $speciescount->{$hit}->{$bestspecies->{$hit}}, $total->{$hit}, @{$wholeline->{$bestspecies->{$hit}}}), "\n";
 }
 elsif ($longest->{$hit})
 {
  print join("\t", $hit, $longest->{$hit}), "\n";
 }
 else 
 {
  print STDERR "No unique for $hit\n";
 }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3