[Bio] / FigKernelScripts / summarize_taxonomy_by_blast.pl Repository:
ViewVC logotype

View of /FigKernelScripts/summarize_taxonomy_by_blast.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (download) (as text) (annotate)
Sun Mar 16 17:07:24 2008 UTC (11 years, 8 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.8: +3 -0 lines
removing fatal error if 0 or 1 hits

#__perl__
#

=pod

=head1 summarize_taxonomy_by_blast.pl

Generate a summary of the taxonomic profile of a genome [e.g. a metagenome] based on the best or all blast hits

=cut


use strict;
use FIG;
use FIGV;
use FileHandle;
use Data::Dumper;

use DB_File;

my $usage = "$0 [-orgdir organism-dir] [-n maxhits] [-b btree-file] genome";

my $orgdir;
my $n = 1;
my $btree_file;

while (@ARGV)
{
    my $arg = $ARGV[0];
    if ($arg =~ /^-(.*)/)
    {
	my $flag = $1;
	shift;
	
	if ($flag eq 'orgdir')
	{
	    $orgdir = shift;
	}
	elsif ($flag eq 'n')
	{
	    $n = shift;
	}
	elsif ($flag eq 'b')
	{
	    $btree_file = shift;
	}
	else
	{
	    die $usage;
	}
    }
    else
    {
	last;
    }
}

my $fig;
if ($orgdir)
{
    $fig=new FIGV($orgdir);
}
else
{
    $fig=new FIG;
}

my $genome=shift || die "$usage\n";

my($btree_tie, %btree, $hits_fh, $hits_tmp);

if ($btree_file)
{
    #
    # Must unlink otherwise we get dups?
    #
    unlink($btree_file);
    $hits_tmp = "$FIG_Config::temp/tax.$$.tmp";
    $hits_fh = new FileHandle("|sort > $hits_tmp") or die "Cannot open sort to $hits_tmp for writing: $!\n";

    $btree_tie = tie %btree, 'DB_File', $btree_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;
    
    $btree_tie or die("Creation of btree $btree_file failed: $!\n");
}
     

print STDERR "Looking through $n hits\n";
my $at; my $tot;
my %leaf;

if ($orgdir and open(EXP, "<$orgdir/expanded_similarities"))
{
    #
    # Rapid scan.
    #
#    warn "Scanning sims\n";

    #
    # We need to sort the sim blocks because the expanded sims aren't necessarily
    # sorted in match order.
    #	

    my @block;

    my $last;

    while (<EXP>)
    {
	chomp;

	if (/^([^\t]+)\t(fig\|(\d+\.\d+)[^\t]+)\t([^\t]*)\t[^\t]*\t[^\t]*\t[^\t]*\t[^\t]*\t[^\t]*\t[^\t]*\t[^\t]*\t([^\t]*)\t([^\t]*)/ and $fig->is_genome($3))
	{
	    my($id1, $id2, $genome, $iden, $psc, $bsc) = ($1, $2, $3, $4, $5, $6);
	    my $ent = [$id1, $id2, $genome, $iden, $psc, $bsc];
		
	    if (defined($last) and $id1 ne $last)
	    {
		handle_block(\@block, $n);
		@block = ();
	    }
	    $last = $id1;
	    push(@block, $ent);
	}
    }
    handle_block(\@block, $n);
    close(EXP);
}
else
{
    my @pegs = $fig->pegs_of($genome);
    
    foreach my $sim ($fig->sims(\@pegs, $n, 0.01, "figx"))
    {
	my $tax=$fig->taxonomy_of($fig->genome_of($sim->[1]));

	print $hits_fh "$tax\t$sim->[0]\n" if $hits_fh;
	
	$leaf{$tax} = 1;
	
	while ($tax =~ /\;/)
	{
	    $at->{$tax}++;
	    $tax =~ s/(.*)\s*\;.*?$/$1/;
	}
	$tot++;
    }
}

print STDERR "Read $tot pegs for $genome\n";

map {print join("\t", $_, $at->{$_}, exists($leaf{$_})), "\n"} sort {uc($a) cmp uc($b)} keys %$at;

if ($hits_fh)
{
#    warn "Awaiting sort finish\n";
    close($hits_fh) or die "Close of sort pipe failed: \$!=$! \$?=$?\n";

    #
    # Read sorted tmpfile and index.
    #

#    warn "Writing btree\n";
    open(S, "<$hits_tmp") or die "Cannot read $hits_tmp: $!\n";

    my $last;
    my @list;
    while (<S>)
    {
	chomp;
	my($tax, $id) = split(/\t/);
	if (defined($last) and $tax ne $last)
	{
	    my $str = join($;, @list);
	    $btree{$last} = $str;
	    @list = ();
	}
	$last = $tax;
	push(@list, $id);
    }
    $btree{$last} = join($;, @list);
    
    untie %btree;

    close(S);
    # unlink($hits_tmp);
}

sub handle_block
{
    my($block, $n) = @_;

    my @b = sort { $a->[4] <=> $b->[4] } @$block;

    for (my $i = 0; $i < $n; $i++)
    {
    	
	next unless (defined $b[$i]);

	my($id1, $id2, $genome) = @{$b[$i]};

	my $tax=$fig->taxonomy_of($genome);

	my $show = $tax eq 'Bacteria; Actinobacteria; Symbiobacterium; Symbiobacterium thermophilum IAM 14863.';
	
	print $hits_fh "$tax\t$id1\n" if $hits_fh;
	$leaf{$tax} = 1;
	
	while ($tax =~ /\;/)
	{
	    $at->{$tax}++;
	    $tax =~ s/(.*)\s*\;.*?$/$1/;
	}
	$tot++;
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3