[Bio] / FigKernelScripts / find_poss_errs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_poss_errs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Nov 11 17:41:49 2010 UTC (9 years ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +1 -1 lines
FigFam 6 digit problem

########################################################################

use strict;
use FIG;
use FIGV;
use FS_RAST;
use FIG_Config;
use Data::Dumper;
use Carp;
use gjoseqlib;
use FF;
use FFs;
my $ffs = new FFs($FIG_Config::FigfamsData);

my $usage = "find_poss_errs -g Genome [-d RASTdir] > table.of.suggestions";

my($figvD,$this_genome,$dir);
while (@ARGV && ($ARGV[0] =~ /^-/))
{
    $_ = shift @ARGV;
    if    ($_ =~ s/^-d//) { $figvD        = ($_ || shift @ARGV) }
    elsif ($_ =~ s/^-g//) { $this_genome  = ($_ || shift @ARGV) }
    else                  { die "Bad Flag: $_" }
}

my $fig;
if ($this_genome && $figvD && (-d $figvD))
{
    $dir = $figvD;
    $fig = new FIGV($dir);
}
elsif ($this_genome && (-d "$FIG_Config::organisms/$this_genome"))
{
    $dir = "$FIG_Config::organisms/$this_genome";
    $fig = new FIG;
}
else
{
    die "$usage";
}

my @famids = map { ($_ =~ /^(FIG\d+)/) ? $1 : () } <DATA>;
$#famids = 5;
my @genomes = &get_neighbors(\@famids);

my @to_look_for = &get_genes_to_seek($fig,$this_genome,\@genomes);
my @contigs = &gjoseqlib::read_fasta("$dir/contigs");

foreach my $entry (@to_look_for)
{
    my($role,$pegs) = @$entry;
    &process_one_search($fig,$ffs,\@contigs,$this_genome,$role,$pegs);
}

sub process_one_search {
    my($fig,$ffs,$contigs,$genome,$role,$pegs) = @_;

    my @pegs = $fig->seqs_with_role($role,undef,$this_genome);
    if (@pegs > 0) { die "Bad Role: $role" }
    
    my $fam;
    my %fams = map { ($fam = $ffs->family_containing_peg($_)) ? ($fam => 1) : () } @$pegs;
    my @fams = sort keys(%fams);

    foreach $fam (@fams)
    {
	my $params = { contigs => $contigs };
	my @results = &FS_RAST::figfam_search($fam,$params);
	foreach my $result (@results)
	{
	    my($location,$translation,$famid,$infam,$iden,$func,$genomes,$annotation) = @$result;
	    print "$infam\t$famid\t$iden\t$func\t$translation\n$location\n",
	          join(",",@$genomes),"\n$annotation\n//\n";
        }
    }
}

sub get_genes_to_seek {
    my($fig,$this_genome,$genomes) = @_;
    
    my %to_seek;
    my $hits_in_this_genome = {};
    foreach my $genome (@$genomes)
    {
#	print STDERR "processing $genome\n";
	my $activeSS = $fig->active_subsystems($genome);
	my $hits_in_genome = {};
	my $subsys_data = $fig->get_genome_subsystem_data($genome);
	foreach my $tuple (@$subsys_data)
	{
	    my($subsys,$role,$peg) = @$tuple;
	    if ($fig->usable_subsystem($subsys) && $activeSS->{$subsys})
	    {
		$hits_in_genome->{$subsys}->{$role}->{$peg} = 1;
		if (! defined($hits_in_this_genome->{$role}))
		{
		    my @pegs = $fig->seqs_with_role($role,undef,$this_genome);
		    if (@pegs > 0)
		    {
			$hits_in_this_genome->{$role} = 1;
		    }
		    else
		    {
#			print STDERR &Dumper(['seeks_with_role',$role,\@pegs]);
		    }

		}
	    }
	}

	foreach my $ss (keys(%$hits_in_genome))
	{
	    my @roles = sort keys(%{$hits_in_genome->{$ss}});
	    my $rolesN = @roles;
	    my @got   = grep { $hits_in_this_genome->{$_} } @roles;
	    my $gotN = @got;
	    
	    if (@got >= (@roles / 2))
	    {
		my @missed = grep { ! $hits_in_this_genome->{$_} } @roles;
		foreach my $role (@missed)
		{
		    foreach my $peg (keys(%{$hits_in_genome->{$ss}->{$role}}))
		    {
			$to_seek{$role}->{$peg} = 1;
		    }
		}
	    }
	}
    }
    my @to_look_for = ();
    foreach my $role (keys(%to_seek))
    {
	my $x = $to_seek{$role};
	push(@to_look_for,[$role,[sort keys(%$x)]]);
    }
    return @to_look_for;
}

sub get_neighbors {
    my($famids) = @_;

    my %genomes_hit;
    while (my $famid = shift @$famids)
    {
	my $famidM   = substr($famid,6,3);
	my $contigsF = "$dir/contigs";
	my $repsF    = "$FIG_Config::FigfamsData/FIGFAMS/$famidM/$famid/reps";
	my $familyDB = "$FIG_Config::FigfamsData/FIGFAMS/$famidM/$famid/PEGs.fasta";
	my @hits     = ();
	my $params   = {};
	if ((-s $contigsF) && (-s $repsF) && (-s $familyDB))
	{
	    my @contigs = &gjoseqlib::read_fasta($contigsF);
	    my @reps = &gjoseqlib::read_fasta($repsF);
	    my @fam  = &gjoseqlib::read_fasta($familyDB);
	    
	    $params->{contigs} = \@contigs;
	    $params->{family}  = \@fam;
	    $params->{reps}    = \@reps;
	    @hits = &FS_RAST::find_and_correct($params);
	}

	my $famO;
	if (@hits > 0) 
	{
	    $famO = new FF($famid,$FIG_Config::FigfamsData);
	}

	foreach my $hit (@hits)
	{
	    my $func;
	    my($infam,$sims) = $famO->should_be_member($hit->{translation});
	    my $iden = $sims->[0] ? $sims->[0]->iden : 0;
	    
	    if ($infam)
	    {
		my $i;
		for ($i=0; ($i < @$sims) && ($i < 4); $i++)
		{
		    if ($sims->[$i]->[1] =~ /^fig\|(\d+\.\d+)/)
		    {
			$genomes_hit{$1}++;
		    }
		}
	    }
	}
    }
    my @genomes = sort { $genomes_hit{$b} <=> $genomes_hit{$a} } keys(%genomes_hit);
    if (@genomes > 5) { $#genomes = 4 }
    return @genomes;
}

sub figfam_search {
    my($famid,$params) = @_;

    $params->{contigs} || confess "missing contigs";

    my $famidM = substr($famid,6,3);
    my $repsF    = "$FIG_Config::FigfamsData/FIGFAMS/$famidM/$famid/reps";
    my $familyDB = "$FIG_Config::FigfamsData/FIGFAMS/$famidM/$famid/PEGs.fasta";
    my @hits = ();
    if ((-s $repsF) && (-s $familyDB))
    {
	my @reps = &gjoseqlib::read_fasta($repsF);
	my @fam  = &gjoseqlib::read_fasta($familyDB);

	$params->{family}  = \@fam;
	$params->{reps}    = \@reps;
	@hits = &FS_RAST::find_and_correct($params);
    }

    my $famO;
    if (@hits > 0) 
    {
	$famO = new FF($famid,$FIG_Config::FigfamsData);
    }

    my @results = ();
    foreach my $hit (@hits)
    {
	my $func;
	my($infam,$sims) = $famO->should_be_member($hit->{translation});
	my $iden = $sims->[0] ? $sims->[0]->iden : 0;

	if ($infam)
	{
	    $func = $famO->family_function(1);
	}
	else
	{
	    $infam = 0;
	    $func = "";
	}
	push(@results,[$hit->{location},
		       $hit->{translation},
		       $famid,
		       $infam,
		       $iden,
		       $func,
		       $hit->{genomes},
		       $hit->{annotation} ? $hit->{annotation} : ''
                      ]);
    }
    return @results;
}

__DATA__
FIG000087
FIG000040
FIG000194
FIG000153
FIG000127
FIG000069
FIG000001
FIG069855
FIG000144
FIG000129
FIG000121
FIG000098

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3