[Bio] / FigKernelScripts / generate_partitions_worker.pl Repository:
ViewVC logotype

View of /FigKernelScripts/generate_partitions_worker.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Thu Nov 11 17:39:53 2010 UTC (9 years ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.3: +2 -2 lines
FigFam 6 digit problem

use FIG;
use Digest::MD5;
my $fig = new FIG;
my $dbf = $fig->db_handle;
use FIGO;
my $figO = new FIGO;

use strict;

my $sim_thresh = 1.0e-10;
my($usage,$fams,$outD,$proc,$log_file);
$usage = "usage: generate_partition_worker Families OutDir Process < FIGfams > Partitions";

while ( $ARGV[0] =~ /^-/ )
{
    $_ = shift @ARGV;
    if ($_ =~ s/^-log//)     { $log_file      = ($_ || shift @ARGV) }
    else                  { print STDERR  "Bad flag: '$_'\n$usage"; exit 1 }
}

(
 ($fams = shift @ARGV) && open(FAMS,"<$fams") &&
 ($outD = shift @ARGV) &&
 ($proc = shift @ARGV)
)
    || die $usage;

open(OUT,">>$outD/$proc") || die "could not open $outD/$proc";
my $ofh = select OUT; $| = 1; select $ofh;
my %done_fam;

my(%fam_to_pegs,%peg_to_fams,%peg_to_md5);
while (defined($_ = <FAMS>))
{
    if ($_ =~ /^(\S+)\t(\S+)/)
    {
	my($fam,$peg) = ($1,$2);
	if (((is_fid($peg)) && (! $fig->is_deleted_fid($peg)) && (! $done_fam{$fam})) ||
	    (!is_fid($peg)))
	{
	    push(@{$fam_to_pegs{$fam}},$peg);
	    push(@{$peg_to_fams{$peg}},$fam);
	    #my $md5 = Digest::MD5::md5_hex( uc $seq );
	    #$peg_to_md5{$peg}=$md5;
	}
    }
}
close(FAMS);
#print STDERR "loaded FIGfamsd\n";

my $peg;
while (defined($_ = <STDIN>))
{
    if (!$fig){
	$fig = new FIG;
	$dbf = $fig->db_handle;
    }

    chomp $_;
    my $cond = "fam = '".$_."'";
    my $x = $dbf->SQL("select fam from tmp_sync_partitions where $cond");

#    if (($_ =~ /(FIG\d+)/) && (! $done_fam{$1}))
    if (($_ =~ /(FIG\d+)/) && (! $done_fam{$1}) && (! scalar @$x > 0))
    {
	my $ff = $1;
	print STDERR "$proc processing $ff\n";

	my $todo = { $ff => 1 };
	my $loaded_fams = {};
	my $state = [$fig,$figO,\%done_fam,$todo,$loaded_fams,\%fam_to_pegs,\%peg_to_fams,$sim_thresh,\%peg_to_md5];
	&load_fam($ff,$state);
	
	my @todo;
	while ((@todo = keys(%$todo)) > 0)
	{
	    my $fam = $todo[0];
	    &process($fam,$state);
#	    print STDERR &Dumper(['todo',$todo]);
	}
    }
}

sub load_fam {
    my($ff,$state) = @_;

    my($fig,$figO,$done_fam,$todo,$loaded_fams,$fam_to_pegs,$peg_to_fams,$sim_thresh,$peg_to_md5) = @$state;
   
    my $loaded_fam = $loaded_fams->{$ff};
    if (! $loaded_fam)
    {
	my $pegs = $fam_to_pegs{$ff};
	my @clean_pegs = ();
	my $id_flag = 'fid';
	foreach my $peg (@$pegs)
	{
	    my $featureO = new FeatureO($figO,$peg) if (&is_fid($peg));
	    if ($featureO && (! $fig->possibly_truncated($peg)) && (! $featureO->possible_frameshift))
	    {
		push(@clean_pegs,$peg);
	    }
	    #elsif (! is_fid($peg))
	    #{
	    #	# get the md5 sequence
	    #	my $md5 = $peg_to_md5->{$peg};
	    #	push(@clean_pegs, "gnl|md5|".$md5);
	    #	$id_flag = 'non_fid';
	    #}							
	}

	my @sims = ();
	my @presims = ();
	if ($id_flag eq 'fid')
	{
	    @presims = $fig->sims(\@clean_pegs,100000,$sim_thresh,'fig');
	}
	else {
	    @presims = $fig->sims(\@clean_pegs,100000,$sim_thresh,'raw');
	}

	foreach my $sim (@presims)
	{
	    my $match1 = $sim->e1 - $sim->b1;
	    my $match2 = $sim->e2 - $sim->b2;
	    if ((($match1 / $sim->ln1) > 0.7) &&
		(($match2 / $sim->ln2) > 0.7))
	    {
		push(@sims,$sim);
	    }
	}

	my $others_not_in_fam = {};
	my $other_fams        = {};
	my $worst_hit = 1000;
	foreach my $sim (sort { ($a->id1 cmp $b->id1) or ($b->bsc <=> $a->bsc) } @sims)
	{
	    my $id1 = $sim->id1;
	    my $id2 = $sim->id2;
	    my $in  = $peg_to_fams{$id2};
	    my $matched1 = ($sim->e1 + 1) - $sim->b1;
	    if (! $in)
	    {
		$others_not_in_fam->{$id2} = 1;
	    }
	    else
	    {
		foreach my $fam_in (@$in)
		{
		    if ($fam_in eq $ff)
		    {
#			print STDERR &Dumper([$worst_hit,$sim->bsc/$matched1]);
			if (((! defined($worst_hit)) || (($sim->bsc / $matched1) < $worst_hit)))
			{
			    $worst_hit = sprintf("%5.3f",($sim->bsc / $matched1));
			}
		    }
		    else
		    {
			if (! $other_fams->{$fam_in})
			{
#			    print STDERR "$ff - $fam_in $id1 $id2\n";
			}
			$other_fams->{$fam_in} = 1;
		    }
		}
	    }
	}
	$loaded_fams->{$ff} = [$worst_hit,$other_fams,$others_not_in_fam];
    }
#   print STDERR &Dumper(["loaded $ff",$loaded_fams->{$ff}]);
}

sub process {
    my($fam,$state) = @_;

    my($fig,$figO,$done_fam,$todo,$loaded_fams,$fam_to_pegs,$peg_to_fams,$sim_thresh,$peg_to_md5) = @$state;
    my $loaded = $loaded_fams->{$fam};
    my($worst_hit,$other_fams,$others_not_in_fam) = @$loaded;

    my @contained_in = ();
    my @not_contained_in = ();
    foreach my $famO (keys(%$other_fams))
    {
	&load_fam($famO,$state);
	my $other_fams2 = $loaded_fams->{$famO}->[1];
	my $contained = 1;
	foreach my $famO2 (keys(%$other_fams2))
	{
	    if (($famO2 ne $fam) && (! $other_fams->{$famO2}))
	    {
		$contained = 0;
	    }
	}

	if ($contained)
	{
	    push(@contained_in,$famO);
	}
	else
	{
	    push(@not_contained_in,$famO);
	}
    }

    my $set_to_write = [$fam,@contained_in];
    &write_set($set_to_write,$state,\@not_contained_in);
    foreach my $famD (@$set_to_write)
    {
	if ($todo->{$famD}) { delete $todo->{$famD} }
	$done_fam->{$famD} = 1;
	
	# add to db_handle to show that figfam is done for all other processes
	if (!$fig){
	    $fig = new FIG;
	    $dbf = $fig->db_handle;
	}
	$dbf->SQL("insert into tmp_sync_partitions (fam) values ('$famD')");
    }

    foreach my $famO (@not_contained_in)
    {
	if (!$fig){
            $fig = new FIG;
            $dbf = $fig->db_handle;
        }

	my $cond = "fam = '".$famO."'";
	my $x = $dbf->SQL("select fam from tmp_sync_partitions where $cond");

#	if (! $done_fam->{$famO})
	if ( (! $done_fam->{$famO}) && (! scalar @$x > 0) )
	{
	    $todo->{$famO} = 1;
	}
    }
}

sub write_set {
    my($set,$state,$not_contained_in) = @_;

    my($fig,$figO,$done_fam,$todo,$loaded_fams,$fam_to_pegs,$peg_to_fams,$sim_thresh,$peg_to_md5) = @$state;
    my %all_pegs;

    my @fams_with_worst = ();
    foreach my $fam (@$set)
    {
	my $loaded = $loaded_fams->{$fam};
	my($worst_hit,$other_fams,$others_not_in_fam) = @$loaded;
	push(@fams_with_worst,"$fam,$worst_hit");
	my $pegs = $fam_to_pegs->{$fam};
	foreach my $peg (@$pegs,keys(%$others_not_in_fam))
	{
	    $all_pegs{$peg} = 1;
	}
    }

    foreach my $fam (@$not_contained_in)
    {
	my $loaded = $loaded_fams->{$fam};
	my($worst_hit,$other_fams,$others_not_in_fam) = @$loaded;
	my $pegs = $fam_to_pegs->{$fam};
	foreach my $peg (@$pegs,keys(%$others_not_in_fam))
	{
	    $all_pegs{$peg} = 1;
	}
    }

    foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%all_pegs))
    {
	print OUT "$peg\n";
    }
    print OUT "//\t",join(" ",@fams_with_worst),"\n";
}

sub is_fid {
    my ($fid) = @_;

    if ($fid =~ /^fig\|\d+\.\d+\.peg.\d+/){
	return 1;
    }
    return 0;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3