[Bio] / FigKernelScripts / make_partition_reps_worker.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_partition_reps_worker.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Oct 8 18:58:31 2009 UTC (10 years, 4 months ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +3 -1 lines
commit change

use strict;

use FIG;
my $fig = new FIG;

my $dbf = $fig->db_handle;

use representative_sequences;
use gjoseqlib;

my $usage = "usage: make_partition_reps_worker FigFamsData OutputDir < partition_numbers";
my($ff_data,$outD,$proc);
(
 ($ff_data = shift @ARGV) && (-d "$ff_data/Partitions") &&
 ($outD = shift @ARGV) &&
 ($proc = shift @ARGV)
)
    || die $usage;

print STDERR "$proc starting\n";

my $seen_partitions = {};
if (-d "$outD"){
    my @all_procs = glob("$outD/*");
    foreach my $file (@all_procs){
	open(FH,"$file") || die "could not open $outD/$proc";
	while (my $line = <FH>){
	    chomp $line;
	    my ($peg, $partition) = split(/\t/, $line);
	    $seen_partitions->{$partition}=1;
	}
	close FH;
    }
    
    open(OUT,">>$outD/$proc") || die "could not open $outD/$proc";
}
else{
    open(OUT,">$outD/$proc") || die "could not open $outD/$proc";
}
my $ofh = select OUT; $| = 1; select $ofh;

while (defined($_ = <STDIN>))
{
    if ($_ =~ /(\d+)/)
    {
	my $partition = $1;
	next if ($seen_partitions->{$partition});
	print STDERR "$proc: $partition\n";

	my $subdir = $partition % 1000;
	(-d "$ff_data/Partitions/$subdir/$partition") 
	    || die "$ff_data/Partitions/$subdir/$partition";

	my @seqs = &gjoseqlib::read_fasta("$ff_data/Partitions/$subdir/$partition/fasta");
	my @cond = map { "(fid = '$_->[0]')" } @seqs;
	my $constraint = join(" OR ",@cond);
#	print STDERR "$constraint\n";

	if (!$fig){
	    $fig = new FIG;
	    $dbf = $fig->db_handle;
	}

	#print STDERR "select fid from tmp_sync_reps where $constraint";
	my $x = [];
	$x = $dbf->SQL("select fid from tmp_sync_reps where $constraint") if ($partition ne "2505");
	my %exists;
	my $peg;
	foreach $peg (map {$_->[0]} @$x)
	{
#	    print STDERR "$proc: $peg already exists\n";
	    $exists{$peg} = 1;
	}
	my @exist_already = grep { $exists{$_->[0]} } @seqs;
	my @new           = grep { ! $exists{$_->[0]} } @seqs;
	my $reps = &representative_sequences::rep_seq(\@exist_already,\@new,{max_sim => 0.3});

	my $n1 = @seqs;
	my $n2 = grep { ! $exists{$_->[0]} } @$reps;
	my $n3 = @exist_already;

	print STDERR "$proc: counts seqs=$n1 new.reps=$n2 exists=$n3\n";
	foreach my $rep (@$reps)
	{
	    $peg = $rep->[0];
	    if (! $exists{$peg})
	    {
		if (!$fig){
		    $fig = new FIG;
		    $dbf = $fig->db_handle;
		}

		$dbf->SQL("insert into tmp_sync_reps (fid) values ('$peg')");
	    }
	    print OUT "$peg\t$partition\n";
	}

    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3