[Bio] / FigKernelScripts / partition_prots.pl Repository:
ViewVC logotype

View of /FigKernelScripts/partition_prots.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Mon Mar 9 14:50:34 2009 UTC (11 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011
Changes since 1.3: +17 -2 lines
check to make sure sims are working

use DB_File;
use strict;

use FIG;
my $fig = new FIG;

my $max_sz;
my $usage = "usage: partition_prots MaxPartSz";
(
 ($max_sz = shift @ARGV)
)
    || die $usage;


my %prot_hash;
my %seen_hash;
&build_hashes(\%prot_hash,\%seen_hash,$max_sz);

my @all = sort keys(%prot_hash);
my $n = @all;
print STDERR "$n proteins to process\n";

my $got_sims = 0;
my $no_sims = 0;
my $i;
for ($i=0; ($i < @all); $i++)
{
    if (! $seen_hash{$all[$i]})
    {
	print STDERR "processing $all[$i], $i of $#all\n";
	&process($fig,$all[$i],\%seen_hash,\%prot_hash,$max_sz,\$n,\$got_sims,\$no_sims);
    }
    else
    {
	print STDERR "seen $all[$i]\n";
    }
}

sub process {
    my($fig,$prot,$seen,$prot_hash,$max_sz,$nP,$gotP,$nosimsP) = @_;

    my %in;
    $in{$prot} = 1;
    my %closest;
    $closest{$prot} = 0;

    my $sz = 1;
    my $peg = $prot;
    while (($sz < $max_sz) && defined($peg))
    {
	$$nP--;
	delete $closest{$peg};
	my @sims = $fig->sims($peg,10000,1,'raw');
	if (@sims > 0)
	{
	    $$got_sims++;
	}
	else
	{
	    $$no_sims++;
	    if ($$no_sims > (10 * $$got_sims))
	    {
		die "sims do not seem to be working properly";
	    }
	}

	foreach my $sim (@sims)
	{
	    my $id2 = $sim->id2;
	    if ($prot_hash->{$id2})
	    {
		if (! $in{$id2})
		{
		    $in{$id2} = 1;
		    $sz++;
		    $closest{$id2} = $sim->bsc;
		}
		elsif (defined($closest{$id2}) && ($closest{$id2} < $sim->bsc))
		{
		    $closest{$id2} = $sim->bsc;
		}
	    }
	}
	$seen->{$peg} = 1;
	print "IN\t$prot\t$peg\n";
	$peg = &the_closest(\%closest,$seen);
    }
    print STDERR "$$nP left\n";
    foreach $_ (sort keys(%in))
    {
	print join("\t",('SET',$prot,$_)),"\n";
    }
}

sub the_closest {
    my($closest,$seen) = @_;

    my($peg,$x,$peg1,$x1);
    while (($peg1,$x1) = each %$closest)
    {
	if ((! $seen->{$peg1}) && ((! defined($peg1)) || ($x1 > $x)))
	{
	    $peg = $peg1;
	    $x = $x1;
	}
    }
    return $peg;
}

sub build_hashes {
    my($prot_hash,$seen_hash,$sz) = @_;

    my($prot_hash_tie,$seen_hash_tie);
#    $prot_hash_tie = tie %$prot_hash, 'DB_File',"partition_prot_hash_$sz.db",O_RDWR,0666,$DB_BTREE;
#    $seen_hash_tie = tie %$seen_hash, 'DB_File',"seen_hash_$sz.db",O_RDWR,0666,$DB_BTREE;
#    return;

    $prot_hash_tie = tie %$prot_hash, 'DB_File',"partition_prot_hash_$sz.db",O_CREAT,0666,$DB_BTREE;
    $prot_hash_tie || die "tie failed";

    $seen_hash_tie = tie %$seen_hash, 'DB_File',"seen_hash_$sz.db",O_CREAT,0666,$DB_BTREE;
    $seen_hash_tie || die "tie failed";

    open(SYMS,"<$FIG_Config::global/peg.synonyms")
	|| die "could not open peg.synonyms";
    while (defined($_ = <SYMS>))
    {
	chop;
	my($head,$rest) = split(/\t/,$_);
	if ($rest =~ /fig\|/)
	{
	    my @fig = map { ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) ? $1 : () } split(/;/,$rest);
	    @fig = grep { $fig->is_complete(&FIG::genome_of($_)) } @fig;
	    if (@fig > 0)
	    {
		$head =~ s/,\d+$//;
		$prot_hash->{$head} = 1;
		foreach my $peg (@fig)
		{
		    $seen_hash->{$peg} = 1;
		}
	    }
	}
    }
    foreach my $genome ($fig->genomes('complete'))
    {
	foreach my $peg ($fig->all_features($genome,'peg'))
	{
	    if (! $seen_hash->{$peg})
	    {
		$prot_hash->{$peg} = 1;
	    }
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3