[Bio] / FigKernelScripts / make_sim_part_data.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_sim_part_data.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Jan 11 14:49:16 2009 UTC (11 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
add command to build similarity partitions

########################################################################

use DB_File;

use FIG;
my $fig = new FIG;

use strict;
my $usage = "usage: make_sim_part_data Dir < stdout.from.partition_prots";

my $dir;
($dir = shift @ARGV)
    || die $usage;

if (-d $dir) { die "$dir already exists" }

mkdir($dir,0777) || die "could not make $dir";

my $partD = "$dir/Partitions";
my $simF  = "$dir/SimilarityPartitions";
my $relF  = "$dir/ToSimilarityPartition";

my %prot_hash;
&build_expand_hash($fig,\%prot_hash);

open(SIMPART,">$simF")
    || die "could not open $simF";
open(REL,">$relF")
    || die "could not open $relF";

mkdir("$partD",0777) || die "could not make $partD";

my $n = 1;

my $x = <STDIN>;
while (defined($x) && ($x =~ /^IN\t(\S+)\t(\S+)/))
{
    my $currset = $1;
    my @setI = ();
    while (defined($x) && ($x =~ /^IN\t(\S+)\t(\S+)/) && ($1 eq $currset))
    {
	push(@setI,$2);
	$x = <STDIN>;
    }

    my $bug;
    my @setS = ();
    while (defined($x) && ($x =~ /^SET\t(\S+)\t(\S+)/) && ($1 eq $currset))
    {
#	if ($1 eq 'fig|314291.3.peg.2784') { $bug = 1 }
	push(@setS,$2);
	$x = <STDIN>;
    }
    &process_set($fig,\*SIMPART,\*REL,$partD,\@setI,\@setS,\$n,\%prot_hash,$bug);
}
close(SIMPART);
close(REL);

sub process_set {
    my($fig,$entity_fh,$rel_fh,$partD,$setI,$setS,$nP,$prot_hash,$bug) = @_;
#    if ($bug) { print STDERR &Dumper($setI,$setS) }

    my @reduced = &reduce($setS,$prot_hash);
#    if ($bug) { print STDERR &Dumper('reduced1',\@reduced); }
    if (@reduced > 1)
    {
	my $md5;
	foreach $md5 (@reduced)
	{
	    print $entity_fh join("\t",($$nP,$md5)),"\n";
	}
	my @reduced_in = &reduce($setI,$prot_hash);
#	if ($bug) { print STDERR &Dumper('reduced_in',\@reduced_in); }
	foreach $md5 (@reduced_in)
	{
	    print $rel_fh join("\t",($$nP,$md5)),"\n";
	}
	my $dir1 = $$nP % 1000;
	&FIG::verify_dir("$partD/$dir1");
	open(FASTA,">$partD/$dir1/$$nP") || die "could not make $partD/$dir1/$$nP";
	foreach $md5 (@reduced)
	{
	    my @pegs = $fig->pegs_with_md5($md5);
#	    if ($bug) { print STDERR &Dumper([$md5,\@pegs]); }
	    if (@pegs == 0)
	    {
		print STDERR "could not handle hash=$md5\n";
	    }
	    else
	    {
		my($i,$x);
		for ($i=0; ($i < @pegs) && (! ($x = $fig->get_translation($pegs[$i]))); $i++) {}
		if ($i < @pegs)
		{
		    print FASTA ">$md5\n$x\n";
		}
	    }
	}
	close(FASTA);

	$$nP++;
    }
#   if ($bug) { die "aborted" }
}

sub reduce {
    my($set,$prot_hash) = @_;

    my @setE = ();
    my $x;
    foreach $x (@$set)
    {
	my $toL;
	if ($toL = $prot_hash->{$x})
	{
	    push(@setE,split(/,/,$toL));
	}
	elsif ($x =~ /^fig\|/)
	{
	    push(@setE,$x);
	}
    }
#    print STDERR &Dumper(['expanded',\@setE]);
    my %reduced = map { $_ => 1 } @setE;
    my @reduced_set = sort { &FIG::by_fig_id($a,$b) } keys(%reduced);

    my %md5s;
    foreach my $peg (@reduced_set)
    {
	my $md5 = $fig->md5_of_peg($peg);
	if ($md5)
	{
	    push(@{$md5s{$md5}},$peg);
	}
    }
    my @md5_set = sort keys(%md5s);
    return @md5_set;
}

sub build_expand_hash {
    my($fig,$prot_hash) = @_;

#   my $prot_hash_tie = tie %$prot_hash, 'DB_File','make_sim_part_prot_hash.db',O_RDWR,0666,$DB_BTREE; return;
    my $prot_hash_tie = tie %$prot_hash, 'DB_File','make_sim_part_prot_hash.db',O_CREAT,0666,$DB_BTREE;
    $prot_hash_tie || die "tie failed";

    open(SYMS,"<$FIG_Config::global/peg.synonyms")
	|| die "could not open peg.synonyms";
    while (defined($_ = <SYMS>))
    {
	chop;
	my($head,$rest) = split(/\t/,$_);
	if ($rest =~ /fig\|/)
	{
	    my @fig = map { ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) ? $1 : () } split(/;/,$rest);
	    @fig = grep { $fig->is_complete(&FIG::genome_of($_)) } @fig;
	    if (@fig > 0)
	    {
		$head =~ s/,\d+$//;
		if ($head =~ /^fig\|/) { push(@fig,$head) }
		$prot_hash->{$head} = join(",",@fig);
	    }
	}
    }
    close(SYMS);
#    print STDERR "prot_hash has been constructed\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3