[Bio] / FigKernelScripts / set_figfam_functions.pl Repository:
ViewVC logotype

View of /FigKernelScripts/set_figfam_functions.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Jan 25 19:44:06 2012 UTC (7 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, mgrast_version_3_2, rast_rel_2014_0912, HEAD
to support assigning FigFam functions

use strict;
use Data::Dumper;
use SeedEnv;

my $usage = "usage: set_figfam_functions PegMd5FuncSubsys RecentAnnotations";
my($tab_pmfs,$recent_ann);
(
 ($tab_pmfs    = shift @ARGV) && (-s $tab_pmfs) &&
 ($recent_ann  = shift @ARGV) && (-s $recent_ann)
)
    || die $usage;

my $three_months = (60*60*24*90);
my $now = time;

# built using get_peg_md5_func_subsystem > $tab_pfms
open(PMFS,"<$tab_pmfs") || die "could not open $tab_pmfs";
my %in_sub   = map { (($_ =~ /^(\S+)\t.*(\d+)$/) && $2) ? ($1 => $2) : () } <PMFS>;
close(PMFS);

my %manual;
# built using get_current_annotations get_recent_annotations > $recent_ann
open(ANN,"<$recent_ann") || die "bad";
while (defined($_ = <ANN>))
{
    if (($_ =~ /^(\S+)\t([^\t]*)\t(\S+)\t(\d+)/) && (($now - $4) < $three_months))
    {
	my $ann = $3;
	if ($ann !~ /(rapid|auto|repair|but not)/i)
	{
	    if ((! $manual{$1}) || ($manual{$1} > $4))
	    {
		$manual{$1} = [$3,$4];
	    }
	}
    }
}
close(ANN);

my $x = <STDIN>;
while ($x && ($x =~ /^(\S+)/))
{
    my $fam = $1;
    my @set;
    while ($x && ($x =~ /^(\S+)\t(\S+)\t(\S.*\S)?/) && ($1 eq $fam))
    {
	push(@set,[$2,$3 ? $3 : '']);
	$x = <STDIN>;
    }
    &process(\@set,$fam,\%in_sub,\%manual);
}


# $set points to a list of [peg,function] tuples
sub process {
    my($set,$fam,$in_sub,$manual) = @_;
    
    my %occ;
    foreach $_ (@$set)
    {
	$occ{$_->[1]}++;
    }
    $set = [map { [$_->[0],$_->[1],$occ{$_->[1]}] } @$set];
#   $set now points to a list of [peg,function,#occ-of-function] tuples

    my $func = &pick_best($set,$in_sub,$manual);
    $func = $func ? $func : '';
    $func =~ s/\s+\#.*$//;
    print join("\t",($fam,$func)),"\n";
}

sub pick_best {
    my($set,$in_sub,$manual) = @_;

    my $sofar = 0;
    for (my $i = 1; ($i < @$set); $i++)
    {
	my $rc;
	if (($set->[$i]->[1] ne $set->[$sofar]->[1]) &&  # if (! same function as best-so-far
	    ($rc = &better($set->[$i],$set->[$sofar],$in_sub,$manual)) && $rc->[0])
	{
#	    print STDERR join("\t",("why-better=$rc->[1]","peg1=$set->[$i]->[0]","func1=$set->[$i]->[1]","func2=$set->[$sofar]->[1]")),"\n";
	    $sofar = $i;
	}
	else
	{
#	    print STDERR &Dumper($i,$sofar,$rc,$set->[$i],$set->[$sofar]);
	}
    }
    return $set->[$sofar]->[1];
}

# better returns a 2-tuple: [first-is-better,why]
sub better {
    my($x,$y,$in_sub,$manual) = @_;

    my $insubX = $in_sub->{$x->[0]};     # insubX is the number of subsystems x is in
    my $insubY = $in_sub->{$y->[0]};     # insubY is the number of subsystems y is in
    my $manualX = $manual->{$x->[0]};    # manualX is a 2-tuple: [who,timestamp]
    my $manualY = $manual->{$y->[0]};    # manualY is a 2-tuple: [who,timestamp]
    
    my $rc;
    if ($rc = &criterion_1($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),1] }
    if ($rc = &criterion_2($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),2] }
    if ($rc = &criterion_3($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),3] }
    if ($rc = &criterion_4($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),4] }
    if ($rc = &criterion_5($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),5] }
    if ($rc = &criterion_6($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),6] }
    if ($rc = &criterion_7($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),7] }
    if ($rc = &criterion_8($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),8] }
    if ($rc = &criterion_9($x,$y,$insubX,$insubY,$manualX,$manualY)) { return [($rc == -1),9] }
    return [(($y->[0] cmp $x->[0]) == -1),10];
}

sub criterion_1 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    my($xN,$yN) = ($x->[2],$y->[2]); 
    my($xOK,$yOK) = (($xN > (0.2 * $yN)),($yN > (0.2 * $xN)));
    my %preferred = ('gjo' => 1, 'VeronikaV' => 1, 'SvetaG' => 1);
    if ($xOK && &preferred($manualX,\%preferred) && (! &preferred($manualY,\%preferred))) { return -1 }
    if ($yOK && &preferred($manualY,\%preferred) && (! &preferred($manualX,\%preferred))) { return  1 }
    if (&preferred($manualX,\%preferred) && &preferred($manualY,\%preferred))     
    { 
	if ($yOK && ($manualY->[1] < $manualX->[1])) { return -1 }
	if ($xOK && ($manualX->[1] < $manualY->[1])) { return  1 }
    }
    return 0;
}

sub preferred {
    my($tuple,$preferred) = @_;
    return ($tuple && $preferred->{$tuple->[0]});
}

sub criterion_2 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    my($xN,$yN) = ($x->[2],$y->[2]); 
    my($xOK,$yOK) = (($xN > (0.2 * $yN)),($yN > (0.2 * $xN)));

    if ($xOK && $manualX && (! $manualY))           { return -1 }
    if ($yOK && $manualY && (! $manualX))           { return  1 }
    if ($manualX && $manualY) 
    {
	if ($yOK && ($manualY->[1] < $manualX->[1])) { return -1 }
	if ($xOK && ($manualX->[1] < $manualY->[1])) { return  1 }
    }
    return 0;
}

sub criterion_3 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    if (! $insubX) { $insubX = 0 }
    if (! $insubY) { $insubY = 0 }
    return ($insubY <=> $insubX);
}

sub criterion_4 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    if ((index($x->[1],'#') > 0) && (index($y->[1],'#') < 0)) { return -1 }
    if ((index($y->[1],'#') > 0) && (index($x->[1],'#') < 0)) { return  1 }
    return 0;
}

sub criterion_5 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    if (&SeedUtils::hypo($x->[1]) && ($y->[1] =~ /^FIG\d+:/))  { return  1 }
    if (&SeedUtils::hypo($y->[1]) && ($x->[1] =~ /^FIG\d+:/))  { return -1 }
    return 0;
}


sub criterion_6 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    if (lc $x->[1] eq lc $y->[1])
    {
	my %words1 = map { $_ => 1 } grep { length($_) == 4 } split(/\s+/,$x->[1]);
	my %words2 = map { $_ => 1 } grep { length($_) == 4 } split(/\s+/,$y->[1]);
	my @inX    = grep { ! $words2{$_} } grep { $_ =~ /^[A-Z][a-z][a-z][A-Z]$/ } keys(%words1);
	if (@inX > 0) { return -1 }
	my @inY    = grep { ! $words1{$_} } grep { $_ =~ /^[A-Z][a-z][a-z][A-Z]$/ } keys(%words2);
	if (@inY > 0) { return 1 }
    }
    return 0;
}

sub criterion_7 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    if (($x->[1] =~ /\(EC \d+\.\d+\.\d+\.\d+\)/) && ($y->[1] !~ /\(EC \d+\.\d+\.\d+\.\d+\)/)) { return -1 }
    if (($y->[1] =~ /\(EC \d+\.\d+\.\d+\.\d+\)/) && ($x->[1] !~ /\(EC \d+\.\d+\.\d+\.\d+\)/)) { return  1 }
    return 0;
}

sub criterion_8 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    return ($y->[2] <=> $x->[2]);
}

sub criterion_9 {
    my($x,$y,$insubX,$insubY,$manualX,$manualY) = @_;

    return abs(30 - length($x->[1])) <=> abs(30 - length($y->[1]));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3