[Bio] / FigKernelPackages / Assignments.pm Repository:
ViewVC logotype

View of /FigKernelPackages/Assignments.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (download) (as text) (annotate)
Wed May 3 13:18:22 2006 UTC (13 years, 9 months ago) by overbeek
Branch: MAIN
Changes since 1.10: +65 -23 lines
 update genome parameters in default assignments

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package Assignments;

use Carp;
use Data::Dumper;
use FIG;
use SameFunc;

sub default_parms {

    my $x = <<END
genome	198214.1	5	Shigella flexneri 2a str. 301
genome	198215.1	5	Shigella flexneri 2a str. 2457T
genome	216598.1	5	Shigella dysenteriae M131649
genome	216599.1	5	Shigella sonnei 53G
genome	630.2	5	Yersinia enterocolitica 8081
genome	633.2	5	Yersinia pseudotuberculosis (Livermore)
genome	187410.1	5	Yersinia pestis KIM
genome	214092.1	5	Yersinia pestis CO92
genome	229193.1	5	Yersinia pestis biovar Medievalis str. 91001
genome	273123.1	5	Yersinia pseudotuberculosis IP 32953
genome	594.1	5	Salmonella enterica subsp. enterica serovar Gallinarum
genome	99287.1	5	Salmonella typhimurium LT2
genome	119912.1	5	Salmonella enterica serovar Choleraesuis SC-B67
genome	209261.1	5	Salmonella enterica subsp. enterica serovar Typhi Ty2
genome	220341.1	5	Salmonella enterica subsp. enterica serovar Typhi str. CT18
genome	83333.1	5	Escherichia coli K12
genome	83334.1	5	Escherichia coli O157:H7
genome	155864.1	5	Escherichia coli O157:H7 EDL933
genome	199310.1	5	Escherichia coli CFT073
genome	216592.1	5	Escherichia coli 042
genome	216593.1	5	Escherichia coli E2348/69
genome	192222.1	5	Campylobacter jejuni subsp. jejuni NCTC 11168
genome	224308.1	5	Bacillus subtilis subsp. subtilis str. 168
genome	196600.1       	5	Vibrio vulnificus YJ016
genome	216895.1       	5	Vibrio vulnificus CMCP6
genome	223926.1       	5	Vibrio parahaemolyticus RIMD 2210633
genome	243277.1       	5	Vibrio cholerae O1 biovar eltor str. N16961
genome	312309.3       	5	Vibrio fischeri ES114
genome	314290.3       	5	Vibrio sp. MED222
genome	314291.3       	5	Vibrio splendidus 12B01
genome	192222.1       	5	Campylobacter jejuni subsp. jejuni NCTC 11168
genome	195099.3       	5	Campylobacter jejuni RM1221
genome	306254.1       	5	Campylobacter coli RM2228
genome	306263.1       	5	Campylobacter lari RM2100
genome	306264.1       	5	Campylobacter upsaliensis RM3195
genome	169963.1       	5	Listeria monocytogenes EGD-e
genome	265669.1       	5	Listeria monocytogenes str. 4b F2365
genome	267409.1       	5	Listeria monocytogenes str. 1/2a F6854
genome	267410.1       	5	Listeria monocytogenes str. 4b H7858
genome	272626.1       	5	Listeria innocua Clip11262
genome	1314.1         	5	Streptococcus pyogenes M5
genome	160490.1       	5	Streptococcus pyogenes M1 GAS
genome	170187.1       	5	Streptococcus pneumoniae TIGR4
genome	171101.1       	5	Streptococcus pneumoniae R6
genome	186103.1       	5	Streptococcus pyogenes MGAS8232
genome	193567.1       	5	Streptococcus pyogenes SSI-1
genome	198466.1       	5	Streptococcus pyogenes MGAS315
genome	205921.3       	5	Streptococcus agalactiae A909
genome	208435.1       	5	Streptococcus agalactiae 2603V/R
genome	210007.1       	5	Streptococcus mutans UA159
genome	211110.1       	5	Streptococcus agalactiae NEM316
genome	246201.1       	5	Streptococcus mitis NCTC 12261
genome	264199.3       	5	Streptococcus thermophilus LMG 18311
genome	286636.1       	5	Streptococcus pyogenes MGAS10394
genome	293653.3       	5	Streptococcus pyogenes MGAS5005
genome	299768.3       	5	Streptococcus thermophilus CNRZ1066
genome	319701.3       	5	Streptococcus pyogenes MGAS6180
genome	93062.4        	5	Staphylococcus aureus subsp. aureus COL
genome	158878.1       	5	Staphylococcus aureus subsp. aureus Mu50
genome	158879.1       	5	Staphylococcus aureus subsp. aureus N315
genome	176279.3       	5	Staphylococcus epidermidis RP62A
genome	176280.1       	5	Staphylococcus epidermidis ATCC 12228
genome	196620.1       	5	Staphylococcus aureus subsp. aureus MW2
genome	282458.1       	5	Staphylococcus aureus subsp. aureus MRSA252
genome	282459.1       	5	Staphylococcus aureus subsp. aureus MSSA476
external	sp	4
external	uni	1.3
external	kegg	1
subsystems	trusted	20
END
;
    my @parms = split(/\n/,$x);
    my $fig = new FIG;
    my @trusted_subsystems = map { my $sub = $_; my $curr = $fig->subsystem_curator($sub); 
				   "$sub\t$curr\n" 
				 } 
                             grep { $fig->usable_subsystem($_) } 
                             $fig->all_subsystems;
    push(@parms,@trusted_subsystems,"//\n");
    return @parms;
}


sub choose_best_assignment {
    my($fig,$parms,$pegs,$external_ids,$ignore) = @_;
    my($peg,$id);

    my $functions = {};
    foreach $peg (@$pegs)
    {
	&load_peg_function($fig,$parms,$peg,$functions);
    }
    my @tmp = keys(%$functions);
#   print &Dumper(['peg check',\@tmp,$functions]);

    if ((@tmp == 1) && (@$pegs >= 5)) { return $tmp[0] }

    foreach $id (@$external_ids)
    {
	&load_ext_function($fig,$parms,$id,$functions);
    }

    return &cleanup(&pick_function($fig,$parms,$functions));
}


sub cleanup {
    my($func) = @_;

    if (! $func)                                           { return "hypothetical protein" }
    if ($func =~ /^hypothetical (\S+ )?protein .*$/i)      { return "hypothetical protein" }
    if ($func =~ /^[a-zA-Z]{1,2}\d{2,5}( protein)?$/i)     { return "hypothetical protein" }
    if ($func =~ /^similar to ORF\d+$/)                    { return "hypothetical protein" }
    if ($func =~ /^(Alr|As|All|Tlr|Tll|Glr|Blr|Slr|SEW|pANL)\d+( protein)?$/i) { return "hypothetical protein" }
    if ($func =~ /^\d{5}/)                                 { return "hypothetical protein" }
    if ($func =~ /unknown protein/)                        { return "hypothetical protein" }
    
    return $func;
}

sub pick_function {
    my($fig,$parms,$functions) = @_;
    my($set,$score,$best_source,$poss_function);
    my(@scored);

    my @partitions = &SameFunc::group_funcs(keys(%$functions));
    if ($ENV{'VERBOSE'}) {  print STDERR "partition: ",&Dumper(\@partitions,$functions); }

    foreach $set (@partitions)
    {
	$score = &score_set($set,$functions);
#	print STDERR &Dumper([$score,$set]);

#       print STDERR "picking from set ",&Dumper($set);
	($poss_function,$best_source) = &pick_specific($fig,$parms,$set,$functions);
#	print STDERR "picked $poss_function from $best_source\n";
	push(@scored,[$score,$poss_function,$best_source]);
    }
    @scored = sort { $b->[0] <=> $a->[0] } @scored;

    if ((@scored > 1) && $ENV{'VERBOSE'})
    {
	foreach $_ (@scored)
	{
	    print STDERR join("\t",@$_),"\n";
	}
	print STDERR "//\n";
    }
    return (@scored > 0) ? $scored[0]->[1] : "";
}

sub score_set {
    my($set,$functions) = @_;
    my($func,$x);

    my $score = 0;
    foreach $func (@$set)
    {
	if ($x = $functions->{$func})
	{
	    foreach $_ (@$x)
	    {
		$score += $_->[0];
	    }
	}
    }
    return $score;
}

sub pick_specific {
    my($fig,$parms,$set,$functions) = @_;
    my($best_func,$best_score,$func,$x,$best_source);

    $best_func  = "";
    $best_score = 0;
    $best_source = "";

    foreach $func (@$set)
    {
	if ($x = $functions->{$func})
	{
	    my $incr = @$x;
	    foreach $_ (@$x)
	    {
		my($sc,$peg,$in_sub) = @$_;
		$sc += $in_sub ? 10000 : 0;

		if (((100 * $sc) + $incr) > $best_score)
		{
		    $best_score = (100 * $sc) + $incr;
		    $best_func  = $func;
		    $best_source = $peg;
		}
	    }
	}
    }
    if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["picked best source",$set,$functions,$best_func,$best_source]) }
    return ($best_func,$best_source);
}

sub load_ext_function {
    my($fig,$parms,$id,$functions) = @_;

    my $func = $fig->function_of($id);
    if ($func && # (! &FIG::hypo($func)) && 
	($id =~ /^([A-Za-z]{2,4})\|/) && ($_ = $parms->{'external'}->{$1}))
    {
	push(@{$functions->{$func}},[$_,$id]);
    }
}

sub load_peg_function {
    my($fig,$parms,$peg,$functions) = @_;

    my $func = $fig->function_of($peg);
    if ($func) # (! &FIG::hypo($func))
    {
	my $value = 1;

	my $genome = &FIG::genome_of($peg);
	if ($_ = $parms->{'genome'}->{$genome})
	{
	    $value += $_;
	}

	my $subv = 0;
	my @subs = $fig->peg_to_subsystems($peg);
	my $sub;
	my $in_sub = 0;
	foreach $sub (@subs)
	{
	    if ($_ = $parms->{'subsystems'}->{$sub})
	    {
		if ($_ > $subv)
		{
		    $subv = $_;
		}
		$in_sub = 1;
	    }
	}
	$value += $subv;
	push(@{$functions->{$func}},[$value,$peg,$in_sub]);
    }
}

sub equivalent_ids {
    my($fig,$parms,$pegs) = @_;
    my($peg,@aliases,$alias,%external_ids,%pegs,$tuple);

    foreach $peg (@$pegs)
    {
	$pegs{$peg} = 1;
	@aliases = $fig->feature_aliases($peg);
	foreach $alias (@aliases)
	{
	    if (($alias =~ /^([A-Za-z]{2,4})\|\S+$/) && $parms->{"external"}->{$1})
	    {
		$external_ids{$alias} = 1;
	    }
	}
	foreach $tuple ($fig->mapped_prot_ids($peg))
	{
	    if (($tuple->[0] =~ /^fig\|/) && $fig->is_real_feature($tuple->[0]))
	    {
		$pegs{$tuple->[0]} = 1;
	    }
	    elsif (($tuple->[0] =~ /^([A-Za-z]{2,4})\|\S+$/) && $parms->{"external"}->{$1})
	    {
		$external_ids{$tuple->[0]} = 1;
	    }
	}
    }
    return ([sort { &FIG::by_fig_id($a,$b) }  keys(%pegs)],[sort keys(%external_ids)]);
}

sub load_parms {
    my($parmsF) = @_;
    my @parmsS;

    my $wts = {};

    if ($parmsF)
    {
	@parmsS = `cat $parmsF`;
    }
    else
    {
	@parmsS = &default_parms;
    }
    while ($_ = shift @parmsS)
    {
	chomp;
	my($type,$data,$val) = split(/\t/,$_);
	if ($type eq 'subsystems')
	{
	    my $x;
	    while (($x = shift @parmsS) && ($x !~ /^\/\//))
	    {
		if ($x =~ /^(\S[^\t]+\S)/)
		{
		    $wts->{$type}->{$1} = $val;
		}
	    }
	}
	else
	{
	    $wts->{$type}->{$data} = $val;
	}
    }
    return $wts;
}

sub print_parms {
    my($parms) = @_;
    my($type,$data,$val,$wt_by_type);

    print STDERR "Parameters:\n";
    foreach $type (sort keys(%$parms))
    {
	print STDERR "\n\t$type\n";
	$wt_by_type = $parms->{$type};
	foreach $data (sort keys(%$wt_by_type))
	{
	    $val = $wt_by_type->{$data};
	    print STDERR "\t\t$data\t$val\n";
	}
    }
    print STDERR "\n";
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3