[Bio] / FigKernelPackages / Assignments.pm Repository:
ViewVC logotype

View of /FigKernelPackages/Assignments.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Jun 3 14:32:27 2005 UTC (14 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: merge-trunktag-bobdev_news-2, Root-bobdev_news, merge-bobdev_news-1, merge-trunktag-bobdev_news-1, merge-bodev_news-3, merge-bobdev_news-2, merge-trunktag-bodev_news-3
Branch point for: Branch-bobdev_news
clean up of auto assignments routines

package Assignments;

use Carp;
use Data::Dumper;
use FIG;
use SameFunc;

sub default_parms {

    my $x = <<END
genome	198214.1	4	Shigella flexneri 2a str. 301
genome	198215.1	4	Shigella flexneri 2a str. 2457T
genome	216598.1	4	Shigella dysenteriae M131649
genome	216599.1	4	Shigella sonnei 53G
genome	630.2	4	Yersinia enterocolitica 8081
genome	633.2	4	Yersinia pseudotuberculosis (Livermore)
genome	187410.1	4	Yersinia pestis KIM
genome	214092.1	4	Yersinia pestis CO92
genome	229193.1	4	Yersinia pestis biovar Medievalis str. 91001
genome	273123.1	4	Yersinia pseudotuberculosis IP 32953
genome	594.1	4	Salmonella enterica subsp. enterica serovar Gallinarum
genome	99287.1	4	Salmonella typhimurium LT2
genome	119912.1	4	Salmonella enterica serovar Choleraesuis SC-B67
genome	209261.1	4	Salmonella enterica subsp. enterica serovar Typhi Ty2
genome	220341.1	4	Salmonella enterica subsp. enterica serovar Typhi str. CT18
genome	83333.1	4	Escherichia coli K12
genome	83334.1	4	Escherichia coli O157:H7
genome	155864.1	4	Escherichia coli O157:H7 EDL933
genome	199310.1	4	Escherichia coli CFT073
genome	216592.1	4	Escherichia coli 042
genome	216593.1	4	Escherichia coli E2348/69
genome	192222.1	4	Campylobacter jejuni subsp. jejuni NCTC 11168
genome	224308.1	2	Bacillus subtilis subsp. subtilis str. 168
external	sp	4
external	uni	2
external	kegg	1
subsystems	trusted	8
ABC_transporter_L-proline_glycine_betaine_(TC_3.A.1.12.1)	MattC
ABC_transporter_alkylphosphonate_(TC_3.A.1.9.1)	MattC
ABC_transporter_arabinose_(TC_3.A.1.2.2)	MattC
ABC_transporter_branched-chain_amino_acid_(TC_3.A.1.4.1)	MattC
ABC_transporter_dipeptide_(TC_3.A.1.5.2)	MattC
ABC_transporter_ferric_enterobactin_(TC_3.A.1.14.2)	MattC
ABC_transporter_ferrichrome_(TC_3.A.1.14.3)	MattC
ABC_transporter_galactose_(TC_3.A.1.2.3)	MattC
ABC_transporter_glutamate_aspartate_(TC_3.A.1.3.4)	MattC
ABC_transporter_glutamine_(TC_3.A.1.3.2)	MattC
ABC_transporter_glycerol_(TC_3.A.1.1.3)	MattC
ABC_transporter_heme_(TC3.A.1.107.1)	MattC
ABC_transporter_histidine_lysine_arginine_ornithine_(TC_3.A.1.3.1)	MattC
ABC_transporter_iron(III)_dicitrate_(TC_3.A.1.14.1)	MattC
ABC_transporter_macrolide	MattC
ABC_transporter_maltose	MattC
ABC_transporter_molybdenum_(TC_3.A.1.8.1)	MattC
ABC_transporter_nickel_(TC_3.A.1.5.3)	MattC
ABC_transporter_oligopeptide_(TC_3.A.1.5.1)	MattC
ABC_transporter_peptide_(TC_3.A.1.5.5)	MattC
ABC_transporter_phosphate_(TC_3.A.1.7.1)	MattC
ABC_transporter_polyamine_putrescine_spermidine_(TC_3.A.1.11.1)	MattC
ABC_transporter_putrescine_(TC_3.A.1.11.2)	MattC
ABC_transporter_ribose_(TC_3.A.1.2.1)	MattC
Adhesion_to_eukaryotic_cell	MikeK
Alanine_Biosynthesis	Straw
Allantoin_degradation	MattC
Ammonia_assimilation	EdF
Anaerobic_respiratory_reductases	OlgaV
Arginine_Biosynthesis	RickS
Arginine_Putrescine_and_4-aminobutyrate_degradation	MattC
Asp-Glu-tRNA(Asn-Gln)_transamidation	gjo
Bacterial_Cell_Division	RickS
Betaine_biosynthesis	MattC
Bilin_Biosynthesis	OlgaZ
Biotin_biosynthesis	rodionov
Branched-Chain_Amino_Acid_Biosynthesis	RossO
CMP-N-acetylneuraminate_Biosynthesis	OlgaZ
Calvin-Benson_cycle	SvetaG
Carotenoids	OlgaV
Chlorophyll_Biosynthesis	VeronikaV
Chorismate_Synthesis	VeronikaV
Coenzyme_A_Biosynthesis	AndreiO
Cyanobacterial_CO2_uptake	OlgaV
Cyanobacterial_Circadian_Clock	OlgaZ
Cyanophycin_metabolism	MikeR
Cytochrome_B6-F_complex	SvetaG
Cytolethal_distending_toxin_of_Campylobacter_jejuni	OlgaZ
D-arabinose_degradation	MattC
D-galactarate_degradation	MattC
D-galacturonate_degradation	MattC
DNA-replication	RickS
DNA_Repair_Base_Excision	MikeK
De_Novo_Purine_Biosynthesis	RossO
De_Novo_Pyrimidine_Synthesis	RossO
Denitrification	rodionov
Embden-Meyerhof_and_Gluconeogenesis	SvetaG
F0F1-type_ATP_synthase	RickS
FMN_and_FAD_biosynthesis	AndreiO
Fatty_Acid_Biosynthesis_FASII	AndreiO
Fe-S_cluster_assembly	rodionov
Flagellum	RickS
Folate_Biosynthesis	vcrecy
Fucose_and_rhamnose_degradation	MattC
Galactitol_degradation	MattC
Galactose_degradation	MattC
General_secretory_pathway_(Sec-SRP)_complex_(TC_3.A.5.1.1)	MattC
Glutamate,_aspartate_and_asparagine_biosynthisis	MattC
Glutamate_biosynthesis	MattC
Glutathione_Redox_Metabolism	Neema_UCSD
Glycerol_Metabolism	MattC
Glycerolipid_and_glycerphospholipid_metabolism	VasiliyP
Glycine_synthesis	MikeK
Glyoxylate_Synthesis	RickS
GroEL_GroES	MikeK
HMG_CoA_Synthesis	Veronika
Hexitol_degradation	MattC
Histidine_Biosynthesis	RossO
Histidine_Degradation	RossO
Inorganic_Sulfur_Assimilation	ChristianR
Inositol_catabolism	VeronikaV
Isoprenoid_Biosynthesis	OlgaZ
Ketogluconate_metabolism	MattC
L-ascorbate_degradation	MattC
Lactose_degradation	MattC
Leucine_Degradation_and_HMG-CoA_Metabolism	VeronikaV
Lysine_Biosynthesis_DAP_Pathway	AndreiO
Mannose-sensitive_hemagglutinin_type_4_pilus	RobE
Mannose_and_fructose_metabolism	HanYuC_UCSD
Menaquinone_and_Phylloquinone_Biosynthesis	OlgaZ
Methanogenesis	gjo
Methionine_Biosynthesis	rodionov
Methylcitrate_cycle	MattC
N-Acetyl-D-Glucosamine_Utilization	OlgaZ
N-linked_Glycosylation_in_Bacteria	OlgaZ
NAD_and_NADP_cofactor_biosynthesis_global	AndreiO
Na(+)-translocating_NADH-quinone_oxidoreductase_and_rnf-like_group_of_electron_transport_complexes	OlgaV
Nitrate_and_nitrite_ammonification	rodionov
Nitrosative_stress	rodionov
P-type_ATPase_transporter_potassium_(TC_3.A.3.7.1)	MattC
Pentose_phosphate_pathway	SvetaG
Peptidoglycan_Biosynthesis	RickS
Phenylalanine_synthesis	MikeK
Photosystem_I	SvetaG
Photosystem_II	SvetaG
Phycobilisome	OlgaZ
Plastoquinone_Biosynthesis	OlgaZ
Polyamine_Metabolism	InesT_UCSD
Porphyrin,_Heme,_and_Siroheme_Biosynthesis	SvetaG
Proline_Synthesis	RickS
Proteasome_archaeal	gjo
Proteasome_eukaryotic	gjo
Pterin_biosynthesis	vcrecy
Purine_conversions	OlgaV
Pyruvate_Alanine_Serine_Interconversions	JasonS_UCSD
Queuosine-Archaeosine_Biosynthesis	vcrecy
RNA_polymerase_I	gjo
RNA_polymerase_II	gjo
RNA_polymerase_III	gjo
RNA_polymerase_II_initiation_factors	gjo
RNA_polymerase_archaeal	gjo
RNA_polymerase_archaeal_initiation_factors	gjo
RNA_polymerase_bacterial	gjo
RNA_polymerase_chloroplast	gjo
Resistance_to_fluoroquinolones	MattC
Respiratory_Complex_I	OlgaV
Respiratory_dehydrogenases_1	OlgaV
Ribonucleotide_reduction	rodionov
Ribose_and_deoxyribose_phosphate_metabolism	MattC
Ribosome_LSU_bacterial	gjo
Ribosome_LSU_eukaryotic_and_archaeal	gjo
Ribosome_SSU_bacterial	gjo
Ribosome_SSU_chloroplast	gjo
Ribosome_SSU_eukaryotic_and_archaeal	gjo
Ribosome_biogenesis_bacterial	gjo
Serine_Biosynthesis	MikeK
Siderophore_Aerobactin_and_Ferrichrome_Biosynthesis	MattC
Soluble_cytochromes_and_functionally_related_electron_carriers	OlgaV
Succinate_dehydrogenase	OlgaV
Sulfate_assimilation	MattC
Sulfur_Metabolism	RobE
TCA_Cycle	OlgaV
Terminal_cytochrome_C_oxidases	OlgaV
Terminal_cytochrome_oxidases	OlgaV
Thiamin_biosynthesis	rodionov
Threonine_synthesis	MikeK
Tocopherol_Biosynthesis	OlgaZ
Transcription_factors_bacterial	gjo
Translation_elongation_factors_eukaryotic_and_archaeal	gjo
Translation_factors_bacterial	gjo
Translation_initiation_factors_eukaryotic_and_archaeal	gjo
Transport_of_Nickel_and_Cobalt	rodionov
Trehalose_biosynthesis	MattC
Tricarballylate_Utilization	RossO
Tryptophan_synthesis	VeronikaV
Tyrosine_synthesis	MikeK
UDP-N-acetylmuramate_from_Fructose-6-phosphate_Biosynthesis	VasiliyP
Ubiquinone_Biosynthesis	OlgaZ
Ubiquinone_Menaquinone-cytochrome_c_reductase_complexes	OlgaV
Urea_decomposition	rodionov
V-Type_ATP_synthase	RickS
carnitine_metabolism	MattC
cysteine_biosynthesis	RobE
dTDP-rhamnose_synthesis	MikeK
fatty_acid_metabolism	MattC
fatty_acid_oxidation_pathway	MattC
glyoxylate_degradation	MattC
mannose_and_GDP-mannose_metabolism	MattC
polyisoprenoid_biosynthesis	MattC
ppGpp_biosynthesis	MikeK
tRNA_aminoacylation	gjo
tRNA_processing	gjo
tRNA_splicing	gjo
//
END
;
    return split(/\n/,$default_parms);
}

sub choose_best_assignment {
    my($fig,$parms,$pegs,$external_ids) = @_;
    my($peg,$id);

    my $functions = {};
    foreach $peg (@$pegs)
    {
	&load_peg_function($fig,$parms,$peg,$functions);
    }

    foreach $id (@$external_ids)
    {
	&load_ext_function($fig,$parms,$id,$functions);
    }

    return &cleanup(&pick_function($fig,$parms,$functions));
}


sub cleanup {
    my($func) = @_;

    if (! $func)                                           { return "hypothetical protein" }
    if ($func =~ /^hypothetical (\S+ )?protein .*$/i)      { return "hypothetical protein" }
    if ($func =~ /^[a-zA-Z]{1,2}\d{2,5}( protein)?$/i)     { return "hypothetical protein" }

    return $func;
}

sub pick_function {
    my($fig,$parms,$functions) = @_;
    my($set,$score,$best_source,$poss_function);
    my(@scored);

    my @partitions = &SameFunc::group_funcs(keys(%$functions));
    if ($ENV{'VERBOSE'}) {  print STDERR "partition: ",&Dumper(\@partitions,$functions); }

    foreach $set (@partitions)
    {
	$score = &score_set($set,$functions);
#	print STDERR &Dumper([$score,$set]);

#       print STDERR "picking from set ",&Dumper($set);
	($poss_function,$best_source) = &pick_specific($fig,$parms,$set,$functions);
#	print STDERR "picked $best_function from $best_source\n";
	push(@scored,[$score,$poss_function,$best_source]);
    }
    @scored = sort { $b->[0] <=> $a->[0] } @scored;

    if ((@scored > 1) && $ENV{'VERBOSE'})
    {
	foreach $_ (@scored)
	{
	    print STDERR join("\t",@$_),"\n";
	}
	print STDERR "//\n";
    }
    return (@scored > 0) ? $scored[0]->[1] : "";
}

sub score_set {
    my($set,$functions) = @_;
    my($func,$x);

    my $score = 0;
    foreach $func (@$set)
    {
	if ($x = $functions->{$func})
	{
	    foreach $_ (@$x)
	    {
		$score += $_->[0];
	    }
	}
    }
    return $score;
}

sub pick_specific {
    my($fig,$parms,$set,$functions) = @_;
    my($best_func,$best_score,$func,$x,$best_source);

    $best_func  = "";
    $best_score = "";
    $best_source = "";

    foreach $func (@$set)
    {
	if ($x = $functions->{$func})
	{
	    my $incr = @$x;
	    foreach $_ (@$x)
	    {
		if (((100 * $_->[0]) + $incr) > $best_score)
		{
		    $best_score = (100 * $_->[0]) + $incr;
		    $best_func  = $func;
		    $best_source = $_->[1];
		}
	    }
	}
    }
    if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["picked best source",$set,$functions,$best_func,$best_source]) }
    return ($best_func,$best_source);
}

sub load_ext_function {
    my($fig,$parms,$id,$functions) = @_;

    my $func = $fig->function_of($id);
    if ($func && # (! &FIG::hypo($func)) && 
	($id =~ /^([A-Za-z]{2,4})\|/) && ($_ = $parms->{'external'}->{$1}))
    {
	push(@{$functions->{$func}},[$_,$id]);
    }
}

sub load_peg_function {
    my($fig,$parms,$peg,$functions) = @_;

    my $func = $fig->function_of($peg);
    if ($func) # (! &FIG::hypo($func))
    {
	my $value = 1;

	my $genome = &FIG::genome_of($peg);
	if ($_ = $parms->{'genome'}->{$genome})
	{
	    $value += $_;
	}

	my $subv = 0;
	my @subs = $fig->peg_to_subsystems($peg);
	my $sub;
	foreach $sub (@subs)
	{
	    if (($_ = $parms->{'subsystems'}->{$sub}) && ($_ > $subv))
	    {
		$subv = $_;
	    }
	}
	$value += $subv;

	push(@{$functions->{$func}},[$value,$peg]);
    }
}

sub equivalent_ids {
    my($fig,$parms,$pegs) = @_;
    my($peg,@aliases,$alias,%external_ids,%pegs,$tuple);

    foreach $peg (@$pegs)
    {
	$pegs{$peg} = 1;
	@aliases = $fig->feature_aliases($peg);
	foreach $alias (@aliases)
	{
	    if (($alias =~ /^([A-Za-z]{2,4})\|\S+$/) && $parms->{"external"}->{$1})
	    {
		$external_ids{$alias} = 1;
	    }
	}
	foreach $tuple ($fig->mapped_prot_ids($peg))
	{
	    if ($tuple->[0] =~ /^fig\|/)
	    {
		$pegs{$tuple->[0]} = 1;
	    }
	    elsif (($tuple->[0] =~ /^([A-Za-z]{2,4})\|\S+$/) && $parms->{"external"}->{$1})
	    {
		$external_ids{$tuple->[0]} = 1;
	    }
	}
    }
    return ([sort { &FIG::by_fig_id($a,$b) }  keys(%pegs)],[sort keys(%external_ids)]);
}

sub load_parms {
    my($parmsF) = @_;
    my @parmsS;

    my $wts = {};

    if ($parmsF)
    {
	@parmsS = `cat $parmsF`;
    }
    else
    {
	@parmsS = &default_parms;
    }
    
    while ($_ = shift @parmsS)
    {
	chomp;
	my($type,$data,$val) = split(/\t/,$_);
	if ($type eq 'subsystems')
	{
	    my $x;
	    while (($x = shift @parmsS) && ($x !~ /^\/\//))
	    {
		if ($x =~ /^(\S[^\t]+\S)/)
		{
		    $wts->{$type}->{$1} = $val;
		}
	    }
	}
	else
	{
	    $wts->{$type}->{$data} = $val;
	}
    }
    return $wts;
}

sub print_parms {
    my($parms) = @_;
    my($type,$data,$val,$wt_by_type);

    print STDERR "Parameters:\n";
    foreach $type (sort keys(%$parms))
    {
	print STDERR "\n\t$type\n";
	$wt_by_type = $parms->{$type};
	foreach $data (sort keys(%$wt_by_type))
	{
	    $val = $wt_by_type->{$data};
	    print STDERR "\t\t$data\t$val\n";
	}
    }
    print STDERR "\n";
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3