[Bio] / DeJonghStuff / map_model_to_kegg.pl Repository:
ViewVC logotype

View of /DeJonghStuff/map_model_to_kegg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Jan 12 19:53:18 2006 UTC (13 years, 9 months ago) by dejongh
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +32 -32 lines
updated

# -*- perl -*-

###########################################
use strict;

use FIG;
my $fig = new FIG;

my ($genome_id) = @ARGV;

# map from model's compound abbreviations to KEGG compound ids 
# and reaction abbreviations to KEGG reaction ids

our (%compound_abbrev_2_name, %compound_abbrev_2_id, %compound_id_2_abbrev, %reaction_abbrev_2_id, %reaction_abbrev_2_name, %reaction_2_substrates, %reaction_2_products, %reversibility);

open(MODEL_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/compound_name") or die("Run parse_model first");
open(MODEL_REACTIONS, "<$FIG_Config::global/Models/$genome_id/reaction") or die("Run parse_model first");
open(MODEL_REACTION_NAMES, "<$FIG_Config::global/Models/$genome_id/reaction_to_role") or die("Run parse_model first");
open(MODEL_REACTION_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/reaction_to_compound") or die("Run parse_model first");

open(COMPOUND_OUT, ">$FIG_Config::global/Models/$genome_id/compound_to_kegg");
open(REAC_OUT, ">$FIG_Config::global/Models/$genome_id/reaction_to_kegg");

while (my $line = <MODEL_REACTIONS>)
{
    chomp $line;
    my ($rabbrev, $rev) = split "\t", $line;
    $reversibility{$rabbrev} = $rev;
}

# hard code some KEGG compound to model compound abbreviations
# Ammonia for Ammonium
$compound_id_2_abbrev{"C00014"} = "nh4";
$compound_id_2_abbrev{"C00093"} = "glyc3p";
$compound_id_2_abbrev{"C00536"} = "pppi";

while (my $line = <MODEL_COMPOUNDS>)
{
    chomp($line);
    my ($cabbrev, undef, $cname) = split /\t/, $line;
    $compound_abbrev_2_name{$cabbrev} = $cname;

    my @cids = $fig->ids_of_compound_like_name($cname);

    # some common compounds like nadh match on the abbreviation
    if (@cids == 0)
    {
	@cids = $fig->ids_of_compound_like_name($cabbrev);
    }

    $compound_abbrev_2_id{$cabbrev} = [];

    if (@cids > 0)
    {
	foreach my $cid (@cids)
	{
	    $compound_id_2_abbrev{$cid} = $cabbrev;
	    push @{$compound_abbrev_2_id{$cabbrev}}, $cid;
	}
    }
}

while (my $line = <MODEL_REACTION_NAMES>)
{
    chomp $line;
    my ($rabbrev, $rname) = split "\t", $line;
    $reaction_abbrev_2_name{$rabbrev} = $rname;
}

while (<MODEL_REACTION_COMPOUNDS>)
{
    my ($rabbrev, $cabbrev,  $setn) = split "\t", $_;

    if ($setn == 1)
    {
	$reaction_2_substrates{$rabbrev}{$cabbrev} = 1;
    }
    else
    {
	$reaction_2_products{$rabbrev}{$cabbrev} = 1;
    }
}

our %cabbrevs_used_in_reactions;

foreach my $rabbrev (keys %reaction_abbrev_2_name)
{
    my $rname = $reaction_abbrev_2_name{$rabbrev};

    # first look up the KEGG reaction id based on the given name
    my $reaction_id = &get_reaction_id($rname);

    if (defined $reaction_id)
    {
	my @tmplist = ($reaction_id);
        $reaction_abbrev_2_id{$rabbrev} = \@tmplist;
	print "$rabbrev matches $reaction_id on KEGG name ($rname)\n";

	my $kegg_rev = $fig->reversible($reaction_id);
	if ($kegg_rev == $reversibility{$rabbrev})
	{
	    print REAC_OUT "$rabbrev\t$reaction_id\t1\n";
	}
	else
	{
	    print REAC_OUT "$rabbrev\t$reaction_id\t0\n";
	}
    }
    else
    {
	print "$rabbrev, $rname, ($reversibility{$rabbrev}): No reaction id matched on KEGG name\n";

	# Now see if the name is associated with an EC number
	my @ec_info = `grep -i "$rname" $FIG_Config::data/KEGG/ECtable`;
	my %check_reactions;

	if (@ec_info == 0)
	{
	    #try removing anything in parentheses to get rid of comments
	    my $tmp_name = $rname;
	    $tmp_name =~ s/\s+\(.*\)$//;
	    # replace "b-" with "beta-"
	    $tmp_name =~ s/^b-/beta-/;

	    if ($tmp_name ne $rname)
	    {
		print "\t Trying $tmp_name\n";
		@ec_info = `grep -i "$tmp_name" $FIG_Config::data/KEGG/ECtable`;
	    }
	}

	if (@ec_info > 0)
	{
	    foreach my $ec_line (@ec_info)
	    {
		$ec_line =~ /\s+(\d+\.\d+\.\d+\.\d+)/;
		my $ec = $1;
		print "\t Found EC: $ec\n";
		my @reactions = $fig->catalyzes($ec);
		print "\t Found reactions: @reactions\n";
		
		foreach my $reaction (@reactions)
		{
		    $check_reactions{$reaction} = 1;
		}
	    }

	}
	else
	{
	    print "\t No EC found\n";
	}

	# also try matching on compounds
	my @compounds = keys %{$reaction_2_substrates{$rabbrev}};
	push @compounds, keys %{$reaction_2_products{$rabbrev}};
	my %cids;
	foreach my $cabbrev (@compounds)
	{
	    my @cids = @{$compound_abbrev_2_id{$cabbrev}};
	    map { $cids{$_} = 1 } @cids;
	}
	delete @cids{ ('C00001', 'C00002', 'C00003', 'C00004', 'C00005', 'C00006', 'C00007', 'C00008', 
		       'C00009', 'C00010', 'C00011', 'C00012', 'C00013', 'C00014', 'C00015', 'C00016', 
		       'C00017', 'C00018', 'C00019', 'C00020', 'C00021', 'C00080') };
	print "\t Found @{[keys %cids]}\n";

	my %no_reactions;

	foreach my $cid (keys %cids)
	{
	    my @rids = $fig->comp2react($cid);

	    foreach my $rid (@rids)
	    {
		if (! exists $check_reactions{$rid} && ! exists $no_reactions{$rid})
		{
		    my $main = 0;
		    my @tuples = $fig->reaction2comp($rid, 0);
		    push @tuples, $fig->reaction2comp($rid, 1);

		    for my $tuple (@tuples)
		    {
			if (@$tuple[0] eq $cid && @$tuple[2])
			{
			    $main = 1;
			    last;
			}
		    }

		    $main ? $check_reactions{$rid} = 1 : $no_reactions{$rid} = 1;
		}
	    }
	}

	my %reaction_scores = %{score_reactions($rabbrev, keys %check_reactions)};

	my @reaction_list = sort { $reaction_scores{$a} <=> $reaction_scores{$b} } keys %reaction_scores;

	if (@reaction_list > 0)
	{
	    $reaction_abbrev_2_id{$rabbrev} = \@reaction_list;
	    print "\t reaction list: @reaction_list\n";
	    
	    if (@reaction_list == 1 && $reaction_scores{$reaction_list[0]} == 0)
	    {
		my $kegg_rev = $fig->reversible($reaction_list[0]);
		if ($kegg_rev == $reversibility{$rabbrev})
		{
		    print REAC_OUT "$rabbrev\t@reaction_list\t1\n";
		}
		else
		{
		    print REAC_OUT "$rabbrev\t@reaction_list\t0\n";
		}
	    }
	    else
	    {
		print REAC_OUT "$rabbrev\t@reaction_list\n";
	    }
	}
	else
	{
	    print "\t No matching reactions\n";
	    print REAC_OUT "$rabbrev\n";
	}

    }
}

close(REAC_OUT);

print "\n\nNumber of reactions with matches: ", scalar(keys(%reaction_abbrev_2_id)), "\n";

foreach my $cabbrev (keys %cabbrevs_used_in_reactions)
{
    my @cids = keys %{$cabbrevs_used_in_reactions{$cabbrev}};
    print COMPOUND_OUT "$cabbrev\t@cids\n";
}

sub get_reaction_id
{
    my $match_name = shift;

    my ($reaction_id, $entry, $name);

    open(KEGG_REACTIONS, "<$FIG_Config::data/KEGG/reaction") or die("Couldn't open KEGG reaction file");

    $/ = "\n///\n";
    while (<KEGG_REACTIONS>)
    {
	if ($_ =~ /ENTRY\s+(\S+)\s+Reaction\nNAME\s+(\S[^\n]+\n)((\s+(\S[^\n]+\n))*)/s)
	{
	    $reaction_id = $1;

	    if ($3)
	    {
		$name = "$2 $3";
	    }
	    else
	    {
		$name = $2;
	    }
	    $name =~ s/(\s+\$|\s+)/ /g;
	    chop $name;

	    if ($name eq $match_name)
	    {
		$/ = "\n";
		return $reaction_id;
	    }
	}
    }

    $/ = "\n";
    return;
}

sub score_reactions
{
    my $rabbrev = shift @_;
    my %reaction_scores;

    # try to select reaction based on compounds
    foreach my $reaction_id (@_)
    {
	my $kegg_rev = $fig->reversible($reaction_id);
	my $better_reversed = 0;
	print "\t Checking reaction $reaction_id ($kegg_rev)\n";
	my %substrates = %{$reaction_2_substrates{$rabbrev}};
	my %products = %{$reaction_2_products{$rabbrev}};
	my %substrates_as_products = %{$reaction_2_substrates{$rabbrev}};
	my %products_as_substrates = %{$reaction_2_products{$rabbrev}};
	my ($score, $reverse_score);

	my @kegg_substrate_info = $fig->reaction2comp($reaction_id, 0);
	my @kegg_product_info = $fig->reaction2comp($reaction_id, 1);

	my (@unmatched_kegg_substrates, @unmatched_kegg_products);

	foreach my $kegg_substrate (@kegg_substrate_info)
	{
	    my $cid = @$kegg_substrate[0];
	    my @cnames = $fig->names_of_compound($cid);
	    print "\t\t Checking $cid (@cnames[0]) ... ";
	    if (my $cabbrev = $compound_id_2_abbrev{$cid})
	    {
		print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
		my $in_substrates = delete $substrates{$cabbrev};
		my $in_products_as_substrates = delete $products_as_substrates{$cabbrev};

		if (! $in_substrates && ! $in_products_as_substrates)
		{
		    print "\t\t\t No match (main = @$kegg_substrate[2])\n";
		    push @unmatched_kegg_substrates, $cid;
		}
	    }
	    else
	    {
		print "No compound found (main = @$kegg_substrate[2])\n";
		push @unmatched_kegg_substrates, $cid;
	    }
	}

	foreach my $kegg_product (@kegg_product_info)
	{
	    my $cid = @$kegg_product[0];
	    my @cnames = $fig->names_of_compound($cid);
	    print "\t\t Checking $cid (@cnames[0]) ... ";
	    if (my $cabbrev = $compound_id_2_abbrev{$cid})
	    {
		print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
		my $in_products = delete $products{$cabbrev};
		my $in_substrates_as_products = delete $substrates_as_products{$cabbrev};

		if (! $in_products && ! $in_substrates_as_products)
		{
		    print "\t\t\t No match (main = @$kegg_product[2])\n";
		    push @unmatched_kegg_products, $cid;
		}
	    }
	    else
	    {
		print "No compound found (main = @$kegg_product[2])\n";
		push @unmatched_kegg_products, $cid;
	    }
	}

	# remove left over hydrogen since KEGG doesn't always include it
	delete $substrates{"h"};
	delete $substrates_as_products{"h"};
	delete $products{"h"};
	delete $products_as_substrates{"h"};

	$score = scalar(keys(%substrates)) + scalar(keys(%products)) + scalar(@unmatched_kegg_substrates)
	    + scalar(@unmatched_kegg_products);
	$reverse_score = scalar(keys(%substrates_as_products))
	    + scalar(keys(%products_as_substrates)) + scalar(@unmatched_kegg_substrates)
		+ scalar(@unmatched_kegg_products);

	if ($reverse_score < $score)
	{
	    print "\t\t Better score reversing reaction ($reverse_score < $score)\n";
	    $better_reversed = 1;
	    $score = $reverse_score;
	    %substrates = %substrates_as_products;
	    %products = %products_as_substrates;
	}
	else
	{
	    print "\t\t No better score reversing reaction ($reverse_score >= $score)\n";
	}

	if (%substrates == 0 && scalar(@unmatched_kegg_substrates) == 0)
	{
	    print "\t\t Matched all the substrates\n";
	}
	else
	{
	    print "\t\t Remaining substrates: ";
	    foreach my $sabbrev (keys %substrates)
	    {
		print "$sabbrev ($compound_abbrev_2_name{$sabbrev}) ";
	    }
	    print "\n\t\t Number of unmatched KEGG substrates: ", scalar(@unmatched_kegg_substrates), "\n";
	}

	if (%products == 0 && scalar(@unmatched_kegg_products) == 0)
	{
	    print "\t\t Matched all the products\n";
	}
	else
	{
	    print "\t\t Remaining products: ";
	    foreach my $pabbrev (keys %products)
	    {
		print "$pabbrev ($compound_abbrev_2_name{$pabbrev}) ";
	    }
	    print "\n\t\t Number of unmatched KEGG products: ", scalar(@unmatched_kegg_products), "\n";
	}

	print "\t\t Score: $score\n";

	# found the reaction.  throw out other contenders and keep track
	# of compounds that were used to make the match
	if ($score == 0)
	{
	    undef %reaction_scores;
	    $reaction_scores{$reaction_id} = $score;

	    foreach my $kegg_substrate (@kegg_substrate_info)
	    {
		my $cid = @$kegg_substrate[0];
		if (my $cabbrev = $compound_id_2_abbrev{$cid})
		{
		    $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
		}
	    }

	    foreach my $kegg_product (@kegg_product_info)
	    {
		my $cid = @$kegg_product[0];
		if (my $cabbrev = $compound_id_2_abbrev{$cid})
		{
		    $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
		}
	    }

	    if ($better_reversed && $reversibility{$rabbrev} == 0 && $kegg_rev == 0)
	    {
		$reversibility{$rabbrev} = -1;
		print "\t\t Model and KEGG reactions have different directions\n";
	    }

	    last;
	}
	else
	{
	    $reaction_scores{$reaction_id} = $score;
	}			    
    }

    return \%reaction_scores;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3