[Bio] / DeJonghStuff / map_model_to_kegg.pl Repository:
ViewVC logotype

View of /DeJonghStuff/map_model_to_kegg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Nov 4 16:22:36 2005 UTC (14 years, 1 month ago) by dejongh
Branch: MAIN
Files for mapping models to KEGG reactions

# -*- perl -*-

###########################################
use strict;

use FIG;
my $fig = new FIG;

my ($genome_id) = @ARGV;

# map from model's compound abbreviations to KEGG compound ids 
# and reaction abbreviations to KEGG reaction ids

my (%compound_abbrev_2_name, %compound_id_2_abbrev, %reaction_abbrev_2_id, %reaction_abbrev_2_name, %reaction_2_substrates, %reaction_2_products);

open(MODEL_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/compound_name") or die("Run parse_model first");
open(MODEL_REACTIONS, "<$FIG_Config::global/Models/$genome_id/reaction") or die("Run parse_model first");
open(MODEL_REACTION_NAMES, "<$FIG_Config::global/Models/$genome_id/reaction_to_role") or die("Run parse_model first");
open(MODEL_REACTION_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/reaction_to_compound") or die("Run parse_model first");

open(COMPOUND_OUT, ">$FIG_Config::global/Models/$genome_id/compound_to_kegg");
open(REAC_OUT, ">$FIG_Config::global/Models/$genome_id/reaction_to_kegg");

my %reversibility;

while (my $line = <MODEL_REACTIONS>)
{
    chomp $line;
    my ($rabbrev, $rev) = split "\t", $line;
    $reversibility{$rabbrev} = $rev;
}

# hard code some KEGG compound to model compound abbreviations
# Ammonia for Ammonium
$compound_id_2_abbrev{"C00014"} = "nh4";
$compound_id_2_abbrev{"C00093"} = "glyc3p";
$compound_id_2_abbrev{"C00536"} = "pppi";

while (my $line = <MODEL_COMPOUNDS>)
{
    chomp($line);
    my ($cabbrev, undef, $cname) = split /\t/, $line;
    $compound_abbrev_2_name{$cabbrev} = $cname;

    my @cids = $fig->ids_of_compound_like_name($cname);

    # some common compounds like nadh match on the abbreviation
    if (@cids == 0)
    {
	@cids = $fig->ids_of_compound_like_name($cabbrev);
    }

    if (@cids > 0)
    {
	foreach my $cid (@cids)
	{
	    $compound_id_2_abbrev{$cid} = $cabbrev;
	}
    }
}

while (my $line = <MODEL_REACTION_NAMES>)
{
    chomp $line;
    my ($rabbrev, $rname) = split "\t", $line;
    $reaction_abbrev_2_name{$rabbrev} = $rname;
}

while (<MODEL_REACTION_COMPOUNDS>)
{
    my ($rabbrev, $cabbrev,  $setn) = split "\t", $_;

    if ($setn == 1)
    {
	$reaction_2_substrates{$rabbrev}{$cabbrev} = 1;
    }
    else
    {
	$reaction_2_products{$rabbrev}{$cabbrev} = 1;
    }
}

my %cabbrevs_used_in_reactions;

foreach my $rabbrev (keys %reaction_abbrev_2_name)
{
    my $rname = $reaction_abbrev_2_name{$rabbrev};

    # first look up the KEGG reaction id based on the given name
    my $reaction_id = &get_reaction_id($rname);

    if (defined $reaction_id)
    {
	my @tmplist = ($reaction_id);
        $reaction_abbrev_2_id{$rabbrev} = \@tmplist;
	print "$rabbrev matches $reaction_id on KEGG name ($rname)\n";

	my $kegg_rev = $fig->reversible($reaction_id);
	if ($kegg_rev == $reversibility{$rabbrev})
	{
	    print REAC_OUT "$rabbrev\t$reaction_id\t1\n";
	}
	else
	{
	    print REAC_OUT "$rabbrev\t$reaction_id\t0\n";
	}
    }
    else
    {
	print "$rabbrev, $rname, ($reversibility{$rabbrev}): No reaction id matched on KEGG name\n";

	# Now see if the name is associated with an EC number
	my @tmp = `grep -i "$rname" $FIG_Config::data/KEGG/ECtable`;

	if (@tmp == 0)
	{
	    #try removing anything in parentheses to get rid of comments
	    my $tmp_name = $rname;
	    $tmp_name =~ s/\s+\(.*\)$//;
	    # replace "b-" with "beta-"
	    $tmp_name =~ s/^b-/beta-/;

	    if ($tmp_name ne $rname)
	    {
		print "\t Trying $tmp_name\n";
		@tmp = `grep -i "$tmp_name" $FIG_Config::data/KEGG/ECtable`;
	    }
	}

	if (@tmp > 0)
	{
	    my %reaction_scores;

	  ec_loop: foreach my $ec_line (@tmp)
	  {
	      $ec_line =~ /\s+(\d+\.\d+\.\d+\.\d+)/;
	      my $ec = $1;
	      print "\t Found EC: $ec\n";
	      my @reactions = $fig->catalyzes($ec);
	      print "\t Found reactions: @reactions\n";

	      if (@reactions > 0)
	      {
		  # try to select reaction based on compounds
		  foreach $reaction_id (@reactions)
		  {
		      my $kegg_rev = $fig->reversible($reaction_id);
		      my $better_reversed = 0;
		      print "\t Checking reaction $reaction_id ($kegg_rev)\n";
		      my %substrates = %{$reaction_2_substrates{$rabbrev}};
		      my %products = %{$reaction_2_products{$rabbrev}};
		      my %substrates_as_products = %{$reaction_2_substrates{$rabbrev}};
		      my %products_as_substrates = %{$reaction_2_products{$rabbrev}};
		      my ($score, $reverse_score);

		      my @kegg_substrate_info = $fig->reaction2comp($reaction_id, 0);
		      my @kegg_product_info = $fig->reaction2comp($reaction_id, 1);

		      my (@unmatched_kegg_substrates, @unmatched_kegg_products);

		      foreach my $kegg_substrate (@kegg_substrate_info)
		      {
			  my $cid = @$kegg_substrate[0];
			  my @cnames = $fig->names_of_compound($cid);
			  print "\t\t Checking $cid (@cnames[0]) ... ";
			  if (my $cabbrev = $compound_id_2_abbrev{$cid})
			  {
			      print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
			      my $in_substrates = delete $substrates{$cabbrev};
			      my $in_products_as_substrates = delete $products_as_substrates{$cabbrev};

			      if (! $in_substrates && ! $in_products_as_substrates)
			      {
				  print "\t\t\t No match (main = @$kegg_substrate[2])\n";
				  push @unmatched_kegg_substrates, $cid;
			      }
			  }
			  else
			  {
			      print "No compound found (main = @$kegg_substrate[2])\n";
			      push @unmatched_kegg_substrates, $cid;
			  }
		      }

		      foreach my $kegg_product (@kegg_product_info)
		      {
			  my $cid = @$kegg_product[0];
			  my @cnames = $fig->names_of_compound($cid);
			  print "\t\t Checking $cid (@cnames[0]) ... ";
			  if (my $cabbrev = $compound_id_2_abbrev{$cid})
			  {
			      print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
			      my $in_products = delete $products{$cabbrev};
			      my $in_substrates_as_products = delete $substrates_as_products{$cabbrev};

			      if (! $in_products && ! $in_substrates_as_products)
			      {
				  print "\t\t\t No match (main = @$kegg_product[2])\n";
				  push @unmatched_kegg_products, $cid;
			      }
			  }
			  else
			  {
			      print "No compound found (main = @$kegg_product[2])\n";
			      push @unmatched_kegg_products, $cid;
			  }
		      }

		      # remove left over hydrogen since KEGG doesn't always include it
		      delete $substrates{"h"};
		      delete $substrates_as_products{"h"};
		      delete $products{"h"};
		      delete $products_as_substrates{"h"};

		      $score = scalar(keys(%substrates)) + scalar(keys(%products)) + scalar(@unmatched_kegg_substrates)
			  + scalar(@unmatched_kegg_products);
		      $reverse_score = scalar(keys(%substrates_as_products))
			  + scalar(keys(%products_as_substrates)) + scalar(@unmatched_kegg_substrates)
			      + scalar(@unmatched_kegg_products);

		      if ($reverse_score < $score)
		      {
			  print "\t\t Better score reversing reaction ($reverse_score < $score)\n";
			  $better_reversed = 1;
			  $score = $reverse_score;
			  %substrates = %substrates_as_products;
			  %products = %products_as_substrates;
		      }
		      else
		      {
			  print "\t\t No better score reversing reaction ($reverse_score >= $score)\n";
		      }

		      if (%substrates == 0 && scalar(@unmatched_kegg_substrates) == 0)
		      {
			  print "\t\t Matched all the substrates\n";
		      }
		      else
		      {
			  print "\t\t Remaining substrates: ";
			  foreach my $sabbrev (keys %substrates)
			  {
			      print "$sabbrev ($compound_abbrev_2_name{$sabbrev}) ";
			  }
			  print "\n\t\t Number of unmatched KEGG substrates: ", scalar(@unmatched_kegg_substrates), "\n";
		      }

		      if (%products == 0 && scalar(@unmatched_kegg_products) == 0)
		      {
			  print "\t\t Matched all the products\n";
		      }
		      else
		      {
			  print "\t\t Remaining products: ";
			  foreach my $pabbrev (keys %products)
			  {
			      print "$pabbrev ($compound_abbrev_2_name{$pabbrev}) ";
			  }
			  print "\n\t\t Number of unmatched KEGG products: ", scalar(@unmatched_kegg_products), "\n";
		      }

		      print "\t\t Score: $score\n";

		      # found the reaction.  throw out other contenders and keep track
		      # of compounds that were used to make the match
		      if ($score == 0)
		      {
			  undef %reaction_scores;
			  $reaction_scores{$reaction_id} = $score;

			  foreach my $kegg_substrate (@kegg_substrate_info)
			  {
			      my $cid = @$kegg_substrate[0];
			      if (my $cabbrev = $compound_id_2_abbrev{$cid})
			      {
				  $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
			      }
			  }

			  foreach my $kegg_product (@kegg_product_info)
			  {
			      my $cid = @$kegg_product[0];
			      if (my $cabbrev = $compound_id_2_abbrev{$cid})
			      {
				  $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
			      }
			  }

			  if ($better_reversed && $reversibility{$rabbrev} == 0 && $kegg_rev == 0)
			  {
			      $reversibility{$rabbrev} = -1;
			      print "\t\t Model and KEGG reactions have different directions\n";
			  }

			  last ec_loop;
		      }
		      else
		      {
			  $reaction_scores{$reaction_id} = $score;
		      }			    
		  }
	      }
	  }

	    my @reaction_list = sort { $reaction_scores{$a} <=> $reaction_scores{$b} } keys %reaction_scores;

	    if (@reaction_list > 0)
	    {
		$reaction_abbrev_2_id{$rabbrev} = \@reaction_list;
		print "\t reaction list: @reaction_list\n";
		
		if (@reaction_list == 1)
		{
		    my $kegg_rev = $fig->reversible($reaction_list[0]);
		    if ($kegg_rev == $reversibility{$rabbrev})
		    {
			print REAC_OUT "$rabbrev\t@reaction_list\t1\n";
		    }
		    else
		    {
			print REAC_OUT "$rabbrev\t@reaction_list\t0\n";
		    }
		}
		else
		{
		    print REAC_OUT "$rabbrev\t@reaction_list\n";
		}
	    }
	    else
	    {
		print "\t No matching reactions\n";
		print REAC_OUT "$rabbrev\n";
	    }
	}
	else
	{
	    print "\t No EC found\n";
	    print REAC_OUT "$rabbrev\n";
	}
    }
}

close(REAC_OUT);

print "\n\nNumber of reactions with matches: ", scalar(keys(%reaction_abbrev_2_id)), "\n";

foreach my $cabbrev (keys %cabbrevs_used_in_reactions)
{
    my @cids = keys %{$cabbrevs_used_in_reactions{$cabbrev}};
    print COMPOUND_OUT "$cabbrev\t@cids\n";
}

sub get_reaction_id
{
    my $match_name = shift;

    my ($reaction_id, $entry, $name);

    open(KEGG_REACTIONS, "<$FIG_Config::data/KEGG/reaction") or die("Couldn't open KEGG reaction file");

    $/ = "\n///\n";
    while (<KEGG_REACTIONS>)
    {
	if ($_ =~ /ENTRY\s+(\S+)\s+Reaction\nNAME\s+(\S[^\n]+\n)((\s+(\S[^\n]+\n))*)/s)
	{
	    $reaction_id = $1;

	    if ($3)
	    {
		$name = "$2 $3";
	    }
	    else
	    {
		$name = $2;
	    }
	    $name =~ s/(\s+\$|\s+)/ /g;
	    chop $name;

	    if ($name eq $match_name)
	    {
		$/ = "\n";
		return $reaction_id;
	    }
	}
    }

    $/ = "\n";
    return;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3