[Bio] / DeJonghStuff / parse_model.pl Repository:
ViewVC logotype

View of /DeJonghStuff/parse_model.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu Jan 12 19:53:18 2006 UTC (13 years, 9 months ago) by dejongh
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +44 -21 lines
updated

# -*- perl -*-

###########################################
# usage:  parse_model <genome_id> <model_id> <kegg organism id>
# models for a given genome should be stored in Data/Global/Models/<genome_id>/
# models should be prepended with the model id (e.g., iSB615).  Three files are expected:
# ${model_id}-metabolites.txt:  contains two-column, tab-separated file with compound ids
#     in the first column and compound names in the second column
# ${model_id}-reactions.txt:  contains three-column, tab-separated file with reaction ids
#     in the first column, reaction names in the second column, and reaction equations in 
#     the third column
# ${model_id}-gene-reaction-assoc.txt:  contains two-column, tab-separated file with 
#     reaction ids in the first column and list of gene ids in the second column

use strict;
use FIG;

my ($genome_id, $model_id, $kegg_org_id) = @ARGV;

unless ($genome_id && $model_id && $kegg_org_id) 
{
    print "usage:  parse_model <genome_id> <model_id> <kegg organism id>\n";
    exit(-1);
}

my $fig = new FIG;

open(MODEL_METABOLITES, "<$FIG_Config::global/Models/$genome_id/${model_id}-metabolites.txt")
    or die("Couldn't open $genome_id/${model_id}-metabolites.txt");
open(MODEL_REACTIONS, "<$FIG_Config::global/Models/$genome_id/${model_id}-reactions.txt") 
    or die("Couldn't open $genome_id/${model_id}-reactions.txt");
open(MODEL_GRA, "<$FIG_Config::global/Models/$genome_id/${model_id}-gene-reaction-assoc.txt")
    or die("Couldn't open $genome_id/${model_id}-gene-reaction-assoc.txt");
open(COMPOUND, ">$FIG_Config::global/Models/$genome_id/compound");
open(COMPOUND_NAME, ">$FIG_Config::global/Models/$genome_id/compound_name");
open(REACTION_TO_ROLE, ">$FIG_Config::global/Models/$genome_id/reaction_to_role");
open(REACTION, ">$FIG_Config::global/Models/$genome_id/reaction");
open(REACTION_TO_COMPOUND, ">$FIG_Config::global/Models/$genome_id/reaction_to_compound");
open(GENE_TO_ROLE, ">$FIG_Config::global/Models/$genome_id/gene_to_role");
open(GENE_TO_REACTION, ">$FIG_Config::global/Models/$genome_id/gene_to_reaction");
open(GENE_TO_PEG, ">$FIG_Config::global/Models/$genome_id/gene_to_peg");

my (%compound_abbrev_2_name, %reaction_abbrev_2_role);

while (my $metabolite = <MODEL_METABOLITES>)
{
    chomp($metabolite);
    if (my ($abbrev, $name) = split /\t/, $metabolite) 
    {
	$abbrev =~ s/\"//g;
	$name =~ s/\"//g;
	$name =~ s/'/\\'/g;

	if (! defined($compound_abbrev_2_name{$abbrev}))
	{
	    $compound_abbrev_2_name{$abbrev} = $name;
	    print COMPOUND "$abbrev\n";
	    print COMPOUND_NAME "$abbrev\t1\t$compound_abbrev_2_name{$abbrev}\n";
	}
	else
	{
	    print "Warning, found multiple names for $abbrev\n";
	}
    }
    else
    {
	print "Can't parse metabolite: $metabolite\n";
    }
}

close(COMPOUND);
close(COMPOUND_NAME);

while (my $reaction = <MODEL_REACTIONS>)
{
    chomp($reaction);

    if (my ($rabbrev, $rname, $equation) = split /\t/, $reaction)
    {
	$rabbrev =~ s/\"//g;
	$rname =~ s/\"//g;
	$equation =~ s/\"//g;

	if (! defined($reaction_abbrev_2_role{$rabbrev}))
	{
	    $reaction_abbrev_2_role{$rabbrev} = $rname;
	    print REACTION_TO_ROLE "$rabbrev\t$rname\n";
	}
	else
	{
	    print "Warning, found multiple names for $rabbrev\n";	
	}

	my ($loc, $substrate_list, $direction, $product_list);

	if ($equation =~ /^\[(.)\]( : )?(.*)(-->|<==>)(.*)/)
	{
	    $loc = $1;
	    ($substrate_list, $direction, $product_list) = ($3, $4, $5);
	}
	elsif ($equation =~ /(.*)(-->|<==>)(.*)/)
	{
	    $loc = "";
	    ($substrate_list, $direction, $product_list) = ($1, $2, $3);
	}
	else
	{
	    print "Warning, unparseable equation: $equation\n";
	    next;
	}

	if ($direction eq "-->")
	{
	    print REACTION "$rabbrev\t0\n";
	}
	else
	{
	    print REACTION "$rabbrev\t1\n";
	}

	$substrate_list =~ s/\s//g;
	$product_list =~ s/\s//g;

	for (my $setn = 1; $setn <= 2; $setn++)
	{
	    my $compound_list = ($setn == 1 ? $substrate_list : $product_list);

	    foreach my $cabbrev_info (split /\+/, $compound_list) 
	    { 
		my ($cid, $stoich, $cloc);

		if ($cabbrev_info =~ /(.+)\[(.)\]$/)
		{
		    $cabbrev_info = $1;
		    $cloc = $2;
		}
		else
		{
		    $cloc = $loc;
		}

		if ($cabbrev_info =~ /\((\d.*)\)(.+)/)
		{
		    $stoich = $1;
		    $cid = $2;
		}
		else
		{
		    $stoich = "1";
		    $cid = $cabbrev_info;
		}

		if (defined($compound_abbrev_2_name{$cid}))
		{
		    print REACTION_TO_COMPOUND "$rabbrev\t$cid\t$setn\t$stoich\t\t$cloc\n";
		}
		else
		{
		    print "Warning, undefined compound $cid in reaction $equation\n";
		}
	    }
	}
    }
    else
    {
	print "Warning, unparseable reaction: $reaction\n";
    }
}

close(REACTION);
close(REACTION_TO_ROLE);
close(REACTION_TO_COMPOUND);

# use hash for tracking gene/peg and gene/reaction correspondences since genes occur in multiple reactions
my (%gene_2_pegs, %gene_2_reactions);

while (<MODEL_GRA>)
{
    my ($rabbrev, $genes) = split "\t", $_;
    my @rids;

    if (defined $reaction_abbrev_2_role{$rabbrev})
    {
	chomp $genes;
	$genes =~ s/\(//g;
	$genes =~ s/\)//g;
	$genes =~ s/and//gi;
	$genes =~ s/or//gi;
	$genes =~ s/^\s+//;
	$genes =~ s/\s+$//;

	foreach my $gene (split /\s+/ , $genes)
	{
	    print "Checking $gene ...\n";
	    push @{$gene_2_reactions{$gene}}, $rabbrev;
	    my $rname = $reaction_abbrev_2_role{$rabbrev};
	    print GENE_TO_ROLE "$gene\t$rname\n"; # this table is redundant

	    if (! defined($gene_2_pegs{$gene}))
	    {
		$gene_2_pegs{$gene} = ();
		# search on kegg gene identifiers because the seem to limit results to the correct peg
		my ($peg_index_data,undef) = $fig->search_index("$kegg_org_id:".$gene);

		foreach my $peg (map { $_->[0] } @$peg_index_data)
		{
		    if ($peg =~ /$genome_id/ )
		    {
			push @{$gene_2_pegs{$gene}}, $peg;
		    }
		}
	    }
	}
    }
    else
    {
	print "Warning, unknown reaction abbreviation in GRA file: $rabbrev\n";
    }
}

foreach my $gene (keys %gene_2_pegs)
{
    if (defined $gene_2_pegs{$gene})
    {
	print GENE_TO_PEG "$gene\t@{$gene_2_pegs{$gene}}\n";
    }
    else
    {
	print "Can't find peg for $gene\n";
    }
}

foreach my $gene (keys %gene_2_reactions)
{
    print GENE_TO_REACTION "$gene\t@{$gene_2_reactions{$gene}}\n";
}

close(GENE_TO_REACTION);
close(GENE_TO_ROLE);
close(GENE_TO_PEG);

exit;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3