[Bio] / DeJonghStuff / load_kegg.pl Repository:
ViewVC logotype

View of /DeJonghStuff/load_kegg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Tue Nov 15 20:09:41 2005 UTC (14 years ago) by dejongh
Branch: MAIN
Changes since 1.1: +2 -1 lines
inadvertently deleted line, now restored

# -*- perl -*-

###########################################
use strict;

# KEY RELATIONAL TABLES:
#    
#   1.  compound(Cid,Priority,Name)    % we have a prioritized set of names for a compound
#   2.  comp_to_CAS(Cid,CASid)         % connection to chemical abstract society [optional]
#   3.  reaction(Rid,Reversible)       % nonreversible go from substrates to products
#   4.  reaction_to_compound(Rid,0/1[substrate or product],Cid,Stoich,main_compound [part of major transformation])
#   5.  reaction_to_role(Rid,FunctionalRole)
#

use FIG;
my $fig = new FIG;

my $usage = "usage: load_kegg";

use Tracer;
#TSetup('2 *', 'WARN');
&load_ec_and_map_data;
&load_compounds;
&load_reactions;
&load_catalyzes;

undef $fig;

sub load_ec_and_map_data {

    Open(\*TMPIN, "<$FIG_Config::data/KEGG/enzyme");
    Open(\*ECMAP,">$FIG_Config::temp/ec_map.table");

	Trace("Reading KEGG enzymes.") if T(2);
    my($ec,%name,$map);
    $/ = "\n///\n";
    while (defined($_ = <TMPIN>))
    {
		if ($_ =~ /ENTRY\s+EC\s+(\d+\.\d+\.\d+\.\d+)/s)
		{
			$ec = $1;
			while ($_ =~ /PATH:\s+(map\d+)\s+(\S[^\n]+\S)/sg)
			{
			        print ECMAP "$ec\t$1\n";
				$name{$1} = $2;
			}
		}
    }
    $/ = "\n";
    close(TMPIN);
    close(ECMAP);
	
	Trace("Writing map table.") if T(2);
    Open(\*MAP, ">$FIG_Config::temp/map_name.table");

    foreach $map (keys(%name))
    {
		print MAP "$map\t$name{$map}\n";
    }
    close(MAP);

    $fig->reload_table('all', "ec_map",
					   "ec varchar(100), map varchar(100)",
					   { index_ec_map_ec => "ec", index_ec_map_map => "map" },
					   "$FIG_Config::temp/ec_map.table");
    unlink("$FIG_Config::temp/ec_map.table");
	$fig->reload_table('all', "map_name",
					   "map varchar(100) UNIQUE NOT NULL, mapname varchar(200), primary key ( map )",
					   { }, "$FIG_Config::temp/map_name.table");
    unlink("$FIG_Config::temp/map_name.table");
}

sub load_compounds {
    
	Trace("Loading compounds.") if T(2);
	
    Open(\*TMPIN, "<$FIG_Config::data/KEGG/compound");
    Open(\*COMP, ">$FIG_Config::temp/comp_name.table");
    Open(\*CAS, ">$FIG_Config::temp/comp_cas.table");

    my($cid,$name,$cas,$names,$tmp,$n,$entry);
    $/ = "\n///\n";
    while (defined($entry = <TMPIN>))
    {
		if ($entry =~ /ENTRY\s+(C\d+)\s*\nNAME\s+(\S[^\n]*)\n((\s+(\S[^\n]*\S)\n)*)/s)
		{
			$cid = $1;
			$names = $2;

			if ($3)
			{
				$tmp = $3;
				chop $tmp;
				$tmp =~ s/^\s+/ /;
				$names = $names . $tmp;
				$names =~ s/\n\s+/ /g;
				$names =~ s/- /-/g;
			}
	
			$n = 1;
			foreach $name (map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_ } split(/;/,$names))
			{
				print COMP "$cid\t$n\t$name\n";
				$n++;
				if (length $name > 200) { print "$cid, $name\n" }
			}
		}

		if ($entry =~ /DBLINKS\s+CAS:\s+(\S+)/s)
		{
			print CAS "$cid\t$1\n";
		}
    }
    $/ = "\n";
    close(TMPIN);
    close(COMP);
    close(CAS);

	$fig->reload_table('all', "comp_name",
					   "cid varchar(7), pos integer, name varchar(200)",
					   { index_comp_name_cid => "cid",
						 index_comp_name_name => "name" },
					"$FIG_Config::temp/comp_name.table");
    unlink("$FIG_Config::temp/comp_name.table");

	$fig->reload_table('all', "comp_cas",
					   "cid varchar(7), cas varchar(100)",
					   { index_comp_cas_cid => "cid",
						 index_comp_cas_cas => "cas" },
					   "$FIG_Config::temp/comp_cas.table");
    unlink("$FIG_Config::temp/comp_cas.table");
}

sub load_reactions {

    my($react,$sub,$prod,@sub,@prod,$subs,$prods,$dir);
    my($cid,$n,$main,%reaction,$x);

	Trace("Loading reactions.") if T(2);
    Open(\*REACTION, "<$FIG_Config::data/KEGG/reaction.lst");
    Open(\*RMAIN, "<$FIG_Config::data/KEGG/reaction_main.lst");
    Open(\*R2C, ">$FIG_Config::temp/reaction_to_compound.table");
    Open(\*REV, ">$FIG_Config::temp/rev.table");

	Trace("Reading reaction list file.") if T(2);
    while (defined($_ = <REACTION>))
    {
		if ($_ =~ /(R\d+):\s+(\S.*\S)\s+<=>\s+(\S.*\S)/)
		{
			$react = $1;
			$sub   = $2;
			$prod  = $3;
			@sub = split(/\s+\+\s+/,$sub);
			@prod = split(/\s+\+\s+/,$prod);
			@sub   = map { $_ =~ /^(([\dmn\(]\S*)\s+)?([CG]\d+)/; $2 ? [$3,$2,0] : [$3,1,0] } @sub;
			@prod  = map { $_ =~ /^(([\dmn\(]\S*)\s+)?([CG]\d+)/; $2 ? [$3,$2,0] : [$3,1,0] } @prod;
			$reaction{$react} = [[@sub],[@prod]];
		}
		else
		{
			Trace("Invalid reaction format: $_") if T(1);
		}
    }
    close(REACTION);

	Trace("Reading main reaction file.") if T(2);
    while (defined($_ = <RMAIN>))
    {
		if ($_ =~ /^(R\d+):\s+(\S.*\S)\s(\<?=\>?)\s(\S.*\S)/)
		{
			$react = $1;
			$sub   = $2;
			$dir   = $3;
			$prod  = $4;
	
			if (exists($reaction{$react}))
			{
			    $subs  = $reaction{$react}->[0];
			    $prods = $reaction{$react}->[1];
			    &mark_main($sub,$subs);
			    &mark_main($prod,$prods);
			    if (($dir eq "<=") || ($dir eq "=>"))
			    {
				print REV "$react\t0\n";
			    }
			    
			    if ($dir eq "<=")
			    {
				$reaction{$react}->[0] = $prods;
				$reaction{$react}->[1] = $subs;
			    }
			}
		}
    }
    close(RMAIN);
    close(REV);

	Trace("Connecting reactions to compounds.") if T(2);
    foreach $react (sort keys(%reaction))
    {
		($subs,$prods) = @{$reaction{$react}};
		foreach $x (@$subs)
		{
			($cid,$n,$main) = @$x;
			print R2C "$react\t0\t$cid\t$n\t$main\n";
		}
	
		foreach $x (@$prods)
		{
			($cid,$n,$main) = @$x;
			print R2C "$react\t1\t$cid\t$n\t$main\n";
		}
    }
    close(R2C);

	$fig->reload_table('all', "reaction_to_compound",
					   "rid varchar(8), setn char(1), cid varchar(8), stoich char(6), main char(1)",
					   { index_reaction_to_compound_rid => "rid",
						 index_reaction_to_compound_cid => "cid" },
					   "$FIG_Config::temp/reaction_to_compound.table"); 
    unlink("$FIG_Config::temp/reaction_to_compound.table");

	$fig->reload_table('all', "reversible",
					   "rid varchar(8) UNIQUE NOT NULL, reversible char(1), primary key(rid)",
					   { }, "$FIG_Config::temp/rev.table");
    unlink("$FIG_Config::temp/rev.table");
	Trace("Reactions processed.") if T(2);
}

sub mark_main {
    my($main,$set) = @_;
    my($cid,$i);

    foreach $cid (split(/\s+\+\s+/,$main))
    {
		for ($i=0; ($i < @$set) && ($set->[$i]->[0] ne $cid); $i++) {}
		if ($i == @$set)
		{
			Confess("Cannot handle $cid in $_\n" . Dumper($set));
		}
		else
		{
			$set->[$i]->[2] = 1;
		}
    }
}

sub load_catalyzes {

    my($entry);
    Open(\*REAC, "<$FIG_Config::data/KEGG/reaction");
    Open(\*REAC2ENZ, ">$FIG_Config::temp/reaction_to_enzyme.table");

	Trace("Reading KEGG reaction file.") if T(2);
    my($rid,$ec,@ecs,$ecs);
    $/ = "\n///\n";
    while (defined($entry = <REAC>))
    {
		if ($entry =~ /ENTRY\s+(R\d+).*\nENZYME\s+(\S[^a-zA-Z\/]+)/s)
		{
			$rid = $1;
			$ecs = $2;
			print "$ecs\n";
			foreach $ec (split(/\s+/,$ecs))
			{
			    print REAC2ENZ "$rid\t$ec\n";
			}
		}
    }
    $/ = "\n";
    close(REAC);
    close(REAC2ENZ);

	$fig->reload_table('all', "reaction_to_enzyme",
						"rid varchar(8), role varchar(100)",
						{ index_reaction_to_enzyme_rid => "rid",
						  index_reaction_to_enzyme_role => "role" },
						"$FIG_Config::temp/reaction_to_enzyme.table");
    unlink("$FIG_Config::temp/reaction_to_enzyme.table");
	Trace("Enzyme reactions loaded.") if T(2);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3