[Bio] / FigKernelScripts / make_kegg_files.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_kegg_files.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Wed Jan 17 19:23:23 2007 UTC (13 years, 1 month ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +9 -8 lines
Update parsing of NAME field to reflect change in source file. -- /gdp

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;

use Cwd;
use Carp;
use Data::Dumper;
use File::Path;

$0 =~ m/([^\/]+)$/;
$usage = "$1 [-noget] InDir OutDir";

$noget = "";
if ($ARGV[0] && ($ARGV[0] =~ m/-noget/)) { $noget = shift; }

(($indir = shift @ARGV) && ($outdir = shift @ARGV))
    || die "\n\tusage: $usage\n\n";

if ($noget) {
    (-d    "$indir/KEGG") || die "Input directory $indir/KEGG does not exist";
} else {
    (!-d   "$indir/KEGG") || die "Input directory $indir/KEGG already exists";
    mkpath("$indir/KEGG",  1, 0777) || die "Could not create $indir/KEGG";
}

(!-d   "$outdir/KEGG")    || die "Output directory $outdir/KEGG already exists";
mkpath("$outdir/KEGG", 1, 0777) || die "Could not create $outdir/KEGG";

$base = getcwd();


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# get data from ftp://ftp.genome.ad.jp/pub/kegg/tarfiles/
#     genes.tar.gz, genome.gz, ko.gz
#-----------------------------------------------------------------------
$kegg_url = "ftp://ftp.genome.ad.jp/pub/kegg/tarfiles/";
unless ($noget)
{
    print STDERR "getting data from $kegg_url ...\n";
    chdir("$indir/KEGG") || die "Could not chdir to $indir/KEGG/, return-code $!";
    if (($test = `which wget`) && ($test !~ m/^which: no wget in/))
    {
	&FIG::run("wget --passive-ftp $kegg_url/genome.gz");
	&FIG::run("wget --passive-ftp $kegg_url/genes.tar.gz");
	&FIG::run("wget --passive-ftp $kegg_url/ko.gz");
    }
    elsif (($test = `which curl`) && ($test !~ m/^which: no curl in/))
    {
	&FIG::run("curl --ftp-pasv -f -O $kegg_url/genome.gz");
	&FIG::run("curl --ftp-pasv -f -O $kegg_url/genes.tar.gz");
	&FIG::run("curl --ftp-pasv -f -O $kegg_url/ko.gz");
    }
    else
    {
	die "Sorry, could not find either \`wget\` or \`curl\` in path $ENV{PATH}";
    }
    chdir($base)  || die "Could not chdir to $base: $!";
}

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ... Extract genome information ...
#-----------------------------------------------------------------------
print STDERR "Extracting genome-name information ...\n";

if (-s "$indir/KEGG/genome.gz")
{
    open(GENOME, "gunzip -c $indir/KEGG/genome |") || die "could not read-open $indir/$genome";
}
elsif (-s "$indir/KEGG/genome")
{
    open(GENOME, "<$indir/KEGG/genome") || die "could not read-open $indir/$genome";
}
else
{
    die "Could not find either $indir/KEGG/genome.gz or $indir/KEGG/genome";
}

$/ = "\n///\n";
while (defined($record = <GENOME>))
{
    chomp $record;
    %parse = map { m/^(\S+)\s+(.*)$/so; $1 => $2 } split( /\n\b/, $record );
    
    if (defined($parse{ENTRY}) && defined($parse{NAME}) && defined($parse{DEFINITION}))
    {
	$abbrev      =  $parse{ENTRY};
	$parse{NAME} =~ m/^([^,\s]+)/;
	$short_name  =  $1;
	$full_name   =  $parse{DEFINITION};
	$full_name   =~ s/[\s\n]+/ /gso;
	
	$abbrev_of{$short_name}    = $abbrev;
	$full_name_of{$short_name} = $full_name;
    }
    else
    {
	die "Could not parse record $.:\n$record";
    }
}
close(GENOME);
$/ = "\n";


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ... Extract sequence information ...
#-----------------------------------------------------------------------
print STDERR "Extracting sequence information ...\n";

if (-s "$indir/KEGG/genes.tar.gz")
{
    if ($ENV{VERBOSE}) {
	open(GENES, "tar -xzvf $indir/KEGG/genes.tar.gz -O |")  || die "could not pipe-open $indir/KEGG/genes.tar.gz";
    } else {
	open(GENES, "tar -xzf  $indir/KEGG/genes.tar.gz -O |")  || die "could not pipe-open $indir/KEGG/genes.tar.gz";
    }
}
elsif (-s "$indir/KEGG/genes.tar") {
    if ($ENV{VERBOSE}) {
	open(GENES,  "tar -xvf $indir/KEGG/genes.tar -O |")  || die "could not pipe-open $indir/KEGG/genes.tar";
    } else {
	open(GENES,  "tar -xf  $indir/KEGG/genes.tar -O |")  || die "could not pipe-open $indir/KEGG/genes.tar";
    }
}
else
{
    die "Could not find either '$indir/KEGG/genes.tar.gz' or '$indir/KEGG/genes.tar.gz'";
}

open(FASTA, ">$outdir/KEGG/fasta")
    || die "could not write-open $outdir/KEGG/fasta";
open(FUNC,  ">$outdir/KEGG/assigned_functions")
    || die "could not write-open $outdir/KEGG/assigned_functions";
open(ORGS,  ">$outdir/KEGG/org.table")
    || die "could not write-open $outdir/KEGG/org.table";

$/ = "\n///\n";
while (defined($record = <GENES>))
{
    chomp $record;
    %parse = map { m/^(\S+)\s+(.*)$/so; $1 => $2 } split( /\n\b/, $record );
    
    $gene_id = $short_name = "";
    if ($parse{ENTRY} =~ m/^(\S+)\s+CDS\s+(.*)$/o) {
	($gene_id, $short_name) = ($1, $2);
	die "For record $., org $short_name was not in 'genomes' file; record:$record\n\t" 
	    unless defined($abbrev_of{$short_name});
    } else {
	print STDERR "Skipping non-CDS ENTRY field: $parse{ENTRY}\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
	next;
    }
    
    
    $gene_name = "";
    if (defined($parse{NAME}))
    {
	if ($parse{NAME} =~ m/^(.*)$/so) {
	    $gene_name =  $1;
	    $gene_name =~ s/[\s\n]+/ /gso;
	} else {
	    die "For record $., org $short_name, could not parse NAME field $parse{NAME} in record:$record\n\t";
	}
    }

    $func = "";
    if (defined($parse{DEFINITION}))
    {
	if ($parse{DEFINITION} =~ m/^(.*)$/mso) {
	    $func = $1;
	    $func =~ s/\s+$//so;
	    $func =~ s/\s+/ /gso;
	} else {
	    die "For record $., org $short_name, could not parse DEFINITION field $parse{DEFINITION} in record:$record\n\t";
	}
    }
    
    $cluster = $clustfunc = "";
    if (defined($parse{ORTHOLOG}))
    {
	if ($parse{ORTHOLOG} =~ m/^KO:\s+(\S+)\s+(.*)$/so) {
	    ($cluster, $clustfunc) = ($1, $2);
	} else {
	    die "For record $., org $short_name, could not parse ORTHOLOG field $parse{ORTHOLOG} in record:$record\n\t";
	}
    }
	    
    if ($parse{AASEQ} =~ m/^\d+\s+(.*)$/so)
    {
	$seq =  $1;
	$seq =~ s/[\s\n]//gso;
	$seq =~ s/[^ARNDCQEGHILKMFPSTWYUVBZX]/x/igo;   #...Change invalid chars to 'x's.
    }
    else
    {
	print STDERR "For record $., org $short_name, skipping apparent pseudogene, name='$parse{NAME}', func='$parse{DEFINITION}' (no AASEQ); record:\n$record\n";
	next;
    }
    
    $kegg_id = "kegg|$abbrev_of{$short_name}:$gene_id";
    &FIG::display_id_and_seq($kegg_id, \$seq, \*FASTA);

    print FUNC "$kegg_id\t$func\n";
    
    print ORGS "$kegg_id\t$full_name_of{$short_name}\n";
}
close(GENES) || die "Could not close pipe from genes.tar or genes.tar.gz";

exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3