[Bio] / FigKernelScripts / bielefeld_tarfile_to_seed.pl Repository:
ViewVC logotype

View of /FigKernelScripts/bielefeld_tarfile_to_seed.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Sun Sep 10 15:08:01 2006 UTC (13 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.6: +1 -1 lines
Tweaked usage msg. -- /gdp

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use FIG;
$fig = new FIG;

use File::Path;

$usage = "bielefeld_tarfile_to_seed.pl [-update] [-discard] input_tarfile output_taxdir";

$trouble = 0;
while ($ARGV[0] =~ m/^-/)
{
    if    ($ARGV[0] =~ m/-update/)
    { 
	$update  = shift @ARGV;
    }
    elsif ($ARGV[0] =~ m/-discard/)
    {
	$discard = shift @ARGV;
    }
    else
    { 
	warn "\tInvalid argument $ARGV[0]\n"; 
	$trouble = shift @ARGV; 
    }
}

if ($trouble)  { die "\nThere were invalid arguments\n\n\tusage = $usage\n\n"; }

(($tarfile = shift) && (-s $tarfile)) 
    || die "Input tarfile $tarfile does not exist.\n\n\tusage: $usage\n\n";
print STDERR "Reading data from $tarfile\n";

($output_dir = shift @ARGV) || die "no Output-Dir given\n\usage:$usage\n\n";

if ($output_dir  =~ m/(\d+\.\d+)$/)
{
    $tax_id = $1;
    $org    = quotemeta($tax_id);
}
else
{
    die "Output-dir $output_dir must end in a valid format Taxon-ID, e.g. \"83333.1\"";
}

($base_dir = `pwd`) || die "Could not get current working directory";
chomp $base_dir;
print STDERR "\nSetting base_dir = $base_dir\n";

$dir = "$base_dir/$output_dir/RAW_DATA";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
# chdir($dir) || die "Could not change working-directory to $dir"; 

if ($tarfile !~ m{^\/})
{
    if (-s "$base_dir/$tarfile")
    {
	$tarfile = "$base_dir/$tarfile";
    }
    else
    {
	die "Could not locate $base_dir/$tarfile";
    }
}
else
{
    print STDERR "Reading data from $tarfile\n";
}

if ($tarfile =~ m/gz$/) {
    &FIG::run("tar xpzvf $tarfile -C $dir");
} else {
    &FIG::run("tar xpvf  $tarfile -C $dir");
}

chdir("$base_dir/$output_dir") || "Could not change working-directory to $output_dir";
&FIG::run("cat RAW_DATA/*\.fas | reformat_contigs > contigs");
open(TMP, "<contigs") || die "could not read-open contigs";
while (($id, $seqP) = &read_fasta_record(\*TMP)) { $len{$id} = length($$seqP); }
close(TMP) || die "could not close contigs";

if ($num_contigs = (scalar keys %len))
{
    print STDERR "Read $num_contigs contigs\n";
}
else
{
    die "ERROR: no contigs found in contigs file\n";
}


$dir = "Features/peg";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(CDS_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$dir = "Features/rna";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(RNA_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$peg_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/peg")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|$org\.peg\.(\d+)/) { $peg_num = &FIG::max($peg_num, $1); } else { die "PEG match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/peg/tbl";
}

$rna_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/rna")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|\d+\.\d+\.rna\.(\d+)/) { $rna_num = &FIG::max($rna_num, $1); } else { die "RNA match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/rna/tbl";
}

opendir(RAW_DATA, "RAW_DATA")           || die "Could not opendir $dir/RAW_DATA";
(@gff_files = grep /_genes.gff$/, readdir(RAW_DATA)) || die "Could not readdir GFF files";
foreach $file (@gff_files)
{
    print STDERR "\nParsing RAW_DATA/$file ...\n";
    open(GFF, "<RAW_DATA/$file") || die "Could not read-open RAW_DATA/$file";
    while (defined($line = <GFF>))
    {
	chomp $line;
	
	next if ($line =~ m/^\#/);
	next if ($line !~ m/\S/);
	
	($contig, undef, $type, $left, $right, undef, $strand, undef, $comment) = split /\t/, $line;
	
	next if ($discard && ($comment =~ m/^discarded/));
	
	if ($left <= 0)
	{
	    ++$bad;
	    while ($left <= 0)  { $left += 3; }    #...Hack
	}
	
	if ($right > $len{$contig})
	{
	    ++$bad;
	    while ($right > $len{$contig})  { $right -= 3; }    #...Hack
	}
	
	if    ($strand eq '+')
	{
	    ($beg, $end) = ($left, $right);
	}
	elsif ($strand eq '-')
	{
	    ($beg, $end) = ($right, $left);
	}
	else
	{
	    die "Invalid strand \'$strand\' in line $.: $line\n"; 
	}
	
	if ($type =~ m/CDS/)
	{
	    ++$peg_num;
	    print CDS_TBL "fig|$tax_id.peg.$peg_num\t$contig\_$beg\_$end\t\n";
	}
	elsif ($type =~ m/RNA/)
	{
	    ++$rna_num;
	    $comment =~ m/^([^\;]+)/;
	    print RNA_TBL "fig|$tax_id.rna.$rna_num\t$contig\_$beg\_$end\t$1\n";
	    # die "$line\n";
	}
	else
	{
	    print STDERR "Could not handle $line\n\n";
	}
    }
}
print STDERR "bad=$bad\n" if $bad;

close(CDS_TBL) || die "Could not close peg tbl";
close(RNA_TBL) || die "Could not close rna tbl";

$dir = "$base_dir/$output_dir";
if (-s "$dir/Features/rna/tbl") 
{
    print STDERR "\nGetting RNA FASTA sequences ...\n";
    &FIG::run("get_dna $dir/contigs $dir/Features/rna/tbl > $dir/Features/rna/fasta");
}
else
{
    warn "\'$output_dir/Features/rna/tbl\' is empty --- deleting Features/rna/\n";
#    system("rm -R $output_dir/Features/rna") && die("could not delete empty Features/rna"); 
}

if (-s "$dir/Features/peg/tbl") 
{
    print STDERR "\nGetting PEG FASTA sequences ...\n";
    &FIG::run("get_fasta_for_tbl_entries $dir/contigs < $dir/Features/peg/tbl > $dir/Features/peg/fasta");
}
else
{ 
    die "Features/peg/tbl is empty --- something is wrong";
}
print STDERR "\nSuccessfully completed processing $tarfile\n\n";
exit(0);



sub read_fasta_record
{
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );
    
    if (not defined($file_handle))  { $file_handle = \*STDIN; }
    
    $old_end_of_record = $/;
    $/ = "\n>";
    
    if (defined($fasta_record = <$file_handle>))
    {
	chomp $fasta_record;
	@lines  =  split( /\n/, $fasta_record );
	$head   =  shift @lines;
	$head   =~ s/^>?//;
	$head   =~ m/^(\S+)/;
	$seq_id = $1;
	
	if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
	
	$sequence  =  join( "", @lines );
	
	@parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
	@parsed_fasta_record = ();
    }
    
    $/ = $old_end_of_record;
    
    return @parsed_fasta_record;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3