[Bio] / FigKernelScripts / bielefeld_tarfile_to_seed.pl Repository:
ViewVC logotype

View of /FigKernelScripts/bielefeld_tarfile_to_seed.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Sep 28 21:45:36 2005 UTC (14 years, 8 months ago) by overbeek
Branch: MAIN
CVS Tags: caBIG-dataload-0
Processes a Bielefeld gene-call tarfile to produce a SEED skeleton directory. -- /gdp

# -*- perl -*-

use FIG;
$fig = new FIG;

use File::Path;

$usage = "bielefeld_gff_to_seed.pl [-update] input_tarfile output_dir";

if ($ARGV[0] =~ m/-update/) { $update = shift; }

(($tarfile = shift) && (-s $tarfile)) 
    || die "Input tarfile $tarfile does not exist.\n\n\tusage: $usage\n\n";

($output_dir = shift @ARGV) || die "no Output-Dir given\n\usage:$usage\n\n";

if ($output_dir  =~ m/(\d+\.\d+)$/)
{
    $tax_id = $1;
    $org    = quotemeta($tax_id);
}
else
{
    die "Output-dir $output_dir must end in a valid format Taxon-ID, e.g. \"83333.1\"";
}

($base_dir = `pwd`) || die "Could not get current working directory";
chomp $base_dir;
print STDERR "\nSetting base_dir = $base_dir\n";

$dir = "$base_dir/$output_dir/RAW_DATA";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
chdir($dir) || die "Could not change working-directory to $dir"; 

&FIG::run("tar xpzvf $tarfile");

chdir("$base_dir/$output_dir") || "Could not change working-directory to $output_dir";
&FIG::run("cat RAW_DATA/*_contig.fas | reformat_contigs > contigs");

open(TMP, "<contigs") || die "could not read-open contigs";
while (($id, $seqP) = &read_fasta_record(\*TMP)) { $len{$id} = length($$seqP); }
close(TMP) || die "could not close contigs";

$dir = "Features/peg";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(CDS_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$dir = "Features/rna";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(RNA_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$peg_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/peg")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|$org\.peg\.(\d+)/) { $peg_num = &FIG::max($peg_num, $1); } else { die "PEG match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/peg/tbl";
}

$rna_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/rna")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|\d+\.\d+\.rna\.(\d+)/) { $rna_num = &FIG::max($rna_num, $1); } else { die "RNA match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/rna/tbl";
}

opendir(RAW_DATA, "RAW_DATA")           || die "Could not opendir $dir/RAW_DATA";
(@gff_files = grep /_genes.gff$/, readdir(RAW_DATA)) || die "Could not readdir GFF files";
foreach $file (@gff_files)
{
    print STDERR "\nParsing RAW_DATA/$file ...\n";
    open(GFF, "<RAW_DATA/$file") || die "Could not read-open RAW_DATA/$file";
    while (defined($line = <GFF>))
    {
	chomp $line;
	
	next if ($line =~ m/^\#/);
	next if ($line !~ m/\S/);
	
	($contig, undef, $type, $left, $right, undef, $strand, undef, $comment) = split /\t/, $line;
	
	if ($left <= 0)
	{
	    ++$bad;
	    while ($left <= 0)  { $left += 3; }    #...Hack
	}
	
	if ($right > $len{$contig})
	{
	    ++$bad;
	    while ($right > $len{$contig})  { $right -= 3; }    #...Hack
	}
	
	if    ($strand eq '+')
	{
	    ($beg, $end) = ($left, $right);
	}
	elsif ($strand eq '-')
	{
	    ($beg, $end) = ($right, $left);
	}
	else
	{
	    die "Invalid strand \'$strand\' in line $.: $line\n"; 
	}
	
	if ($type =~ m/CDS/)
	{
	    ++$peg_num;
	    print CDS_TBL "fig|$tax_id.peg.$peg_num\t$contig\_$beg\_$end\t\n";
	}
	elsif ($type =~ m/RNA/)
	{
	    ++$rna_num;
	    $comment =~ m/^([^\;]+)/;
	    print RNA_TBL "fig|$tax_id.rna.$rna_num\t$contig\_$beg\_$end\t$1\n";
	    # die "$line\n";
	}
	else
	{
	    print STDERR "Could not handle $line\n\n";
	}
    }
}
print STDERR "bad=$bad\n" if $bad;

$dir = "$base_dir/$output_dir";
if (-s "$dir/Features/rna/tbl") 
{
    print STDERR "\nGetting RNA FASTA sequences ...\n";
    &FIG::run("get_dna $dir/contigs $dir/Features/rna/tbl > $dir/Features/rna/fasta");
}
else
{
    warn "\'$output_dir/Features/rna/tbl\' is empty --- deleting Features/rna/\n";
#    system("rm -R $output_dir/Features/rna") && die("could not delete empty Features/rna"); 
}

if (-s "$dir/Features/peg/tbl") 
{
    print STDERR "\nGetting PEG FASTA sequences ...\n";
    &FIG::run("get_fasta_for_tbl_entries $dir/contigs < $dir/Features/peg/tbl > $dir/Features/peg/fasta");
}
else
{ 
    die "Features/peg/tbl is empty --- something is wrong";
}
print STDERR "\nSuccessfully completed processing $tarfile\n\n";
exit(0);



sub read_fasta_record
{
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );
    
    if (not defined($file_handle))  { $file_handle = \*STDIN; }
    
    $old_end_of_record = $/;
    $/ = "\n>";
    
    if (defined($fasta_record = <$file_handle>))
    {
	chomp $fasta_record;
	@lines  =  split( /\n/, $fasta_record );
	$head   =  shift @lines;
	$head   =~ s/^>?//;
	$head   =~ m/^(\S+)/;
	$seq_id = $1;
	
	if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
	
	$sequence  =  join( "", @lines );
	
	@parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
	@parsed_fasta_record = ();
    }
    
    $/ = $old_end_of_record;
    
    return @parsed_fasta_record;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3