[Bio] / StandaloneTools / bielefeld_gff_to_seed.pl Repository:
ViewVC logotype

View of /StandaloneTools/bielefeld_gff_to_seed.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Oct 26 17:03:36 2005 UTC (13 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +96 -17 lines
Fixed RNA size bug; improved path handling. -- /gdp

#!/usr/bin/env /Volumes/raid/FIGdisk.v3/env/mac/bin/perl

BEGIN {
    @INC = qw(
              /Volumes/raid/FIGdisk.v3/dist/releases/dev/mac/lib
              /Volumes/raid/FIGdisk.v3/dist/releases/dev/mac/lib/FigKernelPackages
              /Volumes/raid/FIGdisk.v3/dist/dev/mac/lib
              /Volumes/raid/FIGdisk.v3/dist/dev/mac/lib/FigKernelPackages
              /Volumes/raid/FIGdisk.v3/env/mac/lib/perl5/5.8.7/darwin-2level
              /Volumes/raid/FIGdisk.v3/env/mac/lib/perl5/5.8.7
              /Volumes/raid/FIGdisk.v3/env/mac/lib/perl5/site_perl/5.8.7/darwin-2level
              /Volumes/raid/FIGdisk.v3/env/mac/lib/perl5/site_perl/5.8.7
              /Volumes/raid/FIGdisk.v3/env/mac/lib/perl5/site_perl
              .
              /Volumes/raid/FIGdisk.v3/config
 
);
}
use Data::Dumper;
use Carp;
use FIG_Config;
$ENV{'BLASTMAT'} = "/Volumes/raid/FIGdisk.v3/BLASTMAT";
$ENV{'FIG_HOME'} = "/Volumes/raid/FIGdisk.v3";
# end of tool_hdr
########################################################################
# -*- perl -*-

use FIG;

use File::Path;

$usage = "bielefeld_gff_to_seed.pl [-update] [-discard] Taxon_ID contigs_file < gff_file";

$trouble = 0;
while ($ARGV[0] =~ m/^-/)
{
    if    ($ARGV[0] =~ m/-update/)
    { 
	$update  = shift @ARGV;
    }
    elsif ($ARGV[0] =~ m/-discard/)
    {
	$discard = shift @ARGV;
    }
    else
    { 
	warn "\tInvalid argument $ARGV[0]\n"; 
	$trouble = shift @ARGV; 
    }
}
if ($trouble)  { die "\nThere were invalid arguments\n\n\tusage = $usage\n\n"; }

($base_dir = shift @ARGV) || die "no taxon-ID given\n\usage:$usage\n\n";
$base_dir  =~ m/(\d+\.\d+)$/; 
$tax_id = $1;
$org    = quotemeta($tax_id);

($contigs = shift @ARGV) || die "no contigs-file given\n\usage:$usage\n\n";

open(TMP, "<$contigs") || die "could not read-open $contigs";
while (($id, $seqP) = &read_fasta_record(\*TMP)) { $len{$id} = length($$seqP); }
close(TMP) || die "could not close $contigs";

$dir = "$base_dir/Features/peg";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(CDS_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$dir = "$base_dir/Features/rna";
(-d $dir) || mkpath($dir, 1, 0777) || die "could not create $dir";
open(RNA_TBL, ">$dir/tbl")         || die "could not open $dir/tbl";

$peg_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/peg")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|$org\.peg\.(\d+)/) { $peg_num = &FIG::max($peg_num, $1); } else { die "PEG match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/peg/tbl";
}

$rna_num = 0;
if ($update && (-d ($dir = "$FIG_Config::organisms/$tax_id/Features/rna")))
{
    open(TMP, "<$dir/tbl") || die "Could not open $dir/tbl";
    while (defined($line = <TMP>))
    {
	if ($line =~ m/^fig\|\d+\.\d+\.rna\.(\d+)/) { $rna_num = &FIG::max($rna_num, $1); } else { die "RNA match failed for $line"; }
    }
    close(TMP) || die "Could not close $FIG_Config::organisms/$tax_id/Features/rna/tbl";
}

while (defined($line = <STDIN>))
{
    chomp $line;
    
    next if ($line =~ m/^\#/);
    next if ($line !~ m/\S/);
    
    ($contig, undef, $type, $left, $right, undef, $strand, undef, $comment) = split /\t/, $line;
	
    next if ($discard && ($comment =~ m/^discarded/));
    
    if ($left <= 0)
    {
	++$bad;
	while ($left <= 0)  { $left += 3; }    #...Hack
    }
    
    if ($right > $len{$contig})
    {
	++$bad;
	while ($right > $len{$contig})  { $right -= 3; }    #...Hack
    }
    
    if    ($strand eq '+')
    {
	($beg, $end) = ($left, $right);
    }
    elsif ($strand eq '-')
    {
	($beg, $end) = ($right, $left);
    }
    else
    {
	die "Invalid strand \'$strand\' in line $.: $line\n"; 
    }
    
    
    if ($type =~ m/CDS/)
    {
	++$peg_num;
	print CDS_TBL "fig|$tax_id.peg.$peg_num\t$contig\_$beg\_$end\t\n";
    }
    elsif ($type =~ m/RNA/)
    {
	++$rna_num;
	$comment =~ m/^([^\;]+)/;
	print RNA_TBL "fig|$tax_id.rna.$rna_num\t$contig\_$beg\_$end\t$1\n";
	# die "$line\n";
    }
    else
    {
	print STDERR "Could not handle $line\n";
    }
}
print STDERR "bad=$bad\n" if $bad;

close(CDS_TBL) || die "Could not close peg tbl";
close(RNA_TBL) || die "Could not close rna tbl";

if (! ($x = -s "$base_dir/Features/rna/tbl")) { 
    warn "\'$base_dir/Features/rna/tbl\' is empty ($x) --- deleting Features/rna/\n";
#    system("rm -R $base_dir/Features/rna") && die("could not delete empty Features/rna"); 
}

if (!-s "$base_dir/Features/peg/tbl") { die "Features/peg/tbl is empty --- something is wrong"; }

sub read_fasta_record
{
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );
    
    if (not defined($file_handle))  { $file_handle = \*STDIN; }
    
    $old_end_of_record = $/;
    $/ = "\n>";
    
    if (defined($fasta_record = <$file_handle>))
    {
	chomp $fasta_record;
	@lines  =  split( /\n/, $fasta_record );
	$head   =  shift @lines;
	$head   =~ s/^>?//;
	$head   =~ m/^(\S+)/;
	$seq_id = $1;
	
	if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
	
	$sequence  =  join( "", @lines );
	
	@parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
	@parsed_fasta_record = ();
    }
    
    $/ = $old_end_of_record;
    
    return @parsed_fasta_record;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3