[Bio] / FigKernelScripts / run_glimmer2.pl Repository:
ViewVC logotype

View of /FigKernelScripts/run_glimmer2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Sat May 5 21:34:18 2007 UTC (12 years, 9 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.6: +91 -87 lines
Added support for multiple genetic codes. -- /gdp

# -*- perl -*-

use strict;
use warnings;

&make_sure(qw(long-orfs get-putative build-icm glimmer2));

$0 =~ m/([^\/]+)$/;
my $usage = "$1 Org-ID contigs [-code=genetic_code] [-train=training_tbl[,training_contigs]] [-skip]";

my ($train, $training_tbl, $training_contigs);
$train = $training_tbl = $training_contigs = "";

my $tmp_train  = "$FIG_Config::temp/tmp$$.train";
my $tmp_contig = "$FIG_Config::temp/tmp$$.contig";
my $tmp_coord  = "$FIG_Config::temp/tmp$$.coord";
my $tmp_model  = "$FIG_Config::temp/tmp$$.model";

my $org;
my $contigs;
my $trouble = 0;
my $skip    = "";

if (@ARGV >= 2) 
{
    if ($ARGV[0] && (!-f $ARGV[0])) {
	$org = $ARGV[0];
	print STDERR "Org-ID is $org\n";
    }
    else {
	$trouble = 1;
	warn "Bad Org-ID $ARGV[0]";
    }
    
    if ($ARGV[1] && (-s $ARGV[1])) {
	$contigs = $ARGV[1];
	print STDERR "Contigs file is $contigs\n";
    }
    else {
	$trouble = 1;
	warn "Missing contigs file $ARGV[1]";
    }
}
else
{
    die "usage: $usage";
}


my $code = 11;
my $glimmer  = "glimmer2";
my $longorfs = "long-orfs";

my $i = 2;
while ($i < @ARGV)
{
    if ($ARGV[$i] =~ m/-code=(\d+)/) {
	if (($1 == 4) || ($1 == 11)) {
	    $code = $1;
	    $glimmer  = "glimmer2.gc$code";
	    $longorfs = "long-orfs.gc$code";
	    &make_sure($longorfs, $glimmer);
	}
	else {
	    die "Sorry, we currently only support Genetic Codes 11 and 4";
	}
    }
    elsif ($ARGV[$i] =~ m/-train=(\S+)/) {
	$train =  $1;
	if ($train =~ m/^([^,]+)/) {
	    $training_tbl = $1;
	    if (-s $training_tbl) {
		print STDERR "Training set will be taken from $training_tbl\n";
	    }
	    else {
		$trouble = 1;
		warn "Training tbl $training_tbl does not exist";
	    }
	    
	    if ($train =~ m/,([^,]+)$/)
	    {
		$training_contigs = $1;
		if (-s $training_contigs) {
		    print STDERR "Training set will be extracted from $training_contigs\n";
		}
		else {
		    $trouble = 1;
		    warn "Training contigs $training_contigs do not exist";
		}
	    }
	}
	else {
	    $trouble = 1;
	    warn "Training argument $train is invalid";
	}
    }
    elsif ($ARGV[$i] =~ m/-skip/) {
	$skip = $ARGV[$i];
	print STDERR "Skip option is set\n";
    }
    else {
	$trouble = 1;
	warn "Unrecognized argument $ARGV[$i]\n";
    }
    
    ++$i;
}
die "\n\tusage: $usage\n\n" if $trouble;

if (not $training_contigs) {
    $training_contigs = $contigs;
}

my %len_of;
my %skip;
my @tbl;
my %tbl;
my ($contig_id, $beg, $end);

my $orf_num = 0;
if ($train) {
    @tbl = `cat $training_tbl`;
    foreach my $entry (@tbl) {
	my ($fid, $locus) = split /\t/, $entry;
	$fid =~ m/\.(\d+)$/;
	$orf_num = ($1 > $orf_num) ? $1 : $orf_num;

	$locus =~ m/^(\S+)_(\d+)_(\d+)$/;
	($contig_id, $beg, $end) = ($1, $2, $3);
	
	if ($skip) {
	    $skip{$contig_id} = 1;
	}
	
	if (not defined($tbl{$contig_id})) { $tbl{$contig_id} = []; }
	my $x = $tbl{$contig_id};
	push @$x, [$beg, $end];
    }
    
    my $n = 0;
    open(CONTIGS, "<$training_contigs") 
	|| die "Could not read-open $training_contigs";
    open(TRAIN,   ">$tmp_train")
	|| die "Could not write-open $tmp_train";
    
    while (my ($contig_id, $seqP) = &read_fasta_record(\*CONTIGS)) {
	$len_of{$contig_id} = length($$seqP);
	
	my $x = $tbl{$contig_id};
	foreach my $y (@$x) {
	    my ($beg, $end) = @$y;
	    
	    my $dna;
	    if ($beg < $end) {
		$dna = substr($$seqP,$beg-1,($end+1-$beg));
	    }
	    else {
		$dna = substr($$seqP,$end-1,($beg+1-$end));
		$dna = $ { &rev_comp(\$dna) };
	    }
	    $dna = lc($dna);
	    
	    ++$n;
	    $_ = "T$n";
	    print TRAIN  $_ . (" " x (11-length($_))) . $dna . "\n";
	}
    }
    close(CONTIGS) || die "Could not close $contigs";
}
else {
    print STDERR "\nFinding training ORFs using default procedure\n";
    if (-s "$tmp_train") {
	system("rm -f $tmp_train") 
	    && die "Could not remove $tmp_train";
    }
    
    open(CONTIGS, "<$training_contigs")
	|| die "could not read-open $training_contigs";
    while (my ($contig_id, $seqP) = &read_fasta_record(\*CONTIGS))
    {
	$len_of{$contig_id} = length($$seqP);
	
	open( TMP, ">$tmp_contig") || die "Could not write-open $tmp_contig";
	&display_id_and_seq($contig_id, $seqP, \*TMP);
	close(TMP) || die "Could not close $tmp_contig";
	
	print STDERR "\nScanning contig $contig_id\n";
	system("$longorfs $tmp_contig -l | get-putative > $tmp_coord") 
	    && die "Could not extract training ORFs from contig $contig_id";
	system("extract $tmp_contig $tmp_coord >> $tmp_train") 
	    && die "Could not extract training sequences from contig $contig_id";
    }
    close(CONTIGS) || die "Could not close $contigs";
    
    if (-s "$tmp_coord")  { unlink("$tmp_coord")  || warn "Could not remove $tmp_coord";  }
    if (-s "$tmp_contig") { unlink("$tmp_contig") || warn "Could not remove $tmp_contig"; }
}

if (($_ =  `wc $tmp_train`) && ($_ =~ m/^\s+(\d+)/)) {
    print STDERR "\nExtracted $1 training sequences\n\n";
} 
else { 
    die "\nCould not extract any training sequences";
}

print STDERR "Building interpolated context model in $tmp_model\n";

if (-s "$tmp_model") {
    system("rm -f $tmp_model") && die "Could not remove $tmp_model";
}

system("build-icm <$tmp_train >$tmp_model") 
    && die "Failure at build-icm <$tmp_train >$tmp_model";

#open(TBL,     ">tbl")      || die "Could not write-open tbl";
open(CONTIGS, "<$contigs") || die "Could not read-open $contigs";
while (my ($contig_id, $seqP) = &read_fasta_record(\*CONTIGS)) {
    if ($skip{$contig_id}) {
	print STDERR "Skipping $contig_id\n" if $ENV{VERBOSE};
	next;
    }
    
    $len_of{$contig_id} = length($$seqP);
    
    open( TMP, ">$tmp_contig") || die "Could not write-open $tmp_contig";
    &display_id_and_seq($contig_id, $seqP, \*TMP);
    close(TMP) || die "Could not close $tmp_contig";
    
    print STDERR "\nFinding ORFs on contig $contig_id, len=", length($$seqP), "\n";
    if (my @calls = `$FIG_Config::ext_bin/$glimmer $tmp_contig $tmp_model -l | get-putative`) {
	foreach my $call (@calls) {
	    if ($call =~ m/^\s*\d+\s+(\d+)\s+(\d+)/o) {
		if ($1 < $2) { 
		    ($beg, $end) = ($1, &min(($2+3), $len_of{$contig_id}));
		} else {
		    ($beg, $end) = ($1, &max(($2-3), 1));
		}
	    }
	    else {
		die "Could not parse GLIMMER2 output line:\n$call";
	    }
	    
	    ++$orf_num;
	    print STDOUT "fig|$org.peg.$orf_num\t$contig_id\_$beg\_$end\t\n"
	}
    }
    else {
	warn "glimmer2 found no ORFs on contig $contig_id";
    }
}
close(CONTIGS) || die "Could not close $contigs";

if (-s "$tmp_contig") { unlink("$tmp_contig") || warn "Could not remove $tmp_contig"; }
if (-s "$tmp_train")  { unlink("$tmp_train")  || warn "Could not remove $tmp_train";  }
if (-s "$tmp_model")  { unlink("$tmp_model")  || warn "Could not remove $tmp_model";  }

exit 0;



sub read_fasta_record {
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );

    if (not defined($file_handle))  { $file_handle = \*STDIN; }

    $old_end_of_record = $/;
    $/ = "\n>";

    if (defined($fasta_record = <$file_handle>)) {
        chomp $fasta_record;
        @lines  =  split( /\n/, $fasta_record );
        $head   =  shift @lines;
        $head   =~ s/^>?//;
        $head   =~ m/^(\S+)/;
        $seq_id = $1;

        if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }

        $sequence  =  join( "", @lines );

        @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else {
        @parsed_fasta_record = ();
    }

    $/ = $old_end_of_record;

    return @parsed_fasta_record;
}

sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;

    if (! defined($fh) )  { $fh = \*STDOUT; }

    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq {
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );

    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60) {
        if (($i + 60) <= $n)
        {
            $ln = substr($$seq,$i,60);
        }
        else
        {
            $ln = substr($$seq,$i,($n-$i));
        }
        print $fh "$ln\n";
    }
}

sub rev_comp {
    my( $seqP ) = @_;
    my( $rev  );

    $rev =  reverse( $$seqP );
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return \$rev;
}

sub min { my ($x, $y) = @_; return (($x < $y) ? $x : $y); }

sub max { my ($x, $y) = @_; return (($x > $y) ? $x : $y); }

sub make_sure {
    my(@progs) = @_;

    my $prog;
    foreach $prog (@progs)
    {
	my @tmp = `which $prog`;
	if ($tmp[0] =~ /^no $prog/)
	{
	    print STDERR $tmp[0];
	    exit(1);
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3