[Bio] / StandaloneTools / run-glimmer.pl Repository:
ViewVC logotype

View of /StandaloneTools/run-glimmer.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Sat Dec 24 22:32:44 2005 UTC (13 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +98 -72 lines
Added capability to train from an external set of calls and contigs. -- /gdp

#!/usr/bin/perl -w

$ENV{PATH} = "$ENV{HOME}/glimmer2.13:$ENV{PATH}";

$usage = "$0 Org-ID contigs [-train=training_tbl[,training_contigs]] [-skip]";

$train = $training_tbl = $training_contigs = $skip = "";

$trouble = 0;
if (@ARGV >= 2) 
{
    if ($ARGV[0] && (!-f $ARGV[0])) {
	$org = $ARGV[0];
	print STDERR "Org-ID is $org\n";
    } else {
	$trouble = 1;
	warn "Bad Org-ID $ARGV[0]";
    }
    
    if ($ARGV[1] && (-s $ARGV[1])) {
	$contigs = $ARGV[1];
	print STDERR "Contigs file is $contigs\n";
    } else {
	$trouble = 1;
	warn "Missing contigs file $ARGV[1]";
    }
}
else
{
    die "usage: $usage";
}

$i = 2;
while ($i < @ARGV)
{
    if ($ARGV[$i] =~ m/-train=(\S+)/)
    {
	++$i;
	$train =  $1;
	if ($train =~ m/^([^,]+)/)
	{
	    $training_tbl = $1;
	    if (-s $training_tbl)
	    {
		print STDERR "Training set will be taken from $training_tbl\n";
	    }
	    else
	    {
		$trouble = 1;
		warn "Training tbl $training_tbl does not exist";
	    }
	    
	    if ($train =~ m/,([^,]+)$/)
	    {
		$training_contigs = $1;
		if (-s $training_contigs)
		{
		    print STDERR "Training set will be extracted from $training_contigs\n";
		}
		else
		{
		    $trouble = 1;
		    warn "Training contigs $training_contigs do not exist";
		}
	    }
	}
	else
	{
	    $trouble = 1;
	    warn "Training argument $train is invalid";
	}
    }
    elsif ($ARGV[$i] =~ m/-skip/)
    {
	$skip = $ARGV[$i];
	print STDERR "Skip option is set\n";
	++$i;
    }
    else
    {
	die "Unrecognized argument $ARGV[$i]";
    }
}
die "\n\tusage: $usage\n\n" if $trouble;

if (not $training_contigs)
{
    $training_contigs = $contigs;
}


$orf_num = 0;
if ($train)
{
    @tbl = `cat $training_tbl`;
    foreach $entry (@tbl)
    {
	($fid, $locus) = split /\t/, $entry;
	$fid =~ m/\.(\d+)$/;
	$orf_num = ($1 > $orf_num) ? $1 : $orf_num;

	$locus =~ m/^(\S+)_(\d+)_(\d+)$/;
	($contig_id, $beg, $end) = ($1, $2, $3);
	
	if ($skip) { $skip{$contig_id} = 1; }
	
	if (not defined($tbl{$contig_id})) { $tbl{$contig_id} = []; }
	$x = $tbl{$contig_id};
	push @$x, [$beg, $end];
    }
    
    $n = 0;
    open(CONTIGS, "<$training_contigs") 
	|| die "Could not read-open $training_contigs";
    open(TRAIN,   ">tmp$$.train")
	|| die "Could not write-open tmp$$.train";
    
    while (($contig_id, $seqP) = &read_fasta_record(\*CONTIGS))
    {
	$len_of{$contig_id} = length($$seqP);
	
	$x = $tbl{$contig_id};
	foreach $y (@$x)
	{
	    ($beg, $end) = @$y;
	    
	    if ($beg < $end)
	    {
		$dna = substr($$seqP,$beg-1,($end+1-$beg));
	    }
	    else
	    {
		$dna = substr($$seqP,$end-1,($beg+1-$end));
		$dna = $ { &rev_comp(\$dna) };
	    }
	    $dna = lc($dna);
	    
	    ++$n;
	    $_ = "T$n";
	    print TRAIN  $_ . (" " x (11-length($_))) . $dna . "\n";
	}
    }
    close(CONTIGS) || die "Could not close $contigs";
}
else
{
    print STDERR "\nFinding training ORFs using default procedure\n";
    if (-s "tmp$$.train") {
	system("rm -f tmp$$.train") 
	    && die "Could not remove tmp$$.train";
    }
    
    open(CONTIGS, "<$training_contigs")
	|| die "could not read-open $training_contigs";
    
    while (($contig_id, $seqP) = &read_fasta_record(\*CONTIGS))
    {
	$len_of{$contig_id} = length($$seqP);
	
	open( TMP, ">tmp$$.contig") || die "Could not write-open tmp$$.contig";
	&display_id_and_seq($contig_id, $seqP, \*TMP);
	close(TMP) || die "Could not close tmp$$.contig";
	
	print STDERR "\nScanning contig $contig_id\n";
	system("long-orfs tmp$$.contig -l | get-putative > tmp$$.coord") 
	    && die "Could not extract training ORFs from contig $contig_id";
	system("extract tmp$$.contig tmp$$.coord >> tmp$$.train") 
	    && die "Could not extract training sequences from contig $contig_id";
    }
    close(CONTIGS) || die "Could not close $contigs";
    
    if (-s "tmp$$.coord")  { unlink("tmp$$.coord")  || warn "Could not remove tmp$$.coord";  }
    if (-s "tmp$$.contig") { unlink("tmp$$.contig") || warn "Could not remove tmp$$.contig"; }
}

if (($_ =  `wc tmp$$.train`) && ($_ =~ m/^\s+(\d+)/))
{
    print STDERR "\nExtracted $1 training sequences\n\n";
} 
else 
{ 
    die "\nCould not extract any training sequences";
}

print STDERR "Building interpolated context model in tmp$$.model\n";

if (-s "tmp$$.model")
{
    system("rm -f tmp$$.model") && die "Could not remove tmp$$.model";
}

system("build-icm <tmp$$.train >tmp$$.model") 
    && die "Failure at build-icm <tmp$$.train >tmp$$.model";

#open(TBL,     ">tbl")      || die "Could not write-open tbl";
open(CONTIGS, "<$contigs") || die "Could not read-open $contigs";
while (($contig_id, $seqP) = &read_fasta_record(\*CONTIGS))
{
    if ($skip{$contig_id})
    {
	print STDERR "Skipping $contig_id\n" if $ENV{VERBOSE};
	next;
    }
    
    $len_of{$contig_id} = length($$seqP);
    
    open( TMP, ">tmp$$.contig") || die "Could not write-open tmp$$.contig";
    &display_id_and_seq($contig_id, $seqP, \*TMP);
    close(TMP) || die "Could not close tmp$$.contig";
    
    print STDERR "\nFinding ORFs on contig $contig_id, len=", length($$seqP), "\n";
    if (@calls = `glimmer2 tmp$$.contig tmp$$.model -l | get-putative`)
    {
	foreach $call (@calls)
	{
	    $call =~ m/^\s+\d+\s+(\d+)\s+(\d+)/;
	    
	    if ($1 < $2) { 
		($beg, $end) = ($1, &min(($2+3), $len_of{$contig_id}));
	    } else {
		($beg, $end) = ($1, &max(($2-3), 1));
	    }
	    
	    ++$orf_num;
	    print STDOUT "fig|$org.peg.$orf_num\t$contig_id\_$beg\_$end\t\n"
	}
    }
    else
    {
	warn "glimmer2 found no ORFs on contig $contig_id";
    }
}
close(CONTIGS) || die "Could not close $contigs";

if (-s "tmp$$.contig") { unlink("tmp$$.contig") || warn "Could not remove tmp$$.contig"; }
if (-s "tmp$$.train")  { unlink("tmp$$.train")  || warn "Could not remove tmp$$.train";  }
if (-s "tmp$$.model")  { unlink("tmp$$.model")  || warn "Could not remove tmp$$.model";  }

exit 0;



sub read_fasta_record
{
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );

    if (not defined($file_handle))  { $file_handle = \*STDIN; }

    $old_end_of_record = $/;
    $/ = "\n>";

    if (defined($fasta_record = <$file_handle>))
    {
        chomp $fasta_record;
        @lines  =  split( /\n/, $fasta_record );
        $head   =  shift @lines;
        $head   =~ s/^>?//;
        $head   =~ m/^(\S+)/;
        $seq_id = $1;

        if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }

        $sequence  =  join( "", @lines );

        @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
        @parsed_fasta_record = ();
    }

    $/ = $old_end_of_record;

    return @parsed_fasta_record;
}

sub display_id_and_seq 
{
    my( $id, $seq, $fh ) = @_;

    if (! defined($fh) )  { $fh = \*STDOUT; }

    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq
{
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );

    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60)
    {
        if (($i + 60) <= $n)
        {
            $ln = substr($$seq,$i,60);
        }
        else
        {
            $ln = substr($$seq,$i,($n-$i));
        }
        print $fh "$ln\n";
    }
}

sub rev_comp 
{
    my( $seqP ) = @_;
    my( $rev  );

    $rev =  reverse( $$seqP );
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return \$rev;
}

sub min { my ($x, $y) = @_; return (($x < $y) ? $x : $y); }

sub max { my ($x, $y) = @_; return (($x > $y) ? $x : $y); }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3