[Bio] / FigKernelScripts / backfill_gaps.pl Repository:
ViewVC logotype

View of /FigKernelScripts/backfill_gaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (download) (as text) (annotate)
Wed Feb 11 21:44:29 2015 UTC (4 years, 9 months ago) by gdpusch
Branch: MAIN
CVS Tags: HEAD
Changes since 1.20: +10 -1 lines
Accept as few as a single template if it is in a subsystem.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use strict;
use warnings;

use Carp;

use FIG;
use FIGV;
my $fig = FIG->new();

use constant LEFT   =>  0;
use constant RIGHT  =>  1;
use constant FID    =>  2;

use constant TRAN   =>  0;
use constant OTHERS =>  1;

$0 =~ m/([^\/]+)$/o;
my $self  = $1;
my $usage = "$self [-h(elp)] [-orgdir=OrgDir] [-genetic_code=Code_Number] close_genomes_file OrgID contigs tbl_1 tbl_2 ... > new_tbl_entries";


my $min_len        = 90;
my $genetic_code   = 11;

my $trouble = 0;
while ($ARGV[0] =~ m/^-/) {
    if ($ARGV[0] =~ m/-h(elp)?/) {
	print STDERR "\n   usage: $usage\n\n";
	exit(0);
    }
    elsif ($ARGV[0] =~ /-orgdir=(\S+)/) {
	warn "$self is using $ARGV[0]\n";
	$fig = new FIGV($1);
    }
    elsif ($ARGV[0] =~ /-genetic_code=(\d+)/) {
	warn "$self is using $ARGV[0]\n";
	$genetic_code = $1;
    }
    else {
	$trouble = 1;
	warn "Invalid argument $ARGV[0]\n";
    }
    
    shift @ARGV;
}

for (my $i=0; $i < @ARGV; ++$i) {
    if ($i == 1) {
	if ($ARGV[$i] !~ m/^\d+\.\d+$/) {
	    $trouble = 1;
	    warn "ERROR: OrgID $ARGV[$i] has an invalid format\n"; 
	}
    }
    elsif (!-e $ARGV[$i]) {
	$trouble = 1;
	warn "ERROR: File $ARGV[$i] does not exist\n";
    }
}
die qq(\n\nAborting due to invalid args\n\nusage: $usage\n\n) if $trouble;


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Look up the translation table for this code for later use.
#-----------------------------------------------------------------------
my $trans_table = &FIG::genetic_code($genetic_code);

my $close_genomes_file;
(($close_genomes_file = shift @ARGV) && (-s $close_genomes_file))
    || die "Close genomes file $close_genomes_file has zero size\n\n\tusage: $usage\n\n";

my $taxon_id;
($taxon_id      = shift @ARGV) || die "No Taxon-ID given\n\n\tusage: $usage\n\n";

my $contigs_file;
(($contigs_file = shift @ARGV) && (-s $contigs_file)) 
    || die "Contigs file $contigs_file has zero size\n\n\tusage: $usage\n\n";

my @tbls;
((@tbls = @ARGV) > 0) || die "No tbl files given\n\tusage: $usage\n\n";

my ($seq_of, $len_of) = &load_contigs($contigs_file);

my ($max_peg_num, $regions, $features) = &load_tbls(@tbls);
# die Dumper($max_peg_num, $regions, $features);
print STDERR "Max PEG-num read = $max_peg_num\n";

my @close_orgs;
(@close_orgs = $fig->file_read($close_genomes_file))
    || die qq(No close genomes read from close_genomes_file \'$close_genomes_file\');
@close_orgs  =  map { m/^(\S+)/o ? ($1) : () } @close_orgs;

my $tmp_db      = qq($FIG_Config::temp/tmp_backfill_gaps_db.$$);
my $tmp_query   = qq($FIG_Config::temp/tmp_backfill_gaps_query.$$);

my $orgs_loaded = 0;
unlink($tmp_db) if (-e $tmp_db);
foreach my $org (@close_orgs) {
    my $fasta_file;
    if (-d $org) {
	$fasta_file = qq($org/Features/peg/fasta);
    }
    elsif ($org =~ m/^\d+\.\d+$/o) {
	if ($fig->is_genome($org)) {
	    $fasta_file = qq($FIG_Config::organisms/$org/Features/peg/fasta);
	}
	else {
	    warn qq(Genome \'$org\' is not installed --- skipping\n);
	    next;
	}
    }
    else {
	warn qq(Cannot handle close_genomes_file entry: \'$org\' --- skipping\n);
	next;
    }
    
    if (-s $fasta_file) {
	++$orgs_loaded;
	&FIG::run(qq(cat $fasta_file >> $tmp_db));
    }
    else {
	warn "Close genome FASTA file $fasta_file has zero size --- skipping\n";
    }
}
print STDERR "Loaded $orgs_loaded genomes\n";


if (!-s $tmp_db) {
    unlink($tmp_db, $tmp_query) unless $ENV{DEBUG};
    die qq(\nAborted due to zero-size or nonexistent reference-sequence file \$tmp_db\'\n\n);
}
else {
    &FIG::run("formatdb -i $tmp_db -p T");
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Foreach contig, find gaps between features and BLAST them...
#-----------------------------------------------------------------------
foreach my $contig (keys %$regions)
{
    my $x = $regions->{$contig};
    
    if ($x->[0]  > 1)                  { unshift @$x, [0, 0]; }
    if ($x->[-1] < $len_of->{$contig}) { push    @$x, [1+$len_of->{$contig}, 1+$len_of->{$contig}]; }
    
    print STDERR "Processing ", (scalar @$x), " gaps on contig $contig:\n" if $ENV{VERBOSE};
    for (my $i=1; $i < @$x; ++$i) {
	my $gap_beg = $x->[$i-1]->[RIGHT] + 1;
	my $gap_end = $x->[$i]->[LEFT] - 1;
	my $gap_len = $gap_end - $gap_beg + 1;
	my $gap_loc = "$contig\_$gap_beg\_$gap_end";
	
	if ($gap_len < $min_len) {
	    print STDERR "Skipping short gap $gap_loc ($gap_len char)\n" if $ENV{FIG_VERBOSE};
	    next;
	}
	else {
	    print STDERR "BLASTing gap $gap_loc ($gap_len char)\n"       if $ENV{FIG_VERBOSE};
	}
	
	my $gap_seq = &get_dna($contig, $gap_beg, $gap_end);
	die "Could not get seq for $gap_loc" unless $gap_seq;
	
	open(TMP, ">$tmp_query") || die "Could not write-open $tmp_query";
	&FIG::display_id_and_seq( $gap_loc, \$gap_seq, \*TMP);
	
	my @hits = `blastall -i $tmp_query -d $tmp_db -p blastx -Q$genetic_code -e1.0e-10 -m8 -FF -gF`;
	chomp @hits;
	
	my %candidates;
	my ($new_fid, $new_loc, $new_tran, $new_beg, $new_end, $new_len, $strand);
	if (@hits) {
	    undef %candidates;
	    foreach my $hit (@hits) {
		my ($fid2, $hit_beg, $hit_end, $hit_len, $hit_loc, $bsc);
		(undef, $fid2, undef,undef,undef,undef, $hit_beg,$hit_end, undef,undef, undef,$bsc) = split /\t/, $hit;
		print STDERR "$hit\n$contig, $hit_beg, $hit_end\n";
		
		$hit_beg = $gap_beg + ($hit_beg - 1);
		$hit_end = $gap_beg + ($hit_end - 1);
		$hit_len = abs($hit_end - $hit_beg) + 1;
		
		$hit_loc = "$contig\_$hit_beg\_$hit_end";
		$strand  = ($hit_beg < $hit_end) ? '+' : '-';
		
		if ($hit_len < $min_len/2) {
		    print STDERR "Skipping $hit_loc due to too-short BLAST hit\n\n" if $ENV{FIG_VERBOSE};
		    next;
		}
		
		my $search_region = join( qq(_),
					  ($contig,
					   $fig->max(($gap_beg - 150), 1),
					   $fig->min(($gap_end + 150), $len_of->{$contig})
					   )
					  );
		
		my $tran = $fig->translate( &get_dna($contig, $hit_beg, $hit_end), $trans_table );
		print STDERR "hit = $hit_loc\t$tran\n";
		if ($tran =~ m/\*/o) {
		    print STDERR "Skipping $hit_loc due to embedded STOPs\n\n" if $ENV{FIG_VERBOSE};
		    next;
		}
		
		unless ((($new_loc, $new_tran)
			 = $fig->pick_gene_boundaries($taxon_id, $hit_loc, undef, $trans_table, $search_region))
			&& $new_loc && $new_tran
			) {
		    print STDERR "Could not find gene-boundaries for loc=$hit_loc, tran=$tran --- Skipping\n\n";
		    next;
		}
		
		if ($new_tran =~ m/\*/o) {
		    die "Embedded STOPs in loc=$hit_loc --> new_loc=$new_loc,\n"
			, "\ttran=$tran --> new_tran=$new_tran";
		}
		
		unless (((undef, $new_beg, $new_end) = $fig->boundaries_of($new_loc)) && $new_beg && $new_end) {
		    die "Could not extract boundaries of loc=$hit_loc --> new_loc=$new_loc,"
			, " tran=$tran --> new_tran=$new_tran";
		}
		
		if (my $fid = $features->{"$contig$strand$new_end"}) {
		    print STDERR "Skipping $new_loc since it corresponds to $fid\n\n" if $ENV{VERBOSE};
		    next;
		}
		
		$new_len = abs($new_end - $new_beg) + 1;
		if ($new_len > 2*$hit_len) {
		    print STDERR "Skipping $new_loc since new_len=$new_len is more than twice hit_len=$hit_len\n\n"
			if $ENV{VERBOSE};
                    next;
		}
		
		unless (defined($candidates{$new_loc})) {
		    $candidates{$new_loc} = [ $new_tran, {} ]; 
		    
		    if (length($new_tran) < 500) {
			print STDERR "new_loc = $new_loc\t$new_tran\n" if $ENV{VERBOSE};
		    }
		    else {
			print STDERR "new_loc = $new_loc\t(trans too long to display)\n" if $ENV{VERBOSE};
		    }
		}
		
		$candidates{$new_loc}->[OTHERS]->{$fid2} += $bsc;
		print STDERR "\n" if $ENV{VERBOSE};
	    }
	    
	    
	    print STDERR "------------------------------\n\n"
		, "Processing BLAST candidates for gap = $gap_loc:\n"
		if $ENV{VERBOSE};
	    
	    foreach my $new_loc (keys %candidates) {
		my $tran   = $candidates{$new_loc}->[TRAN];
		my $others = $candidates{$new_loc}->[OTHERS];
		my @others = sort { $others->{$b} <=> $others->{$a} } keys %$others;
		my @in_ss  = grep { $fig->subsystems_for_peg( $_, 1, 1) } @others;
		print STDERR ("Locus $new_loc is supported by ",
			      (scalar @others), " hits, with ",
			      (scalar @in_ss),  " in subsystems\n"
			      ) if $ENV{VERBOSE};
		
		print STDERR "Recalling $new_loc ";
		if ((@in_ss == 0) && (@others < 3)) {
		    print STDERR "--- Skipping due to too few similar FIDs: "
			, join(" ", @others)
			, "\n\n" if $ENV{VERBOSE};
		    next;
		}
		elsif (@others > 10) {
		    print STDERR "--- Too many hits (", (scalar @others)
			, "), truncating to 10 FIDs: " if $ENV{VERBOSE};
		    $#others = 9;
		}
		else {
		    print STDERR "using " if $ENV{VERBOSE};
		}
		print STDERR join(" ", @others), "\n\n" if $ENV{VERBOSE};
		
		my $recalled = $fig->call_start($taxon_id, $new_loc, $new_tran, \@others);
		if ($recalled) {
		    my ($beg, $end, $len);
		    (undef, $beg, $end) = $fig->boundaries_of($recalled);
		    $len = abs($end - $beg) + 1;
		    
		    my $fid;
		    if ($fid = $features->{"$contig$strand$end"}) {
			print STDERR "Existing feature $fid\n\n" if $ENV{FIG_VERBOSE};
		    }
		    else {
			if ($len >= $min_len) {
			    ++$max_peg_num;
			    $new_fid = "fig|$taxon_id.peg.$max_peg_num";
			    print STDERR "Accepting $recalled as $new_fid\n\n" if $ENV{VERBOSE};
			    print STDOUT "$new_fid\t$recalled\t\n";
			}
			else {
			    print STDERR "Skipping short recalled PEG ($len chars)\n\n" if $ENV{VERBOSE};
			}
		    }
		}
		else
		{
		    print STDERR "Recall failed for $new_loc\n\n" if $ENV{VERBOSE};
		}
	    }
	}
	print STDERR "//\n\n" if $ENV{VERBOSE};
    }
    print STDERR "///\n\n" if $ENV{VERBOSE};
}
unlink($tmp_db, $tmp_query) || die "Could not unlink $tmp_db and/or $tmp_query";
print STDERR "\n$self done\n\n";
exit(0);



sub load_contigs
{
    my ($contigs_file) = @_;
    
    my $seq_of = {};
    my $len_of = {};
    
    open (CONTIGS, "<$contigs_file") or die "could not open $contigs_file to read";
    while ( (! eof(CONTIGS)) && ( my ($id, $seqP) = &FIG::read_fasta_record(\*CONTIGS) ) )
    {
    	$$seqP =~ tr/a-z/A-Z/;
	$seq_of->{$id} = $$seqP;
	$len_of->{$id} = length($$seqP);
    }
    close(CONTIGS) or die "could not close $contigs_file";
    
    return ($seq_of, $len_of);
}

sub load_tbls
{
    my (@tbl_files) = @_;
    my ($fid, $org, $locus, $contig, $beg, $end, $strand, $len);
    
    my $regions  = {};
    my $features = {};
    my $file     = 0;
    my $max_peg  = 0;
    foreach my $tbl_file (@tbl_files)
    {
	my $num_fids = 0;
	open(TBL,"<$tbl_file") || die "could not open $tbl_file";
	print STDERR "Loading $tbl_file ...\n" if $ENV{VERBOSE};
	
	my $entry;
	while (defined($entry = <TBL>))
	{
	    ++$num_fids;
	    chomp $entry;
	    if ($entry =~ /^(\S+)\s+(\S+)/)
	    {
		($fid, $locus) = ($1, $2);
		if ($fid =~ m/^fig\|(\d+\.\d+)\.(rna|peg|orf)\.(\d+)$/) 
		{ 
		    $org = $1; 
		    $max_peg = &FIG::max($max_peg, $3) if ($2 eq 'peg');
		} else { 
		    die "Invalid FID $fid"; 
		}
		
		($contig, $beg, $end) = $fig->boundaries_of($locus);
		
		if ($beg < $end) { $strand = '+'; } else { $strand = '-'; }
		
		$len = 1 + abs($end-$beg);
		
		$features->{"$contig$strand$end"} = $fid;
		
		unless (defined($regions->{$contig})) { $regions->{$contig} = []; }
		push(@ { $regions->{$contig} }, [ &FIG::min($beg, $end), &FIG::max($beg, $end), $fid ] );
	    }
	    else
	    {
		print STDERR "Skipping invalid entry $entry\n";
	    }
	}
	print STDERR "Loaded $num_fids features from $tbl_file\n\n" if $ENV{VERBOSE};
	close(TBL);
	++$file;
    }
    
    foreach my $contig (keys(%$regions))
    {
	my $x = [ sort { $a->[LEFT] <=> $b->[LEFT] }  @ { $regions->{$contig} } ];
	for (my $i=1; $i < @$x; ++$i)
	{
	    while (($i < @$x) && ($x->[$i-1]->[RIGHT] >= $x->[$i]->[LEFT]))
	    {
		print STDERR "Merging $i:\t$x->[$i-1]->[FID]\t$x->[$i]->[FID]\n"
		    if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
		$x->[$i-1]->[RIGHT] = &FIG::max( $x->[$i-1]->[RIGHT], $x->[$i]->[RIGHT] );
		splice @$x, $i, 1;
	    }
	}
	
	$regions->{$contig} = $x;
	print STDERR "After merger, $contig has ", (scalar @$x), " regions\n\n" if $ENV{VERBOSE}; 
    }

    return ($max_peg, $regions, $features);
}

sub get_dna
{
    my ($contig, $beg, $end) = @_;
    my $len = abs($end-$beg) + 1;
    my $seq = "";
    
    if (defined($$seq_of{$contig}) && $$seq_of{$contig})
    {
	if ($beg < $end)
	{
	    $seq = substr($$seq_of{$contig}, $beg-1, $len);
	} 
	else
	{
	    $seq = substr($$seq_of{$contig}, $end-1, $len);
	    $seq = $ { &FIG::rev_comp(\$seq) };
	}
    }
    else
    {
	die "No sequence loaded for contig $contig";
    }
    
    return uc( $seq );
}    


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3