[Bio] / FigKernelScripts / find_gaps.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_gaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Wed Oct 17 08:53:20 2007 UTC (12 years, 7 months ago) by gdpusch
Branch: MAIN
Changes since 1.5: +65 -26 lines
Added 5prime padding. -- /gdp

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use FIG;
$fig = new FIG;

use constant LEFT   =>  0;
use constant RIGHT  =>  1;
use constant FID    =>  2;

use constant TRAN   =>  0;
use constant OTHERS =>  1;

$0 =~ m/([^\/]+)$/;   $this_tool = $1;
$usage = "$this_tool  [-extend_5p=num] org_id  contigs  tbl_1 tbl_2 ... > gap_tbl";

if (not @ARGV) {
    die "\n\n\tusage: $usage\n\n";
}

$trouble = 0;
$extend_5prime_end = 0;

#...Process any flags and switches...
while ($ARGV[0] =~ m/^-/o) {
    if ($ARGV[$i] =~ m/-help/) {
	print STDERR "\n\n\tusage: $usage\n\n";
	exit(0);
    }
    elsif ($ARGV[$i] =~ m/-extend_5p=(\d+)/o) {
	$extend_5prime_end = $1;
	print STDERR "PEG 5-prime end will be extended $extend_5prime_end bp\n" if $ENV{VERBOSE};
    }
    else {
	$trouble = 1;
	warn "Unrecognized flag: $ARGV[$i]\n";
    }
    
    shift @ARGV;
}


#...Process Org-ID argument...
if (not @ARGV) {
    warn "\n\nNo argument list given\n\n";
    die "\n\n\tusage: $usage\n\n";
}
else {
    if ($ARGV[0] !~ m/^\d+\.\d+$/o) {
	$trouble = 1;
	warn "\nOrg-ID $ARGV[0] is malformed\n";
    }
    else {
	$org_id = shift @ARGV;
    }
}


#...Check existence of file arguments...
for ($i=0; $i < @ARGV; ++$i) {
    if (!-e $ARGV[$i]) {
	$trouble = 1;
	warn "\nERROR: File $ARGV[$i] does not exist\n";
    }
}
die "\n\nAborting due to invalid args\n\n   usage: $usage\n\n" if $trouble;

(($contigs_file = shift @ARGV) && (-s $contigs_file)) 
    || die "Contigs file $contigs_file has zero size\n\n\tusage: $usage\n\n";

((@tbls = @ARGV) > 0) || die "\n\tusage: $usage\n\n";

$len_of  = &load_contig_lens($contigs_file);
$regions = &load_regions($len_of, @tbls);
# die Dumper($features);

$gap_num = 0;
foreach $contig (sort keys %$len_of) {
    print STDERR "Scanning $contig ...\n" if $ENV{VERBOSE};
    
    $x = $regions->{$contig};
    $x = defined($x) ? $x : [];
    
#   if ($x->[0]  > 1)                  { unshift @$x, [0, 0]; }
#   if ($x->[-1] > $len_of->{$contig}) { push    @$x, [1+$len_of->{$contig}, 1+$len_of->{$contig}]; }
    
    unshift @$x, [0, 0];
    push    @$x, [1+$len_of->{$contig}, 1+$len_of->{$contig}];
    
    for ($i=1; $i < @$x; ++$i) {
	$gap_beg = $x->[$i-1]->[RIGHT] + 1;
	$gap_end = $x->[$i]->[LEFT] - 1;
#	$gap_len = $gap_end - $gap_beg + 1;
	$gap_loc = "$contig\_$gap_beg\_$gap_end";
	
	print "fig|$org_id.gap.".(++$gap_num) . "\t$gap_loc\n";
    }
}
print STDERR "\n$this_tool done\n\n" if $ENV{VERBOSE};
exit(0);


sub load_contig_lens
{
    my ($contigs_file) = @_;
    
    my $num_contigs = 0;
    my $num_chars   = 0;
    
    my $len_of = {};
    
    print STDERR "Loading $contigs_file ...\n" if $ENV{VERBOSE};
    open (CONTIGS, "<$contigs_file") or die "could not open $contigs_file to read";
    while (($id, $seqP) = &FIG::read_fasta_record(\*CONTIGS)) {
	++$num_contigs;
	$num_chars += $len_of->{$id} = length($$seqP);
    }
    close(CONTIGS) or die "could not close $contigs_file";
    print STDERR "Loaded $num_contigs contig lengths (total $num_chars chars) from $contigs_file\n\n" if $ENV{VERBOSE};

    return $len_of;
}

sub load_regions
{
    my ($len_of, @tbl_files) = @_;
    my ($fid, $org, $locus, $contig, $beg, $end, $len);
    
    my $regions  = {};
    foreach my $tbl_file (@tbl_files) {
	my $num_fids = 0;
	open(TBL,"<$tbl_file") || die "could not open $tbl_file";
	print STDERR "Loading $tbl_file ...\n" if $ENV{VERBOSE};
	
	while (defined($entry = <TBL>))
	{
	    ++$num_fids;
	    chomp $entry;
	    if ($entry =~ m/^(\S+)\s+(\S+)/o) {
		($fid, $locus) = ($1, $2);
		
		$fid_type = "";
		if ($fid =~ m/^fig\|\d+\.\d+\.(rna|peg|orf)\.\d+$/) {
		    $fid_type = $1;
		}
		else {
		    die "Unrecognized FID type: $fid"; 
		}
		
		($contig, $beg, $end) = $fig->boundaries_of($locus);
		
		if ($extend_5prime_end && ($fid_type eq qq(peg))) {
		    print STDERR "$fid:\t$locus\tbeg = $beg --> " if $ENV{VERBOSE};
		    $beg -= ($beg < $end) ? +$extend_5prime_end : -$extend_5prime_end;
		    $beg = &FIG::max(0, &FIG::min($beg, $len_of->{$contig}));
		    print STDERR "$beg\n" if $ENV{VERBOSE};
		}
		$len = 1 + abs($end-$beg);
		
		unless (defined($regions->{$contig})) { $regions->{$contig} = []; }
		push(@ { $regions->{$contig} }, [ &FIG::min($beg, $end), &FIG::max($beg, $end), $fid ] );
	    }
	    else
	    {
		print STDERR "Skipping invalid entry $entry\n";
	    }
	}
	print STDERR "Loaded $num_fids features from $tbl_file\n\n" if $ENV{VERBOSE};
	close(TBL);
    }
    
    foreach my $contig (keys(%$regions))
    {
	my $x = [ sort { $a->[LEFT] <=> $b->[LEFT] }  @ { $regions->{$contig} } ];
	for (my $i=1; $i < @$x; ++$i)
	{
	    while (($i < @$x) && ($x->[$i-1]->[RIGHT] >= $x->[$i]->[LEFT]))
	    {
		print STDERR "Merging $i:\t$x->[$i-1]->[FID]\t$x->[$i]->[FID]\n"
		    if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
		$x->[$i-1]->[RIGHT] = &FIG::max( $x->[$i-1]->[RIGHT], $x->[$i]->[RIGHT] );
		splice @$x, $i, 1;
	    }
	}
	
	$regions->{$contig} = $x;
	print STDERR "After merger, $contig has ", (scalar @$x), " regions\n\n" if $ENV{VERBOSE}; 
    }

    return $regions;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3