[Bio] / FigKernelScripts / trim_sequences.pl Repository:
ViewVC logotype

View of /FigKernelScripts/trim_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (download) (as text) (annotate)
Sat Sep 12 15:43:30 2009 UTC (10 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.10: +3 -4 lines
fixes bug in handling multidomain prots

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;

# usage: trim_sequences [-pad_left=nL] [-pad_right=nR] [SimilarityCutoff FracCoverage] < full_seqs > trimmed_seqs

my $sim_cutoff = 1.0e-30;
my $frac_cov   = 0.80;
my $pad_left   = 0;
my $pad_right  = 0;

while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if    ($arg =~ /^-pad_left=(\d+)/)  { $pad_left = $1 }
    elsif ($arg =~ /^-pad_right=(\d+)/) { $pad_right = $1 }
    else { die "bad argument $arg" }
}

if (@ARGV == 2)
{
    ($sim_cutoff,$frac_cov) = @ARGV;
}

&make_sure("formatdb","blastall");

my $min_sz  = 30;
my $fullF   = "/tmp/seqs.$$";
my $singleF = "/tmp/seq.$$";

my %seq_of;
my %ln;

open(TMP,">$fullF") || die "could not open $fullF";
$/ = "\n>";
while (defined($_ = <STDIN>))
{
    chomp;
    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
    {
	my $id  =  $1;
	my $seq =  $2;
	$seq =~ s/\s//gs;
	$seq_of{$id} = $seq;
	$ln{$id} = length($seq);
	print TMP ">$id\n$seq\n";
    }
}
close(TMP);
$/ = "\n";

&run("formatdb -i $fullF -p T");
my @sims =  grep { ($_->[0] ne $_->[1]) && ($_->[6] <= $sim_cutoff) 
        	   # && ((($_->[3] - $_->[2]) / $ln{$_->[0]}) >= $frac_cov) 
                 }	

            map { $_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\S+)\t(\S+)\t(\S+)\t(\S+)\s+(\S+)/;
	          [$1,$2,$4,$5,$6,$7,$8] 
                }
            `blastall -i $fullF -d $fullF -FF -m 8 -p blastp -b 400000`;

if (@sims == 0)
{
    print STDERR "No similarities worth processing\n";
    exit(1);
}

my %seen;
my @sims1 = ();
foreach my $sim (@sims)
{
    if (! $seen{"$sim->[0]\t$sim->[1]"})  # pick only the best similarity between two ids
    {
	push(@sims1,$sim);
	$seen{"$sim->[0]\t$sim->[1]"} = 1;
    }
}

my %counts;
foreach my $sim (@sims1)
{
    my($id1,$id2,$b1,$e1,$b2,$e2,$sc) = @$sim;
    if (((($e1 - $b1)+1) / $ln{$id1}) >= $frac_cov) { $counts{$id1}++ }
}

my @ids = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
my $i;
for ($i=1; ($i < @ids) && &close_ln($ids[0],$ids[$i],\%ln) && 
                          &close_count($ids[0],$ids[1],\%counts); $i++) {}

if ($i < @ids) { $#ids = $i-1 }
foreach my $id (@ids)
{
#   print STDERR "$id\t$counts{$id}\n";
}

my %short = map { $_ => 1 } @ids;
my @short_sims = grep { $short{$_->[1]} } @sims1;

my %starts;
my %ends;

foreach my $sim (@short_sims)
{
    my($id1,$id2,$b1,$e1,$b2,$e2,$sc) = @$sim;
    push(@{$starts{$id1}},$b1);
    push(@{$ends{$id1}},$e1);
}

foreach my $id (keys(%starts))
{
    my @x  = sort { $a <=> $b } @{$starts{$id}};
    my $b1 = $x[int(@x/2)];
    $b1    = ($b1 > $pad_left) ? $b1 - $pad_left : 1;

    @x     = sort { $a <=> $b } @{$ends{$id}};
    my $e1 = $x[int(@x/2)];
    $e1    = ($e1 < ($ln{$id} - $pad_right)) ? $e1 + $pad_right : $ln{$id};
    my $s  = substr($seq_of{$id},($b1-1),($e1+1-$b1));
    print ">$id $b1-$e1\n$s\n";
}

system("rm $fullF\*");

sub run {
    my($cmd) = @_;

#   my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n";
    (system($cmd) == 0) || confess "FAILED: $cmd";
}

sub make_sure {
    my(@progs) = @_;

    my $prog;
    foreach $prog (@progs)
    {
	my @tmp = `which $prog`;
	if ($tmp[0] =~ /^no $prog/)
	{
	    print STDERR $tmp[0];
	    exit(1);
	}
    }
}

sub close_ln {
    my($id1,$id2,$lns) = @_;

    return (abs(($lns->{$id1} - $lns->{$id2})/$lns->{$id1}) < 0.8);
}

sub close_count {
    my($id1,$id2,$counts) = @_;

    return (abs(($counts->{$id1} - $counts->{$id2})/$counts->{$id1}) < 0.8);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3