[Bio] / FigKernelScripts / split_sequences_into_sets.pl Repository:
ViewVC logotype

View of /FigKernelScripts/split_sequences_into_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Aug 19 16:11:28 2009 UTC (10 years, 3 months ago) by overbeek
Branch: MAIN
Changes since 1.1: +76 -186 lines
complete rewrite

########################################################################
# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use FIG;

use ProtSims;
use gjoseqlib;

my $min_sz  = 30;
my $fullF   = "/tmp/seqs.$$";

my $usage = "usage: split_sequences_into_sets OutputDirectory [SimilarityCutoff FracCoverage] < full_seqs";

my $output;
(
 ($output = shift @ARGV)
)
    || die $usage;

(! -e $output) || die "$output already exists -- check it (and delete it, if that is what you really wish)";
&FIG::verify_dir($output);

my @seqs = &gjoseqlib::read_fasta();
my %seq_of = map { $_->[0] => $_->[2] } @seqs;

my($sim_cutoff,$frac_cov);
if (@ARGV == 2)
{
    $sim_cutoff = shift @ARGV;
    $frac_cov   = shift @ARGV;
}
else
{
    $sim_cutoff = 1.0e-30;
    $frac_cov   = 0.70;
}

my $min_size = (@seqs > 20) ? (@seqs / 10) : (@seqs / 2);
my @all = &ProtSims::blastP(\@seqs,\@seqs,$min_size);
# &print_sims(\@all);

# $_ = @all; print STDERR "computed $_ similarities\n";

my @sims = grep { ($_->id1 ne $_->id2) && 
	          ($_->psc <= $sim_cutoff) &&
                  ((($_->e1 - $_->b1) / $_->ln1) >= $frac_cov) &&
                  ((($_->e2 - $_->b2) / $_->ln2) >= $frac_cov)
                } @all;

# print STDERR "=======\n";
# &print_sims(\@sims);

my(%seen,%conn);
foreach my $sim (@sims)
{
    if (! $seen{"$sim->id1\t$sim->id2"})  # pick only the best similarity between two ids
    {
	push(@{$conn{$sim->id1}},$sim);
	$seen{"$sim->id1\t$sim->id2"} = 1;
    }
}
undef %seen;
# print STDERR "built connections\n";

my @ids = sort { &cov($conn{$b}) <=> &cov($conn{$a}) } 
          keys(%seq_of);  
# sort by sum of coverage

my $n = 1;
my @sets = ();

open(SZ,">$output/set.sizes") || die "could not open $output/set.sizes";

foreach my $central_id (@ids)
{
    if (! $seen{$central_id})
    {

	my $set = &extract_set($central_id,\%conn,\%seen);
	print SZ join("\t",($n,scalar @$set)),"\n";

	push(@sets,[$n,$set]);

	open(OUT,">$output/$n") || die "could not open $output/$n";
	$n++;
	foreach my $peg (@$set)
	{
	    $seen{$peg} = 1;
	    print OUT ">$peg\n$seq_of{$peg}\n";
	}
	close(OUT);
    }
}
close(SZ);
# print STDERR "built set sizes\n";

my @distances = &construct_distance_matrix(\@sets,\@all); # entries are [n1,n2,sc]
open(DIST,">$output/distance.matrix\n") || die "could not open $output/distance.matrix";
foreach my $tuple (@distances)
{
    print DIST join("\t",@$tuple),"\n";
}
close(DIST);
# print STDERR "built distance matrix\n";

sub print_sims {
    my($sims) = @_;
    
    foreach my $sim (@$sims)
    {
	print STDERR join("\t",($sim->id1,$sim->id2,$sim->iden,$sim->b1,$sim->e1,$sim->b2,$sim->e2,$sim->psc,$sim->ln1,$sim->ln2)),"\n";
    }
}

sub cov {
    my($sims) = @_;
    my($x,$n);

    $n = 0;
    foreach $x (@$sims)
    {
	$n += ($x->e1 - $x->b1);
    }
    return $n;
}

sub run {
    my($cmd) = @_;

#   my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n";
    (system($cmd) == 0) || confess "FAILED: $cmd";
}

sub extract_set {
    my($central_id,$conn,$seen) = @_;

    my $cluster = [$central_id];
    my %in      = ($central_id,1);

    my $i;
    while ($i < @$cluster)
    {
	my $x = $conn->{$cluster->[$i]};
	$i++;
	foreach my $peg (map { $_->id2} @$x)
	{
	    if (! $in{$peg})
	    {
		push(@$cluster,$peg);
		$in{$peg} = 1;
	    }
	}
    }
    return $cluster;
}

sub construct_distance_matrix {
    my($sets,$sims) = @_;
    my($tuple,$n,$set,$id1,$id2,$sc,$sim,%best,$key,$n1,$n2);
    my(@distances,$sofar,%in);

    foreach my $tuple (@$sets)
    {
	my($n,$set) = @$tuple;
	foreach my $id (@$set)
	{
	    $in{$id} = $n;
	}
    }

    my %best;
    foreach my $sim (@$sims)
    {
	my $id1 = $sim->id1;
	my $id2 = $sim->id2;
	my $sc = $sim->psc;
	if ($in{$id1} && $in{$id2} && ($in{$id1} != $in{$id2}))
	{
	    my $key = join("\t",($in{$id1},$in{$id2}));
	    if ((! ($sofar = $best{$key})) || ($sofar > $sc))
	    {
		$best{$key} = $sc;
	    }

	    $key = join("\t",($in{$id2},$in{$id1}));
	    if ((! ($sofar = $best{$key})) || ($sofar > $sc))
	    {
		$best{$key} = $sc;
	    }
	}
    }

    my @distances = ();
    foreach my $key (keys(%best))
    {
	my($n1,$n2) = split(/\t/,$key);
	push(@distances,[$n1,$n2,$best{$key}]);
    }
    return sort { ($a->[0] <=> $b->[0]) or ($a->[1] <=> $b->[1]) } @distances;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3