[Bio] / FigKernelScripts / build_solid_set.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_solid_set.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri Feb 1 16:49:36 2013 UTC (6 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.2: +16 -6 lines
avoid including transport or regulatory clusters

use strict;
use Data::Dumper;
use Carp;
use SeedUtils;
use GenerateClusters;
use FIG;
my $fig = new FIG;

=head1 svr_build_solid_set -g CloseGenomes < clustersRel

Produces a set of "solid rectangles" separated by "//" lines.  Each group of lines
is the set of rows in one rectangle.   Each line in a rectangle begins with a genome ID, and is followed
by an ordered set of tab-separated Pegs (one
line per genome.  The first line will be an input cluster from the input set
(from the reference genome)

------

Example:

    svr_build_solid_set -g close.genomes < RefClusters > solid.rectangles

=head2 Command-Line Options

=over 4

=item -g GenomesToTest

=back

=head2 Output Format

The output is a set of lines, one per genome projected to.
Each line is a set of tab-separated Pegs from a single genome

=cut

use SeedUtils;
use SAPserver;
my $sapO = SAPserver->new();
use Getopt::Long;
use GenerateClusters;

my $usage = "usage: svr_build_solid_set -g CloseGenomesFile < clusters_from_reference_genome";

my $file;

my $rc  = GetOptions('g=s' => \$file);
if (! $rc) { print STDERR $usage; exit }

if (! $file) { die "you need to give a file of genomes -f  argument" }

my @genomes = map { ($_ =~ /^(\S+)/) ? $1 : () } `cat $file`;
print STDERR scalar @genomes, " genomes to check\n";

my @sets;
my $last = <STDIN>;
while ($last && ($last =~ /^(\S+)\t(\S+)/))
{
    my $curr = $1;
    my $set = [];
    while ($last && ($last =~ /^(\S+)\t(\S+)/) && ($1 eq $curr))
    {
	push(@$set,$2);
	$last = <STDIN>;
    }
    if ($set = &ok_set($set,$fig)) 
    {
	push(@sets,$set);
    }
}
print STDERR scalar @sets, " sets were read\n";
my $ref = &SeedUtils::genome_of($sets[0]->[0]);
print STDERR "reference genome = $ref\n";
my $corrH = &GenerateClusters::bbhs($ref,\@genomes,$sapO);
print STDERR "read all correspondence tables\n";

foreach my $set (@sets)
{
    print join("\t",($ref,@$set)),"\n";
    foreach my $g (@genomes)
    {
	my $instance = &project($set,$ref,$g,$corrH,$sapO);
	if ($instance)
	{
	    print join("\t",($g,@$instance)),"\n";
	}
    }
    print "//\n";
}

sub ok_set {
    my($set,$fig) = @_;

    my @ok_pegs = grep { my $f = $fig->function_of($_); $f !~ /transport|regulat/ } @$set;
    if     (@ok_pegs >= (0.5 * @$set))  { return $set }
    elsif  (@ok_pegs > 2)               { return \@ok_pegs }
    else                                { return undef }
}
    

sub project {
    my($set,$ref,$g,$corrH,$sapO) = @_;


    my @set2 = map { ($corrH->{$g}->{$_}) ? $corrH->{$g}->{$_} : () } @$set;
    if (@set2 == @$set)
    {
	my $sz = &sz_of_cluster($set,$sapO);
	my $sz2 = &sz_of_cluster(\@set2,$sapO);
	if ($sz && ((($sz2-$sz) / $sz) < 0.2))
	{
	    return \@set2;
	}
    }
    return undef;
}

sub sz_of_cluster {
    my($set,$sapO) = @_;

    my $locH = $sapO->fid_locations( -ids => $set, -boundaries => 1);
    my $min;
    my $max;
    foreach my $peg (keys(%$locH))
    {
	my $loc = $locH->{$peg};
	if ($loc && ($loc =~ /^.*:(\S+)_(\d+)[+-](\d+)$/))
	{
	    my($contig,$beg,$strand,$len) = ($1,$2,$3,$4);
	    if ($strand eq '+')
	    {
		if ((! $min) || ($min > $beg)) { $min = $beg }
		if ((! $max) || ($max < ($beg + $len - 1))) { $max = $beg + $len - 1 }
	    }
	    else
	    {
		if ((! $min) || ($min > ($beg - ($len - 1)))) { $min = $beg-($len - 1) }
		if ((! $max) || ($max < $beg))  { $max = $beg }
	    }
	}
    }
    return ($max - $min) + 1;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3