[Bio] / FigKernelScripts / get_families_final.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_families_final.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Mar 7 16:04:44 2014 UTC (5 years, 9 months ago) by overbeek
Branch: MAIN
Changes since 1.1: +15 -4 lines
handle 1-char functions

use strict;
use Data::Dumper;
use Getopt::Long;
use SeedEnv;
use gjoseqlib;
use gjostat;

my $usage = "usage: get_families_final -f families -s Seq\n";
my $families;
my $seqD;
my $rc  = GetOptions('f=s' => \$families,
                     's=s' => \$seqD);

if ((! $rc) || (! $families) || (! $seqD))
{ 
    print STDERR $usage; exit ;
}

open(IN,"cat $families.good $families.bad.fixed $families.missed |")
    || die "could not read families";

my %genomes;
my %needed_pegs;

while (defined($_ = <IN>))
{
    chomp;
    my(undef,undef,$peg) = split(/\t/,$_);
    my $g = &SeedUtils::genome_of($peg);
    $genomes{$g} = 1;
    $needed_pegs{$peg} = 1;
}
close(IN);
my %to_seq;
foreach my $g (keys(%genomes))
{
    my @tuples = grep { $needed_pegs{$_->[0]} } &gjoseqlib::read_fasta("$seqD/$g");
    foreach my $tuple (@tuples)
    {
	my($peg,undef,$seq) = @$tuple;
	$to_seq{$peg} = $seq;
    }
}

open(FAMS,"cat $families.good $families.bad.fixed $families.missed |")
    || die "could not open families";

my @sets;
my $last = <FAMS>;
while ($last && ($last =~ /^(\S[^\t]*)\t(\d+)\t(\S+)/))
{
    my $fam = $1;
    my $subfam = $2;
    my @set;
    while ($last && ($last =~ /^(\S[^\t]*\S)\t(\d+)\t(\S+)/) && ($fam eq $1) && ($subfam == $2))
    {
	push(@set,[$1,$2,$3]);
	$last = <FAMS>;
    }
    push(@sets,\@set);
}
if ($last) { die "POORLY FORMATTED FAMILY: CHECK $last" }
my $famN = 1;
foreach my $set (sort { (@$b <=> @$a) or 
			($a->[0] cmp $b->[0]) or
		        ($a->[1] <=> $b->[1])
		      } @sets)
{
    my @lens;
    foreach my $tuple (@$set)
    {
	my $ln = length($to_seq{$tuple->[2]});
	push(@lens,$ln);
    }
    my($mean,$std_dev) = &mean_stddev(@lens);

    foreach my $tuple (@$set)
    {
	my($fam,$subfam,$peg) = @$tuple;
	my $ln = length($to_seq{$peg});
	if (! $ln) { print STDERR &Dumper($peg,$to_seq{$peg}); die "HERE"; }
	my $z_sc = ($ln - $mean) / ($std_dev + 0.000000001);
	print join("\t",($famN,$fam,$subfam,$peg,$ln,sprintf("%0.3f",$mean),sprintf("%0.3f",$std_dev),sprintf("%0.3f",$z_sc))),"\n";
    }
    $famN++;
}

sub mean_stddev
{
    my $n = @_;
    return $n ? ( shift, undef) : ( undef, undef ) if $n < 2;
    my ( $sum, $sum2 ) = ( 0, 0 );
    foreach ( @_ ) { $sum += $_; $sum2 += $_ * $_ }
    my $x = $sum / $n;
    my $to_check = ( $sum2 - ($n * $x * $x )) / ( $n - 1 );
    ( $x, ($to_check > 0) ? sqrt($to_check) : 0);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3