[Bio] / FigKernelScripts / get_families_final.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/get_families_final.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : overbeek 1.3 #
2 :     # This is a SAS Component
3 :     #
4 :    
5 :    
6 :     =head1 get_families_final
7 :    
8 :     Generate protein families (of isofunctional homologs) using kmer technology.
9 :    
10 :     ------
11 :    
12 :     Example:
13 :    
14 :     get_families_final -f Families/families -s Seqs.Fasta
15 :    
16 :     This simple program gathers the families from
17 :    
18 :     good families (those which were assigned funtions and usually have a unique PEG)
19 :     bad.fixed families (those assigned a function and then subjected to a splitting test)
20 :     missed (those families of PEGs with no assigned functions and clustered by similarity)
21 :    
22 :     and builds families.all, the final set of families.
23 :    
24 :     =head2 Command-Line Options
25 :    
26 :     =over 4
27 :    
28 :     =item -f FamilyFilesPrefix
29 :    
30 :     The prefix used when writing files recording subfamilies (and the final
31 :     families.all)
32 :    
33 :     =item -s Seqs.Fasta
34 :    
35 :     The directory from which the translations of PEGs from each genome are
36 :     used.
37 :    
38 :     =back
39 :    
40 :     =head2 Output Format
41 :    
42 :     Output is written to STDOUT and constitutes the derived protein families (which
43 :     include singletons). An 8-column, tab-separated table is written:
44 :    
45 :     FamilyID - an integer
46 :     Function - function assigned to family
47 :     SubFunction - the Function and an integer (SubFunction) together uniquely
48 :     determine the FamilyID. Another way to look at it is
49 :    
50 :     a) each family is assigned a unique ID and a function
51 :     b) multiple families can have the same function (consider
52 :     "hypothetical protein")
53 :     c) the Function+SubFunction uniquely determine the FamilyID
54 :     PEG
55 :     LengthProt - the length of the translated PEG
56 :     Mean - the mean length of PEGs in the family
57 :     StdDev - standard deviation of lengths for family
58 :     Z-sc - the Z-score associated with the length of this PEG
59 :    
60 :     =cut
61 :    
62 : overbeek 1.1 use strict;
63 :     use Data::Dumper;
64 :     use Getopt::Long;
65 :     use SeedEnv;
66 :     use gjoseqlib;
67 :     use gjostat;
68 :    
69 :     my $usage = "usage: get_families_final -f families -s Seq\n";
70 :     my $families;
71 :     my $seqD;
72 :     my $rc = GetOptions('f=s' => \$families,
73 :     's=s' => \$seqD);
74 :    
75 :     if ((! $rc) || (! $families) || (! $seqD))
76 :     {
77 :     print STDERR $usage; exit ;
78 :     }
79 :    
80 :     open(IN,"cat $families.good $families.bad.fixed $families.missed |")
81 :     || die "could not read families";
82 :    
83 :     my %genomes;
84 :     my %needed_pegs;
85 :    
86 :     while (defined($_ = <IN>))
87 :     {
88 :     chomp;
89 :     my(undef,undef,$peg) = split(/\t/,$_);
90 :     my $g = &SeedUtils::genome_of($peg);
91 :     $genomes{$g} = 1;
92 :     $needed_pegs{$peg} = 1;
93 :     }
94 :     close(IN);
95 :     my %to_seq;
96 :     foreach my $g (keys(%genomes))
97 :     {
98 :     my @tuples = grep { $needed_pegs{$_->[0]} } &gjoseqlib::read_fasta("$seqD/$g");
99 :     foreach my $tuple (@tuples)
100 :     {
101 :     my($peg,undef,$seq) = @$tuple;
102 :     $to_seq{$peg} = $seq;
103 :     }
104 :     }
105 :    
106 :     open(FAMS,"cat $families.good $families.bad.fixed $families.missed |")
107 :     || die "could not open families";
108 :    
109 :     my @sets;
110 :     my $last = <FAMS>;
111 : overbeek 1.2 while ($last && ($last =~ /^(\S[^\t]*)\t(\d+)\t(\S+)/))
112 : overbeek 1.1 {
113 :     my $fam = $1;
114 :     my $subfam = $2;
115 :     my @set;
116 : overbeek 1.2 while ($last && ($last =~ /^(\S[^\t]*\S)\t(\d+)\t(\S+)/) && ($fam eq $1) && ($subfam == $2))
117 : overbeek 1.1 {
118 :     push(@set,[$1,$2,$3]);
119 :     $last = <FAMS>;
120 :     }
121 :     push(@sets,\@set);
122 :     }
123 : overbeek 1.2 if ($last) { die "POORLY FORMATTED FAMILY: CHECK $last" }
124 : overbeek 1.1 my $famN = 1;
125 :     foreach my $set (sort { (@$b <=> @$a) or
126 :     ($a->[0] cmp $b->[0]) or
127 :     ($a->[1] <=> $b->[1])
128 :     } @sets)
129 :     {
130 :     my @lens;
131 :     foreach my $tuple (@$set)
132 :     {
133 :     my $ln = length($to_seq{$tuple->[2]});
134 :     push(@lens,$ln);
135 :     }
136 : overbeek 1.2 my($mean,$std_dev) = &mean_stddev(@lens);
137 : overbeek 1.1
138 :     foreach my $tuple (@$set)
139 :     {
140 :     my($fam,$subfam,$peg) = @$tuple;
141 :     my $ln = length($to_seq{$peg});
142 :     if (! $ln) { print STDERR &Dumper($peg,$to_seq{$peg}); die "HERE"; }
143 :     my $z_sc = ($ln - $mean) / ($std_dev + 0.000000001);
144 :     print join("\t",($famN,$fam,$subfam,$peg,$ln,sprintf("%0.3f",$mean),sprintf("%0.3f",$std_dev),sprintf("%0.3f",$z_sc))),"\n";
145 :     }
146 :     $famN++;
147 :     }
148 : overbeek 1.2
149 :     sub mean_stddev
150 :     {
151 :     my $n = @_;
152 :     return $n ? ( shift, undef) : ( undef, undef ) if $n < 2;
153 :     my ( $sum, $sum2 ) = ( 0, 0 );
154 :     foreach ( @_ ) { $sum += $_; $sum2 += $_ * $_ }
155 :     my $x = $sum / $n;
156 :     my $to_check = ( $sum2 - ($n * $x * $x )) / ( $n - 1 );
157 :     ( $x, ($to_check > 0) ? sqrt($to_check) : 0);
158 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3