[Bio] / FigKernelPackages / Signatures.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/Signatures.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :     package Signatures;
18 :    
19 :     use strict;
20 :     use Tracer;
21 :    
22 :     =head1 Genome Signatures Computation Object
23 :    
24 :     This object is used to support the computation of genome signatures. It
25 :     takes as input two genome sets and produces a statistical analysis of the
26 :     protein families that distinguish the two sets.
27 :    
28 :     A genome in this context is defined as a genome ID and a list of protein
29 :     family IDs. Of course, anything could be used for the genome ID and anything
30 :     could be used for the family IDs, so long as it is done consistently.
31 :     Nonetheless, to keep the documentation grounded, we will refer to the
32 :     elements of discourse as I<genomes> and I<families>.
33 :    
34 :     The basic strategy is to count the number of occurrences of each family in
35 :     each set and assign a score ranging from 0 to 2 based on whether the family
36 :     tends to occur in both sets or tends to occur in only one set. A score of 1
37 :     or more indicates that the family is considered a signature for its
38 :     particular set.
39 :    
40 :     The main entry method for this module is the static L</ComputeSignatures>.
41 :     This builds the object and runs the computations. The object itself
42 :     contains the following fields.
43 :    
44 :     =over 4
45 :    
46 :     =item sets
47 :    
48 :     A 2-tuple containing the hashes for the two genome sets (known as set C<0>
49 :     and set C<1>). Each hash is keyed by genome ID and maps each genome in the
50 :     set to a list of the IDs of all the families present in the genome.
51 :    
52 :     =item counts
53 :    
54 :     Hash mapping each family ID to a 2-tuple consisting of (0) the number of
55 :     genomes in set C<0> that contain the family and (1) the number of genomes
56 :     in set C<1> that contain the family.
57 :    
58 :     =back
59 :    
60 :     =head2 Special Methods
61 :    
62 :     =head3 new
63 :    
64 :     my $sigO = Signatures->new(\%set0, \%set1);
65 :    
66 :     Create a new signature object for the two specified genome sets. The new
67 :     object will contain the genome set information, but the family count and
68 :     score tables will be empty.
69 :    
70 :     =over 4
71 :    
72 :     =item set0
73 :    
74 :     Reference to a hash mapping the ID of each genome in set 0 to a list of
75 :     the families it contains.
76 :    
77 :     =item set1
78 :    
79 :     Reference to a hash mapping the ID of each genome in set 1 to a list of
80 :     the families it contains.
81 :    
82 :     =back
83 :    
84 :     =cut
85 :    
86 :     sub new {
87 :     # Get the parameters.
88 :     my ($class, $set0, $set1) = @_;
89 :     # Create the signatures object.
90 :     my $retVal = {
91 :     sets => [$set0, $set1],
92 :     counts => {},
93 :     };
94 :     # Bless and return it.
95 :     bless $retVal, $class;
96 :     return $retVal;
97 :     }
98 :    
99 :     =head2 Public Methods
100 :    
101 :     =head3 ComputeSignatures
102 :    
103 :     my ($families0, $families1) = ComputeSignatures(\%set0, \%set1);
104 :    
105 :     Compute the signature protein families for the two specified genome sets.
106 :     Each genome set is specified in the form of a hash that maps the ID of
107 :     each genome in the set to a list of the IDs for families represented in the
108 :     genome. The output hashes list the families common in each set that are
109 :     uncommon in the other set. They map each relevant family ID to a score
110 :     ranging from 1 to 2 indicating the degree to which the family can be considered
111 :     a signature family.
112 :    
113 :     =over 4
114 :    
115 :     =item set0
116 :    
117 :     Reference to a hash mapping the ID of each genome in set 0 to a list of
118 :     the families it contains.
119 :    
120 :     =item set1
121 :    
122 :     Reference to a hash mapping the ID of each genome in set 1 to a list of
123 :     the families it contains.
124 :    
125 :     =item RETURN
126 :    
127 :     Returns a list of two hash references, the first (index 0) containing families
128 :     that are significant to set 0 and the second (index 1) containing families that
129 :     are significant to set 1. Each hash maps family IDs to significance scores
130 :     ranging from 1 to 2. The higher the score, the more significant the family
131 :     is in distinguishing the indicated set from the other set.
132 :    
133 :     =back
134 :    
135 :     =cut
136 :    
137 :     sub ComputeSignatures {
138 :     # Get the parameters.
139 :     my ($set0, $set1) = @_;
140 :     # Create the signatures object.
141 :     my $sig = Signatures->new($set0, $set1);
142 :     # Count the families.
143 :     $sig->CountFamilies(0);
144 :     $sig->CountFamilies(1);
145 :     # Loop through the families, computing scores.
146 :     my ($families0, $families1) = $sig->ScoreFamilies();
147 :     # Return the result.
148 :     return ($families0, $families1);
149 :     }
150 :    
151 :     =head2 Internal Methods
152 :    
153 :     =head3 CountFamilies
154 :    
155 :     $sig->CountFamilies($idx);
156 :    
157 :     Count the families found in the specified genome set. The counts will be stored
158 :     in the C<counts> member of this object.
159 :    
160 :     =over 4
161 :    
162 :     =item idx
163 :    
164 :     Index of the set to be counted: C<0> for set 0 or C<1> for set 1.
165 :    
166 :     =back
167 :    
168 :     =cut
169 :    
170 :     sub CountFamilies {
171 :     # Get the parameters.
172 :     my ($self, $idx) = @_;
173 :     # Get the indicated set.
174 :     my $set = $self->{sets}[$idx];
175 :     # Get the count hash.
176 :     my $counts = $self->{counts};
177 :     # Loop through the genomes in the specified set.
178 :     for my $genomeID (keys %$set) {
179 : parrello 1.2 Trace("Processing $genomeID for set $idx.") if T(3);
180 : parrello 1.1 # Get the genome's families.
181 :     my $families = $set->{$genomeID};
182 :     # Loop through them.
183 :     for my $family (@$families) {
184 :     # Insure we have a hash entry for this family.
185 :     if (! exists $counts->{$family}) {
186 :     $counts->{$family} = [0,0];
187 :     }
188 :     # Count this family for this set.
189 :     $counts->{$family}[$idx]++;
190 :     }
191 :     }
192 :     }
193 :    
194 :     =head3 ScoreFamilies
195 :    
196 :     my ($families0, $families1) = $sig->ScoreFamilies();
197 :    
198 :     Compute the score for each family and place it in the appropriate output
199 :     hash. This method should be called after L</CountFamilies> has been used
200 :     to fill in the C<counts> member.
201 :    
202 :     =over 4
203 :    
204 :     =item RETURN
205 :    
206 :     Returns a list of two hash references, the first (index 0) containing families
207 :     that are significant to set 0 and the second (index 1) containing families that
208 :     are significant to set 1. Each hash maps family IDs to significance scores
209 :     ranging from 1 to 2. The higher the score, the more significant the family
210 :     is in distinguishing the indicated set from the other set.
211 :    
212 :     =back
213 :    
214 :     =cut
215 :    
216 :     sub ScoreFamilies {
217 :     # Get the parameters.
218 :     my ($self) = @_;
219 :     # Create the return hashes.
220 :     my $families0 = {};
221 :     my $families1 = {};
222 :     # Get the count hash.
223 :     my $counts = $self->{counts};
224 :     # Compute the size of each set.
225 :     my $size0 = scalar keys %{$self->{sets}[0]};
226 :     my $size1 = scalar keys %{$self->{sets}[1]};
227 :     # Loop through the families, computing scores.
228 :     for my $family (keys %$counts) {
229 :     # Get the counts for this family.
230 :     my ($count0, $count1) = @{$counts->{$family}};
231 :     # Compute the occurrence ratios for the two sets.
232 :     my $ratio0 = $count0 / $size0;
233 :     my $ratio1 = $count1 / $size1;
234 :     my $comp0 = 1 - $ratio0;
235 :     my $comp1 = 1 - $ratio1;
236 :     # Get the three variances.
237 :     my $var0 = $ratio0 * $ratio0 + $comp0 * $comp0;
238 :     my $var1 = $ratio1 * $ratio1 + $comp1 * $comp1;
239 :     my $mix = $ratio0 * $ratio1 + $comp0 * $comp1;
240 :     # Compute the score.
241 :     my $score = 2 - $mix / $var0 - $mix / $var1;
242 :     # If we have a score, save this family.
243 :     if ($score > 1) {
244 :     if ($ratio0 > $ratio1) {
245 :     $families0->{$family} = $score;
246 :     } else {
247 :     $families1->{$family} = $score;
248 :     }
249 :     }
250 : parrello 1.2 Trace("Family counts for $family are $count0, $count1 with score $score.") if T(3);
251 : parrello 1.1 }
252 :     # Return the family hashes.
253 :     return ($families0, $families1);
254 :    
255 :     }
256 :    
257 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3