[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :     use strict;
3 :    
4 :     #
5 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 :     #
20 :     package ANNO;
21 :    
22 :     use strict;
23 :     use ERDB;
24 :     use Tracer;
25 :     use SeedUtils;
26 :     use ServerThing;
27 :    
28 :     sub new {
29 :     my ($class) = @_;
30 :     # Create the sapling object.
31 :     my $sap = ERDB::GetDatabase('Sapling');
32 :     # Create the server object.
33 :     my $retVal = { db => $sap };
34 :     # Bless and return it.
35 :     bless $retVal, $class;
36 :     return $retVal;
37 :     }
38 :    
39 :    
40 :     =head2 Primary Methods
41 :    
42 :     =head3 methods
43 :    
44 :     my $methodList = $ssObject->methods();
45 :    
46 :     Return a list of the methods allowed on this object.
47 :    
48 :     =cut
49 :    
50 :     use constant METHODS => [qw(metabolic_reconstruction
51 :     assign_function_to_prot
52 :     call_genes
53 :     find_rnas
54 :     assign_functions_to_DNA
55 : parrello 1.4 find_special_proteins
56 : parrello 1.6 assign_functions_to_dna_small
57 : olson 1.1 )];
58 :    
59 :     sub methods {
60 :     # Get the parameters.
61 :     my ($self) = @_;
62 :     # Return the result.
63 :     return METHODS;
64 :     }
65 :    
66 :     #
67 :     # Docs are in ANNOserver.pm.
68 :     #
69 :    
70 : parrello 1.4 sub find_special_proteins {
71 :     # Get the parameters.
72 :     my ($self, $args) = @_;
73 :     # Pull in the special protein finder.
74 :     require find_special_proteins;
75 :     # Convert the hash to the form expected by find_special_proteins.
76 :     my $params = {
77 :     contigs => $args->{-contigs},
78 :     is_init => $args->{-is_init},
79 :     is_alt => $args->{-is_alt},
80 :     is_term => $args->{-is_term},
81 :     comment => $args->{-comment}
82 :     };
83 :     if (exists $args->{-templates}) {
84 :     my $templates = $args->{-templates};
85 :     if (ref $templates eq 'ARRAY') {
86 :     $params->{references} = $templates;
87 :     } elsif ($templates =~ /^pyr/) {
88 :     $params->{pyrrolysine} = 1
89 :     }
90 :     }
91 :     # Process the input.
92 :     my @retVal = find_special_proteins::find_selenoproteins($params);
93 :     # Return the result.
94 :     return \@retVal;
95 :     }
96 :    
97 : olson 1.1 sub metabolic_reconstruction {
98 :     # Get the parameters.
99 :     my ($self, $args) = @_;
100 :    
101 :     my $sapling = $self->{db};
102 :     my $retVal = [];
103 :    
104 :     # This counter will be used to generate user IDs for roles without them.
105 :     my $next = 1000;
106 :    
107 :     my $id_roles = $args->{-roles};
108 :     my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;
109 :    
110 :     my @id_roles = ();
111 :     foreach my $tuple (@id_roles1)
112 :     {
113 :     my($function,$id) = @$tuple;
114 : parrello 1.3 foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))
115 : olson 1.1 {
116 :     push(@id_roles,[$role,$id]);
117 :     }
118 :     }
119 :    
120 :     my %big;
121 :     my $id_display = 1;
122 :     map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
123 :     my @resultRows = $sapling->GetAll("Subsystem Includes Role",
124 : parrello 1.5 'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
125 :     [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
126 : olson 1.1 my %ss_roles;
127 :     foreach my $row (@resultRows) {
128 :     my ($sub, $role, $abbr) = @$row;
129 :     $ss_roles{$sub}->{$role} = $abbr;
130 :     }
131 :     foreach my $sub (keys %ss_roles) {
132 :     my $roles = $ss_roles{$sub};
133 : parrello 1.3 my @rolesubset = grep { $big{$_} } keys %$roles;
134 :     my @abbr = map{$roles->{$_}} @rolesubset;
135 : olson 1.1 my $set = join(" ", @abbr);
136 :     if (@abbr > 0) {
137 :     my ($variant, $size) = $self->get_max_subset($sub, $set);
138 :     if ($variant) {
139 :     foreach my $role (keys %$roles) {
140 :     if ($id_display) {
141 : parrello 1.3 if (exists $big{$role}) {
142 :     foreach my $id (@{$big{$role}}) {
143 :     push (@$retVal, [$variant, $role, $id]);
144 :     }
145 :     }
146 : olson 1.1 } else {
147 :     push (@$retVal, [$variant, $role]);
148 :     }
149 :     }
150 :     }
151 :     }
152 :     }
153 :     # Return the result.
154 :     return $retVal;
155 :     }
156 :    
157 : parrello 1.6 =head3 assign_functions_to_dna_small
158 :    
159 :     my $idHash = $annoObject->assign_functions_to_dna_small({
160 :     -seqs => [[$id1, $comment1, $seq1],
161 :     [$id2, $comment2, $seq2],
162 :     ... ],
163 :     -kmer => 10,
164 :     -minHits => 3,
165 :     -maxGap => 600,
166 :     });
167 :    
168 :     This method uses FIGfams to assign functions to sequences. It is intended for smaller
169 :     sequence sets than the main method, because it eschews the normal flow control; however,
170 :     it is easier to use for things like the EXCEL interface.
171 :    
172 :     The parameters are as follows.
173 :    
174 :     =item parameter
175 :    
176 :     The parameter should be a reference to a hash with the following keys.
177 :    
178 :     =over 8
179 :    
180 :     =item -seqs
181 :    
182 :     Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
183 :     (1) a comment, and (2) a sequence associated with the ID.
184 :    
185 :     =item -kmer
186 :    
187 :     KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
188 :     sizes are more accurate.
189 :    
190 :     =item -minHits (optional)
191 :    
192 :     A number from 1 to 10, indicating the minimum number of matches required to
193 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
194 :     indicates a more reliable matching algorithm; the default is C<3>.
195 :    
196 :     =item -maxGap (optional)
197 :    
198 :     When looking for a match, if two sequence elements match and are closer than
199 :     this distance, then they will be considered part of a single match. Otherwise,
200 :     the match will be split. The default is C<600>.
201 :    
202 :     =back
203 :    
204 :     =item RETURN
205 :    
206 :     Returns a hash mapping each incoming ID to a list of hit regions. Each hit
207 :     region is a 5-tuple consisting of (0) the number of matches to the function, (1) the
208 :     start location, (2) the stop location, (3) the proposed function, and (4) the name
209 :     of the Genome Set from which the gene is likely to have originated.
210 :    
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub assign_functions_to_dna_small {
216 :     # Get the parameters.
217 :     my ($self, $args) = @_;
218 :     # Get the Kmers object.
219 :     my $kmers = $self->{kmers};
220 :     # Analyze the options.
221 :     my $maxGap = $args->{-maxGap} || 600;
222 :     my $minHits = $args->{-minHits} || 3;
223 :     # Get the KMER size.
224 :     my $kmer = $args->{-kmer};
225 :     # Declare the return variable.
226 :     my $retVal = {};
227 :     # Get the sequence tuples.
228 :     my $seqs = ServerThing::GetIdList(-seqs => $args);
229 :     # Loop through the sequences, finding assignments.
230 :     for my $seqTuple (@$seqs) {
231 :     # Extract the ID and sequence.
232 :     my ($id, undef, $seq) = @$seqTuple;
233 :     # Compute the assignment.
234 :     my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
235 :     $minHits, $maxGap);
236 :     # Store the result.
237 :     $retVal->{$id} = $assignment;
238 :     }
239 :     # Return the results.
240 :     return $retVal;
241 :     }
242 :    
243 :    
244 : olson 1.1 =head2 Internal Utility Methods
245 :    
246 : parrello 1.6 =head3 set_kmer_data
247 :    
248 :     $annoObject->set_kmer_data($kmers);
249 :    
250 :     Store the default KMER object for this annotation service.
251 :    
252 :     =cut
253 :    
254 :     sub set_kmer_data {
255 :     # Get the parameters.
256 :     my ($self, $kmers) = @_;
257 :     # Store the specified object.
258 :     $self->{kmers} = $kmers;
259 :     }
260 :    
261 : olson 1.1 =head3 get_max_subset
262 :    
263 :     my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);
264 :    
265 :     Given a subsystem ID and a role rule, return the ID of the variant for
266 :     the subsystem that matches the most roles in the rule and the number of
267 :     roles matched.
268 :    
269 :     =over 4
270 :    
271 :     =item sub
272 :    
273 :     Name (ID) of the subsystem whose variants are to be examined.
274 :    
275 :     =item setA
276 :    
277 :     A space-delimited list of role abbreviations, lexically ordered. This provides
278 :     a unique specification of the roles in the set.
279 :    
280 :     =item RETURN
281 :    
282 : parrello 1.2 Returns a 2-element list consisting of name variant found (subsystem name, colon,
283 :     and variant code) and the number of roles matched.
284 : olson 1.1
285 :     =back
286 :    
287 :     =cut
288 :    
289 :     sub get_max_subset {
290 :     my ($self, $sub, $setA) = @_;
291 :     my $sapling = $self->{db};
292 :     my $max_size = 0;
293 :     my $max_set;
294 :     my $max_variant;
295 :     my %set_hash;
296 :     my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
297 :     while (my $resultRow = $qh->Fetch()) {
298 :     my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
299 :     my ($variantCode) = $resultRow->Value('Variant(code)');
300 :     my $variantId = $sub.":".$variantCode;
301 :     foreach my $setB (@variantRoleRule) {
302 :     my $size = is_A_a_superset_of_B($setA, $setB);
303 :     if ($size && $size > $max_size) {
304 :     $max_size = $size;
305 :     $max_set = $setB;
306 :     $max_variant = $variantId;
307 :     }
308 :     }
309 :     }
310 :     #if ($max_size) {
311 :     #print STDERR "Success $max_variant, $max_set\n";
312 :     #}
313 :     return($max_variant, $max_size);
314 :     }
315 :    
316 :    
317 :     =head3 is_A_a_superset_of_B
318 :    
319 :     my $size = SS::is_A_a_superset_of_B($a, $b);
320 :    
321 :     This method takes as input two role rules, and returns 0 if the first
322 :     role rule is NOT a superset of the second; otherwise, it returns the size
323 :     of the second rule. A role rule is a space-delimited list of role
324 :     abbreviations in lexical order. This provides a unique identifier for a
325 :     set of roles in a subsystem.
326 :    
327 :     =over 4
328 :    
329 :     =item a
330 :    
331 :     First role rule.
332 :    
333 :     =item b
334 :    
335 :     Second role rule.
336 :    
337 :     =item RETURN
338 :    
339 :     Returns 0 if the first rule is NOT a superset of the second and the size of the
340 :     second rule if it is. As a result, if the first rule IS a superset, this method
341 :     will evaluate to TRUE, and to FALSE otherwise.
342 :    
343 :     =back
344 :    
345 :     =cut
346 :    
347 :     sub is_A_a_superset_of_B {
348 :     my ($a, $b) = @_;
349 :     my @a = split(" ", $a);
350 :     my @b = split(" ", $b);
351 :     if (@b > @a) {
352 :     return(0);
353 :     }
354 :     my %given;
355 :     map { $given{$_} = 1} @a;
356 :     map { if (! $given{$_}) {return 0}} split(" ", $b);
357 :     my $l = scalar(@b);
358 :     return scalar(@b);
359 :     }
360 :    
361 :    
362 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3