[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :     use strict;
3 :    
4 :     #
5 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 :     #
20 :     package ANNO;
21 :    
22 :     use strict;
23 :     use ERDB;
24 :     use Tracer;
25 :     use SeedUtils;
26 :     use ServerThing;
27 :    
28 :     sub new {
29 :     my ($class) = @_;
30 :     # Create the sapling object.
31 :     my $sap = ERDB::GetDatabase('Sapling');
32 :     # Create the server object.
33 :     my $retVal = { db => $sap };
34 :     # Bless and return it.
35 :     bless $retVal, $class;
36 :     return $retVal;
37 :     }
38 :    
39 :    
40 :     =head2 Primary Methods
41 :    
42 :     =head3 methods
43 :    
44 :     my $methodList = $ssObject->methods();
45 :    
46 :     Return a list of the methods allowed on this object.
47 :    
48 :     =cut
49 :    
50 :     use constant METHODS => [qw(metabolic_reconstruction
51 :     assign_function_to_prot
52 :     call_genes
53 :     find_rnas
54 :     assign_functions_to_DNA
55 : parrello 1.4 find_special_proteins
56 : parrello 1.6 assign_functions_to_dna_small
57 : olson 1.1 )];
58 :    
59 :     sub methods {
60 :     # Get the parameters.
61 :     my ($self) = @_;
62 :     # Return the result.
63 :     return METHODS;
64 :     }
65 :    
66 :     #
67 :     # Docs are in ANNOserver.pm.
68 :     #
69 :    
70 : parrello 1.4 sub find_special_proteins {
71 :     # Get the parameters.
72 :     my ($self, $args) = @_;
73 :     # Pull in the special protein finder.
74 :     require find_special_proteins;
75 :     # Convert the hash to the form expected by find_special_proteins.
76 :     my $params = {
77 :     contigs => $args->{-contigs},
78 :     is_init => $args->{-is_init},
79 :     is_alt => $args->{-is_alt},
80 :     is_term => $args->{-is_term},
81 :     comment => $args->{-comment}
82 :     };
83 :     if (exists $args->{-templates}) {
84 :     my $templates = $args->{-templates};
85 :     if (ref $templates eq 'ARRAY') {
86 :     $params->{references} = $templates;
87 :     } elsif ($templates =~ /^pyr/) {
88 :     $params->{pyrrolysine} = 1
89 :     }
90 :     }
91 :     # Process the input.
92 :     my @retVal = find_special_proteins::find_selenoproteins($params);
93 :     # Return the result.
94 :     return \@retVal;
95 :     }
96 :    
97 : olson 1.1 sub metabolic_reconstruction {
98 :     # Get the parameters.
99 :     my ($self, $args) = @_;
100 :    
101 :     my $sapling = $self->{db};
102 :     my $retVal = [];
103 :    
104 :     # This counter will be used to generate user IDs for roles without them.
105 :     my $next = 1000;
106 :    
107 :     my $id_roles = $args->{-roles};
108 :     my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;
109 :    
110 :     my @id_roles = ();
111 :     foreach my $tuple (@id_roles1)
112 :     {
113 :     my($function,$id) = @$tuple;
114 : parrello 1.3 foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))
115 : olson 1.1 {
116 :     push(@id_roles,[$role,$id]);
117 :     }
118 :     }
119 :    
120 :     my %big;
121 :     my $id_display = 1;
122 :     map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
123 :     my @resultRows = $sapling->GetAll("Subsystem Includes Role",
124 : parrello 1.5 'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
125 :     [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
126 : olson 1.1 my %ss_roles;
127 :     foreach my $row (@resultRows) {
128 :     my ($sub, $role, $abbr) = @$row;
129 :     $ss_roles{$sub}->{$role} = $abbr;
130 :     }
131 :     foreach my $sub (keys %ss_roles) {
132 :     my $roles = $ss_roles{$sub};
133 : parrello 1.3 my @rolesubset = grep { $big{$_} } keys %$roles;
134 :     my @abbr = map{$roles->{$_}} @rolesubset;
135 : olson 1.1 my $set = join(" ", @abbr);
136 :     if (@abbr > 0) {
137 :     my ($variant, $size) = $self->get_max_subset($sub, $set);
138 :     if ($variant) {
139 :     foreach my $role (keys %$roles) {
140 :     if ($id_display) {
141 : parrello 1.3 if (exists $big{$role}) {
142 :     foreach my $id (@{$big{$role}}) {
143 :     push (@$retVal, [$variant, $role, $id]);
144 :     }
145 :     }
146 : olson 1.1 } else {
147 :     push (@$retVal, [$variant, $role]);
148 :     }
149 :     }
150 :     }
151 :     }
152 :     }
153 :     # Return the result.
154 :     return $retVal;
155 :     }
156 :    
157 : parrello 1.6 =head3 assign_functions_to_dna_small
158 :    
159 :     my $idHash = $annoObject->assign_functions_to_dna_small({
160 :     -seqs => [[$id1, $comment1, $seq1],
161 :     [$id2, $comment2, $seq2],
162 :     ... ],
163 :     -kmer => 10,
164 :     -minHits => 3,
165 :     -maxGap => 600,
166 :     });
167 :    
168 :     This method uses FIGfams to assign functions to sequences. It is intended for smaller
169 :     sequence sets than the main method, because it eschews the normal flow control; however,
170 :     it is easier to use for things like the EXCEL interface.
171 :    
172 :     The parameters are as follows.
173 :    
174 :     =item parameter
175 :    
176 :     The parameter should be a reference to a hash with the following keys.
177 :    
178 :     =over 8
179 :    
180 :     =item -seqs
181 :    
182 :     Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
183 :     (1) a comment, and (2) a sequence associated with the ID.
184 :    
185 :     =item -kmer
186 :    
187 :     KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
188 :     sizes are more accurate.
189 :    
190 :     =item -minHits (optional)
191 :    
192 :     A number from 1 to 10, indicating the minimum number of matches required to
193 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
194 :     indicates a more reliable matching algorithm; the default is C<3>.
195 :    
196 :     =item -maxGap (optional)
197 :    
198 :     When looking for a match, if two sequence elements match and are closer than
199 :     this distance, then they will be considered part of a single match. Otherwise,
200 :     the match will be split. The default is C<600>.
201 :    
202 :     =back
203 :    
204 :     =item RETURN
205 :    
206 :     Returns a hash mapping each incoming ID to a list of hit regions. Each hit
207 : parrello 1.7 region is a n-tuple consisting of (0) the number of matches to the function, (1) the
208 :     start location, (2) the stop location, (3) the proposed function, (4) the name
209 :     of the Genome Set from which the gene is likely to have originated, (5) the ID
210 :     number of the OTU (or C<undef> if the OTU was not found), and (6) the IDs of the
211 :     roles represented in the function, if any of them have IDs.
212 :    
213 : parrello 1.6
214 :     =back
215 :    
216 :     =cut
217 :    
218 :     sub assign_functions_to_dna_small {
219 :     # Get the parameters.
220 :     my ($self, $args) = @_;
221 :     # Get the Kmers object.
222 :     my $kmers = $self->{kmers};
223 :     # Analyze the options.
224 :     my $maxGap = $args->{-maxGap} || 600;
225 :     my $minHits = $args->{-minHits} || 3;
226 :     # Get the KMER size.
227 :     my $kmer = $args->{-kmer};
228 :     # Declare the return variable.
229 :     my $retVal = {};
230 : parrello 1.7 # Get the sapling database.
231 :     my $sap = $self->{db};
232 : parrello 1.6 # Get the sequence tuples.
233 :     my $seqs = ServerThing::GetIdList(-seqs => $args);
234 :     # Loop through the sequences, finding assignments.
235 :     for my $seqTuple (@$seqs) {
236 :     # Extract the ID and sequence.
237 :     my ($id, undef, $seq) = @$seqTuple;
238 :     # Compute the assignment.
239 :     my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
240 :     $minHits, $maxGap);
241 : parrello 1.7 # Loop through the assignments, adding the function and OTU IDs.
242 :     for my $tuple (@$assignment) {
243 :     # Extract the function and OTU.
244 :     my $function = $tuple->[3];
245 :     my $otu = $tuple->[4];
246 :     # Get the IDs for the roles (if any).
247 :     my @roleIdx;
248 :     if ($function) {
249 :     # We have a function, so split it into roles.
250 :     my @roles = roles_of_function($function);
251 :     # Accumulate the IDs for the roles found.
252 :     for my $role (@roles) {
253 :     push @roleIdx, $sap->GetEntityValues(Role => $role, ['role-index']);
254 :     }
255 :     }
256 :     # Get the ID for the OTU (if any).
257 :     my $otuIdx;
258 :     if ($otu) {
259 :     ($otuIdx) = $sap->GetFlat("Genome IsCollectedInto",
260 :     'Genome(scientific-name) = ?', [$otu],
261 :     'IsCollectedInto(to-link)');
262 :     }
263 :     # Update the tuple.
264 :     splice @$tuple, 5, undef, $otuIdx, @roleIdx;
265 :     }
266 : parrello 1.6 # Store the result.
267 :     $retVal->{$id} = $assignment;
268 :     }
269 :     # Return the results.
270 :     return $retVal;
271 :     }
272 :    
273 :    
274 : olson 1.1 =head2 Internal Utility Methods
275 :    
276 : parrello 1.6 =head3 set_kmer_data
277 :    
278 :     $annoObject->set_kmer_data($kmers);
279 :    
280 :     Store the default KMER object for this annotation service.
281 :    
282 :     =cut
283 :    
284 :     sub set_kmer_data {
285 :     # Get the parameters.
286 :     my ($self, $kmers) = @_;
287 :     # Store the specified object.
288 :     $self->{kmers} = $kmers;
289 :     }
290 :    
291 : olson 1.1 =head3 get_max_subset
292 :    
293 :     my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);
294 :    
295 :     Given a subsystem ID and a role rule, return the ID of the variant for
296 :     the subsystem that matches the most roles in the rule and the number of
297 :     roles matched.
298 :    
299 :     =over 4
300 :    
301 :     =item sub
302 :    
303 :     Name (ID) of the subsystem whose variants are to be examined.
304 :    
305 :     =item setA
306 :    
307 :     A space-delimited list of role abbreviations, lexically ordered. This provides
308 :     a unique specification of the roles in the set.
309 :    
310 :     =item RETURN
311 :    
312 : parrello 1.2 Returns a 2-element list consisting of name variant found (subsystem name, colon,
313 :     and variant code) and the number of roles matched.
314 : olson 1.1
315 :     =back
316 :    
317 :     =cut
318 :    
319 :     sub get_max_subset {
320 :     my ($self, $sub, $setA) = @_;
321 :     my $sapling = $self->{db};
322 :     my $max_size = 0;
323 :     my $max_set;
324 :     my $max_variant;
325 :     my %set_hash;
326 :     my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
327 :     while (my $resultRow = $qh->Fetch()) {
328 :     my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
329 :     my ($variantCode) = $resultRow->Value('Variant(code)');
330 :     my $variantId = $sub.":".$variantCode;
331 :     foreach my $setB (@variantRoleRule) {
332 :     my $size = is_A_a_superset_of_B($setA, $setB);
333 :     if ($size && $size > $max_size) {
334 :     $max_size = $size;
335 :     $max_set = $setB;
336 :     $max_variant = $variantId;
337 :     }
338 :     }
339 :     }
340 :     #if ($max_size) {
341 :     #print STDERR "Success $max_variant, $max_set\n";
342 :     #}
343 :     return($max_variant, $max_size);
344 :     }
345 :    
346 :    
347 :     =head3 is_A_a_superset_of_B
348 :    
349 :     my $size = SS::is_A_a_superset_of_B($a, $b);
350 :    
351 :     This method takes as input two role rules, and returns 0 if the first
352 :     role rule is NOT a superset of the second; otherwise, it returns the size
353 :     of the second rule. A role rule is a space-delimited list of role
354 :     abbreviations in lexical order. This provides a unique identifier for a
355 :     set of roles in a subsystem.
356 :    
357 :     =over 4
358 :    
359 :     =item a
360 :    
361 :     First role rule.
362 :    
363 :     =item b
364 :    
365 :     Second role rule.
366 :    
367 :     =item RETURN
368 :    
369 :     Returns 0 if the first rule is NOT a superset of the second and the size of the
370 :     second rule if it is. As a result, if the first rule IS a superset, this method
371 :     will evaluate to TRUE, and to FALSE otherwise.
372 :    
373 :     =back
374 :    
375 :     =cut
376 :    
377 :     sub is_A_a_superset_of_B {
378 :     my ($a, $b) = @_;
379 :     my @a = split(" ", $a);
380 :     my @b = split(" ", $b);
381 :     if (@b > @a) {
382 :     return(0);
383 :     }
384 :     my %given;
385 :     map { $given{$_} = 1} @a;
386 :     map { if (! $given{$_}) {return 0}} split(" ", $b);
387 :     my $l = scalar(@b);
388 :     return scalar(@b);
389 :     }
390 :    
391 :    
392 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3