[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :     use strict;
3 :    
4 :     #
5 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 :     #
20 :     package ANNO;
21 :    
22 :     use strict;
23 :     use ERDB;
24 :     use Tracer;
25 :     use SeedUtils;
26 :     use ServerThing;
27 :    
28 :     sub new {
29 : parrello 1.9 my ($class, $sapDB) = @_;
30 : olson 1.1 # Create the sapling object.
31 : parrello 1.9 my $sap = $sapDB || ERDB::GetDatabase('Sapling');
32 : olson 1.1 # Create the server object.
33 :     my $retVal = { db => $sap };
34 :     # Bless and return it.
35 :     bless $retVal, $class;
36 :     return $retVal;
37 :     }
38 :    
39 :    
40 :     =head2 Primary Methods
41 :    
42 :     =head3 methods
43 :    
44 :     my $methodList = $ssObject->methods();
45 :    
46 :     Return a list of the methods allowed on this object.
47 :    
48 :     =cut
49 :    
50 :     use constant METHODS => [qw(metabolic_reconstruction
51 :     assign_function_to_prot
52 :     call_genes
53 :     find_rnas
54 :     assign_functions_to_DNA
55 : parrello 1.4 find_special_proteins
56 : parrello 1.6 assign_functions_to_dna_small
57 : olson 1.8 get_dataset
58 :     get_vector_basis_sets
59 :     get_active_datasets
60 : olson 1.1 )];
61 :    
62 :     sub methods {
63 :     # Get the parameters.
64 :     my ($self) = @_;
65 :     # Return the result.
66 :     return METHODS;
67 :     }
68 :    
69 :     #
70 :     # Docs are in ANNOserver.pm.
71 :     #
72 :    
73 : parrello 1.4 sub find_special_proteins {
74 :     # Get the parameters.
75 :     my ($self, $args) = @_;
76 :     # Pull in the special protein finder.
77 :     require find_special_proteins;
78 :     # Convert the hash to the form expected by find_special_proteins.
79 :     my $params = {
80 :     contigs => $args->{-contigs},
81 :     is_init => $args->{-is_init},
82 :     is_alt => $args->{-is_alt},
83 :     is_term => $args->{-is_term},
84 :     comment => $args->{-comment}
85 :     };
86 :     if (exists $args->{-templates}) {
87 :     my $templates = $args->{-templates};
88 :     if (ref $templates eq 'ARRAY') {
89 :     $params->{references} = $templates;
90 :     } elsif ($templates =~ /^pyr/) {
91 :     $params->{pyrrolysine} = 1
92 :     }
93 :     }
94 :     # Process the input.
95 :     my @retVal = find_special_proteins::find_selenoproteins($params);
96 :     # Return the result.
97 :     return \@retVal;
98 :     }
99 :    
100 : olson 1.1 sub metabolic_reconstruction {
101 :     # Get the parameters.
102 :     my ($self, $args) = @_;
103 :    
104 :     my $sapling = $self->{db};
105 :     my $retVal = [];
106 :    
107 :     # This counter will be used to generate user IDs for roles without them.
108 :     my $next = 1000;
109 :    
110 :     my $id_roles = $args->{-roles};
111 :     my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;
112 :    
113 :     my @id_roles = ();
114 :     foreach my $tuple (@id_roles1)
115 :     {
116 :     my($function,$id) = @$tuple;
117 : parrello 1.3 foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))
118 : olson 1.1 {
119 :     push(@id_roles,[$role,$id]);
120 :     }
121 :     }
122 :    
123 :     my %big;
124 :     my $id_display = 1;
125 :     map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
126 :     my @resultRows = $sapling->GetAll("Subsystem Includes Role",
127 : parrello 1.5 'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
128 :     [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
129 : olson 1.1 my %ss_roles;
130 :     foreach my $row (@resultRows) {
131 :     my ($sub, $role, $abbr) = @$row;
132 :     $ss_roles{$sub}->{$role} = $abbr;
133 :     }
134 :     foreach my $sub (keys %ss_roles) {
135 :     my $roles = $ss_roles{$sub};
136 : parrello 1.3 my @rolesubset = grep { $big{$_} } keys %$roles;
137 :     my @abbr = map{$roles->{$_}} @rolesubset;
138 : olson 1.1 my $set = join(" ", @abbr);
139 :     if (@abbr > 0) {
140 :     my ($variant, $size) = $self->get_max_subset($sub, $set);
141 :     if ($variant) {
142 :     foreach my $role (keys %$roles) {
143 :     if ($id_display) {
144 : parrello 1.3 if (exists $big{$role}) {
145 :     foreach my $id (@{$big{$role}}) {
146 :     push (@$retVal, [$variant, $role, $id]);
147 :     }
148 :     }
149 : olson 1.1 } else {
150 :     push (@$retVal, [$variant, $role]);
151 :     }
152 :     }
153 :     }
154 :     }
155 :     }
156 :     # Return the result.
157 :     return $retVal;
158 :     }
159 :    
160 : parrello 1.6 =head3 assign_functions_to_dna_small
161 :    
162 :     my $idHash = $annoObject->assign_functions_to_dna_small({
163 :     -seqs => [[$id1, $comment1, $seq1],
164 :     [$id2, $comment2, $seq2],
165 :     ... ],
166 :     -kmer => 10,
167 :     -minHits => 3,
168 :     -maxGap => 600,
169 :     });
170 :    
171 :     This method uses FIGfams to assign functions to sequences. It is intended for smaller
172 :     sequence sets than the main method, because it eschews the normal flow control; however,
173 :     it is easier to use for things like the EXCEL interface.
174 :    
175 :     The parameters are as follows.
176 :    
177 :     =item parameter
178 :    
179 :     The parameter should be a reference to a hash with the following keys.
180 :    
181 :     =over 8
182 :    
183 :     =item -seqs
184 :    
185 :     Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
186 :     (1) a comment, and (2) a sequence associated with the ID.
187 :    
188 :     =item -kmer
189 :    
190 :     KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
191 :     sizes are more accurate.
192 :    
193 :     =item -minHits (optional)
194 :    
195 :     A number from 1 to 10, indicating the minimum number of matches required to
196 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
197 :     indicates a more reliable matching algorithm; the default is C<3>.
198 :    
199 :     =item -maxGap (optional)
200 :    
201 :     When looking for a match, if two sequence elements match and are closer than
202 :     this distance, then they will be considered part of a single match. Otherwise,
203 :     the match will be split. The default is C<600>.
204 :    
205 :     =back
206 :    
207 :     =item RETURN
208 :    
209 :     Returns a hash mapping each incoming ID to a list of hit regions. Each hit
210 : parrello 1.7 region is a n-tuple consisting of (0) the number of matches to the function, (1) the
211 :     start location, (2) the stop location, (3) the proposed function, (4) the name
212 :     of the Genome Set from which the gene is likely to have originated, (5) the ID
213 :     number of the OTU (or C<undef> if the OTU was not found), and (6) the IDs of the
214 :     roles represented in the function, if any of them have IDs.
215 :    
216 : parrello 1.6
217 :     =back
218 :    
219 :     =cut
220 :    
221 :     sub assign_functions_to_dna_small {
222 :     # Get the parameters.
223 :     my ($self, $args) = @_;
224 :     # Get the Kmers object.
225 :     my $kmers = $self->{kmers};
226 :     # Analyze the options.
227 :     my $maxGap = $args->{-maxGap} || 600;
228 :     my $minHits = $args->{-minHits} || 3;
229 :     # Get the KMER size.
230 :     my $kmer = $args->{-kmer};
231 :     # Declare the return variable.
232 :     my $retVal = {};
233 : parrello 1.7 # Get the sapling database.
234 :     my $sap = $self->{db};
235 : parrello 1.6 # Get the sequence tuples.
236 :     my $seqs = ServerThing::GetIdList(-seqs => $args);
237 :     # Loop through the sequences, finding assignments.
238 :     for my $seqTuple (@$seqs) {
239 :     # Extract the ID and sequence.
240 :     my ($id, undef, $seq) = @$seqTuple;
241 :     # Compute the assignment.
242 :     my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
243 :     $minHits, $maxGap);
244 : parrello 1.7 # Loop through the assignments, adding the function and OTU IDs.
245 :     for my $tuple (@$assignment) {
246 :     # Extract the function and OTU.
247 :     my $function = $tuple->[3];
248 :     my $otu = $tuple->[4];
249 :     # Get the IDs for the roles (if any).
250 :     my @roleIdx;
251 :     if ($function) {
252 :     # We have a function, so split it into roles.
253 :     my @roles = roles_of_function($function);
254 :     # Accumulate the IDs for the roles found.
255 :     for my $role (@roles) {
256 :     push @roleIdx, $sap->GetEntityValues(Role => $role, ['role-index']);
257 :     }
258 :     }
259 :     # Get the ID for the OTU (if any).
260 :     my $otuIdx;
261 :     if ($otu) {
262 :     ($otuIdx) = $sap->GetFlat("Genome IsCollectedInto",
263 :     'Genome(scientific-name) = ?', [$otu],
264 :     'IsCollectedInto(to-link)');
265 :     }
266 :     # Update the tuple.
267 :     splice @$tuple, 5, undef, $otuIdx, @roleIdx;
268 :     }
269 : parrello 1.6 # Store the result.
270 :     $retVal->{$id} = $assignment;
271 :     }
272 :     # Return the results.
273 :     return $retVal;
274 :     }
275 :    
276 :    
277 : olson 1.1 =head2 Internal Utility Methods
278 :    
279 : parrello 1.6 =head3 set_kmer_data
280 :    
281 :     $annoObject->set_kmer_data($kmers);
282 :    
283 :     Store the default KMER object for this annotation service.
284 :    
285 :     =cut
286 :    
287 :     sub set_kmer_data {
288 :     # Get the parameters.
289 :     my ($self, $kmers) = @_;
290 :     # Store the specified object.
291 :     $self->{kmers} = $kmers;
292 :     }
293 :    
294 : olson 1.1 =head3 get_max_subset
295 :    
296 :     my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);
297 :    
298 :     Given a subsystem ID and a role rule, return the ID of the variant for
299 :     the subsystem that matches the most roles in the rule and the number of
300 :     roles matched.
301 :    
302 :     =over 4
303 :    
304 :     =item sub
305 :    
306 :     Name (ID) of the subsystem whose variants are to be examined.
307 :    
308 :     =item setA
309 :    
310 :     A space-delimited list of role abbreviations, lexically ordered. This provides
311 :     a unique specification of the roles in the set.
312 :    
313 :     =item RETURN
314 :    
315 : parrello 1.2 Returns a 2-element list consisting of name variant found (subsystem name, colon,
316 :     and variant code) and the number of roles matched.
317 : olson 1.1
318 :     =back
319 :    
320 :     =cut
321 :    
322 :     sub get_max_subset {
323 :     my ($self, $sub, $setA) = @_;
324 :     my $sapling = $self->{db};
325 :     my $max_size = 0;
326 :     my $max_set;
327 :     my $max_variant;
328 :     my %set_hash;
329 :     my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
330 :     while (my $resultRow = $qh->Fetch()) {
331 :     my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
332 :     my ($variantCode) = $resultRow->Value('Variant(code)');
333 :     my $variantId = $sub.":".$variantCode;
334 :     foreach my $setB (@variantRoleRule) {
335 :     my $size = is_A_a_superset_of_B($setA, $setB);
336 :     if ($size && $size > $max_size) {
337 :     $max_size = $size;
338 :     $max_set = $setB;
339 :     $max_variant = $variantId;
340 :     }
341 :     }
342 :     }
343 :     #if ($max_size) {
344 :     #print STDERR "Success $max_variant, $max_set\n";
345 :     #}
346 :     return($max_variant, $max_size);
347 :     }
348 :    
349 :    
350 :     =head3 is_A_a_superset_of_B
351 :    
352 :     my $size = SS::is_A_a_superset_of_B($a, $b);
353 :    
354 :     This method takes as input two role rules, and returns 0 if the first
355 :     role rule is NOT a superset of the second; otherwise, it returns the size
356 :     of the second rule. A role rule is a space-delimited list of role
357 :     abbreviations in lexical order. This provides a unique identifier for a
358 :     set of roles in a subsystem.
359 :    
360 :     =over 4
361 :    
362 :     =item a
363 :    
364 :     First role rule.
365 :    
366 :     =item b
367 :    
368 :     Second role rule.
369 :    
370 :     =item RETURN
371 :    
372 :     Returns 0 if the first rule is NOT a superset of the second and the size of the
373 :     second rule if it is. As a result, if the first rule IS a superset, this method
374 :     will evaluate to TRUE, and to FALSE otherwise.
375 :    
376 :     =back
377 :    
378 :     =cut
379 :    
380 :     sub is_A_a_superset_of_B {
381 :     my ($a, $b) = @_;
382 :     my @a = split(" ", $a);
383 :     my @b = split(" ", $b);
384 :     if (@b > @a) {
385 :     return(0);
386 :     }
387 :     my %given;
388 :     map { $given{$_} = 1} @a;
389 :     map { if (! $given{$_}) {return 0}} split(" ", $b);
390 :     my $l = scalar(@b);
391 :     return scalar(@b);
392 :     }
393 :    
394 :    
395 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3