[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (view) (download) (as text)

1 : overbeek 1.1 package UnvSubsys;
2 :    
3 : overbeek 1.2 use Subsystem;
4 : overbeek 1.1 use Carp;
5 :     use FIG;
6 : parrello 1.12 use SFXlate;
7 : overbeek 1.1 use Data::Dumper;
8 :     use strict;
9 : parrello 1.12 use Tracer;
10 : overbeek 1.1
11 : parrello 1.12 =head1 Universal Subsystem Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     The universal subsystem object provides methods used to display useful information
16 :     about a subsystem. Its intent is to support both Sprout and SEED.
17 :    
18 :     The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
19 :     the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
20 :     are stored in the spreadsheet's cells. Each genome can be identified by genome
21 :     ID or row index. Each role can be identified by role name or column index.
22 :    
23 :     It is worth noting that the term I<subset> has two meanings-- a subset of roles
24 :     or a subset of genomes. A subset of roles is called a I<column subset>. A
25 :     subset of genomes is called a I<row subset>.
26 :    
27 :     The object created contains a great deal of information, so it's worthwhile to
28 :     stop for a moment and discuss it. The object will have the following members.
29 :    
30 :     =over 4
31 :    
32 :     =item Roles
33 :    
34 :     A list of 3-tuples, each consisting of a role name, the abbreviated role name,
35 :     and a list of URLs for the reactions. Each role is a column in the
36 :     subsystem spreadsheet. Indexing into the list by column index yields the
37 :     ID, abbreviation, and reactions for the role in that column.
38 :    
39 :     =item RoleIndex
40 :    
41 :     A hash mapping each role to its column index in the spreadsheet.
42 :    
43 :     =item RoleSubsets
44 :    
45 :     A list of 2-tuples, each consisting of a subset name followed by a list of the
46 :     column indices for the roles in the subset.
47 :    
48 :     =item Genomes
49 :    
50 :     A list of 2-tuples, each containing the genome ID for the relevant row and the
51 :     variant code for that genome. Subsystems can have several variations, and the
52 :     variant code indicates which variant of the subsystem that the genome uses.
53 :     There is one genome for each row in the spreadsheet. Indexing into the list
54 :     yields the ID of the genome for that row and its variant code.
55 :    
56 :     =item GenomeIndex
57 :    
58 :     A hash mapping each genome ID to its row index in the spreadsheet.
59 :    
60 :     =item PegHash
61 :    
62 :     A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
63 :     of hashes, C<$row> is a genome index, and C<$col> is a role index, then
64 :    
65 :     $pegHash->{$row}->{$col}
66 :    
67 :     returns a reference to a list of the IDs for the PEGs in the relevant
68 :     spreadsheet cell.
69 :    
70 :     =item ColorHash
71 :    
72 :     A hash mapping each PEG ID to the color that should be used to represent that
73 :     PEG in the display.
74 :    
75 :     =item AliasHash
76 :    
77 :     A hash mapping each PEG ID to a list of its aliases.
78 :    
79 :     =item ReactionHash
80 :    
81 :     A hash mapping each role to the list of reactions it catalyzes.
82 :    
83 :     =back
84 :    
85 :     =head2 Public Methods
86 :    
87 :     =head3 new
88 :    
89 :     C<< my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors); >>
90 :    
91 :     Construct a new universal subsystem object for a specified subsystem.
92 :    
93 :     =over 4
94 :    
95 :     =item ssa
96 :    
97 :     Name of the subsystem.
98 :    
99 :     =item fig
100 :    
101 :     Data access object, either a FIG object or an SFXlate object.
102 :    
103 :     =item active_subsetR
104 :    
105 :     Name of the active row subset. A row subset names a group of genomes.
106 :    
107 :     =item focus
108 :    
109 :     ID of the genome currently in focus.
110 :    
111 :     =item show_clusters
112 :    
113 :     TRUE if clusters should be painted by color, else FALSE.
114 :    
115 :     =item aliases
116 :    
117 :     TRUE if PEG aliases should be shown, else FALSE.
118 :    
119 :     =item peg_colors (optional)
120 : overbeek 1.1
121 : parrello 1.12 Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
122 :     to be assigned to the PEG.
123 : overbeek 1.1
124 : parrello 1.12 =back
125 : overbeek 1.1
126 : parrello 1.12 =cut
127 : overbeek 1.1
128 : parrello 1.12 sub new {
129 :     # Get the parameters.
130 :     my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
131 :     # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
132 :     # in the seed's file system.
133 :     $ssa =~ s/ /_/g;
134 :     # Get the FIG object. At this point in time, we're only getting data from the SEED. On
135 :     # a future pass through the code, we'll get data from a SEED or Sprout.
136 :     if ((ref($fig) eq "FIG") || (ref($fig) eq 'SFXlate')) {
137 :     # Create a subsystem object. The "get_subsystem" method is provided by both FIG
138 :     # and SFXlate; however, the object returned in each case is different: FIG
139 :     # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
140 :     my $subsystem = $fig->get_subsystem($ssa);
141 :     # Get the key subsystem data. Note that CRs in the notes are converted to LFs
142 :     # in case the notes come from a Mac.
143 : parrello 1.5 my $curator = $subsystem->get_curator;
144 :     my $notes = $subsystem->get_notes;
145 : parrello 1.12 $notes =~ s/\r/\n/g;
146 : parrello 1.5 my @roles = $subsystem->get_roles;
147 :     my $reactions = $subsystem->get_reactions;
148 :     my @genomes = $subsystem->get_genomes;
149 : overbeek 1.1 my @col_subsets = $subsystem->get_subset_namesC;
150 : parrello 1.12 # Create the data structures for the role list and the role index.
151 : parrello 1.5 my $role_info = [];
152 :     my $roleH = {};
153 : parrello 1.12 # Loop through the roles to create the role list. The $i index will move
154 :     # through the columns of the spreadsheet.
155 : parrello 1.5 my($i,$j,$subset,$peg);
156 :     for ($i=0; ($i < @roles); $i++)
157 :     {
158 : parrello 1.12 # Get the ID of the role in the current column.
159 : parrello 1.5 my $role = $roles[$i];
160 : parrello 1.12 # Extract its abbreviation.
161 :     my $abbrev = $subsystem->get_role_abbr($i);
162 :     # Get the reactions. Note that its possible no reactions were found. If there were
163 :     # reactions found, however, we convert from reaction IDs to a comma-delimited
164 :     # list of HTML code full of hyperlinks.
165 : parrello 1.5 my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
166 : parrello 1.12 # Form the role name, abbreviation, and reaction list into a 3-tuple and push
167 :     # them onto the main role list.
168 : parrello 1.5 push(@$role_info,[$role,$abbrev,$react]);
169 : parrello 1.12 # Set the role hash so that we can get back the column index for a given
170 :     # role name.
171 : parrello 1.5 $roleH->{$role} = $i;
172 :     }
173 : parrello 1.12 # Get the column subsets. A column subset is a list of role IDs, so we need to
174 :     # convert it to a set of column indices. We ignore the special "All" subset
175 :     # that contains everything.
176 : parrello 1.5 my $subset_info = [];
177 :     foreach $subset (@col_subsets)
178 :     {
179 :     if ($subset ne 'All')
180 :     {
181 :     push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
182 :     }
183 :     }
184 : parrello 1.12 # Now we create the genome directories. For each genome we need to be able to
185 :     # hash from the genomeID to the specified role index and we must be able to
186 :     # get the genome's variant code. (The variant code indicates which subsystem
187 :     # variant the genome uses.
188 : parrello 1.5 my $genomes_info = [];
189 :     my $genomeH = {};
190 :     for ($i=0; ($i < @genomes); $i++)
191 :     {
192 : parrello 1.12 # Get the genome ID for this row.
193 : parrello 1.5 my $genome = $genomes[$i];
194 : parrello 1.12 # Get its variant code.
195 :     my $variant = $subsystem->get_variant_code($i);
196 :     # Form them into a 2-tuple and add the result to the genome list.
197 : parrello 1.5 push(@$genomes_info,[$genome,$variant]);
198 : parrello 1.12 # Set up the hash to get from the genome ID to the row index.
199 : parrello 1.5 $genomeH->{$genome} = $i;
200 :     }
201 : parrello 1.12
202 :     # Next we gather the data from the actual spreadsheet cells. For an SFXlate
203 :     # object, this is the most expensive part of the operation, since it requires
204 :     # a database call for each cell.
205 : parrello 1.5 my $pegH = {};
206 : parrello 1.12 # $i is the row index, which cycles through genomes.
207 : parrello 1.5 for ($i=0; ($i < @genomes); $i++)
208 :     {
209 : parrello 1.12 # $j is the column index, which cycles through roles.
210 : parrello 1.5 for ($j=0; ($j < @roles); $j++)
211 :     {
212 : parrello 1.12 my @pegs = $subsystem->get_pegs_from_cell($i,$j);
213 : parrello 1.5 $pegH->{$i}->{$j} = [@pegs];
214 :     }
215 :     }
216 : overbeek 1.7
217 : parrello 1.12 # Get the row subsets. Row subsets are determined by the genome
218 :     # taxonomy, so these are different from the row subsets stored
219 :     # in the subsystem object.
220 :     my $row_subsets = &row_subsets($fig, $genomeH);
221 :     # Here we try to get a list of the active genomes. The caller
222 :     # gives us hints in the form of the "$focus" and "$active_subsetR"
223 :     # parameters. If, for example, we are using this object to generate
224 :     # a subsystem web page, the focus information would steer us to
225 :     # whatever the user wants to look at on the page.
226 : olson 1.13 my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
227 : parrello 1.12 $genomeH,$genomes_info);
228 :    
229 :     # Now we generate a table of colors for the various PEGs. If the
230 :     # caller gave us a map of peg IDs to colors, we use that. Otherwise,
231 :     # we allow the option of painting the PEGs by cluster number. (The
232 :     # caller indicates this by setting the "show_clusters" flag in the
233 :     # parameter list.)
234 :     my $colorsH;
235 :     if ($peg_colors)
236 :     {
237 :     # Here the caller gave us a list of peg colors. The list contains
238 :     # 2-tuples, each consisting of a PEG ID followed by the
239 :     # color value. The loop below extracts the pairs and stuffs them
240 :     # into the color hash.
241 :     $colorsH = {};
242 :     foreach $_ (@$peg_colors)
243 :     {
244 :     my($peg,$color) = @$_;
245 :     $colorsH->{$peg} = $color;
246 :     }
247 :     }
248 :     elsif ($show_clusters)
249 :     {
250 :     # Here the user wants us to base the colors on the genome clustering
251 :     # information.
252 :     $colorsH = &set_colors($fig,$subsystem,$pegH,$active_genomes);
253 :     }
254 :     else
255 :     {
256 :     # Here the user is not interested in coloring the PEGs.
257 :     $colorsH = {};
258 :     }
259 :     # If the user wants to see aliases, compute the alias hash. Aliases
260 :     # will only be computed for PEGs belonging to active (highlighted)
261 :     # genomes, and there is a maximum of one alias per PEG.
262 : overbeek 1.7 my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
263 : parrello 1.12 # Create and bless the UnvSubsys object.
264 : parrello 1.5 my $self = { Roles => $role_info,
265 :     RoleIndex => $roleH,
266 :     RoleSubsets => $subset_info,
267 :     Genomes => $genomes_info,
268 :     GenomeIndex => $genomeH,
269 : parrello 1.12 GenomeSubsets => $row_subsets,
270 : parrello 1.5 PegHash => $pegH,
271 :     Colors => $colorsH,
272 :     Aliases => $aliasesH,
273 :     Curator => $curator,
274 :     Notes => $notes,
275 :     Reactions => $reactions
276 :     };
277 :     bless($self, $class);
278 : parrello 1.12 # Return the object.
279 : parrello 1.5 return $self;
280 : overbeek 1.1 }
281 :     else
282 :     {
283 : parrello 1.12 # Here the FIG-like object was not recognized, so we return an
284 :     # undefined value.
285 : parrello 1.5 return undef;
286 : overbeek 1.1 }
287 :     }
288 :    
289 : parrello 1.12 =head3 get_subset_namesR
290 :    
291 :     C<< my @names = $unvsub->get_subset_namesR(); >>
292 :    
293 :     Return the names of the genome (row) subsets.
294 :    
295 :     =cut
296 :    
297 : overbeek 1.6 sub get_subset_namesR {
298 :     my($self) = @_;
299 :    
300 :     return map { $_->[0] } @{$self->{GenomeSubsets}};
301 :     }
302 :    
303 : parrello 1.12 =head3 get_subsetR
304 :    
305 :     C<< my @genomes = $unvsub->get_subsetR($set); >>
306 :    
307 :     Return a list of the genome IDs covered by a row subset.
308 :    
309 :     =over 4
310 :    
311 :     =item set
312 :    
313 :     Name of the row subset whose genomes are desired.
314 :    
315 :     =item RETURN
316 :    
317 :     Returns a list of the IDs for the genomes found in the specified row
318 :     set.
319 :    
320 :     =back
321 :    
322 :     =cut
323 :    
324 : overbeek 1.6 sub get_subsetR {
325 : parrello 1.12 # Get the parameters.
326 : overbeek 1.6 my($self,$set) = @_;
327 :     my($i);
328 : parrello 1.12 # Get the list of row subsets.
329 : overbeek 1.6 my $sets = $self->{GenomeSubsets};
330 : parrello 1.12 # Find the row subset with the specified name. The row subset list is a
331 :     # list of 2-tuples, and the first element of each tuple is the set
332 :     # name.
333 : overbeek 1.6 for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
334 :     if ($i < @$sets)
335 :     {
336 : parrello 1.12 # Here we found the named subset. The subset tuple's second element is
337 :     # the list of row indices. We map these to genome IDs before returning
338 :     # them.
339 :     return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
340 : overbeek 1.6 }
341 : parrello 1.12 # Here we subset was not found, so we return the undefined value.
342 : overbeek 1.6 return undef;
343 :     }
344 :    
345 : parrello 1.12 =head3 get_subsetR
346 :    
347 :     C<< my @pairs = $unvsub->get_subsetsR(); >>
348 :    
349 :     Return a list of all the row subsets. The subsets are returned in the form
350 :     of 2-tuples, each consisting of a subset name followed by a reference to a
351 :     list of genome IDs. The genome IDs correspond to the rows in the subset.
352 :    
353 :     =cut
354 :    
355 : overbeek 1.6 sub get_subsetsR {
356 : parrello 1.12 # Get the parameters.
357 : overbeek 1.6 my($self) = @_;
358 : parrello 1.12 # Extract the list of genome subsets. This list is in the form of
359 :     # 2-tuples, but the rows are identified by row index, not genome ID.
360 : overbeek 1.6 my $sets = $self->{GenomeSubsets};
361 : parrello 1.12 # Create the return list.
362 : overbeek 1.6 my @pairs = ();
363 : parrello 1.12 # Loop through the subsets.
364 : overbeek 1.6 my $pair;
365 :     foreach $pair (@$sets)
366 :     {
367 : parrello 1.12 # Convert this subset's member list from row indices to genome IDs
368 :     # and stash the result in the return list.
369 :     my($id,$members) = @$pair;
370 :     push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
371 : overbeek 1.6 }
372 : parrello 1.12 # Return the list constructed.
373 : overbeek 1.6 return @pairs;
374 :     }
375 :    
376 : parrello 1.12 =head3 row_subsets
377 :    
378 :     C<< my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH); >>
379 :    
380 :     This method computes the taxonomic row subsets for a subsystem. It takes
381 :     as input a hash that maps genome IDs to column indices and a FIG object.
382 :     The FIG object provides a list of taxonomic groups of 10 or more complete
383 :     genomes. From the list, we extract subsets which have more than 10
384 :     genomes in the list of subsystem genomes. If no such subsets exist,
385 :     we extract subsets which have at least 1 genome from the list of
386 :     subsystem genomes. The subsets are returned as 2-tuples, the first
387 :     element being the subset ID and the second being a list of row indices
388 :     for the genomes in the subset.
389 :    
390 :     =over 4
391 :    
392 :     =item fig
393 :    
394 :     A FIG-like object for accessing the data store.
395 :    
396 :     =item genomeH
397 :    
398 :     Reference to a hash that maps each genome ID to its row index in the
399 :     subsystem spreadsheet.
400 :    
401 :     =item RETURN
402 :    
403 :     Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
404 :     subset ID followed by a list of the row indices for the genomes in the
405 :     subset.
406 :    
407 :     =back
408 :    
409 :     =cut
410 :    
411 : overbeek 1.6 sub row_subsets {
412 : parrello 1.12 my ($fig, $genomeH) = @_;
413 : overbeek 1.6
414 : parrello 1.12 # We need a real FIG object, since SFXlate does not yet support
415 :     # taxonomy trees. The "FIG" method does this for us.
416 :     $fig = $fig->FIG();
417 :     # Initialize the return value.
418 : overbeek 1.6 my $subsets = [];
419 : parrello 1.12 # Get a list of taxonomic groups. This will come back as a list of
420 :     # 2-tuples.
421 : overbeek 1.11 my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
422 : parrello 1.12 # Loop through the 2-tuples. We're looking for subsets which
423 :     # contain at least one genome on the subsystem's spreadsheet.
424 : overbeek 1.6 my($pair,$id,$members);
425 :     foreach $pair (@$taxonomic_groups)
426 :     {
427 :     ($id,$members) = @$pair;
428 : parrello 1.12 # Extract the genomes in the member list that participate in this
429 :     # subsystem. To do this, we convert each genome ID to its row
430 :     # index. If no row index exists, the GREP condition discards the
431 :     # member.
432 :     my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
433 :     # If there are enough members, save the subset.
434 :     if (@mem > 0)
435 :     {
436 :     push(@$subsets,[$id,[@mem]]);
437 :     }
438 : overbeek 1.6 }
439 : parrello 1.12 # Return the list of row subsets.
440 : overbeek 1.6 return $subsets;
441 :     }
442 :    
443 : parrello 1.12 =head3 set_aliases
444 :    
445 :     C<< my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes); >>
446 :    
447 :     Return a hash mapping PEG IDs to aliases.
448 :    
449 :     =over 4
450 :    
451 :     =item fig
452 :    
453 :     FIG-like object that can be used to access the data store.
454 :    
455 :     =item pegH
456 :    
457 :     Reference to the spreadsheet hash table. Given a row index I<$row> and a
458 :     column index I<$col>,
459 :    
460 :     $pegH->{$row}->{$col}
461 :    
462 :     will return a reference to a list of PEGs in the specified spreadsheet cell.
463 :    
464 :     =item active_genomes
465 :    
466 :     Reference to a hash whose keys correspond to the spreadsheet row indices
467 :     of genomes that should be highlighted.
468 :    
469 :     =item RETURN
470 :    
471 :     Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
472 :     for active genomes will be processed.
473 :    
474 :     =back
475 :    
476 :     =cut
477 :    
478 : overbeek 1.4 sub set_aliases {
479 : parrello 1.12 # Get the parameters.
480 : overbeek 1.7 my($fig,$pegH,$active_genomes) = @_;
481 : overbeek 1.4 my($genomeI,$roleI,$pegs,$peg,$roleH);
482 :    
483 : parrello 1.12 # Create the return hash.
484 : overbeek 1.4 my $aliasesH = {};
485 :    
486 : parrello 1.12 # Loop through each row that corresponds to an active genome.
487 :     # The active genome list contains row indices, and there is
488 :     # one genome per row. Note that the genome ID is never used,
489 :     # only the row index.
490 : overbeek 1.7 foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
491 : overbeek 1.4 {
492 : parrello 1.12 # Get the role hash for the specified genome. The role hash
493 :     # maps column indices (representing roles) to lists of PEGs.
494 : parrello 1.5 $roleH = $pegH->{$genomeI};
495 : parrello 1.12 # Loop through the role (column) indices.
496 : parrello 1.5 foreach $roleI (keys(%$roleH))
497 :     {
498 : parrello 1.12 # Get the PEG list for this row/column combination.
499 : parrello 1.5 $pegs = $roleH->{$roleI};
500 : parrello 1.12 # Only proceed if data was found in the cell.
501 :     if (defined $pegs) {
502 :     # Loop through the pegs in the cell.
503 :     foreach $peg (@$pegs)
504 : parrello 1.5 {
505 : parrello 1.12 # If we do not already have an alias for this PEG,
506 :     # compute one.
507 :     if (! $aliasesH->{$peg})
508 :     {
509 :     $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
510 :     }
511 : parrello 1.5 }
512 :     }
513 :     }
514 : overbeek 1.4 }
515 : parrello 1.12 # Return the hash we built.
516 : overbeek 1.4 return $aliasesH;
517 :     }
518 :    
519 : parrello 1.12 =head3 set_colors
520 :    
521 :     C<< my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes); >>
522 :    
523 :     Return a hash that maps each PEG in the subsystem spreadsheet to a display
524 :     color. Not all PEGs need to be mapped. Those that do not have a color
525 :     assigned will generally be displayed in white.
526 :    
527 :     =over 4
528 :    
529 :     =item fig
530 :    
531 :     FIG-like object that can be used to access the data store.
532 :    
533 :     =item sub
534 :    
535 :     Subsystem object for the current subsystem.
536 :    
537 :     =item pegH
538 :    
539 :     Reference to the spreadsheet hash table. Given a row index I<$row> and a
540 :     column index I<$col>,
541 :    
542 :     $pegH->{$row}->{$col}
543 :    
544 :     will return a reference to a list of PEGs in the specified spreadsheet cell.
545 :    
546 :     =item active_genomes
547 :    
548 :     Reference to a hash whose keys correspond to the spreadsheet row indices
549 :     of genomes that should be highlighted.
550 :    
551 :     =item RETURN
552 :    
553 :     Returns a hash that takes as input a PEG ID and returns a color. These colors
554 :     are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
555 :     for active genomes will be colored.
556 :    
557 :     =back
558 :    
559 :     =cut
560 :    
561 : overbeek 1.4 sub set_colors {
562 : parrello 1.12 # Get the parameters.
563 :     my($fig,$sub,$pegH,$active_genomes) = @_;
564 :    
565 :     my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
566 :     # Create the return hash.
567 : overbeek 1.4 my $colorsH = {};
568 : parrello 1.12 # Loop through the active genomes. The keys of "%$pegH" are the row indices
569 :     # for rows that have at least one occupied cell in the spreadsheet. The
570 :     # Grep then reduces this list to those rows that are highlighted.
571 : overbeek 1.7 foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
572 : overbeek 1.4 {
573 : parrello 1.12 # We will use the following hash to compile a list of all the PEGs
574 :     # in the spreadsheet for the current genome, that is, the genome
575 :     # represented by row index "$genomeI".
576 : parrello 1.5 undef %pegs_in_genome;
577 : parrello 1.12 # Get the hash for the current row. This hash maps column indices to
578 :     # lists of pegs.
579 : parrello 1.5 $roleH = $pegH->{$genomeI};
580 : parrello 1.12 # Loop through the column indices for the specified row.
581 : parrello 1.5 foreach $roleI (keys(%$roleH))
582 :     {
583 : parrello 1.12 # Here we can finally get a list of the pegs in this spreadsheet
584 :     # cell. We loop through them, marking them in the "%pegs_in_genome"
585 :     # hash.
586 : parrello 1.5 $pegs = $roleH->{$roleI};
587 :     foreach $peg (@$pegs)
588 :     {
589 :     $pegs_in_genome{$peg} = 1;
590 :     }
591 :     }
592 : parrello 1.12 # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
593 :     # list of all the pegs for the current spreadsheet role.
594 : parrello 1.5 my @pegs = keys(%pegs_in_genome);
595 :     my($tuple,$peg,$color);
596 : parrello 1.12 # Get a hash that maps the PEG IDs to colors.
597 :     my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
598 :     # Loop through the hash we got back and assign the colors from that
599 :     # hash to the master hash we're returning to the caller.
600 : parrello 1.5 while (($peg,$color) = each %$colors_for_one_genome)
601 :     {
602 :     $colorsH->{$peg} = $colors_for_one_genome->{$peg};
603 :     }
604 : overbeek 1.4 }
605 : parrello 1.12 # Return the color hash.
606 : overbeek 1.4 return $colorsH;
607 :     }
608 :    
609 : parrello 1.12 =head3 set_colors_for_genome
610 :    
611 :     C<< my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs); >>
612 :    
613 :     Return a reference to a hash mapping the specified pegs to colors. PEGs that
614 :     are physically close to each other will be painted the same color.
615 :    
616 :     =over 4
617 :    
618 :     =item fig
619 :    
620 :     A fig-like object that can be used to access the data store.
621 :    
622 :     =item sub
623 :    
624 :     Subsystem object for the relevant subsystem.
625 : overbeek 1.4
626 : parrello 1.12 =item pegs
627 : overbeek 1.4
628 : parrello 1.12 Reference to a list of PEG IDs. All of the peg IDs should be for the
629 :     same genome.
630 : overbeek 1.4
631 : parrello 1.12 =item RETURN
632 : overbeek 1.4
633 : parrello 1.12 Returns a reference to a hash that maps each PEG ID to a color.
634 : overbeek 1.4
635 : parrello 1.12 =back
636 : overbeek 1.4
637 : parrello 1.12 =cut
638 : overbeek 1.4
639 : parrello 1.12 sub set_colors_for_genome {
640 :     # Get the parameters.
641 :     my($fig, $sub, $pegs) = @_;
642 :     # Default all the PEGs to white.
643 :     my %color_of = map { $_ => '#FFFFFF' } @$pegs;
644 :     # Divide the pegs into clusters.
645 :     my @clusters = $fig->compute_clusters($pegs, $sub);
646 :     # Get a list of useful colors.
647 :     my @colors = &cool_colors();
648 :     # If we have too many clusters, chop off the big ones at the end. These
649 :     # are least likely to be important.
650 :     if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
651 :     # Loop through the clusters.
652 :     for my $cluster (@clusters) {
653 :     # Get the color for this cluster.
654 :     my $color = shift @colors;
655 :     # Loop through this cluster, putting this color into the color_of
656 :     # entries for each PEG.
657 :     for my $peg (@$cluster) {
658 :     $color_of{$peg} = $color;
659 : parrello 1.5 }
660 : overbeek 1.4 }
661 : parrello 1.12 # Return the color map.
662 :     return \%color_of;
663 : overbeek 1.4 }
664 :    
665 : parrello 1.12 =head3 cool_colors
666 :    
667 :     C<< my @colorList = UnvSubsys::cool_colors(); >>
668 :    
669 :     Return a list of web-safe colors.
670 :    
671 :     =cut
672 :    
673 : overbeek 1.4 sub cool_colors {
674 :     # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
675 :     return (
676 :     '#C0C0C0', '#FF40C0', '#FF8040', '#FF0080', '#FFC040', '#40C0FF', '#40FFC0', '#C08080', '#C0FF00', '#00FF80', '#00C040',
677 :     "#6B8E23", "#483D8B", "#2E8B57", "#008000", "#006400", "#800000", "#00FF00", "#7FFFD4",
678 :     "#87CEEB", "#A9A9A9", "#90EE90", "#D2B48C", "#8DBC8F", "#D2691E", "#87CEFA", "#E9967A", "#FFE4C4", "#FFB6C1",
679 :     "#E0FFFF", "#FFA07A", "#DB7093", "#9370DB", "#008B8B", "#FFDEAD", "#DA70D6", "#DCDCDC", "#FF00FF", "#6A5ACD",
680 :     "#00FA9A", "#228B22", "#1E90FF", "#FA8072", "#CD853F", "#DC143C", "#FF6347", "#98FB98", "#4682B4",
681 :     "#D3D3D3", "#7B68EE", "#2F4F4F", "#FF7F50", "#FF69B4", "#BC8F8F", "#A0522D", "#DEB887", "#00DED1",
682 :     "#6495ED", "#800080", "#FFD700", "#F5DEB3", "#66CDAA", "#FF4500", "#4B0082", "#CD5C5C",
683 :     "#EE82EE", "#7CFC00", "#FFFF00", "#191970", "#FFFFE0", "#DDA0DD", "#00BFFF", "#DAA520", "#008080",
684 :     "#00FF7F", "#9400D3", "#BA55D3", "#D8BFD8", "#8B4513", "#3CB371", "#00008B", "#5F9EA0",
685 :     "#4169E1", "#20B2AA", "#8A2BE2", "#ADFF2F", "#556B2F",
686 :     "#F0FFFF", "#B0E0E6", "#FF1493", "#B8860B", "#FF0000", "#F08080", "#7FFF00", "#8B0000",
687 :     "#40E0D0", "#0000CD", "#48D1CC", "#8B008B", "#696969", "#AFEEEE", "#FF8C00", "#EEE8AA", "#A52A2A",
688 :     "#FFE4B5", "#B0C4DE", "#FAF0E6", "#9ACD32", "#B22222", "#FAFAD2", "#808080", "#0000FF",
689 :     "#000080", "#32CD32", "#FFFACD", "#9932CC", "#FFA500", "#F0E68C", "#E6E6FA", "#F4A460", "#C71585",
690 :     "#BDB76B", "#00FFFF", "#FFDAB9", "#ADD8E6", "#778899",
691 :     );
692 :     }
693 :    
694 : parrello 1.12 =head3 ext_id
695 :    
696 :     C<< my $externalID = UnvSubsys::ext_id($fig, $peg); >>
697 :    
698 :     or
699 :    
700 :     C<< my @externalIDs = UnvSubsys::ext_id($fig, $peg); >>
701 :    
702 :     Return a list of non-FIG IDs for the specified feature. In a scalar context, return
703 :     a single non-FIG ID for the specified feature.
704 :    
705 :     This method returns IDs that are all of the same type, that is, all UniProt IDs, or
706 :     all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
707 :     of a given type. If it finds at least one, then all IDs of that type are returned.
708 :     Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
709 :     KEGG IDs.
710 :    
711 :     =over 4
712 :    
713 :     =item fig
714 :    
715 :     A FIG-like object for accessing the data store.
716 :    
717 :     =item peg
718 :    
719 :     ID of the feature whose aliases are desired.
720 :    
721 :     =item RETURN
722 :    
723 :     In list context, a list of non-FIG IDs for the feature that are all of the same
724 :     type. In scalar context, the first non-FIG ID for the feature of the
725 :     highest-priority available type.
726 :    
727 :     =back
728 :    
729 :     =cut
730 :    
731 : overbeek 1.4 sub ext_id {
732 :     my($fig,$peg) = @_;
733 :    
734 :     my @tmp;
735 :     my @aliases = $fig->feature_aliases($peg);
736 :     if ((@tmp = grep { $_ =~ /^uni\|/ } @aliases) > 0)
737 :     {
738 : parrello 1.5 @aliases = @tmp;
739 : overbeek 1.4 }
740 :     elsif ((@tmp = grep { $_ =~ /^sp\|/ } @aliases) > 0)
741 :     {
742 : parrello 1.5 @aliases = @tmp;
743 : overbeek 1.4 }
744 :     elsif ((@tmp = grep { $_ =~ /^gi\|/ } @aliases) > 0)
745 :     {
746 : parrello 1.5 @aliases = @tmp;
747 : overbeek 1.4 }
748 :     elsif ((@tmp = grep { $_ =~ /^kegg\|/ } @aliases) > 0)
749 :     {
750 : parrello 1.5 @aliases = @tmp;
751 : overbeek 1.4 }
752 :     else
753 :     {
754 : parrello 1.5 @aliases = ();
755 : overbeek 1.4 }
756 :    
757 :     if (wantarray())
758 :     {
759 : parrello 1.5 return @aliases;
760 : overbeek 1.4 }
761 :     else
762 :     {
763 : parrello 1.5 return $aliases[0];
764 : overbeek 1.4 }
765 :     }
766 :    
767 : parrello 1.12 =head3 subsystem_curator
768 :    
769 :     C<< my $name = $unvsub->subsystem_curator(); >>
770 :    
771 :     Return the name of the subsystem curator. The database stores user names as
772 :     C<master:>I<name>. This method strips off the C<master:> prefix before it
773 :     passes the result back to the caller.
774 :    
775 :     =cut
776 : overbeek 1.4
777 : overbeek 1.2 sub subsystem_curator {
778 :     my($self) = @_;
779 :    
780 :     my $curator = $self->{Curator};
781 :     $curator =~ s/master://;
782 :     return $curator;
783 :     }
784 :    
785 : parrello 1.12 =head3 get_roles
786 :    
787 :     C<< my @roles = $unvsub->get_roles(); >>
788 :    
789 :     Return a list of the roles (columns) for this subsystem. The roles will be
790 :     returned in column order, so that if you access the sixth element of the
791 :     return list, you'll get the name of the role for the sixth column.
792 :    
793 :     =cut
794 :    
795 : overbeek 1.2 sub get_roles {
796 :     my($self) = @_;
797 : parrello 1.12 # The role index in this object is a list of 3-tuples. The caller only
798 :     # wants the first element of each tuple, which is the role name.
799 : overbeek 1.2 return map { $_->[0] } @{$self->{Roles}};
800 :     }
801 :    
802 : parrello 1.12 =head3 get_genome_index
803 :    
804 :     C<< my $index = $unvsub->get_genome_index($genome); >>
805 :    
806 :     Return the row index of the specified genome.
807 :    
808 :     =over 4
809 :    
810 :     =item genome
811 :    
812 :     ID of the genome whose row index is desired.
813 :    
814 :     =item RETURN
815 :    
816 :     Returns the index of the row corresponding to the specified genome, or an
817 :     undefined value if the genome is not represented in the subsystem
818 :     spreadsheet.
819 :    
820 :     =back
821 :    
822 :     =cut
823 :    
824 : overbeek 1.3 sub get_genome_index {
825 :     my($self,$genome) = @_;
826 :    
827 :     return $self->{GenomeIndex}->{$genome};
828 :     }
829 :    
830 : parrello 1.12 =head3 get_genomes
831 :    
832 :     C<< my @genomes = $unvsub->get_genomes(); >>
833 :    
834 :     Return a list of the genome IDs for the subsystem. The genomes will be
835 :     presented in row order. In other words, if you index into the sixth
836 :     element of the return list, you will retrieve the ID of the genome for
837 :     the sixth row.
838 :    
839 :     =cut
840 :    
841 : overbeek 1.3 sub get_genomes {
842 :     my($self) = @_;
843 : parrello 1.12 # The genome array is a list of 2-tuples. We extract the first
844 :     # element of each tuple, which is the genome ID.
845 : overbeek 1.3 return map { $_->[0] } @{$self->{Genomes}};
846 :     }
847 :    
848 : parrello 1.12 =head3 get_variant_code
849 :    
850 :     C<< my $code = $unvsub->get_variant_code($genome); >>
851 :    
852 :     Return the variant code for a genome. Each subsystem has several variations.
853 :     The variant code indicates which variation of a subsystem is used by a
854 :     particular genome.
855 :    
856 :     Genome data is stored in a list of 2-tuples. The first element is the genome
857 :     ID; the second is the variant code.
858 :    
859 :     =over 4
860 :    
861 :     =item genome
862 :    
863 :     ID or row index of the genome whose variant code is desired.
864 :    
865 :     =item RETURN
866 :    
867 :     Returns the variant code for the specified genome.
868 :    
869 :     =back
870 :    
871 :     =cut
872 :    
873 : overbeek 1.3 sub get_variant_code {
874 :     my($self,$genome) = @_;
875 : parrello 1.12 # Check to see if we have a row index.
876 : overbeek 1.3 if ($genome =~ /^\d+$/)
877 :     {
878 : parrello 1.12 # Here we have a row index, so use it to find the genome's variant
879 :     # code.
880 : parrello 1.5 return $self->{Genomes}->[$genome]->[1];
881 : overbeek 1.3 }
882 :     else
883 :     {
884 : parrello 1.12 # Here we have a genome ID, so we need to convert it to a row index.
885 : parrello 1.5 my $genomeI = $self->{GenomeIndex}->{$genome};
886 :     return $self->{Genomes}->[$genomeI]->[1];
887 : overbeek 1.3 }
888 :     }
889 :    
890 : parrello 1.12 =head3 get_pegs_from_cell
891 :    
892 :     C<< my @pegs = $unvsub->get_pegs_from_cell($genome, $role); >>
893 :    
894 :     Return a list of the features in a specified spreadsheet cell. The cell is specified
895 :     by genome ID and role ID.
896 :    
897 :     =over 4
898 :    
899 :     =item genome
900 :    
901 :     ID of the genome relevant to the cell.
902 :    
903 :     =item role
904 :    
905 :     ID of the role relevant to the cell.
906 :    
907 :     =item RETURN
908 :    
909 :     Returns a list of the features in the cell, or an empty list if the cell is empty.
910 :    
911 :     =back
912 :    
913 :     =cut
914 :    
915 : overbeek 1.3 sub get_pegs_from_cell {
916 :     my($self,$genome,$role) = @_;
917 : parrello 1.12 # Convert the genome and role IDs to row and column indices.
918 : overbeek 1.3 my $genomeI = $self->{GenomeIndex}->{$genome};
919 :     my $roleI = $self->{RoleIndex}->{$role};
920 : parrello 1.12 # Get the pegs from the cell and return them.
921 : overbeek 1.3 my $pegs = $self->{PegHash}->{$genomeI}->{$roleI};
922 :     return $pegs ? @$pegs : ();
923 :     }
924 :    
925 :     sub get_notes {
926 :     my($self) = @_;
927 :    
928 :     return $self->{Notes};
929 :     }
930 :    
931 : overbeek 1.2 sub get_role_index {
932 :     my($self,$role) = @_;
933 :    
934 :     return $self->{RoleIndex}->{$role};
935 :     }
936 :    
937 :     sub get_role_abbr {
938 :     my($self,$roleI) = @_;
939 :    
940 :     if ($roleI !~ /^\d+$/)
941 :     {
942 : parrello 1.5 $roleI = $self->{RoleIndex}->{$roleI};
943 : overbeek 1.2 }
944 :     my $roles = $self->{Roles};
945 :     return $roles->[$roleI]->[1];
946 :     }
947 :    
948 :     sub get_reactions {
949 :     my($self) = @_;
950 :    
951 :     return $self->{Reactions};
952 :     }
953 :    
954 :     sub get_subset_namesC {
955 :     my($self) = @_;
956 :    
957 :     return map { $_->[0] } @{$self->{RoleSubsets}};
958 :     }
959 :    
960 :     sub get_subsetC_roles {
961 :     my($self,$subset) = @_;
962 :     my($i,$j);
963 :    
964 :     my $subset_info = $self->{RoleSubsets};
965 :     for ($i=0; ($i < @$subset_info) && ($subset_info->[$i]->[0] ne $subset); $i++) {}
966 :     if ($i < @$subset_info)
967 :     {
968 : parrello 1.5 my @roles = ();
969 :     foreach $j (@{$subset_info->[$i]->[1]})
970 :     {
971 :     push(@roles,$self->{Roles}->[$j]->[0]);
972 :     }
973 :     return @roles;
974 : overbeek 1.2 }
975 :     return undef;
976 :     }
977 :    
978 : overbeek 1.4 sub get_color_of {
979 :     my($self,$peg) = @_;
980 :    
981 :     return $self->{Colors}->{$peg};
982 :     }
983 :    
984 : parrello 1.12 =head3 active_genomes
985 :    
986 :     C<< my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info); >>
987 :    
988 :     Return a hash containing the active genomes for this subsystem display. The
989 :     keys of the hash will be the row indices of the genomes to be highlighted on the
990 :     display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
991 :     tests TRUE, the fourth row should be highlighted.
992 :    
993 :     The rules for determining the active genomes are as follows. If I<$active_subsetR> is
994 :     specified, it is presumed to be the ID of the subset containing the active genomes.
995 :     If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
996 :     ID of the genome currently in focus, and the active genomes will be the ones in the
997 :     smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
998 :     I<$focus> are specified, then all genomes are active.
999 :    
1000 :     =over 4
1001 :    
1002 :     =item row_subsets
1003 :    
1004 :     Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1005 :     a reference to a list of the row indices for the rows in the identified subset.
1006 :    
1007 :     =item active_subsetR (optional)
1008 :    
1009 :     ID of the active subset (if any).
1010 :    
1011 :     =item focus
1012 :    
1013 :     ID of the genome currently in focus (if any). If there is no active subset, then
1014 :     the smallest subset containing this genome will be made active.
1015 :    
1016 :     =item genomeH
1017 :    
1018 :     Reference to a hash of genome IDs to row indices. The keys of this hash are
1019 :     the genomes in this subsystem, which also form the subsystem spreadsheet's
1020 :     rows.
1021 :    
1022 :     =item genomes_info
1023 :    
1024 :     Reference to a list of 2-tuples. The first element of each 2-tuple is the
1025 :     ID of a genome; the second is the variant code for the subsystem variant
1026 :     used by the genome. The tuples are ordered by row index, so that the ID
1027 :     and variant code of the genome in a particular row can be located by indexing
1028 :     into this parameter using the subsystem spreadsheet row number.
1029 :    
1030 :     =item RETURN
1031 :    
1032 :     Returns a reference to a hash that maps the row indices of the active genomes
1033 :     to 1. This hash can be used to quickly determine whether or not a particular
1034 :     row is to be highlighted.
1035 :    
1036 :     =back
1037 :    
1038 :     =cut
1039 :    
1040 : overbeek 1.7 sub active_genomes {
1041 : parrello 1.12 # Get the parameters.
1042 :     my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1043 : overbeek 1.7 my($i,@bestL);
1044 : parrello 1.12 # Declare the return variable.
1045 : overbeek 1.7 my $active_genomes = {};
1046 : parrello 1.12 # Check for an active subset.
1047 : overbeek 1.7 if ($active_subsetR)
1048 :     {
1049 : parrello 1.12 # Search for the active subset in the row subset array.
1050 :     for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1051 :     if ($i < @$row_subsets)
1052 :     {
1053 :     # Here we found the active subset, so we extract its list of row indices.
1054 :     @bestL = @{$row_subsets->[$i]->[1]};
1055 :     }
1056 :     else {
1057 :     # Here we have the ID of the active subset. First, we search for that ID
1058 :     # in the row subset list.
1059 :     for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1060 :     if ($i < @$row_subsets)
1061 :     {
1062 :     # Here we found the named subset, so we return its member list.
1063 :     @bestL = @{$row_subsets->[$i]->[1]};
1064 :     }
1065 :     else
1066 :     {
1067 :     # Here the active subset does not exist. We punt by extracting a
1068 :     # list of all the row indices in the spreadsheet.
1069 :     $active_subsetR = 'All';
1070 :     @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1071 :     }
1072 :     }
1073 : overbeek 1.7 }
1074 : parrello 1.12 elsif ($focus)
1075 : overbeek 1.7 {
1076 : parrello 1.12 # Here we don't have an active row subset, but a particular genome is in
1077 :     # focus. We'll look for the smallest row subset containing the genome
1078 :     # in focus. First, we need to prime the loop. "$bestN" will be the ID
1079 :     # of the best subset found so far; "@bestL" is where we stash the list
1080 :     # of IDs in the subset. Our initial selection, then, will be the
1081 :     # fake "All" subset, which contains the entire collection of rows.
1082 :     if (! $fig->is_complete($focus))
1083 :     {
1084 :     # Here the gnome in focus is incomplete, so it won't be anywhere
1085 :     # in our list. We default to making everything active.
1086 :     $active_subsetR = 'All';
1087 :     @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1088 :     } else {
1089 :     my $bestN = "All";
1090 :     @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1091 :     # Next, we get the row index for the genome in focus.
1092 :     my $focusIndex = $genomeH->{$focus};
1093 :     # Now we loop through all the row subsets.
1094 :     my $tuple;
1095 :     foreach $tuple (@$row_subsets)
1096 :     {
1097 :     # Extract the subset ID and its list of row indices. The latter is
1098 :     # in "$genomeIs".
1099 :     my($id,$genomeIs) = @$tuple;
1100 :     # Search for the index of the focus row in the current subset's list
1101 :     # of row indices.
1102 :     for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1103 :     # Now either $i will be the index of the focus row in the subset, or
1104 :     # it is going to be equal to the number of rows in the subset.
1105 :     if ($i < @$genomeIs)
1106 :     {
1107 :     # We've found the focus row in this subset. Select it as the new
1108 :     # best subset if it's smaller than the last one we found.
1109 :     if (@$genomeIs < @bestL)
1110 :     {
1111 :     $bestN = $id;
1112 :     @bestL = @$genomeIs;
1113 :     }
1114 :     }
1115 :     }
1116 :     # Save the best subset found as the active one.
1117 :     $active_subsetR = $bestN;
1118 :     }
1119 :     } else {
1120 :     # Here we have nothing: no active subset, and no focus row. We make
1121 :     # all rows active.
1122 :     $active_subsetR = 'All';
1123 :     @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1124 : overbeek 1.7 }
1125 : parrello 1.12 # "@bestL" now contains a list of the row indices for the active genomes.
1126 :     # We convert it to a hash and return it.
1127 : overbeek 1.7 my %active_genomes = map { $_ => 1 } @bestL;
1128 :     return \%active_genomes;
1129 :     }
1130 :    
1131 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3