[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Tue Sep 27 00:58:27 2005 UTC revision 1.12, Wed Oct 12 02:47:56 2005 UTC
# Line 3  Line 3 
3  use Subsystem;  use Subsystem;
4  use Carp;  use Carp;
5  use FIG;  use FIG;
6    use SFXlate;
7    
8  use Data::Dumper;  use Data::Dumper;
9  use strict;  use strict;
10    use Tracer;
11    
12  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;  
13    
14      $ssa =~ s/ /_/g;  =head2 Introduction
15    
16    The universal subsystem object provides methods used to display useful information
17    about a subsystem. Its intent is to support both Sprout and SEED.
18    
19          ### { Roles =>Roles,  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
20          ###   RoleIndex => ToRoleIndexHash,  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
21          ###   RoleSubsets => ColSubsets,  are stored in the spreadsheet's cells. Each genome can be identified by genome
22          ###   Genomes => Genomes,  ID or row index. Each role can be identified by role name or column index.
         ###   GenomeIndex => ToGenomeIndexHash,  
         ###   PegHash => PegHash,  
         ###   Colors  => ColorHash,  
         ###   Aliases => AliasHash,  
         ###   Curator => Curator,  
         ###   Notes => Notes,  
         ###   Reactions => ReactionHash  
         ### }  
         ###  
         ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  
         ###  
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### RowSubSets = pointer to a list of [SubsetName,[GenomeIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
         ### ReactionHash is a hash: key=Role value=[reaction-ids]  
23    
24      if ((ref($fig) eq "FIG") || ((ref($fig) eq 'SFXlate') && ($fig = $fig->{'fig'})))  It is worth noting that the term I<subset> has two meanings-- a subset of roles
25      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
26          my $subsystem = new Subsystem($ssa,$fig,0);  subset of genomes is called a I<row subset>.
27    
28    The object created contains a great deal of information, so it's worthwhile to
29    stop for a moment and discuss it. The object will have the following members.
30    
31    =over 4
32    
33    =item Roles
34    
35    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
36    and a list of URLs for the reactions. Each role is a column in the
37    subsystem spreadsheet. Indexing into the list by column index yields the
38    ID, abbreviation, and reactions for the role in that column.
39    
40    =item RoleIndex
41    
42    A hash mapping each role to its column index in the spreadsheet.
43    
44    =item RoleSubsets
45    
46    A list of 2-tuples, each consisting of a subset name followed by a list of the
47    column indices for the roles in the subset.
48    
49    =item Genomes
50    
51    A list of 2-tuples, each containing the genome ID for the relevant row and the
52    variant code for that genome. Subsystems can have several variations, and the
53    variant code indicates which variant of the subsystem that the genome uses.
54    There is one genome for each row in the spreadsheet. Indexing into the list
55    yields the ID of the genome for that row and its variant code.
56    
57    =item GenomeIndex
58    
59    A hash mapping each genome ID to its row index in the spreadsheet.
60    
61    =item PegHash
62    
63    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
64    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
65    
66        $pegHash->{$row}->{$col}
67    
68    returns a reference to a list of the IDs for the PEGs in the relevant
69    spreadsheet cell.
70    
71    =item ColorHash
72    
73    A hash mapping each PEG ID to the color that should be used to represent that
74    PEG in the display.
75    
76    =item AliasHash
77    
78    A hash mapping each PEG ID to a list of its aliases.
79    
80    =item ReactionHash
81    
82    A hash mapping each role to the list of reactions it catalyzes.
83    
84    =back
85    
86    =head2 Public Methods
87    
88    =head3 new
89    
90    C<< my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors); >>
91    
92    Construct a new universal subsystem object for a specified subsystem.
93    
94    =over 4
95    
96    =item ssa
97    
98    Name of the subsystem.
99    
100    =item fig
101    
102    Data access object, either a FIG object or an SFXlate object.
103    
104    =item active_subsetR
105    
106    Name of the active row subset. A row subset names a group of genomes.
107    
108    =item focus
109    
110    ID of the genome currently in focus.
111    
112    =item show_clusters
113    
114    TRUE if clusters should be painted by color, else FALSE.
115    
116    =item aliases
117    
118    TRUE if PEG aliases should be shown, else FALSE.
119    
120    =item peg_colors (optional)
121    
122    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
123    to be assigned to the PEG.
124    
125    =back
126    
127    =cut
128    
129    sub new {
130        # Get the parameters.
131        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
132        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
133        # in the seed's file system.
134        $ssa =~ s/ /_/g;
135        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
136        # a future pass through the code, we'll get data from a SEED or Sprout.
137        if ((ref($fig) eq "FIG") || (ref($fig) eq 'SFXlate')) {
138            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
139            # and SFXlate; however, the object returned in each case is different: FIG
140            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
141            my $subsystem = $fig->get_subsystem($ssa);
142            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
143            # in case the notes come from a Mac.
144          my $curator = $subsystem->get_curator;          my $curator = $subsystem->get_curator;
145          my $notes = $subsystem->get_notes;          my $notes = $subsystem->get_notes;
146          $notes =~ s/ /\n/g;          $notes =~ s/\r/\n/g;
147          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
148          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
149          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
150          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
151            # Create the data structures for the role list and the role index.
152          my $role_info = [];          my $role_info = [];
153          my $roleH     = {};          my $roleH     = {};
154            # Loop through the roles to create the role list. The $i index will move
155            # through the columns of the spreadsheet.
156          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
157          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
158          {          {
159                # Get the ID of the role in the current column.
160              my $role = $roles[$i];              my $role = $roles[$i];
161              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
162                my $abbrev = $subsystem->get_role_abbr($i);
163                # Get the reactions. Note that its possible no reactions were found. If there were
164                # reactions found, however, we convert from reaction IDs to a comma-delimited
165                # list of HTML code full of hyperlinks.
166              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
167                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
168                # them onto the main role list.
169              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
170                # Set the role hash so that we can get back the column index for a given
171                # role name.
172              $roleH->{$role} = $i;              $roleH->{$role} = $i;
173          }          }
174            # Get the column subsets. A column subset is a list of role IDs, so we need to
175            # convert it to a set of column indices. We ignore the special "All" subset
176            # that contains everything.
177          my $subset_info = [];          my $subset_info = [];
178          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
179          {          {
# Line 80  Line 182 
182                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
183              }              }
184          }          }
185            # Now we create the genome directories. For each genome we need to be able to
186            # hash from the genomeID to the specified role index and we must be able to
187            # get the genome's variant code. (The variant code indicates which subsystem
188            # variant the genome uses.
189          my $genomes_info = [];          my $genomes_info = [];
190          my $genomeH      = {};          my $genomeH      = {};
191          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
192          {          {
193                # Get the genome ID for this row.
194              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
195              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
196                my $variant = $subsystem->get_variant_code($i);
197                # Form them into a 2-tuple and add the result to the genome list.
198              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
199                # Set up the hash to get from the genome ID to the row index.
200              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
201          }          }
202    
203            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
204            # object, this is the most expensive part of the operation, since it requires
205            # a database call for each cell.
206          my $pegH = {};          my $pegH = {};
207            # $i is the row index, which cycles through genomes.
208          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
209          {          {
210              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
211              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
212              {              {
213                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
214                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
215              }              }
216          }          }
217    
218          my $row_subsets    = &row_subsets($fig,$genomeH,$genomes_info);          # Get the row subsets. Row subsets are determined by the genome
219          my $active_genomes = &active_genomes($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info);          # taxonomy, so these are different from the row subsets stored
220            # in the subsystem object.
221            my $row_subsets    = &row_subsets($fig, $genomeH);
222            # Here we try to get a list of the active genomes. The caller
223            # gives us hints in the form of the "$focus" and "$active_subsetR"
224            # parameters. If, for example, we are using this object to generate
225            # a subsystem web page, the focus information would steer us to
226            # whatever the user wants to look at on the page.
227            my $active_genomes = &active_genomes($row_subsets,$active_subsetR,$focus,
228                                                 $genomeH,$genomes_info);
229    
230            # Now we generate a table of colors for the various PEGs. If the
231            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
232            # we allow the option of painting the PEGs by cluster number. (The
233            # caller indicates this by setting the "show_clusters" flag in the
234            # parameter list.)
235          my $colorsH;          my $colorsH;
236          if ($peg_colors)          if ($peg_colors)
237          {          {
238                # Here the caller gave us a list of peg colors. The list contains
239                # 2-tuples, each consisting of a PEG ID followed by the
240                # color value. The loop below extracts the pairs and stuffs them
241                # into the color hash.
242              $colorsH = {};              $colorsH = {};
243              foreach $_ (@$peg_colors)              foreach $_ (@$peg_colors)
244              {              {
# Line 118  Line 248 
248          }          }
249          elsif ($show_clusters)          elsif ($show_clusters)
250          {          {
251              $colorsH  = &set_colors($fig,$pegH,$active_genomes);              # Here the user wants us to base the colors on the genome clustering
252                # information.
253                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
254          }          }
255          else          else
256          {          {
257                # Here the user is not interested in coloring the PEGs.
258              $colorsH = {};              $colorsH = {};
259          }          }
260            # If the user wants to see aliases, compute the alias hash. Aliases
261            # will only be computed for PEGs belonging to active (highlighted)
262            # genomes, and there is a maximum of one alias per PEG.
263          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
264          my $reactions = $subsystem->get_reactions;          # Create and bless the UnvSubsys object.
265          my $self = { Roles => $role_info,          my $self = { Roles => $role_info,
266                       RoleIndex => $roleH,                       RoleIndex => $roleH,
267                       RoleSubsets => $subset_info,                       RoleSubsets => $subset_info,
# Line 141  Line 276 
276                       Reactions => $reactions                       Reactions => $reactions
277                     };                     };
278          bless($self, $class);          bless($self, $class);
279            # Return the object.
280          return $self;          return $self;
281      }      }
282      else      else
283      {      {
284            # Here the FIG-like object was not recognized, so we return an
285            # undefined value.
286          return undef;          return undef;
287      }      }
288  }  }
289    
290    =head3 get_subset_namesR
291    
292    C<< my @names = $unvsub->get_subset_namesR(); >>
293    
294    Return the names of the genome (row) subsets.
295    
296    =cut
297    
298  sub get_subset_namesR {  sub get_subset_namesR {
299      my($self) = @_;      my($self) = @_;
300    
301      return map { $_->[0] } @{$self->{GenomeSubsets}};      return map { $_->[0] } @{$self->{GenomeSubsets}};
302  }  }
303    
304    =head3 get_subsetR
305    
306    C<< my @genomes = $unvsub->get_subsetR($set); >>
307    
308    Return a list of the genome IDs covered by a row subset.
309    
310    =over 4
311    
312    =item set
313    
314    Name of the row subset whose genomes are desired.
315    
316    =item RETURN
317    
318    Returns a list of the IDs for the genomes found in the specified row
319    set.
320    
321    =back
322    
323    =cut
324    
325  sub get_subsetR {  sub get_subsetR {
326        # Get the parameters.
327      my($self,$set) = @_;      my($self,$set) = @_;
328      my($i);      my($i);
329        # Get the list of row subsets.
330      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
331        # Find the row subset with the specified name. The row subset list is a
332        # list of 2-tuples, and the first element of each tuple is the set
333        # name.
334      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
335      if ($i < @$sets)      if ($i < @$sets)
336      {      {
337            # Here we found the named subset. The subset tuple's second element is
338            # the list of row indices. We map these to genome IDs before returning
339            # them.
340          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
341      }      }
342        # Here we subset was not found, so we return the undefined value.
343      return undef;      return undef;
344  }  }
345    
346    =head3 get_subsetR
347    
348    C<< my @pairs = $unvsub->get_subsetsR(); >>
349    
350    Return a list of all the row subsets. The subsets are returned in the form
351    of 2-tuples, each consisting of a subset name followed by a reference to a
352    list of genome IDs. The genome IDs correspond to the rows in the subset.
353    
354    =cut
355    
356  sub get_subsetsR {  sub get_subsetsR {
357        # Get the parameters.
358      my($self) = @_;      my($self) = @_;
359        # Extract the list of genome subsets. This list is in the form of
360        # 2-tuples, but the rows are identified by row index, not genome ID.
361      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
362        # Create the return list.
363      my @pairs = ();      my @pairs = ();
364        # Loop through the subsets.
365      my $pair;      my $pair;
366      foreach $pair (@$sets)      foreach $pair (@$sets)
367      {      {
368            # Convert this subset's member list from row indices to genome IDs
369            # and stash the result in the return list.
370          my($id,$members) = @$pair;          my($id,$members) = @$pair;
371          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
372      }      }
373        # Return the list constructed.
374      return @pairs;      return @pairs;
375  }  }
376    
377    =head3 row_subsets
378    
379    C<< my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH); >>
380    
381    This method computes the taxonomic row subsets for a subsystem. It takes
382    as input a hash that maps genome IDs to column indices and a FIG object.
383    The FIG object provides a list of taxonomic groups of 10 or more complete
384    genomes. From the list, we extract subsets which have more than 10
385    genomes in the list of subsystem genomes. If no such subsets exist,
386    we extract subsets which have at least 1 genome from the list of
387    subsystem genomes. The subsets are returned as 2-tuples, the first
388    element being the subset ID and the second being a list of row indices
389    for the genomes in the subset.
390    
391    =over 4
392    
393    =item fig
394    
395    A FIG-like object for accessing the data store.
396    
397    =item genomeH
398    
399    Reference to a hash that maps each genome ID to its row index in the
400    subsystem spreadsheet.
401    
402    =item RETURN
403    
404    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
405    subset ID followed by a list of the row indices for the genomes in the
406    subset.
407    
408    =back
409    
410    =cut
411    
412  sub row_subsets {  sub row_subsets {
413      my ($fig,$genomeH,$genomes_info) = @_;      my ($fig, $genomeH) = @_;
414    
415        # We need a real FIG object, since SFXlate does not yet support
416        # taxonomy trees. The "FIG" method does this for us.
417        $fig = $fig->FIG();
418        # Initialize the return value.
419      my $subsets = [];      my $subsets = [];
420      my $taxonomic_groups = $fig->taxonomic_groups_of_complete(10);      # Get a list of taxonomic groups. This will come back as a list of
421        # 2-tuples.
422        my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
423        # Loop through the 2-tuples. We're looking for subsets which
424        # contain at least one genome on the subsystem's spreadsheet.
425      my($pair,$id,$members);      my($pair,$id,$members);
426      foreach $pair (@$taxonomic_groups)      foreach $pair (@$taxonomic_groups)
427      {      {
428          ($id,$members) = @$pair;          ($id,$members) = @$pair;
429            # Extract the genomes in the member list that participate in this
430            # subsystem. To do this, we convert each genome ID to its row
431            # index. If no row index exists, the GREP condition discards the
432            # member.
433          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
434            # If there are enough members, save the subset.
435          if (@mem > 0)          if (@mem > 0)
436          {          {
437              push(@$subsets,[$id,[@mem]]);              push(@$subsets,[$id,[@mem]]);
438          }          }
439      }      }
440        # Return the list of row subsets.
441      return $subsets;      return $subsets;
442  }  }
443    
444    =head3 set_aliases
445    
446    C<< my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes); >>
447    
448    Return a hash mapping PEG IDs to aliases.
449    
450    =over 4
451    
452    =item fig
453    
454    FIG-like object that can be used to access the data store.
455    
456    =item pegH
457    
458    Reference to the spreadsheet hash table. Given a row index I<$row> and a
459    column index I<$col>,
460    
461        $pegH->{$row}->{$col}
462    
463    will return a reference to a list of PEGs in the specified spreadsheet cell.
464    
465    =item active_genomes
466    
467    Reference to a hash whose keys correspond to the spreadsheet row indices
468    of genomes that should be highlighted.
469    
470    =item RETURN
471    
472    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
473    for active genomes will be processed.
474    
475    =back
476    
477    =cut
478    
479  sub set_aliases {  sub set_aliases {
480        # Get the parameters.
481      my($fig,$pegH,$active_genomes) = @_;      my($fig,$pegH,$active_genomes) = @_;
482      my($genomeI,$roleI,$pegs,$peg,$roleH);      my($genomeI,$roleI,$pegs,$peg,$roleH);
483    
484        # Create the return hash.
485      my $aliasesH = {};      my $aliasesH = {};
486    
487        # Loop through each row that corresponds to an active genome.
488        # The active genome list contains row indices, and there is
489        # one genome per row. Note that the genome ID is never used,
490        # only the row index.
491      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
492      {      {
493            # Get the role hash for the specified genome. The role hash
494            # maps column indices (representing roles) to lists of PEGs.
495          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
496            # Loop through the role (column) indices.
497          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
498          {          {
499                # Get the PEG list for this row/column combination.
500              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
501                # Only proceed if data was found in the cell.
502                if (defined $pegs) {
503                    # Loop through the pegs in the cell.
504              foreach $peg (@$pegs)              foreach $peg (@$pegs)
505              {              {
506                        # If we do not already have an alias for this PEG,
507                        # compute one.
508                  if (! $aliasesH->{$peg})                  if (! $aliasesH->{$peg})
509                  {                  {
510                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
# Line 222  Line 512 
512              }              }
513          }          }
514      }      }
515        }
516        # Return the hash we built.
517      return $aliasesH;      return $aliasesH;
518  }  }
519    
520    =head3 set_colors
521    
522    C<< my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes); >>
523    
524    Return a hash that maps each PEG in the subsystem spreadsheet to a display
525    color. Not all PEGs need to be mapped. Those that do not have a color
526    assigned will generally be displayed in white.
527    
528    =over 4
529    
530    =item fig
531    
532    FIG-like object that can be used to access the data store.
533    
534    =item sub
535    
536    Subsystem object for the current subsystem.
537    
538    =item pegH
539    
540    Reference to the spreadsheet hash table. Given a row index I<$row> and a
541    column index I<$col>,
542    
543        $pegH->{$row}->{$col}
544    
545    will return a reference to a list of PEGs in the specified spreadsheet cell.
546    
547    =item active_genomes
548    
549    Reference to a hash whose keys correspond to the spreadsheet row indices
550    of genomes that should be highlighted.
551    
552    =item RETURN
553    
554    Returns a hash that takes as input a PEG ID and returns a color. These colors
555    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
556    for active genomes will be colored.
557    
558    =back
559    
560    =cut
561    
562  sub set_colors {  sub set_colors {
563      my($fig,$pegH,$active_genomes) = @_;      # Get the parameters.
564      my($genomeI,$roleI,$pegs,$peg,$roleH,$peg,%pegs_in_genome);      my($fig,$sub,$pegH,$active_genomes) = @_;
565    
566        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
567        # Create the return hash.
568      my $colorsH = {};      my $colorsH = {};
569        # Loop through the active genomes. The keys of "%$pegH" are the row indices
570        # for rows that have at least one occupied cell in the spreadsheet. The
571        # Grep then reduces this list to those rows that are highlighted.
572      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
573      {      {
574            # We will use the following hash to compile a list of all the PEGs
575            # in the spreadsheet for the current genome, that is, the genome
576            # represented by row index "$genomeI".
577          undef %pegs_in_genome;          undef %pegs_in_genome;
578            # Get the hash for the current row. This hash maps column indices to
579            # lists of pegs.
580          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
581            # Loop through the column indices for the specified row.
582          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
583          {          {
584                # Here we can finally get a list of the pegs in this spreadsheet
585                # cell. We loop through them, marking them in the "%pegs_in_genome"
586                # hash.
587              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
588              foreach $peg (@$pegs)              foreach $peg (@$pegs)
589              {              {
590                  $pegs_in_genome{$peg} = 1;                  $pegs_in_genome{$peg} = 1;
591              }              }
592          }          }
593            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
594            # list of all the pegs for the current spreadsheet role.
595          my @pegs = keys(%pegs_in_genome);          my @pegs = keys(%pegs_in_genome);
596          my($tuple,$peg,$color);          my($tuple,$peg,$color);
597          my $colors_for_one_genome = &set_colors_for_genome($fig,\@pegs);          # Get a hash that maps the PEG IDs to colors.
598            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
599            # Loop through the hash we got back and assign the colors from that
600            # hash to the master hash we're returning to the caller.
601          while (($peg,$color) = each %$colors_for_one_genome)          while (($peg,$color) = each %$colors_for_one_genome)
602          {          {
603              $colorsH->{$peg} = $colors_for_one_genome->{$peg};              $colorsH->{$peg} = $colors_for_one_genome->{$peg};
604          }          }
605      }      }
606        # Return the color hash.
607      return $colorsH;      return $colorsH;
608  }  }
609    
610  sub set_colors_for_genome {  =head3 set_colors_for_genome
     my($fig,$pegs) = @_;  
     my($peg,@clusters,$cluster,@colors,$color,%seen,%conn,$x,$peg1,@pegs,$i);  
611    
612      my $color_of = {};  C<< my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs); >>
     foreach $peg (@$pegs) { $color_of->{$peg} = '#FFFFFF' }  
613    
614      @pegs = keys(%$color_of);  #  Use of keys makes @pegs entries unique  Return a reference to a hash mapping the specified pegs to colors. PEGs that
615    are physically close to each other will be painted the same color.
616    
617      foreach $peg (@pegs)  =over 4
     {  
         $conn{$peg} = [grep { $color_of->{$_} && ($_ ne $peg) } $fig->close_genes($peg,5000)];  
     }  
618    
619      @clusters = ();  =item fig
     while ($peg = shift @pegs)  
     {  
         if (! $seen{$peg})  
         {  
             $cluster = [$peg];  
             $seen{$peg} = 1;  
             for ($i=0; ($i < @$cluster); $i++)  
             {  
                 my @tmp = grep { ! $seen{$_} } @{$conn{$cluster->[$i]}};  
                 if (@tmp > 0)  
                 {  
                     foreach my $peg1 (@tmp) { $seen{$peg1} = 1 }  
                     push(@$cluster,@tmp);  
                 }  
             }  
             push(@clusters,$cluster);  
         }  
     }  
620    
621      @colors =  &cool_colors();  A fig-like object that can be used to access the data store.
622    
623      @clusters = grep { @$_ > 1 } sort { @$a <=> @$b } @clusters;  =item sub
624    
625      if (@clusters > @colors) { splice(@clusters,0,(@clusters - @colors)) }  # make sure we have enough colors  Subsystem object for the relevant subsystem.
626    
627      my($cluster);  =item pegs
628      foreach $cluster (@clusters)  
629      {  Reference to a list of PEG IDs. All of the peg IDs should be for the
630          $color = shift @colors;  same genome.
631          foreach $peg (@$cluster)  
632          {  =item RETURN
633              $color_of->{$peg} = $color;  
634          }  Returns a reference to a hash that maps each PEG ID to a color.
635      }  
636      return $color_of;  =back
637    
638    =cut
639    
640    sub set_colors_for_genome {
641        # Get the parameters.
642        my($fig, $sub, $pegs) = @_;
643        # Default all the PEGs to white.
644        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
645        # Divide the pegs into clusters.
646        my @clusters = $fig->compute_clusters($pegs, $sub);
647        # Get a list of useful colors.
648        my @colors =  &cool_colors();
649        # If we have too many clusters, chop off the big ones at the end. These
650        # are least likely to be important.
651        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
652        # Loop through the clusters.
653        for my $cluster (@clusters) {
654            # Get the color for this cluster.
655            my $color = shift @colors;
656            # Loop through this cluster, putting this color into the color_of
657            # entries for each PEG.
658            for my $peg (@$cluster) {
659                $color_of{$peg} = $color;
660            }
661        }
662        # Return the color map.
663        return \%color_of;
664  }  }
665    
666    =head3 cool_colors
667    
668    C<< my @colorList = UnvSubsys::cool_colors(); >>
669    
670    Return a list of web-safe colors.
671    
672    =cut
673    
674  sub cool_colors {  sub cool_colors {
675   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
676   return (   return (
# Line 327  Line 692 
692   );   );
693  }  }
694    
695    =head3 ext_id
696    
697    C<< my $externalID = UnvSubsys::ext_id($fig, $peg); >>
698    
699    or
700    
701    C<< my @externalIDs = UnvSubsys::ext_id($fig, $peg); >>
702    
703    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
704    a single non-FIG ID for the specified feature.
705    
706    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
707    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
708    of a given type. If it finds at least one, then all IDs of that type are returned.
709    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
710    KEGG IDs.
711    
712    =over 4
713    
714    =item fig
715    
716    A FIG-like object for accessing the data store.
717    
718    =item peg
719    
720    ID of the feature whose aliases are desired.
721    
722    =item RETURN
723    
724    In list context, a list of non-FIG IDs for the feature that are all of the same
725    type. In scalar context, the first non-FIG ID for the feature of the
726    highest-priority available type.
727    
728    =back
729    
730    =cut
731    
732  sub ext_id {  sub ext_id {
733      my($fig,$peg) = @_;      my($fig,$peg) = @_;
734    
# Line 363  Line 765 
765      }      }
766  }  }
767    
768    =head3 subsystem_curator
769    
770    C<< my $name = $unvsub->subsystem_curator(); >>
771    
772    Return the name of the subsystem curator. The database stores user names as
773    C<master:>I<name>. This method strips off the C<master:> prefix before it
774    passes the result back to the caller.
775    
776    =cut
777    
778  sub subsystem_curator {  sub subsystem_curator {
779      my($self) = @_;      my($self) = @_;
# Line 372  Line 783 
783      return $curator;      return $curator;
784  }  }
785    
786    =head3 get_roles
787    
788    C<< my @roles = $unvsub->get_roles(); >>
789    
790    Return a list of the roles (columns) for this subsystem. The roles will be
791    returned in column order, so that if you access the sixth element of the
792    return list, you'll get the name of the role for the sixth column.
793    
794    =cut
795    
796  sub get_roles {  sub get_roles {
797      my($self) = @_;      my($self) = @_;
798        # The role index in this object is a list of 3-tuples. The caller only
799        # wants the first element of each tuple, which is the role name.
800      return map { $_->[0] } @{$self->{Roles}};      return map { $_->[0] } @{$self->{Roles}};
801  }  }
802    
803    =head3 get_genome_index
804    
805    C<< my $index = $unvsub->get_genome_index($genome); >>
806    
807    Return the row index of the specified genome.
808    
809    =over 4
810    
811    =item genome
812    
813    ID of the genome whose row index is desired.
814    
815    =item RETURN
816    
817    Returns the index of the row corresponding to the specified genome, or an
818    undefined value if the genome is not represented in the subsystem
819    spreadsheet.
820    
821    =back
822    
823    =cut
824    
825  sub get_genome_index {  sub get_genome_index {
826      my($self,$genome) = @_;      my($self,$genome) = @_;
827    
828      return $self->{GenomeIndex}->{$genome};      return $self->{GenomeIndex}->{$genome};
829  }  }
830    
831    =head3 get_genomes
832    
833    C<< my @genomes = $unvsub->get_genomes(); >>
834    
835    Return a list of the genome IDs for the subsystem. The genomes will be
836    presented in row order. In other words, if you index into the sixth
837    element of the return list, you will retrieve the ID of the genome for
838    the sixth row.
839    
840    =cut
841    
842  sub get_genomes {  sub get_genomes {
843      my($self) = @_;      my($self) = @_;
844        # The genome array is a list of 2-tuples. We extract the first
845        # element of each tuple, which is the genome ID.
846      return map { $_->[0] } @{$self->{Genomes}};      return map { $_->[0] } @{$self->{Genomes}};
847  }  }
848    
849    =head3 get_variant_code
850    
851    C<< my $code = $unvsub->get_variant_code($genome); >>
852    
853    Return the variant code for a genome. Each subsystem has several variations.
854    The variant code indicates which variation of a subsystem is used by a
855    particular genome.
856    
857    Genome data is stored in a list of 2-tuples. The first element is the genome
858    ID; the second is the variant code.
859    
860    =over 4
861    
862    =item genome
863    
864    ID or row index of the genome whose variant code is desired.
865    
866    =item RETURN
867    
868    Returns the variant code for the specified genome.
869    
870    =back
871    
872    =cut
873    
874  sub get_variant_code {  sub get_variant_code {
875      my($self,$genome) = @_;      my($self,$genome) = @_;
876        # Check to see if we have a row index.
877      if ($genome =~ /^\d+$/)      if ($genome =~ /^\d+$/)
878      {      {
879            # Here we have a row index, so use it to find the genome's variant
880            # code.
881          return $self->{Genomes}->[$genome]->[1];          return $self->{Genomes}->[$genome]->[1];
882      }      }
883      else      else
884      {      {
885            # Here we have a genome ID, so we need to convert it to a row index.
886          my $genomeI = $self->{GenomeIndex}->{$genome};          my $genomeI = $self->{GenomeIndex}->{$genome};
887          return $self->{Genomes}->[$genomeI]->[1];          return $self->{Genomes}->[$genomeI]->[1];
888      }      }
889  }  }
890    
891    =head3 get_pegs_from_cell
892    
893    C<< my @pegs = $unvsub->get_pegs_from_cell($genome, $role); >>
894    
895    Return a list of the features in a specified spreadsheet cell. The cell is specified
896    by genome ID and role ID.
897    
898    =over 4
899    
900    =item genome
901    
902    ID of the genome relevant to the cell.
903    
904    =item role
905    
906    ID of the role relevant to the cell.
907    
908    =item RETURN
909    
910    Returns a list of the features in the cell, or an empty list if the cell is empty.
911    
912    =back
913    
914    =cut
915    
916  sub get_pegs_from_cell {  sub get_pegs_from_cell {
917      my($self,$genome,$role) = @_;      my($self,$genome,$role) = @_;
918        # Convert the genome and role IDs to row and column indices.
919      my $genomeI = $self->{GenomeIndex}->{$genome};      my $genomeI = $self->{GenomeIndex}->{$genome};
920      my $roleI   = $self->{RoleIndex}->{$role};      my $roleI   = $self->{RoleIndex}->{$role};
921        # Get the pegs from the cell and return them.
922      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
923      return $pegs ? @$pegs : ();      return $pegs ? @$pegs : ();
924  }  }
# Line 473  Line 982 
982      return $self->{Colors}->{$peg};      return $self->{Colors}->{$peg};
983  }  }
984    
985    =head3 active_genomes
986    
987    C<< my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info); >>
988    
989    Return a hash containing the active genomes for this subsystem display. The
990    keys of the hash will be the row indices of the genomes to be highlighted on the
991    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
992    tests TRUE, the fourth row should be highlighted.
993    
994    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
995    specified, it is presumed to be the ID of the subset containing the active genomes.
996    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
997    ID of the genome currently in focus, and the active genomes will be the ones in the
998    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
999    I<$focus> are specified, then all genomes are active.
1000    
1001    =over 4
1002    
1003    =item row_subsets
1004    
1005    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1006    a reference to a list of the row indices for the rows in the identified subset.
1007    
1008    =item active_subsetR (optional)
1009    
1010    ID of the active subset (if any).
1011    
1012    =item focus
1013    
1014    ID of the genome currently in focus (if any). If there is no active subset, then
1015    the smallest subset containing this genome will be made active.
1016    
1017    =item genomeH
1018    
1019    Reference to a hash of genome IDs to row indices. The keys of this hash are
1020    the genomes in this subsystem, which also form the subsystem spreadsheet's
1021    rows.
1022    
1023    =item genomes_info
1024    
1025    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1026    ID of a genome; the second is the variant code for the subsystem variant
1027    used by the genome. The tuples are ordered by row index, so that the ID
1028    and variant code of the genome in a particular row can be located by indexing
1029    into this parameter using the subsystem spreadsheet row number.
1030    
1031    =item RETURN
1032    
1033    Returns a reference to a hash that maps the row indices of the active genomes
1034    to 1. This hash can be used to quickly determine whether or not a particular
1035    row is to be highlighted.
1036    
1037    =back
1038    
1039    =cut
1040    
1041  sub active_genomes {  sub active_genomes {
1042      my($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info) = @_;      # Get the parameters.
1043        my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1044      my($i,@bestL);      my($i,@bestL);
1045        # Declare the return variable.
1046      my $active_genomes = {};      my $active_genomes = {};
1047        # Check for an active subset.
1048      if ($active_subsetR)      if ($active_subsetR)
1049      {      {
1050            # Search for the active subset in the row subset array.
1051            for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1052            if ($i < @$row_subsets)
1053            {
1054                # Here we found the active subset, so we extract its list of row indices.
1055                @bestL = @{$row_subsets->[$i]->[1]};
1056            }
1057            else {
1058                # Here we have the ID of the active subset. First, we search for that ID
1059                # in the row subset list.
1060          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1061          if ($i < @$row_subsets)          if ($i < @$row_subsets)
1062          {          {
1063                    # Here we found the named subset, so we return its member list.
1064              @bestL = @{$row_subsets->[$i]->[1]};              @bestL = @{$row_subsets->[$i]->[1]};
1065          }          }
1066          else          else
1067          {          {
1068                    # Here the active subset does not exist. We punt by extracting a
1069                    # list of all the row indices in the spreadsheet.
1070              $active_subsetR = 'All';              $active_subsetR = 'All';
1071              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1072          }          }
1073      }      }
1074      else      }
1075        elsif ($focus)
1076      {      {
1077          if (! $focus)          # Here we don't have an active row subset, but a particular genome is in
1078            # focus. We'll look for the smallest row subset containing the genome
1079            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1080            # of the best subset found so far; "@bestL" is where we stash the list
1081            # of IDs in the subset. Our initial selection, then, will be the
1082            # fake "All" subset, which contains the entire collection of rows.
1083            if (! $fig->is_complete($focus))
1084          {          {
1085                # Here the gnome in focus is incomplete, so it won't be anywhere
1086                # in our list. We default to making everything active.
1087              $active_subsetR = 'All';              $active_subsetR = 'All';
1088              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1089          }          } else {
1090          else              my $bestN   = "All";
1091          {              @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1092              my $bestN   = undef;              # Next, we get the row index for the genome in focus.
1093              @bestL   = ();              my $focusIndex = $genomeH->{$focus};
1094                # Now we loop through all the row subsets.
1095              my $tuple;              my $tuple;
1096              foreach $tuple (@$row_subsets)              foreach $tuple (@$row_subsets)
1097              {              {
1098                    # Extract the subset ID and its list of row indices. The latter is
1099                    # in "$genomeIs".
1100                  my($id,$genomeIs) = @$tuple;                  my($id,$genomeIs) = @$tuple;
1101                  for ($i=0; ($i < @$genomeIs) && ($genomes_info->[$genomeIs->[$i]]->[0] ne $focus); $i++) {}                  # Search for the index of the focus row in the current subset's list
1102                    # of row indices.
1103                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1104                    # Now either $i will be the index of the focus row in the subset, or
1105                    # it is going to be equal to the number of rows in the subset.
1106                  if ($i < @$genomeIs)                  if ($i < @$genomeIs)
1107                  {                  {
1108                      if ((! $bestN) || (@$genomeIs < @bestL))                      # We've found the focus row in this subset. Select it as the new
1109                        # best subset if it's smaller than the last one we found.
1110                        if (@$genomeIs < @bestL)
1111                      {                      {
1112                          $bestN  = $id;                          $bestN  = $id;
1113                          @bestL = @$genomeIs;                          @bestL = @$genomeIs;
1114                      }                      }
1115                  }                  }
1116              }              }
1117                # Save the best subset found as the active one.
1118              $active_subsetR = $bestN;              $active_subsetR = $bestN;
1119          }          }
1120        } else {
1121            # Here we have nothing: no active subset, and no focus row. We make
1122            # all rows active.
1123            $active_subsetR = 'All';
1124            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1125      }      }
1126        # "@bestL" now contains a list of the row indices for the active genomes.
1127        # We convert it to a hash and return it.
1128      my %active_genomes = map { $_ => 1 } @bestL;      my %active_genomes = map { $_ => 1 } @bestL;
1129      return \%active_genomes;      return \%active_genomes;
1130  }  }
1131    
1132  1;  1;
   
   

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3