[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Wed Sep 28 17:21:54 2005 UTC revision 1.15, Mon Dec 5 19:06:30 2005 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  package UnvSubsys;  package UnvSubsys;
19    
20  use Subsystem;  use Subsystem;
21  use Carp;  use Carp;
22  use FIG;  use FIG;
23    use SFXlate;
24  use Data::Dumper;  use Data::Dumper;
25  use strict;  use strict;
26    use Tracer;
27    
28  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;  
29    
30      $ssa =~ s/ /_/g;  =head2 Introduction
31    
32    The universal subsystem object provides methods used to display useful information
33    about a subsystem. Its intent is to support both Sprout and SEED.
34    
35          ### { Roles =>Roles,  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
36          ###   RoleIndex => ToRoleIndexHash,  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
37          ###   RoleSubsets => ColSubsets,  are stored in the spreadsheet's cells. Each genome can be identified by genome
38          ###   Genomes => Genomes,  ID or row index. Each role can be identified by role name or column index.
         ###   GenomeIndex => ToGenomeIndexHash,  
         ###   PegHash => PegHash,  
         ###   Colors  => ColorHash,  
         ###   Aliases => AliasHash,  
         ###   Curator => Curator,  
         ###   Notes => Notes,  
         ###   Reactions => ReactionHash  
         ### }  
         ###  
         ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  
         ###  
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### RowSubSets = pointer to a list of [SubsetName,[GenomeIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
         ### ReactionHash is a hash: key=Role value=[reaction-ids]  
39    
40      if ((ref($fig) eq "FIG") || ((ref($fig) eq 'SFXlate') && ($fig = $fig->{'fig'})))  It is worth noting that the term I<subset> has two meanings-- a subset of roles
41      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
42          my $subsystem = new Subsystem($ssa,$fig,0);  subset of genomes is called a I<row subset>.
43    
44    The object created contains a great deal of information, so it's worthwhile to
45    stop for a moment and discuss it. The object will have the following members.
46    
47    =over 4
48    
49    =item Roles
50    
51    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
52    and a list of URLs for the reactions. Each role is a column in the
53    subsystem spreadsheet. Indexing into the list by column index yields the
54    ID, abbreviation, and reactions for the role in that column.
55    
56    =item RoleIndex
57    
58    A hash mapping each role to its column index in the spreadsheet.
59    
60    =item RoleSubsets
61    
62    A list of 2-tuples, each consisting of a subset name followed by a list of the
63    column indices for the roles in the subset.
64    
65    =item Genomes
66    
67    A list of 2-tuples, each containing the genome ID for the relevant row and the
68    variant code for that genome. Subsystems can have several variations, and the
69    variant code indicates which variant of the subsystem that the genome uses.
70    There is one genome for each row in the spreadsheet. Indexing into the list
71    yields the ID of the genome for that row and its variant code.
72    
73    =item GenomeIndex
74    
75    A hash mapping each genome ID to its row index in the spreadsheet.
76    
77    =item PegHash
78    
79    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
80    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
81    
82        $pegHash->{$row}->{$col}
83    
84    returns a reference to a list of the IDs for the PEGs in the relevant
85    spreadsheet cell.
86    
87    =item ColorHash
88    
89    A hash mapping each PEG ID to the color that should be used to represent that
90    PEG in the display.
91    
92    =item AliasHash
93    
94    A hash mapping each PEG ID to a list of its aliases.
95    
96    =item ReactionHash
97    
98    A hash mapping each role to the list of reactions it catalyzes.
99    
100    =back
101    
102    =head2 Public Methods
103    
104    =head3 new
105    
106    C<< my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors); >>
107    
108    Construct a new universal subsystem object for a specified subsystem.
109    
110    =over 4
111    
112    =item ssa
113    
114    Name of the subsystem.
115    
116    =item fig
117    
118    Data access object, either a FIG object or an SFXlate object.
119    
120    =item active_subsetR
121    
122    Name of the active row subset. A row subset names a group of genomes.
123    
124    =item focus
125    
126    ID of the genome currently in focus.
127    
128    =item show_clusters
129    
130    TRUE if clusters should be painted by color, else FALSE.
131    
132    =item aliases
133    
134    TRUE if PEG aliases should be shown, else FALSE.
135    
136    =item peg_colors (optional)
137    
138    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
139    to be assigned to the PEG.
140    
141    =back
142    
143    =cut
144    
145    sub new {
146        # Get the parameters.
147        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
148        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
149        # in the seed's file system.
150        $ssa =~ s/ /_/g;
151        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
152        # a future pass through the code, we'll get data from a SEED or Sprout.
153        if ((ref($fig) eq "FIG") || (ref($fig) eq 'SFXlate')) {
154            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
155            # and SFXlate; however, the object returned in each case is different: FIG
156            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
157            my $subsystem = $fig->get_subsystem($ssa);
158            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
159            # in case the notes come from a Mac.
160          my $curator = $subsystem->get_curator;          my $curator = $subsystem->get_curator;
161          my $notes = $subsystem->get_notes;          my $notes = $subsystem->get_notes;
162          $notes =~ s/ /\n/g;          $notes =~ s/\r/\n/g;
163          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
164          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
165          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
166          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
167            my @diagrams = $subsystem->get_diagrams();
168            # Create the data structures for the role list and the role index.
169          my $role_info = [];          my $role_info = [];
170          my $roleH     = {};          my $roleH     = {};
171            # Loop through the roles to create the role list. The $i index will move
172            # through the columns of the spreadsheet.
173          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
174          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
175          {          {
176                # Get the ID of the role in the current column.
177              my $role = $roles[$i];              my $role = $roles[$i];
178              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
179                my $abbrev = $subsystem->get_role_abbr($i);
180                # Get the reactions. Note that its possible no reactions were found. If there were
181                # reactions found, however, we convert from reaction IDs to a comma-delimited
182                # list of HTML code full of hyperlinks.
183              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
184                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
185                # them onto the main role list.
186              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
187                # Set the role hash so that we can get back the column index for a given
188                # role name.
189              $roleH->{$role} = $i;              $roleH->{$role} = $i;
190          }          }
191            # Get the column subsets. A column subset is a list of role IDs, so we need to
192            # convert it to a set of column indices. We ignore the special "All" subset
193            # that contains everything.
194          my $subset_info = [];          my $subset_info = [];
195          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
196          {          {
# Line 80  Line 199 
199                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
200              }              }
201          }          }
202            # Now we create the genome directories. For each genome we need to be able to
203            # hash from the genomeID to the specified role index and we must be able to
204            # get the genome's variant code. (The variant code indicates which subsystem
205            # variant the genome uses.
206          my $genomes_info = [];          my $genomes_info = [];
207          my $genomeH      = {};          my $genomeH      = {};
208          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
209          {          {
210                # Get the genome ID for this row.
211              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
212              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
213                my $variant = $subsystem->get_variant_code($i);
214                # Form them into a 2-tuple and add the result to the genome list.
215              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
216                # Set up the hash to get from the genome ID to the row index.
217              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
218          }          }
219    
220            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
221            # object, this is the most expensive part of the operation, since it requires
222            # a database call for each cell.
223          my $pegH = {};          my $pegH = {};
224            # $i is the row index, which cycles through genomes.
225          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
226          {          {
227              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
228              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
229              {              {
230                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
231                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
232              }              }
233          }          }
234    
235          my $row_subsets    = &row_subsets($fig,$genomeH,$genomes_info);          # Get the row subsets. Row subsets are determined by the genome
236          my $active_genomes = &active_genomes($fig,$row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info);          # taxonomy, so these are different from the row subsets stored
237            # in the subsystem object.
238            my $row_subsets    = &row_subsets($fig, $genomeH);
239            # Here we try to get a list of the active genomes. The caller
240            # gives us hints in the form of the "$focus" and "$active_subsetR"
241            # parameters. If, for example, we are using this object to generate
242            # a subsystem web page, the focus information would steer us to
243            # whatever the user wants to look at on the page.
244            my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
245                                                 $genomeH,$genomes_info);
246    
247            # Now we generate a table of colors for the various PEGs. If the
248            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
249            # we allow the option of painting the PEGs by cluster number. (The
250            # caller indicates this by setting the "show_clusters" flag in the
251            # parameter list.)
252          my $colorsH;          my $colorsH;
253          if ($peg_colors)          if ($peg_colors)
254          {          {
255                # Here the caller gave us a list of peg colors. The list contains
256                # 2-tuples, each consisting of a PEG ID followed by the
257                # color value. The loop below extracts the pairs and stuffs them
258                # into the color hash.
259              $colorsH = {};              $colorsH = {};
260              foreach $_ (@$peg_colors)              foreach $_ (@$peg_colors)
261              {              {
# Line 118  Line 265 
265          }          }
266          elsif ($show_clusters)          elsif ($show_clusters)
267          {          {
268              $colorsH  = &set_colors($fig,$pegH,$active_genomes);              # Here the user wants us to base the colors on the genome clustering
269                # information.
270                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
271          }          }
272          else          else
273          {          {
274                # Here the user is not interested in coloring the PEGs.
275              $colorsH = {};              $colorsH = {};
276          }          }
277            # If the user wants to see aliases, compute the alias hash. Aliases
278            # will only be computed for PEGs belonging to active (highlighted)
279            # genomes, and there is a maximum of one alias per PEG.
280          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
281          my $reactions = $subsystem->get_reactions;          # Create and bless the UnvSubsys object.
282          my $self = { Roles => $role_info,          my $self = { Roles => $role_info,
283                       RoleIndex => $roleH,                       RoleIndex => $roleH,
284                       RoleSubsets => $subset_info,                       RoleSubsets => $subset_info,
# Line 138  Line 290 
290                       Aliases => $aliasesH,                       Aliases => $aliasesH,
291                       Curator => $curator,                       Curator => $curator,
292                       Notes => $notes,                       Notes => $notes,
293                       Reactions => $reactions                       Reactions => $reactions,
294                         Diagrams => \@diagrams,
295                     };                     };
296          bless($self, $class);          bless($self, $class);
297            # Return the object.
298          return $self;          return $self;
299      }      }
300      else      else
301      {      {
302            # Here the FIG-like object was not recognized, so we return an
303            # undefined value.
304          return undef;          return undef;
305      }      }
306  }  }
307    
308    =head3 get_subset_namesR
309    
310    C<< my @names = $unvsub->get_subset_namesR(); >>
311    
312    Return the names of the genome (row) subsets.
313    
314    =cut
315    
316  sub get_subset_namesR {  sub get_subset_namesR {
317      my($self) = @_;      my($self) = @_;
318    
319      return map { $_->[0] } @{$self->{GenomeSubsets}};      return map { $_->[0] } @{$self->{GenomeSubsets}};
320  }  }
321    
322    =head3 get_subsetR
323    
324    C<< my @genomes = $unvsub->get_subsetR($set); >>
325    
326    Return a list of the genome IDs covered by a row subset.
327    
328    =over 4
329    
330    =item set
331    
332    Name of the row subset whose genomes are desired.
333    
334    =item RETURN
335    
336    Returns a list of the IDs for the genomes found in the specified row
337    set.
338    
339    =back
340    
341    =cut
342    
343  sub get_subsetR {  sub get_subsetR {
344        # Get the parameters.
345      my($self,$set) = @_;      my($self,$set) = @_;
346      my($i);      my($i);
347        # Get the list of row subsets.
348      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
349        # Find the row subset with the specified name. The row subset list is a
350        # list of 2-tuples, and the first element of each tuple is the set
351        # name.
352      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
353      if ($i < @$sets)      if ($i < @$sets)
354      {      {
355            # Here we found the named subset. The subset tuple's second element is
356            # the list of row indices. We map these to genome IDs before returning
357            # them.
358          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
359      }      }
360        # Here we subset was not found, so we return the undefined value.
361      return undef;      return undef;
362  }  }
363    
364    =head3 get_subsetR
365    
366    C<< my @pairs = $unvsub->get_subsetsR(); >>
367    
368    Return a list of all the row subsets. The subsets are returned in the form
369    of 2-tuples, each consisting of a subset name followed by a reference to a
370    list of genome IDs. The genome IDs correspond to the rows in the subset.
371    
372    =cut
373    
374  sub get_subsetsR {  sub get_subsetsR {
375        # Get the parameters.
376      my($self) = @_;      my($self) = @_;
377        # Extract the list of genome subsets. This list is in the form of
378        # 2-tuples, but the rows are identified by row index, not genome ID.
379      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
380        # Create the return list.
381      my @pairs = ();      my @pairs = ();
382        # Loop through the subsets.
383      my $pair;      my $pair;
384      foreach $pair (@$sets)      foreach $pair (@$sets)
385      {      {
386            # Convert this subset's member list from row indices to genome IDs
387            # and stash the result in the return list.
388          my($id,$members) = @$pair;          my($id,$members) = @$pair;
389          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
390      }      }
391        # Return the list constructed.
392      return @pairs;      return @pairs;
393  }  }
394    
395    =head3 row_subsets
396    
397    C<< my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH); >>
398    
399    This method computes the taxonomic row subsets for a subsystem. It takes
400    as input a hash that maps genome IDs to column indices and a FIG object.
401    The FIG object provides a list of taxonomic groups of 10 or more complete
402    genomes. From the list, we extract subsets which have more than 10
403    genomes in the list of subsystem genomes. If no such subsets exist,
404    we extract subsets which have at least 1 genome from the list of
405    subsystem genomes. The subsets are returned as 2-tuples, the first
406    element being the subset ID and the second being a list of row indices
407    for the genomes in the subset.
408    
409    =over 4
410    
411    =item fig
412    
413    A FIG-like object for accessing the data store.
414    
415    =item genomeH
416    
417    Reference to a hash that maps each genome ID to its row index in the
418    subsystem spreadsheet.
419    
420    =item RETURN
421    
422    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
423    subset ID followed by a list of the row indices for the genomes in the
424    subset.
425    
426    =back
427    
428    =cut
429    
430  sub row_subsets {  sub row_subsets {
431      my ($fig,$genomeH,$genomes_info) = @_;      my ($fig, $genomeH) = @_;
432    
433        # We need a real FIG object, since SFXlate does not yet support
434        # taxonomy trees. The "FIG" method does this for us.
435        $fig = $fig->FIG();
436        # Initialize the return value.
437      my $subsets = [];      my $subsets = [];
438        # Get a list of taxonomic groups. This will come back as a list of
439        # 2-tuples.
440      my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);      my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
441        # Loop through the 2-tuples. We're looking for subsets which
442        # contain at least one genome on the subsystem's spreadsheet.
443      my($pair,$id,$members);      my($pair,$id,$members);
444      foreach $pair (@$taxonomic_groups)      foreach $pair (@$taxonomic_groups)
445      {      {
446          ($id,$members) = @$pair;          ($id,$members) = @$pair;
447            # Extract the genomes in the member list that participate in this
448            # subsystem. To do this, we convert each genome ID to its row
449            # index. If no row index exists, the GREP condition discards the
450            # member.
451          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
452            # If there are enough members, save the subset.
453          if (@mem > 0)          if (@mem > 0)
454          {          {
455              push(@$subsets,[$id,[@mem]]);              push(@$subsets,[$id,[@mem]]);
456          }          }
457      }      }
458        # Return the list of row subsets.
459      return $subsets;      return $subsets;
460  }  }
461    
462    =head3 set_aliases
463    
464    C<< my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes); >>
465    
466    Return a hash mapping PEG IDs to aliases.
467    
468    =over 4
469    
470    =item fig
471    
472    FIG-like object that can be used to access the data store.
473    
474    =item pegH
475    
476    Reference to the spreadsheet hash table. Given a row index I<$row> and a
477    column index I<$col>,
478    
479        $pegH->{$row}->{$col}
480    
481    will return a reference to a list of PEGs in the specified spreadsheet cell.
482    
483    =item active_genomes
484    
485    Reference to a hash whose keys correspond to the spreadsheet row indices
486    of genomes that should be highlighted.
487    
488    =item RETURN
489    
490    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
491    for active genomes will be processed.
492    
493    =back
494    
495    =cut
496    
497  sub set_aliases {  sub set_aliases {
498        # Get the parameters.
499      my($fig,$pegH,$active_genomes) = @_;      my($fig,$pegH,$active_genomes) = @_;
500      my($genomeI,$roleI,$pegs,$peg,$roleH);      my($genomeI,$roleI,$pegs,$peg,$roleH);
501    
502        # Create the return hash.
503      my $aliasesH = {};      my $aliasesH = {};
504    
505        # Loop through each row that corresponds to an active genome.
506        # The active genome list contains row indices, and there is
507        # one genome per row. Note that the genome ID is never used,
508        # only the row index.
509      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
510      {      {
511            # Get the role hash for the specified genome. The role hash
512            # maps column indices (representing roles) to lists of PEGs.
513          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
514            # Loop through the role (column) indices.
515          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
516          {          {
517                # Get the PEG list for this row/column combination.
518              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
519                # Only proceed if data was found in the cell.
520                if (defined $pegs) {
521                    # Loop through the pegs in the cell.
522              foreach $peg (@$pegs)              foreach $peg (@$pegs)
523              {              {
524                        # If we do not already have an alias for this PEG,
525                        # compute one.
526                  if (! $aliasesH->{$peg})                  if (! $aliasesH->{$peg})
527                  {                  {
528                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
# Line 222  Line 530 
530              }              }
531          }          }
532      }      }
533        }
534        # Return the hash we built.
535      return $aliasesH;      return $aliasesH;
536  }  }
537    
538    =head3 set_colors
539    
540    C<< my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes); >>
541    
542    Return a hash that maps each PEG in the subsystem spreadsheet to a display
543    color. Not all PEGs need to be mapped. Those that do not have a color
544    assigned will generally be displayed in white.
545    
546    =over 4
547    
548    =item fig
549    
550    FIG-like object that can be used to access the data store.
551    
552    =item sub
553    
554    Subsystem object for the current subsystem.
555    
556    =item pegH
557    
558    Reference to the spreadsheet hash table. Given a row index I<$row> and a
559    column index I<$col>,
560    
561        $pegH->{$row}->{$col}
562    
563    will return a reference to a list of PEGs in the specified spreadsheet cell.
564    
565    =item active_genomes
566    
567    Reference to a hash whose keys correspond to the spreadsheet row indices
568    of genomes that should be highlighted.
569    
570    =item RETURN
571    
572    Returns a hash that takes as input a PEG ID and returns a color. These colors
573    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
574    for active genomes will be colored.
575    
576    =back
577    
578    =cut
579    
580  sub set_colors {  sub set_colors {
581      my($fig,$pegH,$active_genomes) = @_;      # Get the parameters.
582      my($genomeI,$roleI,$pegs,$peg,$roleH,$peg,%pegs_in_genome);      my($fig,$sub,$pegH,$active_genomes) = @_;
583    
584        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
585        # Create the return hash.
586      my $colorsH = {};      my $colorsH = {};
587        # Loop through the active genomes. The keys of "%$pegH" are the row indices
588        # for rows that have at least one occupied cell in the spreadsheet. The
589        # Grep then reduces this list to those rows that are highlighted.
590      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
591      {      {
592            # We will use the following hash to compile a list of all the PEGs
593            # in the spreadsheet for the current genome, that is, the genome
594            # represented by row index "$genomeI".
595          undef %pegs_in_genome;          undef %pegs_in_genome;
596            # Get the hash for the current row. This hash maps column indices to
597            # lists of pegs.
598          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
599            # Loop through the column indices for the specified row.
600          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
601          {          {
602                # Here we can finally get a list of the pegs in this spreadsheet
603                # cell. We loop through them, marking them in the "%pegs_in_genome"
604                # hash.
605              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
606              foreach $peg (@$pegs)              foreach $peg (@$pegs)
607              {              {
608                  $pegs_in_genome{$peg} = 1;                  $pegs_in_genome{$peg} = 1;
609              }              }
610          }          }
611            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
612            # list of all the pegs for the current spreadsheet role.
613          my @pegs = keys(%pegs_in_genome);          my @pegs = keys(%pegs_in_genome);
614          my($tuple,$peg,$color);          my($tuple,$peg,$color);
615          my $colors_for_one_genome = &set_colors_for_genome($fig,\@pegs);          # Get a hash that maps the PEG IDs to colors.
616            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
617            # Loop through the hash we got back and assign the colors from that
618            # hash to the master hash we're returning to the caller.
619          while (($peg,$color) = each %$colors_for_one_genome)          while (($peg,$color) = each %$colors_for_one_genome)
620          {          {
621              $colorsH->{$peg} = $colors_for_one_genome->{$peg};              $colorsH->{$peg} = $colors_for_one_genome->{$peg};
622          }          }
623      }      }
624        # Return the color hash.
625      return $colorsH;      return $colorsH;
626  }  }
627    
628  sub set_colors_for_genome {  =head3 set_colors_for_genome
     my($fig,$pegs) = @_;  
     my($peg,@clusters,$cluster,@colors,$color,%seen,%conn,$x,$peg1,@pegs,$i);  
629    
630      my $color_of = {};  C<< my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs); >>
     foreach $peg (@$pegs) { $color_of->{$peg} = '#FFFFFF' }  
631    
632      @pegs = keys(%$color_of);  #  Use of keys makes @pegs entries unique  Return a reference to a hash mapping the specified pegs to colors. PEGs that
633    are physically close to each other will be painted the same color.
634    
635      foreach $peg (@pegs)  =over 4
     {  
         $conn{$peg} = [grep { $color_of->{$_} && ($_ ne $peg) } $fig->close_genes($peg,5000)];  
     }  
636    
637      @clusters = ();  =item fig
     while ($peg = shift @pegs)  
     {  
         if (! $seen{$peg})  
         {  
             $cluster = [$peg];  
             $seen{$peg} = 1;  
             for ($i=0; ($i < @$cluster); $i++)  
             {  
                 my @tmp = grep { ! $seen{$_} } @{$conn{$cluster->[$i]}};  
                 if (@tmp > 0)  
                 {  
                     foreach my $peg1 (@tmp) { $seen{$peg1} = 1 }  
                     push(@$cluster,@tmp);  
                 }  
             }  
             push(@clusters,$cluster);  
         }  
     }  
638    
639      @colors =  &cool_colors();  A fig-like object that can be used to access the data store.
640    
641      @clusters = grep { @$_ > 1 } sort { @$a <=> @$b } @clusters;  =item sub
642    
643      if (@clusters > @colors) { splice(@clusters,0,(@clusters - @colors)) }  # make sure we have enough colors  Subsystem object for the relevant subsystem.
644    
645      my($cluster);  =item pegs
646      foreach $cluster (@clusters)  
647      {  Reference to a list of PEG IDs. All of the peg IDs should be for the
648          $color = shift @colors;  same genome.
649          foreach $peg (@$cluster)  
650          {  =item RETURN
651              $color_of->{$peg} = $color;  
652          }  Returns a reference to a hash that maps each PEG ID to a color.
653      }  
654      return $color_of;  =back
655    
656    =cut
657    
658    sub set_colors_for_genome {
659        # Get the parameters.
660        my($fig, $sub, $pegs) = @_;
661        # Default all the PEGs to white.
662        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
663        # Divide the pegs into clusters.
664        my @clusters = $fig->compute_clusters($pegs, $sub);
665        # Get a list of useful colors.
666        my @colors =  &cool_colors();
667        # If we have too many clusters, chop off the big ones at the end. These
668        # are least likely to be important.
669        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
670        # Loop through the clusters.
671        for my $cluster (@clusters) {
672            # Get the color for this cluster.
673            my $color = shift @colors;
674            # Loop through this cluster, putting this color into the color_of
675            # entries for each PEG.
676            for my $peg (@$cluster) {
677                $color_of{$peg} = $color;
678            }
679        }
680        # Return the color map.
681        return \%color_of;
682  }  }
683    
684    =head3 cool_colors
685    
686    C<< my @colorList = UnvSubsys::cool_colors(); >>
687    
688    Return a list of web-safe colors.
689    
690    =cut
691    
692  sub cool_colors {  sub cool_colors {
693   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
694   return (   return (
# Line 327  Line 710 
710   );   );
711  }  }
712    
713    =head3 ext_id
714    
715    C<< my $externalID = UnvSubsys::ext_id($fig, $peg); >>
716    
717    or
718    
719    C<< my @externalIDs = UnvSubsys::ext_id($fig, $peg); >>
720    
721    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
722    a single non-FIG ID for the specified feature.
723    
724    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
725    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
726    of a given type. If it finds at least one, then all IDs of that type are returned.
727    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
728    KEGG IDs.
729    
730    =over 4
731    
732    =item fig
733    
734    A FIG-like object for accessing the data store.
735    
736    =item peg
737    
738    ID of the feature whose aliases are desired.
739    
740    =item RETURN
741    
742    In list context, a list of non-FIG IDs for the feature that are all of the same
743    type. In scalar context, the first non-FIG ID for the feature of the
744    highest-priority available type.
745    
746    =back
747    
748    =cut
749    
750  sub ext_id {  sub ext_id {
751      my($fig,$peg) = @_;      my($fig,$peg) = @_;
752    
# Line 363  Line 783 
783      }      }
784  }  }
785    
786    =head3 subsystem_curator
787    
788    C<< my $name = $unvsub->subsystem_curator(); >>
789    
790    Return the name of the subsystem curator. The database stores user names as
791    C<master:>I<name>. This method strips off the C<master:> prefix before it
792    passes the result back to the caller.
793    
794    =cut
795    
796  sub subsystem_curator {  sub subsystem_curator {
797      my($self) = @_;      my($self) = @_;
# Line 372  Line 801 
801      return $curator;      return $curator;
802  }  }
803    
804    =head3 get_roles
805    
806    C<< my @roles = $unvsub->get_roles(); >>
807    
808    Return a list of the roles (columns) for this subsystem. The roles will be
809    returned in column order, so that if you access the sixth element of the
810    return list, you'll get the name of the role for the sixth column.
811    
812    =cut
813    
814  sub get_roles {  sub get_roles {
815      my($self) = @_;      my($self) = @_;
816        # The role index in this object is a list of 3-tuples. The caller only
817        # wants the first element of each tuple, which is the role name.
818      return map { $_->[0] } @{$self->{Roles}};      return map { $_->[0] } @{$self->{Roles}};
819  }  }
820    
821    =head3 get_genome_index
822    
823    C<< my $index = $unvsub->get_genome_index($genome); >>
824    
825    Return the row index of the specified genome.
826    
827    =over 4
828    
829    =item genome
830    
831    ID of the genome whose row index is desired.
832    
833    =item RETURN
834    
835    Returns the index of the row corresponding to the specified genome, or an
836    undefined value if the genome is not represented in the subsystem
837    spreadsheet.
838    
839    =back
840    
841    =cut
842    
843  sub get_genome_index {  sub get_genome_index {
844      my($self,$genome) = @_;      my($self,$genome) = @_;
845    
846      return $self->{GenomeIndex}->{$genome};      return $self->{GenomeIndex}->{$genome};
847  }  }
848    
849    =head3 get_genomes
850    
851    C<< my @genomes = $unvsub->get_genomes(); >>
852    
853    Return a list of the genome IDs for the subsystem. The genomes will be
854    presented in row order. In other words, if you index into the sixth
855    element of the return list, you will retrieve the ID of the genome for
856    the sixth row.
857    
858    =cut
859    
860  sub get_genomes {  sub get_genomes {
861      my($self) = @_;      my($self) = @_;
862        # The genome array is a list of 2-tuples. We extract the first
863        # element of each tuple, which is the genome ID.
864      return map { $_->[0] } @{$self->{Genomes}};      return map { $_->[0] } @{$self->{Genomes}};
865  }  }
866    
867    =head3 get_variant_code
868    
869    C<< my $code = $unvsub->get_variant_code($genome); >>
870    
871    Return the variant code for a genome. Each subsystem has several variations.
872    The variant code indicates which variation of a subsystem is used by a
873    particular genome.
874    
875    Genome data is stored in a list of 2-tuples. The first element is the genome
876    ID; the second is the variant code.
877    
878    =over 4
879    
880    =item genome
881    
882    ID or row index of the genome whose variant code is desired.
883    
884    =item RETURN
885    
886    Returns the variant code for the specified genome.
887    
888    =back
889    
890    =cut
891    
892  sub get_variant_code {  sub get_variant_code {
893      my($self,$genome) = @_;      my($self,$genome) = @_;
894        # Check to see if we have a row index.
895      if ($genome =~ /^\d+$/)      if ($genome =~ /^\d+$/)
896      {      {
897            # Here we have a row index, so use it to find the genome's variant
898            # code.
899          return $self->{Genomes}->[$genome]->[1];          return $self->{Genomes}->[$genome]->[1];
900      }      }
901      else      else
902      {      {
903            # Here we have a genome ID, so we need to convert it to a row index.
904          my $genomeI = $self->{GenomeIndex}->{$genome};          my $genomeI = $self->{GenomeIndex}->{$genome};
905          return $self->{Genomes}->[$genomeI]->[1];          return $self->{Genomes}->[$genomeI]->[1];
906      }      }
907  }  }
908    
909    =head3 get_pegs_from_cell
910    
911    C<< my @pegs = $unvsub->get_pegs_from_cell($genome, $role); >>
912    
913    Return a list of the features in a specified spreadsheet cell. The cell is specified
914    by genome ID and role ID.
915    
916    =over 4
917    
918    =item genome
919    
920    ID of the genome relevant to the cell.
921    
922    =item role
923    
924    ID of the role relevant to the cell.
925    
926    =item RETURN
927    
928    Returns a list of the features in the cell, or an empty list if the cell is empty.
929    
930    =back
931    
932    =cut
933    
934  sub get_pegs_from_cell {  sub get_pegs_from_cell {
935      my($self,$genome,$role) = @_;      my($self,$genome,$role) = @_;
936        # Convert the genome and role IDs to row and column indices.
937      my $genomeI = $self->{GenomeIndex}->{$genome};      my $genomeI = $self->{GenomeIndex}->{$genome};
938      my $roleI   = $self->{RoleIndex}->{$role};      my $roleI   = $self->{RoleIndex}->{$role};
939        # Get the pegs from the cell and return them.
940      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
941      return $pegs ? @$pegs : ();      return $pegs ? @$pegs : ();
942  }  }
943    
944    =head3 get_diagrams
945    
946    C<< my @list = $sub->get_diagrams(); >>
947    
948    Return a list of the diagrams associated with this subsystem. Each diagram
949    is represented in the return list as a 4-tuple C<[diagram_id, diagram_name,
950    page_link, img_link]> where
951    
952    =over 4
953    
954    =item diagram_id
955    
956    ID code for this diagram.
957    
958    =item diagram_name
959    
960    Displayable name of the diagram.
961    
962    =item page_link
963    
964    URL of an HTML page containing information about the diagram.
965    
966    =item img_link
967    
968    URL of an HTML page containing an image for the diagram.
969    
970    =back
971    
972    Note that the URLs are in fact for CGI scripts with parameters that point them
973    to the correct place. Though Sprout has diagram information in it, it has
974    no relationship to the diagrams displayed in SEED, so the work is done entirely
975    on the SEED side.
976    
977    =cut
978    
979    sub get_diagrams {
980        # Get the parameters.
981        my ($self) = @_;
982        # Return the diagram list.
983        return @{$self->{Diagrams}};
984    }
985    
986  sub get_notes {  sub get_notes {
987      my($self) = @_;      my($self) = @_;
988    
# Line 473  Line 1042 
1042      return $self->{Colors}->{$peg};      return $self->{Colors}->{$peg};
1043  }  }
1044    
1045    =head3 active_genomes
1046    
1047    C<< my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info); >>
1048    
1049    Return a hash containing the active genomes for this subsystem display. The
1050    keys of the hash will be the row indices of the genomes to be highlighted on the
1051    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
1052    tests TRUE, the fourth row should be highlighted.
1053    
1054    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
1055    specified, it is presumed to be the ID of the subset containing the active genomes.
1056    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
1057    ID of the genome currently in focus, and the active genomes will be the ones in the
1058    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
1059    I<$focus> are specified, then all genomes are active.
1060    
1061    =over 4
1062    
1063    =item row_subsets
1064    
1065    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1066    a reference to a list of the row indices for the rows in the identified subset.
1067    
1068    =item active_subsetR (optional)
1069    
1070    ID of the active subset (if any).
1071    
1072    =item focus
1073    
1074    ID of the genome currently in focus (if any). If there is no active subset, then
1075    the smallest subset containing this genome will be made active.
1076    
1077    =item genomeH
1078    
1079    Reference to a hash of genome IDs to row indices. The keys of this hash are
1080    the genomes in this subsystem, which also form the subsystem spreadsheet's
1081    rows.
1082    
1083    =item genomes_info
1084    
1085    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1086    ID of a genome; the second is the variant code for the subsystem variant
1087    used by the genome. The tuples are ordered by row index, so that the ID
1088    and variant code of the genome in a particular row can be located by indexing
1089    into this parameter using the subsystem spreadsheet row number.
1090    
1091    =item RETURN
1092    
1093    Returns a reference to a hash that maps the row indices of the active genomes
1094    to 1. This hash can be used to quickly determine whether or not a particular
1095    row is to be highlighted.
1096    
1097    =back
1098    
1099    =cut
1100    
1101  sub active_genomes {  sub active_genomes {
1102        # Get the parameters.
1103      my($fig,$row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info) = @_;      my($fig,$row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info) = @_;
1104      my($i,@bestL);      my($i,@bestL);
1105        # Declare the return variable.
1106      my $active_genomes = {};      my $active_genomes = {};
1107        # Check for an active subset.
1108      if ($active_subsetR)      if ($active_subsetR)
1109      {      {
1110            # Search for the active subset in the row subset array.
1111            for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1112            if ($i < @$row_subsets)
1113            {
1114                # Here we found the active subset, so we extract its list of row indices.
1115                @bestL = @{$row_subsets->[$i]->[1]};
1116            }
1117            else {
1118                # Here we have the ID of the active subset. First, we search for that ID
1119                # in the row subset list.
1120          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1121          if ($i < @$row_subsets)          if ($i < @$row_subsets)
1122          {          {
1123                    # Here we found the named subset, so we return its member list.
1124              @bestL = @{$row_subsets->[$i]->[1]};              @bestL = @{$row_subsets->[$i]->[1]};
1125          }          }
1126          else          else
1127          {          {
1128                    # Here the active subset does not exist. We punt by extracting a
1129                    # list of all the row indices in the spreadsheet.
1130              $active_subsetR = 'All';              $active_subsetR = 'All';
1131              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1132          }          }
1133      }      }
1134      else      }
1135        elsif ($focus)
1136      {      {
1137          if ((! $focus) || (! $fig->is_complete($focus)))          # Here we don't have an active row subset, but a particular genome is in
1138            # focus. We'll look for the smallest row subset containing the genome
1139            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1140            # of the best subset found so far; "@bestL" is where we stash the list
1141            # of IDs in the subset. Our initial selection, then, will be the
1142            # fake "All" subset, which contains the entire collection of rows.
1143            if (! $fig->is_complete($focus))
1144          {          {
1145                # Here the gnome in focus is incomplete, so it won't be anywhere
1146                # in our list. We default to making everything active.
1147              $active_subsetR = 'All';              $active_subsetR = 'All';
1148              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1149          }          } else {
1150          else              my $bestN   = "All";
1151          {              @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1152              my $bestN   = undef;              # Next, we get the row index for the genome in focus.
1153              @bestL   = ();              my $focusIndex = $genomeH->{$focus};
1154                # Now we loop through all the row subsets.
1155              my $tuple;              my $tuple;
1156              foreach $tuple (@$row_subsets)              foreach $tuple (@$row_subsets)
1157              {              {
1158                    # Extract the subset ID and its list of row indices. The latter is
1159                    # in "$genomeIs".
1160                  my($id,$genomeIs) = @$tuple;                  my($id,$genomeIs) = @$tuple;
1161                  for ($i=0; ($i < @$genomeIs) && ($genomes_info->[$genomeIs->[$i]]->[0] ne $focus); $i++) {}                  # Search for the index of the focus row in the current subset's list
1162                    # of row indices.
1163                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1164                    # Now either $i will be the index of the focus row in the subset, or
1165                    # it is going to be equal to the number of rows in the subset.
1166                  if ($i < @$genomeIs)                  if ($i < @$genomeIs)
1167                  {                  {
1168                      if ((! $bestN) || (@$genomeIs < @bestL))                      # We've found the focus row in this subset. Select it as the new
1169                        # best subset if it's smaller than the last one we found.
1170                        if (@$genomeIs < @bestL)
1171                      {                      {
1172                          $bestN  = $id;                          $bestN  = $id;
1173                          @bestL = @$genomeIs;                          @bestL = @$genomeIs;
1174                      }                      }
1175                  }                  }
                 else  
                 {  
 #                   print &Dumper($id);  
                 }  
1176              }              }
1177                # Save the best subset found as the active one.
1178              $active_subsetR = $bestN;              $active_subsetR = $bestN;
1179          }          }
1180        } else {
1181            # Here we have nothing: no active subset, and no focus row. We make
1182            # all rows active.
1183            $active_subsetR = 'All';
1184            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1185      }      }
1186        # "@bestL" now contains a list of the row indices for the active genomes.
1187        # We convert it to a hash and return it.
1188      my %active_genomes = map { $_ => 1 } @bestL;      my %active_genomes = map { $_ => 1 } @bestL;
1189      return \%active_genomes;      return \%active_genomes;
1190  }  }
1191    
1192  1;  1;
   
   

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3