[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Mon Sep 12 20:45:51 2005 UTC revision 1.14, Mon Nov 28 22:41:16 2005 UTC
# Line 3  Line 3 
3  use Subsystem;  use Subsystem;
4  use Carp;  use Carp;
5  use FIG;  use FIG;
6    use SFXlate;
7  use Data::Dumper;  use Data::Dumper;
8  use strict;  use strict;
9    use Tracer;
10    
11  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $show_clusters, $aliases) = @_;  
12    
13      $ssa =~ s/ /_/g;  =head2 Introduction
14    
15    The universal subsystem object provides methods used to display useful information
16    about a subsystem. Its intent is to support both Sprout and SEED.
17    
18          ### { Roles =>Roles,  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
19          ###   RoleIndex => ToRoleIndexHash,  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
20          ###   RoleSubsets => ColSubsets,  are stored in the spreadsheet's cells. Each genome can be identified by genome
21          ###   Genomes => Genomes,  ID or row index. Each role can be identified by role name or column index.
         ###   GenomeIndex => ToGenomeIndexHash,  
         ###   PegHash => PegHash,  
         ###   Colors  => ColorHash,  
         ###   Aliases => AliasHash,  
         ###   Curator => Curator,  
         ###   Notes => Notes,  
         ###   Reactions => ReactionHash  
         ### }  
         ###  
         ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  
         ###  
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
         ### ReactionHash is a hash: key=Role value=[reaction-ids]  
22    
23      if (ref($fig) eq "FIG")  It is worth noting that the term I<subset> has two meanings-- a subset of roles
24      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
25    subset of genomes is called a I<row subset>.
26    
27    The object created contains a great deal of information, so it's worthwhile to
28    stop for a moment and discuss it. The object will have the following members.
29    
30    =over 4
31    
32    =item Roles
33    
34    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
35    and a list of URLs for the reactions. Each role is a column in the
36    subsystem spreadsheet. Indexing into the list by column index yields the
37    ID, abbreviation, and reactions for the role in that column.
38    
39    =item RoleIndex
40    
41    A hash mapping each role to its column index in the spreadsheet.
42    
43    =item RoleSubsets
44    
45    A list of 2-tuples, each consisting of a subset name followed by a list of the
46    column indices for the roles in the subset.
47    
48    =item Genomes
49    
50    A list of 2-tuples, each containing the genome ID for the relevant row and the
51    variant code for that genome. Subsystems can have several variations, and the
52    variant code indicates which variant of the subsystem that the genome uses.
53    There is one genome for each row in the spreadsheet. Indexing into the list
54    yields the ID of the genome for that row and its variant code.
55    
56    =item GenomeIndex
57    
58    A hash mapping each genome ID to its row index in the spreadsheet.
59    
60    =item PegHash
61    
62    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
63    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
64    
65        $pegHash->{$row}->{$col}
66    
67    returns a reference to a list of the IDs for the PEGs in the relevant
68    spreadsheet cell.
69    
70    =item ColorHash
71    
72    A hash mapping each PEG ID to the color that should be used to represent that
73    PEG in the display.
74    
75    =item AliasHash
76    
77    A hash mapping each PEG ID to a list of its aliases.
78    
79    =item ReactionHash
80    
81    A hash mapping each role to the list of reactions it catalyzes.
82    
83    =back
84    
85    =head2 Public Methods
86    
87    =head3 new
88    
89    C<< my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors); >>
90    
91    Construct a new universal subsystem object for a specified subsystem.
92    
93    =over 4
94    
95    =item ssa
96    
97    Name of the subsystem.
98    
99    =item fig
100    
101    Data access object, either a FIG object or an SFXlate object.
102    
103    =item active_subsetR
104    
105    Name of the active row subset. A row subset names a group of genomes.
106    
107          my $subsystem = new Subsystem($ssa,$fig,0);  =item focus
108    
109    ID of the genome currently in focus.
110    
111    =item show_clusters
112    
113    TRUE if clusters should be painted by color, else FALSE.
114    
115    =item aliases
116    
117    TRUE if PEG aliases should be shown, else FALSE.
118    
119    =item peg_colors (optional)
120    
121    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
122    to be assigned to the PEG.
123    
124    =back
125    
126    =cut
127    
128    sub new {
129        # Get the parameters.
130        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
131        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
132        # in the seed's file system.
133        $ssa =~ s/ /_/g;
134        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
135        # a future pass through the code, we'll get data from a SEED or Sprout.
136        if ((ref($fig) eq "FIG") || (ref($fig) eq 'SFXlate')) {
137            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
138            # and SFXlate; however, the object returned in each case is different: FIG
139            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
140            my $subsystem = $fig->get_subsystem($ssa);
141            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
142            # in case the notes come from a Mac.
143          my $curator = $subsystem->get_curator;          my $curator = $subsystem->get_curator;
144          my $notes = $subsystem->get_notes;          my $notes = $subsystem->get_notes;
145          $notes =~ s/ /\n/g;          $notes =~ s/\r/\n/g;
146          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
147          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
148          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
149          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
150            my @diagrams = $subsystem->get_diagrams();
151            # Create the data structures for the role list and the role index.
152          my $role_info = [];          my $role_info = [];
153          my $roleH     = {};          my $roleH     = {};
154            # Loop through the roles to create the role list. The $i index will move
155            # through the columns of the spreadsheet.
156          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
157          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
158          {          {
159                # Get the ID of the role in the current column.
160              my $role = $roles[$i];              my $role = $roles[$i];
161              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
162                my $abbrev = $subsystem->get_role_abbr($i);
163                # Get the reactions. Note that its possible no reactions were found. If there were
164                # reactions found, however, we convert from reaction IDs to a comma-delimited
165                # list of HTML code full of hyperlinks.
166              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
167                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
168                # them onto the main role list.
169              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
170                # Set the role hash so that we can get back the column index for a given
171                # role name.
172              $roleH->{$role} = $i;              $roleH->{$role} = $i;
173          }          }
174            # Get the column subsets. A column subset is a list of role IDs, so we need to
175            # convert it to a set of column indices. We ignore the special "All" subset
176            # that contains everything.
177          my $subset_info = [];          my $subset_info = [];
178          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
179          {          {
# Line 80  Line 182 
182                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
183              }              }
184          }          }
185            # Now we create the genome directories. For each genome we need to be able to
186            # hash from the genomeID to the specified role index and we must be able to
187            # get the genome's variant code. (The variant code indicates which subsystem
188            # variant the genome uses.
189          my $genomes_info = [];          my $genomes_info = [];
190          my $genomeH      = {};          my $genomeH      = {};
191          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
192          {          {
193                # Get the genome ID for this row.
194              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
195              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
196                my $variant = $subsystem->get_variant_code($i);
197                # Form them into a 2-tuple and add the result to the genome list.
198              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
199                # Set up the hash to get from the genome ID to the row index.
200              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
201          }          }
202    
203            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
204            # object, this is the most expensive part of the operation, since it requires
205            # a database call for each cell.
206          my $pegH = {};          my $pegH = {};
207            # $i is the row index, which cycles through genomes.
208          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
209          {          {
210              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
211              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
212              {              {
213                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
214                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
215              }              }
216          }          }
217          my $colorsH  = $show_clusters  ? &set_colors($fig,$pegH)  : {};  
218          my $aliasesH = $aliases ? &set_aliases($fig,$pegH) : {};          # Get the row subsets. Row subsets are determined by the genome
219          my $reactions = $subsystem->get_reactions;          # taxonomy, so these are different from the row subsets stored
220            # in the subsystem object.
221            my $row_subsets    = &row_subsets($fig, $genomeH);
222            # Here we try to get a list of the active genomes. The caller
223            # gives us hints in the form of the "$focus" and "$active_subsetR"
224            # parameters. If, for example, we are using this object to generate
225            # a subsystem web page, the focus information would steer us to
226            # whatever the user wants to look at on the page.
227            my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
228                                                 $genomeH,$genomes_info);
229    
230            # Now we generate a table of colors for the various PEGs. If the
231            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
232            # we allow the option of painting the PEGs by cluster number. (The
233            # caller indicates this by setting the "show_clusters" flag in the
234            # parameter list.)
235            my $colorsH;
236            if ($peg_colors)
237            {
238                # Here the caller gave us a list of peg colors. The list contains
239                # 2-tuples, each consisting of a PEG ID followed by the
240                # color value. The loop below extracts the pairs and stuffs them
241                # into the color hash.
242                $colorsH = {};
243                foreach $_ (@$peg_colors)
244                {
245                    my($peg,$color) = @$_;
246                    $colorsH->{$peg} = $color;
247                }
248            }
249            elsif ($show_clusters)
250            {
251                # Here the user wants us to base the colors on the genome clustering
252                # information.
253                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
254            }
255            else
256            {
257                # Here the user is not interested in coloring the PEGs.
258                $colorsH = {};
259            }
260            # If the user wants to see aliases, compute the alias hash. Aliases
261            # will only be computed for PEGs belonging to active (highlighted)
262            # genomes, and there is a maximum of one alias per PEG.
263            my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
264            # Create and bless the UnvSubsys object.
265          my $self = { Roles => $role_info,          my $self = { Roles => $role_info,
266                       RoleIndex => $roleH,                       RoleIndex => $roleH,
267                       RoleSubsets => $subset_info,                       RoleSubsets => $subset_info,
268                       Genomes => $genomes_info,                       Genomes => $genomes_info,
269                       GenomeIndex => $genomeH,                       GenomeIndex => $genomeH,
270                         GenomeSubsets => $row_subsets,
271                       PegHash => $pegH,                       PegHash => $pegH,
272                       Colors => $colorsH,                       Colors => $colorsH,
273                       Aliases => $aliasesH,                       Aliases => $aliasesH,
274                       Curator => $curator,                       Curator => $curator,
275                       Notes => $notes,                       Notes => $notes,
276                       Reactions => $reactions                       Reactions => $reactions,
277                         Diagrams => \@diagrams,
278                     };                     };
279          bless($self, $class);          bless($self, $class);
280            # Return the object.
281          return $self;          return $self;
282      }      }
283      else      else
284      {      {
285            # Here the FIG-like object was not recognized, so we return an
286            # undefined value.
287            return undef;
288        }
289    }
290    
291    =head3 get_subset_namesR
292    
293    C<< my @names = $unvsub->get_subset_namesR(); >>
294    
295    Return the names of the genome (row) subsets.
296    
297    =cut
298    
299    sub get_subset_namesR {
300        my($self) = @_;
301    
302        return map { $_->[0] } @{$self->{GenomeSubsets}};
303    }
304    
305    =head3 get_subsetR
306    
307    C<< my @genomes = $unvsub->get_subsetR($set); >>
308    
309    Return a list of the genome IDs covered by a row subset.
310    
311    =over 4
312    
313    =item set
314    
315    Name of the row subset whose genomes are desired.
316    
317    =item RETURN
318    
319    Returns a list of the IDs for the genomes found in the specified row
320    set.
321    
322    =back
323    
324    =cut
325    
326    sub get_subsetR {
327        # Get the parameters.
328        my($self,$set) = @_;
329        my($i);
330        # Get the list of row subsets.
331        my $sets = $self->{GenomeSubsets};
332        # Find the row subset with the specified name. The row subset list is a
333        # list of 2-tuples, and the first element of each tuple is the set
334        # name.
335        for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
336        if ($i < @$sets)
337        {
338            # Here we found the named subset. The subset tuple's second element is
339            # the list of row indices. We map these to genome IDs before returning
340            # them.
341            return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
342        }
343        # Here we subset was not found, so we return the undefined value.
344          return undef;          return undef;
345      }      }
346    
347    =head3 get_subsetR
348    
349    C<< my @pairs = $unvsub->get_subsetsR(); >>
350    
351    Return a list of all the row subsets. The subsets are returned in the form
352    of 2-tuples, each consisting of a subset name followed by a reference to a
353    list of genome IDs. The genome IDs correspond to the rows in the subset.
354    
355    =cut
356    
357    sub get_subsetsR {
358        # Get the parameters.
359        my($self) = @_;
360        # Extract the list of genome subsets. This list is in the form of
361        # 2-tuples, but the rows are identified by row index, not genome ID.
362        my $sets = $self->{GenomeSubsets};
363        # Create the return list.
364        my @pairs = ();
365        # Loop through the subsets.
366        my $pair;
367        foreach $pair (@$sets)
368        {
369            # Convert this subset's member list from row indices to genome IDs
370            # and stash the result in the return list.
371            my($id,$members) = @$pair;
372            push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
373        }
374        # Return the list constructed.
375        return @pairs;
376    }
377    
378    =head3 row_subsets
379    
380    C<< my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH); >>
381    
382    This method computes the taxonomic row subsets for a subsystem. It takes
383    as input a hash that maps genome IDs to column indices and a FIG object.
384    The FIG object provides a list of taxonomic groups of 10 or more complete
385    genomes. From the list, we extract subsets which have more than 10
386    genomes in the list of subsystem genomes. If no such subsets exist,
387    we extract subsets which have at least 1 genome from the list of
388    subsystem genomes. The subsets are returned as 2-tuples, the first
389    element being the subset ID and the second being a list of row indices
390    for the genomes in the subset.
391    
392    =over 4
393    
394    =item fig
395    
396    A FIG-like object for accessing the data store.
397    
398    =item genomeH
399    
400    Reference to a hash that maps each genome ID to its row index in the
401    subsystem spreadsheet.
402    
403    =item RETURN
404    
405    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
406    subset ID followed by a list of the row indices for the genomes in the
407    subset.
408    
409    =back
410    
411    =cut
412    
413    sub row_subsets {
414        my ($fig, $genomeH) = @_;
415    
416        # We need a real FIG object, since SFXlate does not yet support
417        # taxonomy trees. The "FIG" method does this for us.
418        $fig = $fig->FIG();
419        # Initialize the return value.
420        my $subsets = [];
421        # Get a list of taxonomic groups. This will come back as a list of
422        # 2-tuples.
423        my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
424        # Loop through the 2-tuples. We're looking for subsets which
425        # contain at least one genome on the subsystem's spreadsheet.
426        my($pair,$id,$members);
427        foreach $pair (@$taxonomic_groups)
428        {
429            ($id,$members) = @$pair;
430            # Extract the genomes in the member list that participate in this
431            # subsystem. To do this, we convert each genome ID to its row
432            # index. If no row index exists, the GREP condition discards the
433            # member.
434            my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
435            # If there are enough members, save the subset.
436            if (@mem > 0)
437            {
438                push(@$subsets,[$id,[@mem]]);
439            }
440        }
441        # Return the list of row subsets.
442        return $subsets;
443  }  }
444    
445    =head3 set_aliases
446    
447    C<< my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes); >>
448    
449    Return a hash mapping PEG IDs to aliases.
450    
451    =over 4
452    
453    =item fig
454    
455    FIG-like object that can be used to access the data store.
456    
457    =item pegH
458    
459    Reference to the spreadsheet hash table. Given a row index I<$row> and a
460    column index I<$col>,
461    
462        $pegH->{$row}->{$col}
463    
464    will return a reference to a list of PEGs in the specified spreadsheet cell.
465    
466    =item active_genomes
467    
468    Reference to a hash whose keys correspond to the spreadsheet row indices
469    of genomes that should be highlighted.
470    
471    =item RETURN
472    
473    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
474    for active genomes will be processed.
475    
476    =back
477    
478    =cut
479    
480  sub set_aliases {  sub set_aliases {
481      my($fig,$pegH) = @_;      # Get the parameters.
482        my($fig,$pegH,$active_genomes) = @_;
483      my($genomeI,$roleI,$pegs,$peg,$roleH);      my($genomeI,$roleI,$pegs,$peg,$roleH);
484    
485        # Create the return hash.
486      my $aliasesH = {};      my $aliasesH = {};
487    
488      foreach $genomeI (keys(%$pegH))      # Loop through each row that corresponds to an active genome.
489        # The active genome list contains row indices, and there is
490        # one genome per row. Note that the genome ID is never used,
491        # only the row index.
492        foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
493      {      {
494            # Get the role hash for the specified genome. The role hash
495            # maps column indices (representing roles) to lists of PEGs.
496          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
497            # Loop through the role (column) indices.
498          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
499          {          {
500                # Get the PEG list for this row/column combination.
501              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
502                # Only proceed if data was found in the cell.
503                if (defined $pegs) {
504                    # Loop through the pegs in the cell.
505              foreach $peg (@$pegs)              foreach $peg (@$pegs)
506              {              {
507                        # If we do not already have an alias for this PEG,
508                        # compute one.
509                  if (! $aliasesH->{$peg})                  if (! $aliasesH->{$peg})
510                  {                  {
511                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
# Line 147  Line 513 
513              }              }
514          }          }
515      }      }
516        }
517        # Return the hash we built.
518      return $aliasesH;      return $aliasesH;
519  }  }
520    
521    =head3 set_colors
522    
523    C<< my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes); >>
524    
525    Return a hash that maps each PEG in the subsystem spreadsheet to a display
526    color. Not all PEGs need to be mapped. Those that do not have a color
527    assigned will generally be displayed in white.
528    
529    =over 4
530    
531    =item fig
532    
533    FIG-like object that can be used to access the data store.
534    
535    =item sub
536    
537    Subsystem object for the current subsystem.
538    
539    =item pegH
540    
541    Reference to the spreadsheet hash table. Given a row index I<$row> and a
542    column index I<$col>,
543    
544        $pegH->{$row}->{$col}
545    
546    will return a reference to a list of PEGs in the specified spreadsheet cell.
547    
548    =item active_genomes
549    
550    Reference to a hash whose keys correspond to the spreadsheet row indices
551    of genomes that should be highlighted.
552    
553    =item RETURN
554    
555    Returns a hash that takes as input a PEG ID and returns a color. These colors
556    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
557    for active genomes will be colored.
558    
559    =back
560    
561    =cut
562    
563  sub set_colors {  sub set_colors {
564      my($fig,$pegH) = @_;      # Get the parameters.
565      my($genomeI,$roleI,$pegs,$peg,$roleH,$peg,%pegs_in_genome);      my($fig,$sub,$pegH,$active_genomes) = @_;
566    
567        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
568        # Create the return hash.
569      my $colorsH = {};      my $colorsH = {};
570        # Loop through the active genomes. The keys of "%$pegH" are the row indices
571      foreach $genomeI (keys(%$pegH))      # for rows that have at least one occupied cell in the spreadsheet. The
572      {      # Grep then reduces this list to those rows that are highlighted.
573        foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
574        {
575            # We will use the following hash to compile a list of all the PEGs
576            # in the spreadsheet for the current genome, that is, the genome
577            # represented by row index "$genomeI".
578          undef %pegs_in_genome;          undef %pegs_in_genome;
579            # Get the hash for the current row. This hash maps column indices to
580            # lists of pegs.
581          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
582            # Loop through the column indices for the specified row.
583          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
584          {          {
585                # Here we can finally get a list of the pegs in this spreadsheet
586                # cell. We loop through them, marking them in the "%pegs_in_genome"
587                # hash.
588              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
589              foreach $peg (@$pegs)              foreach $peg (@$pegs)
590              {              {
591                  $pegs_in_genome{$peg} = 1;                  $pegs_in_genome{$peg} = 1;
592              }              }
593          }          }
594            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
595            # list of all the pegs for the current spreadsheet role.
596          my @pegs = keys(%pegs_in_genome);          my @pegs = keys(%pegs_in_genome);
597          my($tuple,$peg,$color);          my($tuple,$peg,$color);
598          my $colors_for_one_genome = &set_colors_for_genome($fig,\@pegs);          # Get a hash that maps the PEG IDs to colors.
599            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
600            # Loop through the hash we got back and assign the colors from that
601            # hash to the master hash we're returning to the caller.
602          while (($peg,$color) = each %$colors_for_one_genome)          while (($peg,$color) = each %$colors_for_one_genome)
603          {          {
604              $colorsH->{$peg} = $colors_for_one_genome->{$peg};              $colorsH->{$peg} = $colors_for_one_genome->{$peg};
605          }          }
606      }      }
607        # Return the color hash.
608      return $colorsH;      return $colorsH;
609  }  }
610    
611  sub set_colors_for_genome {  =head3 set_colors_for_genome
     my($fig,$pegs) = @_;  
     my($peg,@clusters,$cluster,@colors,$color,%seen,%conn,$x,$peg1,@pegs,$i);  
612    
613      my $color_of = {};  C<< my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs); >>
     foreach $peg (@$pegs) { $color_of->{$peg} = '#FFFFFF' }  
614    
615      @pegs = keys(%$color_of);  #  Use of keys makes @pegs entries unique  Return a reference to a hash mapping the specified pegs to colors. PEGs that
616    are physically close to each other will be painted the same color.
617    
618      foreach $peg (@pegs)  =over 4
     {  
         $conn{$peg} = [grep { $color_of->{$_} && ($_ ne $peg) } $fig->close_genes($peg,5000)];  
     }  
619    
620      @clusters = ();  =item fig
     while ($peg = shift @pegs)  
     {  
         if (! $seen{$peg})  
         {  
             $cluster = [$peg];  
             $seen{$peg} = 1;  
             for ($i=0; ($i < @$cluster); $i++)  
             {  
                 my @tmp = grep { ! $seen{$_} } @{$conn{$cluster->[$i]}};  
                 if (@tmp > 0)  
                 {  
                     foreach my $peg1 (@tmp) { $seen{$peg1} = 1 }  
                     push(@$cluster,@tmp);  
                 }  
             }  
             push(@clusters,$cluster);  
         }  
     }  
621    
622      @colors =  &cool_colors();  A fig-like object that can be used to access the data store.
623    
624      @clusters = grep { @$_ > 1 } sort { @$a <=> @$b } @clusters;  =item sub
625    
626      if (@clusters > @colors) { splice(@clusters,0,(@clusters - @colors)) }  # make sure we have enough colors  Subsystem object for the relevant subsystem.
627    
628      my($cluster);  =item pegs
629      foreach $cluster (@clusters)  
630      {  Reference to a list of PEG IDs. All of the peg IDs should be for the
631          $color = shift @colors;  same genome.
632          foreach $peg (@$cluster)  
633          {  =item RETURN
634              $color_of->{$peg} = $color;  
635          }  Returns a reference to a hash that maps each PEG ID to a color.
636      }  
637      return $color_of;  =back
638    
639    =cut
640    
641    sub set_colors_for_genome {
642        # Get the parameters.
643        my($fig, $sub, $pegs) = @_;
644        # Default all the PEGs to white.
645        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
646        # Divide the pegs into clusters.
647        my @clusters = $fig->compute_clusters($pegs, $sub);
648        # Get a list of useful colors.
649        my @colors =  &cool_colors();
650        # If we have too many clusters, chop off the big ones at the end. These
651        # are least likely to be important.
652        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
653        # Loop through the clusters.
654        for my $cluster (@clusters) {
655            # Get the color for this cluster.
656            my $color = shift @colors;
657            # Loop through this cluster, putting this color into the color_of
658            # entries for each PEG.
659            for my $peg (@$cluster) {
660                $color_of{$peg} = $color;
661            }
662        }
663        # Return the color map.
664        return \%color_of;
665  }  }
666    
667    =head3 cool_colors
668    
669    C<< my @colorList = UnvSubsys::cool_colors(); >>
670    
671    Return a list of web-safe colors.
672    
673    =cut
674    
675  sub cool_colors {  sub cool_colors {
676   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
677   return (   return (
# Line 254  Line 693 
693   );   );
694  }  }
695    
696    =head3 ext_id
697    
698    C<< my $externalID = UnvSubsys::ext_id($fig, $peg); >>
699    
700    or
701    
702    C<< my @externalIDs = UnvSubsys::ext_id($fig, $peg); >>
703    
704    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
705    a single non-FIG ID for the specified feature.
706    
707    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
708    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
709    of a given type. If it finds at least one, then all IDs of that type are returned.
710    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
711    KEGG IDs.
712    
713    =over 4
714    
715    =item fig
716    
717    A FIG-like object for accessing the data store.
718    
719    =item peg
720    
721    ID of the feature whose aliases are desired.
722    
723    =item RETURN
724    
725    In list context, a list of non-FIG IDs for the feature that are all of the same
726    type. In scalar context, the first non-FIG ID for the feature of the
727    highest-priority available type.
728    
729    =back
730    
731    =cut
732    
733  sub ext_id {  sub ext_id {
734      my($fig,$peg) = @_;      my($fig,$peg) = @_;
735    
# Line 290  Line 766 
766      }      }
767  }  }
768    
769    =head3 subsystem_curator
770    
771    C<< my $name = $unvsub->subsystem_curator(); >>
772    
773    Return the name of the subsystem curator. The database stores user names as
774    C<master:>I<name>. This method strips off the C<master:> prefix before it
775    passes the result back to the caller.
776    
777    =cut
778    
779  sub subsystem_curator {  sub subsystem_curator {
780      my($self) = @_;      my($self) = @_;
# Line 299  Line 784 
784      return $curator;      return $curator;
785  }  }
786    
787    =head3 get_roles
788    
789    C<< my @roles = $unvsub->get_roles(); >>
790    
791    Return a list of the roles (columns) for this subsystem. The roles will be
792    returned in column order, so that if you access the sixth element of the
793    return list, you'll get the name of the role for the sixth column.
794    
795    =cut
796    
797  sub get_roles {  sub get_roles {
798      my($self) = @_;      my($self) = @_;
799        # The role index in this object is a list of 3-tuples. The caller only
800        # wants the first element of each tuple, which is the role name.
801      return map { $_->[0] } @{$self->{Roles}};      return map { $_->[0] } @{$self->{Roles}};
802  }  }
803    
804    =head3 get_genome_index
805    
806    C<< my $index = $unvsub->get_genome_index($genome); >>
807    
808    Return the row index of the specified genome.
809    
810    =over 4
811    
812    =item genome
813    
814    ID of the genome whose row index is desired.
815    
816    =item RETURN
817    
818    Returns the index of the row corresponding to the specified genome, or an
819    undefined value if the genome is not represented in the subsystem
820    spreadsheet.
821    
822    =back
823    
824    =cut
825    
826  sub get_genome_index {  sub get_genome_index {
827      my($self,$genome) = @_;      my($self,$genome) = @_;
828    
829      return $self->{GenomeIndex}->{$genome};      return $self->{GenomeIndex}->{$genome};
830  }  }
831    
832    =head3 get_genomes
833    
834    C<< my @genomes = $unvsub->get_genomes(); >>
835    
836    Return a list of the genome IDs for the subsystem. The genomes will be
837    presented in row order. In other words, if you index into the sixth
838    element of the return list, you will retrieve the ID of the genome for
839    the sixth row.
840    
841    =cut
842    
843  sub get_genomes {  sub get_genomes {
844      my($self) = @_;      my($self) = @_;
845        # The genome array is a list of 2-tuples. We extract the first
846        # element of each tuple, which is the genome ID.
847      return map { $_->[0] } @{$self->{Genomes}};      return map { $_->[0] } @{$self->{Genomes}};
848  }  }
849    
850    =head3 get_variant_code
851    
852    C<< my $code = $unvsub->get_variant_code($genome); >>
853    
854    Return the variant code for a genome. Each subsystem has several variations.
855    The variant code indicates which variation of a subsystem is used by a
856    particular genome.
857    
858    Genome data is stored in a list of 2-tuples. The first element is the genome
859    ID; the second is the variant code.
860    
861    =over 4
862    
863    =item genome
864    
865    ID or row index of the genome whose variant code is desired.
866    
867    =item RETURN
868    
869    Returns the variant code for the specified genome.
870    
871    =back
872    
873    =cut
874    
875  sub get_variant_code {  sub get_variant_code {
876      my($self,$genome) = @_;      my($self,$genome) = @_;
877        # Check to see if we have a row index.
878      if ($genome =~ /^\d+$/)      if ($genome =~ /^\d+$/)
879      {      {
880            # Here we have a row index, so use it to find the genome's variant
881            # code.
882          return $self->{Genomes}->[$genome]->[1];          return $self->{Genomes}->[$genome]->[1];
883      }      }
884      else      else
885      {      {
886            # Here we have a genome ID, so we need to convert it to a row index.
887          my $genomeI = $self->{GenomeIndex}->{$genome};          my $genomeI = $self->{GenomeIndex}->{$genome};
888          return $self->{Genomes}->[$genomeI]->[1];          return $self->{Genomes}->[$genomeI]->[1];
889      }      }
890  }  }
891    
892    =head3 get_pegs_from_cell
893    
894    C<< my @pegs = $unvsub->get_pegs_from_cell($genome, $role); >>
895    
896    Return a list of the features in a specified spreadsheet cell. The cell is specified
897    by genome ID and role ID.
898    
899    =over 4
900    
901    =item genome
902    
903    ID of the genome relevant to the cell.
904    
905    =item role
906    
907    ID of the role relevant to the cell.
908    
909    =item RETURN
910    
911    Returns a list of the features in the cell, or an empty list if the cell is empty.
912    
913    =back
914    
915    =cut
916    
917  sub get_pegs_from_cell {  sub get_pegs_from_cell {
918      my($self,$genome,$role) = @_;      my($self,$genome,$role) = @_;
919        # Convert the genome and role IDs to row and column indices.
920      my $genomeI = $self->{GenomeIndex}->{$genome};      my $genomeI = $self->{GenomeIndex}->{$genome};
921      my $roleI   = $self->{RoleIndex}->{$role};      my $roleI   = $self->{RoleIndex}->{$role};
922        # Get the pegs from the cell and return them.
923      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
924      return $pegs ? @$pegs : ();      return $pegs ? @$pegs : ();
925  }  }
926    
927    =head3 get_diagrams
928    
929    C<< my @list = $sub->get_diagrams(); >>
930    
931    Return a list of the diagrams associated with this subsystem. Each diagram
932    is represented in the return list as a 4-tuple C<[diagram_id, diagram_name,
933    page_link, img_link]> where
934    
935    =over 4
936    
937    =item diagram_id
938    
939    ID code for this diagram.
940    
941    =item diagram_name
942    
943    Displayable name of the diagram.
944    
945    =item page_link
946    
947    URL of an HTML page containing information about the diagram.
948    
949    =item img_link
950    
951    URL of an HTML page containing an image for the diagram.
952    
953    =back
954    
955    Note that the URLs are in fact for CGI scripts with parameters that point them
956    to the correct place. Though Sprout has diagram information in it, it has
957    no relationship to the diagrams displayed in SEED, so the work is done entirely
958    on the SEED side.
959    
960    =cut
961    
962    sub get_diagrams {
963        # Get the parameters.
964        my ($self) = @_;
965        # Return the diagram list.
966        return @{$self->{Diagrams}};
967    }
968    
969  sub get_notes {  sub get_notes {
970      my($self) = @_;      my($self) = @_;
971    
# Line 400  Line 1025 
1025      return $self->{Colors}->{$peg};      return $self->{Colors}->{$peg};
1026  }  }
1027    
1028  1;  =head3 active_genomes
1029    
1030    C<< my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info); >>
1031    
1032    Return a hash containing the active genomes for this subsystem display. The
1033    keys of the hash will be the row indices of the genomes to be highlighted on the
1034    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
1035    tests TRUE, the fourth row should be highlighted.
1036    
1037    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
1038    specified, it is presumed to be the ID of the subset containing the active genomes.
1039    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
1040    ID of the genome currently in focus, and the active genomes will be the ones in the
1041    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
1042    I<$focus> are specified, then all genomes are active.
1043    
1044    =over 4
1045    
1046    =item row_subsets
1047    
1048    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1049    a reference to a list of the row indices for the rows in the identified subset.
1050    
1051    =item active_subsetR (optional)
1052    
1053    ID of the active subset (if any).
1054    
1055    =item focus
1056    
1057    ID of the genome currently in focus (if any). If there is no active subset, then
1058    the smallest subset containing this genome will be made active.
1059    
1060    =item genomeH
1061    
1062    Reference to a hash of genome IDs to row indices. The keys of this hash are
1063    the genomes in this subsystem, which also form the subsystem spreadsheet's
1064    rows.
1065    
1066    =item genomes_info
1067    
1068    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1069    ID of a genome; the second is the variant code for the subsystem variant
1070    used by the genome. The tuples are ordered by row index, so that the ID
1071    and variant code of the genome in a particular row can be located by indexing
1072    into this parameter using the subsystem spreadsheet row number.
1073    
1074    =item RETURN
1075    
1076    Returns a reference to a hash that maps the row indices of the active genomes
1077    to 1. This hash can be used to quickly determine whether or not a particular
1078    row is to be highlighted.
1079    
1080    =back
1081    
1082    =cut
1083    
1084    sub active_genomes {
1085        # Get the parameters.
1086        my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1087        my($i,@bestL);
1088        # Declare the return variable.
1089        my $active_genomes = {};
1090        # Check for an active subset.
1091        if ($active_subsetR)
1092        {
1093            # Search for the active subset in the row subset array.
1094            for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1095            if ($i < @$row_subsets)
1096            {
1097                # Here we found the active subset, so we extract its list of row indices.
1098                @bestL = @{$row_subsets->[$i]->[1]};
1099            }
1100            else {
1101                # Here we have the ID of the active subset. First, we search for that ID
1102                # in the row subset list.
1103                for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1104                if ($i < @$row_subsets)
1105                {
1106                    # Here we found the named subset, so we return its member list.
1107                    @bestL = @{$row_subsets->[$i]->[1]};
1108                }
1109                else
1110                {
1111                    # Here the active subset does not exist. We punt by extracting a
1112                    # list of all the row indices in the spreadsheet.
1113                    $active_subsetR = 'All';
1114                    @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1115                }
1116            }
1117        }
1118        elsif ($focus)
1119        {
1120            # Here we don't have an active row subset, but a particular genome is in
1121            # focus. We'll look for the smallest row subset containing the genome
1122            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1123            # of the best subset found so far; "@bestL" is where we stash the list
1124            # of IDs in the subset. Our initial selection, then, will be the
1125            # fake "All" subset, which contains the entire collection of rows.
1126            if (! $fig->is_complete($focus))
1127            {
1128                # Here the gnome in focus is incomplete, so it won't be anywhere
1129                # in our list. We default to making everything active.
1130                $active_subsetR = 'All';
1131                @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1132            } else {
1133                my $bestN   = "All";
1134                @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1135                # Next, we get the row index for the genome in focus.
1136                my $focusIndex = $genomeH->{$focus};
1137                # Now we loop through all the row subsets.
1138                my $tuple;
1139                foreach $tuple (@$row_subsets)
1140                {
1141                    # Extract the subset ID and its list of row indices. The latter is
1142                    # in "$genomeIs".
1143                    my($id,$genomeIs) = @$tuple;
1144                    # Search for the index of the focus row in the current subset's list
1145                    # of row indices.
1146                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1147                    # Now either $i will be the index of the focus row in the subset, or
1148                    # it is going to be equal to the number of rows in the subset.
1149                    if ($i < @$genomeIs)
1150                    {
1151                        # We've found the focus row in this subset. Select it as the new
1152                        # best subset if it's smaller than the last one we found.
1153                        if (@$genomeIs < @bestL)
1154                        {
1155                            $bestN  = $id;
1156                            @bestL = @$genomeIs;
1157                        }
1158                    }
1159                }
1160                # Save the best subset found as the active one.
1161                $active_subsetR = $bestN;
1162            }
1163        } else {
1164            # Here we have nothing: no active subset, and no focus row. We make
1165            # all rows active.
1166            $active_subsetR = 'All';
1167            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1168        }
1169        # "@bestL" now contains a list of the row indices for the active genomes.
1170        # We convert it to a hash and return it.
1171        my %active_genomes = map { $_ => 1 } @bestL;
1172        return \%active_genomes;
1173    }
1174    
1175    1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.14

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3