[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Tue Sep 27 00:58:27 2005 UTC revision 1.19, Thu Dec 6 13:59:34 2007 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  package UnvSubsys;  package UnvSubsys;
19    
20  use Subsystem;  use Subsystem;
21  use Carp;  use Carp;
22  use FIG;  use FIG;
23    use SFXlate;
24  use Data::Dumper;  use Data::Dumper;
25  use strict;  use strict;
26    use Tracer;
27    
28  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;  
29    
30      $ssa =~ s/ /_/g;  =head2 Introduction
31    
32    The universal subsystem object provides methods used to display useful information
33    about a subsystem. Its intent is to support both Sprout and SEED.
34    
35          ### { Roles =>Roles,  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
36          ###   RoleIndex => ToRoleIndexHash,  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
37          ###   RoleSubsets => ColSubsets,  are stored in the spreadsheet's cells. Each genome can be identified by genome
38          ###   Genomes => Genomes,  ID or row index. Each role can be identified by role name or column index.
         ###   GenomeIndex => ToGenomeIndexHash,  
         ###   PegHash => PegHash,  
         ###   Colors  => ColorHash,  
         ###   Aliases => AliasHash,  
         ###   Curator => Curator,  
         ###   Notes => Notes,  
         ###   Reactions => ReactionHash  
         ### }  
         ###  
         ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  
         ###  
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### RowSubSets = pointer to a list of [SubsetName,[GenomeIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
         ### ReactionHash is a hash: key=Role value=[reaction-ids]  
39    
40      if ((ref($fig) eq "FIG") || ((ref($fig) eq 'SFXlate') && ($fig = $fig->{'fig'})))  It is worth noting that the term I<subset> has two meanings-- a subset of roles
41      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
42          my $subsystem = new Subsystem($ssa,$fig,0);  subset of genomes is called a I<row subset>.
43    
44    The object created contains a great deal of information, so it's worthwhile to
45    stop for a moment and discuss it. The object will have the following members.
46    
47    =over 4
48    
49    =item Roles
50    
51    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
52    and a list of URLs for the reactions. Each role is a column in the
53    subsystem spreadsheet. Indexing into the list by column index yields the
54    ID, abbreviation, and reactions for the role in that column.
55    
56    =item RoleIndex
57    
58    A hash mapping each role to its column index in the spreadsheet.
59    
60    =item RoleSubsets
61    
62    A list of 2-tuples, each consisting of a subset name followed by a list of the
63    column indices for the roles in the subset.
64    
65    =item Genomes
66    
67    A list of 2-tuples, each containing the genome ID for the relevant row and the
68    variant code for that genome. Subsystems can have several variations, and the
69    variant code indicates which variant of the subsystem that the genome uses.
70    There is one genome for each row in the spreadsheet. Indexing into the list
71    yields the ID of the genome for that row and its variant code.
72    
73    =item GenomeIndex
74    
75    A hash mapping each genome ID to its row index in the spreadsheet.
76    
77    =item PegHash
78    
79    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
80    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
81    
82        $pegHash->{$row}->{$col}
83    
84    returns a reference to a list of the IDs for the PEGs in the relevant
85    spreadsheet cell.
86    
87    =item ColorHash
88    
89    A hash mapping each PEG ID to the color that should be used to represent that
90    PEG in the display.
91    
92    =item AliasHash
93    
94    A hash mapping each PEG ID to a list of its aliases.
95    
96    =item ReactionHash
97    
98    A hash mapping each role to the list of reactions it catalyzes.
99    
100    =back
101    
102    =head2 Public Methods
103    
104    =head3 new
105    
106        my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors);
107    
108    Construct a new universal subsystem object for a specified subsystem.
109    
110    =over 4
111    
112    =item ssa
113    
114    Name of the subsystem.
115    
116    =item fig
117    
118    Data access object, either a FIG object or an SFXlate object.
119    
120    =item active_subsetR
121    
122    Name of the active row subset. A row subset names a group of genomes.
123    
124    =item focus
125    
126    ID of the genome currently in focus.
127    
128    =item show_clusters
129    
130    TRUE if clusters should be painted by color, else FALSE.
131    
132    =item aliases
133    
134    TRUE if PEG aliases should be shown, else FALSE.
135    
136    =item peg_colors (optional)
137    
138    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
139    to be assigned to the PEG.
140    
141    =back
142    
143    =cut
144    
145    sub new {
146        # Get the parameters.
147        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
148        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
149        # in the seed's file system.
150        $ssa =~ s/ /_/g;
151        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
152        # a future pass through the code, we'll get data from a SEED or Sprout.
153        if ((ref($fig) eq "FIG") || (ref($fig) eq "FIGV") || (ref($fig) eq 'SFXlate')) {
154            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
155            # and SFXlate; however, the object returned in each case is different: FIG
156            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
157            my $subsystem = $fig->get_subsystem($ssa);
158            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
159            # in case the notes come from a Mac.
160          my $curator = $subsystem->get_curator;          my $curator = $subsystem->get_curator;
161          my $notes = $subsystem->get_notes;          my $notes = $subsystem->get_notes;
162          $notes =~ s/ /\n/g;          $notes =~ s/\r/\n/g;
163          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
164          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
165          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
166          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
167            my @diagrams = $subsystem->get_diagrams();
168            # Create the data structures for the role list and the role index.
169          my $role_info = [];          my $role_info = [];
170          my $roleH     = {};          my $roleH     = {};
171            # Loop through the roles to create the role list. The $i index will move
172            # through the columns of the spreadsheet.
173          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
174          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
175          {          {
176                # Get the ID of the role in the current column.
177              my $role = $roles[$i];              my $role = $roles[$i];
178              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
179                my $abbrev = $subsystem->get_role_abbr($i);
180                # Get the reactions. Note that its possible no reactions were found. If there were
181                # reactions found, however, we convert from reaction IDs to a comma-delimited
182                # list of HTML code full of hyperlinks.
183              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
184                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
185                # them onto the main role list.
186              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
187                # Set the role hash so that we can get back the column index for a given
188                # role name.
189              $roleH->{$role} = $i;              $roleH->{$role} = $i;
190          }          }
191            # Get the column subsets. A column subset is a list of role IDs, so we need to
192            # convert it to a set of column indices. We ignore the special "All" subset
193            # that contains everything.
194          my $subset_info = [];          my $subset_info = [];
195          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
196          {          {
# Line 80  Line 199 
199                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
200              }              }
201          }          }
202            # Now we create the genome directories. For each genome we need to be able to
203            # hash from the genomeID to the specified role index and we must be able to
204            # get the genome's variant code. (The variant code indicates which subsystem
205            # variant the genome uses.
206          my $genomes_info = [];          my $genomes_info = [];
207          my $genomeH      = {};          my $genomeH      = {};
208          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
209          {          {
210                # Get the genome ID for this row.
211              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
212              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
213                my $variant = $subsystem->get_variant_code($i);
214                # Form them into a 2-tuple and add the result to the genome list.
215              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
216                # Set up the hash to get from the genome ID to the row index.
217              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
218          }          }
219    
220            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
221            # object, this is the most expensive part of the operation, since it requires
222            # a database call for each cell.
223          my $pegH = {};          my $pegH = {};
224            # $i is the row index, which cycles through genomes.
225          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
226          {          {
227              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
228              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
229              {              {
230                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
231                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
232              }              }
233          }          }
234    
235          my $row_subsets    = &row_subsets($fig,$genomeH,$genomes_info);          # Get the row subsets. Row subsets are determined by the genome
236          my $active_genomes = &active_genomes($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info);          # taxonomy, so these are different from the row subsets stored
237            # in the subsystem object.
238            my $row_subsets    = &row_subsets($fig, $genomeH);
239            # Here we try to get a list of the active genomes. The caller
240            # gives us hints in the form of the "$focus" and "$active_subsetR"
241            # parameters. If, for example, we are using this object to generate
242            # a subsystem web page, the focus information would steer us to
243            # whatever the user wants to look at on the page.
244            my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
245                                                 $genomeH,$genomes_info);
246    
247            # Now we generate a table of colors for the various PEGs. If the
248            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
249            # we allow the option of painting the PEGs by cluster number. (The
250            # caller indicates this by setting the "show_clusters" flag in the
251            # parameter list.)
252          my $colorsH;          my $colorsH;
253          if ($peg_colors)          if ($peg_colors)
254          {          {
255                # Here the caller gave us a list of peg colors. The list contains
256                # 2-tuples, each consisting of a PEG ID followed by the
257                # color value. The loop below extracts the pairs and stuffs them
258                # into the color hash.
259              $colorsH = {};              $colorsH = {};
260              foreach $_ (@$peg_colors)              foreach $_ (@$peg_colors)
261              {              {
# Line 118  Line 265 
265          }          }
266          elsif ($show_clusters)          elsif ($show_clusters)
267          {          {
268              $colorsH  = &set_colors($fig,$pegH,$active_genomes);              # Here the user wants us to base the colors on the genome clustering
269                # information.
270                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
271          }          }
272          else          else
273          {          {
274                # Here the user is not interested in coloring the PEGs.
275              $colorsH = {};              $colorsH = {};
276          }          }
277            # If the user wants to see aliases, compute the alias hash. Aliases
278            # will only be computed for PEGs belonging to active (highlighted)
279            # genomes, and there is a maximum of one alias per PEG.
280          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
281          my $reactions = $subsystem->get_reactions;          # Create and bless the UnvSubsys object.
282          my $self = { Roles => $role_info,          my $self = { SSA => $ssa,
283                         Roles => $role_info,
284                       RoleIndex => $roleH,                       RoleIndex => $roleH,
285                       RoleSubsets => $subset_info,                       RoleSubsets => $subset_info,
286                       Genomes => $genomes_info,                       Genomes => $genomes_info,
# Line 138  Line 291 
291                       Aliases => $aliasesH,                       Aliases => $aliasesH,
292                       Curator => $curator,                       Curator => $curator,
293                       Notes => $notes,                       Notes => $notes,
294                       Reactions => $reactions                       Reactions => $reactions,
295                         Diagrams => \@diagrams,
296                     };                     };
297          bless($self, $class);          bless($self, $class);
298            # Return the object.
299          return $self;          return $self;
300      }      }
301      else      else
302      {      {
303            # Here the FIG-like object was not recognized, so we return an
304            # undefined value.
305          return undef;          return undef;
306      }      }
307  }  }
308    
309    =head3 get_ssa
310    
311        my $ssa = $unvsub->get_ssa();
312    
313    Return the name of the subsystem
314    
315    =cut
316    
317    sub get_ssa {
318        my($self) = @_;
319        return $self->{SSA};
320    }
321    
322    =head3 get_ssa_pretty
323    
324        my $ssa = $unvsub->get_ssa_pretty();
325    
326    Return the 'prettyfied' name of the subsystem
327    
328    =cut
329    
330    sub get_ssa_pretty{
331        my($self) = @_;
332        my $ssa = $self->{SSA};
333        $ssa =~ s/_/ /g;
334        return $ssa;
335    }
336    
337    
338    =head3 get_subset_namesR
339    
340        my @names = $unvsub->get_subset_namesR();
341    
342    Return the names of the genome (row) subsets.
343    
344    =cut
345    
346  sub get_subset_namesR {  sub get_subset_namesR {
347      my($self) = @_;      my($self) = @_;
348    
349      return map { $_->[0] } @{$self->{GenomeSubsets}};      return map { $_->[0] } @{$self->{GenomeSubsets}};
350  }  }
351    
352    =head3 get_subsetR
353    
354        my @genomes = $unvsub->get_subsetR($set);
355    
356    Return a list of the genome IDs covered by a row subset.
357    
358    =over 4
359    
360    =item set
361    
362    Name of the row subset whose genomes are desired.
363    
364    =item RETURN
365    
366    Returns a list of the IDs for the genomes found in the specified row
367    set.
368    
369    =back
370    
371    =cut
372    
373  sub get_subsetR {  sub get_subsetR {
374        # Get the parameters.
375      my($self,$set) = @_;      my($self,$set) = @_;
376      my($i);      my($i);
377        # Get the list of row subsets.
378      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
379        # Find the row subset with the specified name. The row subset list is a
380        # list of 2-tuples, and the first element of each tuple is the set
381        # name.
382      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
383      if ($i < @$sets)      if ($i < @$sets)
384      {      {
385            # Here we found the named subset. The subset tuple's second element is
386            # the list of row indices. We map these to genome IDs before returning
387            # them.
388          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
389      }      }
390        # Here we subset was not found, so we return the undefined value.
391      return undef;      return undef;
392  }  }
393    
394    =head3 get_subsetR
395    
396        my @pairs = $unvsub->get_subsetsR();
397    
398    Return a list of all the row subsets. The subsets are returned in the form
399    of 2-tuples, each consisting of a subset name followed by a reference to a
400    list of genome IDs. The genome IDs correspond to the rows in the subset.
401    
402    =cut
403    
404  sub get_subsetsR {  sub get_subsetsR {
405        # Get the parameters.
406      my($self) = @_;      my($self) = @_;
407        # Extract the list of genome subsets. This list is in the form of
408        # 2-tuples, but the rows are identified by row index, not genome ID.
409      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
410        # Create the return list.
411      my @pairs = ();      my @pairs = ();
412        # Loop through the subsets.
413      my $pair;      my $pair;
414      foreach $pair (@$sets)      foreach $pair (@$sets)
415      {      {
416            # Convert this subset's member list from row indices to genome IDs
417            # and stash the result in the return list.
418          my($id,$members) = @$pair;          my($id,$members) = @$pair;
419          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
420      }      }
421        # Return the list constructed.
422      return @pairs;      return @pairs;
423  }  }
424    
425    =head3 row_subsets
426    
427        my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH);
428    
429    This method computes the taxonomic row subsets for a subsystem. It takes
430    as input a hash that maps genome IDs to column indices and a FIG object.
431    The FIG object provides a list of taxonomic groups of 10 or more complete
432    genomes. From the list, we extract subsets which have more than 10
433    genomes in the list of subsystem genomes. If no such subsets exist,
434    we extract subsets which have at least 1 genome from the list of
435    subsystem genomes. The subsets are returned as 2-tuples, the first
436    element being the subset ID and the second being a list of row indices
437    for the genomes in the subset.
438    
439    =over 4
440    
441    =item fig
442    
443    A FIG-like object for accessing the data store.
444    
445    =item genomeH
446    
447    Reference to a hash that maps each genome ID to its row index in the
448    subsystem spreadsheet.
449    
450    =item RETURN
451    
452    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
453    subset ID followed by a list of the row indices for the genomes in the
454    subset.
455    
456    =back
457    
458    =cut
459    
460  sub row_subsets {  sub row_subsets {
461      my ($fig,$genomeH,$genomes_info) = @_;      my ($fig, $genomeH) = @_;
462    
463        # We need a real FIG object, since SFXlate does not yet support
464        # taxonomy trees. The "FIG" method does this for us.
465        $fig = $fig->FIG();
466        # Initialize the return value.
467      my $subsets = [];      my $subsets = [];
468      my $taxonomic_groups = $fig->taxonomic_groups_of_complete(10);      # Get a list of taxonomic groups. This will come back as a list of
469        # 2-tuples.
470        my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
471        # Loop through the 2-tuples. We're looking for subsets which
472        # contain at least one genome on the subsystem's spreadsheet.
473      my($pair,$id,$members);      my($pair,$id,$members);
474      foreach $pair (@$taxonomic_groups)      foreach $pair (@$taxonomic_groups)
475      {      {
476          ($id,$members) = @$pair;          ($id,$members) = @$pair;
477    #       warn "Group $id is @$members\n";
478    #       if ($id eq 'All')
479    #       {
480    #           push(@$members, '372461.6');
481    #       }
482    
483            # Extract the genomes in the member list that participate in this
484            # subsystem. To do this, we convert each genome ID to its row
485            # index. If no row index exists, the GREP condition discards the
486            # member.
487          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
488            # If there are enough members, save the subset.
489          if (@mem > 0)          if (@mem > 0)
490          {          {
491              push(@$subsets,[$id,[@mem]]);              push(@$subsets,[$id,[@mem]]);
492          }          }
493      }      }
494        # Return the list of row subsets.
495      return $subsets;      return $subsets;
496  }  }
497    
498    =head3 set_aliases
499    
500        my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes);
501    
502    Return a hash mapping PEG IDs to aliases.
503    
504    =over 4
505    
506    =item fig
507    
508    FIG-like object that can be used to access the data store.
509    
510    =item pegH
511    
512    Reference to the spreadsheet hash table. Given a row index I<$row> and a
513    column index I<$col>,
514    
515        $pegH->{$row}->{$col}
516    
517    will return a reference to a list of PEGs in the specified spreadsheet cell.
518    
519    =item active_genomes
520    
521    Reference to a hash whose keys correspond to the spreadsheet row indices
522    of genomes that should be highlighted.
523    
524    =item RETURN
525    
526    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
527    for active genomes will be processed.
528    
529    =back
530    
531    =cut
532    
533  sub set_aliases {  sub set_aliases {
534        # Get the parameters.
535      my($fig,$pegH,$active_genomes) = @_;      my($fig,$pegH,$active_genomes) = @_;
536      my($genomeI,$roleI,$pegs,$peg,$roleH);      my($genomeI,$roleI,$pegs,$peg,$roleH);
537    
538        # Create the return hash.
539      my $aliasesH = {};      my $aliasesH = {};
540    
541        # Loop through each row that corresponds to an active genome.
542        # The active genome list contains row indices, and there is
543        # one genome per row. Note that the genome ID is never used,
544        # only the row index.
545      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
546      {      {
547            # Get the role hash for the specified genome. The role hash
548            # maps column indices (representing roles) to lists of PEGs.
549          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
550            # Loop through the role (column) indices.
551          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
552          {          {
553                # Get the PEG list for this row/column combination.
554              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
555                # Only proceed if data was found in the cell.
556                if (defined $pegs) {
557                    # Loop through the pegs in the cell.
558              foreach $peg (@$pegs)              foreach $peg (@$pegs)
559              {              {
560                        # If we do not already have an alias for this PEG,
561                        # compute one.
562                  if (! $aliasesH->{$peg})                  if (! $aliasesH->{$peg})
563                  {                  {
564                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
# Line 222  Line 566 
566              }              }
567          }          }
568      }      }
569        }
570        # Return the hash we built.
571      return $aliasesH;      return $aliasesH;
572  }  }
573    
574    =head3 set_colors
575    
576        my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes);
577    
578    Return a hash that maps each PEG in the subsystem spreadsheet to a display
579    color. Not all PEGs need to be mapped. Those that do not have a color
580    assigned will generally be displayed in white.
581    
582    =over 4
583    
584    =item fig
585    
586    FIG-like object that can be used to access the data store.
587    
588    =item sub
589    
590    Subsystem object for the current subsystem.
591    
592    =item pegH
593    
594    Reference to the spreadsheet hash table. Given a row index I<$row> and a
595    column index I<$col>,
596    
597        $pegH->{$row}->{$col}
598    
599    will return a reference to a list of PEGs in the specified spreadsheet cell.
600    
601    =item active_genomes
602    
603    Reference to a hash whose keys correspond to the spreadsheet row indices
604    of genomes that should be highlighted.
605    
606    =item RETURN
607    
608    Returns a hash that takes as input a PEG ID and returns a color. These colors
609    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
610    for active genomes will be colored.
611    
612    =back
613    
614    =cut
615    
616  sub set_colors {  sub set_colors {
617      my($fig,$pegH,$active_genomes) = @_;      # Get the parameters.
618      my($genomeI,$roleI,$pegs,$peg,$roleH,$peg,%pegs_in_genome);      my($fig,$sub,$pegH,$active_genomes) = @_;
619    
620        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
621        # Create the return hash.
622      my $colorsH = {};      my $colorsH = {};
623        # Loop through the active genomes. The keys of "%$pegH" are the row indices
624        # for rows that have at least one occupied cell in the spreadsheet. The
625        # Grep then reduces this list to those rows that are highlighted.
626      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
627      {      {
628            # We will use the following hash to compile a list of all the PEGs
629            # in the spreadsheet for the current genome, that is, the genome
630            # represented by row index "$genomeI".
631          undef %pegs_in_genome;          undef %pegs_in_genome;
632            # Get the hash for the current row. This hash maps column indices to
633            # lists of pegs.
634          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
635            # Loop through the column indices for the specified row.
636          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
637          {          {
638                # Here we can finally get a list of the pegs in this spreadsheet
639                # cell. We loop through them, marking them in the "%pegs_in_genome"
640                # hash.
641              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
642              foreach $peg (@$pegs)              foreach $peg (@$pegs)
643              {              {
644                  $pegs_in_genome{$peg} = 1;                  $pegs_in_genome{$peg} = 1;
645              }              }
646          }          }
647            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
648            # list of all the pegs for the current spreadsheet role.
649          my @pegs = keys(%pegs_in_genome);          my @pegs = keys(%pegs_in_genome);
650          my($tuple,$peg,$color);          my($tuple,$peg,$color);
651          my $colors_for_one_genome = &set_colors_for_genome($fig,\@pegs);          # Get a hash that maps the PEG IDs to colors.
652            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
653            # Loop through the hash we got back and assign the colors from that
654            # hash to the master hash we're returning to the caller.
655          while (($peg,$color) = each %$colors_for_one_genome)          while (($peg,$color) = each %$colors_for_one_genome)
656          {          {
657              $colorsH->{$peg} = $colors_for_one_genome->{$peg};              $colorsH->{$peg} = $colors_for_one_genome->{$peg};
658          }          }
659      }      }
660        # Return the color hash.
661      return $colorsH;      return $colorsH;
662  }  }
663    
664  sub set_colors_for_genome {  =head3 set_colors_for_genome
     my($fig,$pegs) = @_;  
     my($peg,@clusters,$cluster,@colors,$color,%seen,%conn,$x,$peg1,@pegs,$i);  
665    
666      my $color_of = {};      my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs);
     foreach $peg (@$pegs) { $color_of->{$peg} = '#FFFFFF' }  
667    
668      @pegs = keys(%$color_of);  #  Use of keys makes @pegs entries unique  Return a reference to a hash mapping the specified pegs to colors. PEGs that
669    are physically close to each other will be painted the same color.
670    
671      foreach $peg (@pegs)  =over 4
     {  
         $conn{$peg} = [grep { $color_of->{$_} && ($_ ne $peg) } $fig->close_genes($peg,5000)];  
     }  
672    
673      @clusters = ();  =item fig
     while ($peg = shift @pegs)  
     {  
         if (! $seen{$peg})  
         {  
             $cluster = [$peg];  
             $seen{$peg} = 1;  
             for ($i=0; ($i < @$cluster); $i++)  
             {  
                 my @tmp = grep { ! $seen{$_} } @{$conn{$cluster->[$i]}};  
                 if (@tmp > 0)  
                 {  
                     foreach my $peg1 (@tmp) { $seen{$peg1} = 1 }  
                     push(@$cluster,@tmp);  
                 }  
             }  
             push(@clusters,$cluster);  
         }  
     }  
674    
675      @colors =  &cool_colors();  A fig-like object that can be used to access the data store.
676    
677      @clusters = grep { @$_ > 1 } sort { @$a <=> @$b } @clusters;  =item sub
678    
679      if (@clusters > @colors) { splice(@clusters,0,(@clusters - @colors)) }  # make sure we have enough colors  Subsystem object for the relevant subsystem.
680    
681      my($cluster);  =item pegs
682      foreach $cluster (@clusters)  
683      {  Reference to a list of PEG IDs. All of the peg IDs should be for the
684          $color = shift @colors;  same genome.
685          foreach $peg (@$cluster)  
686          {  =item RETURN
687              $color_of->{$peg} = $color;  
688          }  Returns a reference to a hash that maps each PEG ID to a color.
689      }  
690      return $color_of;  =back
691    
692    =cut
693    
694    sub set_colors_for_genome {
695        # Get the parameters.
696        my($fig, $sub, $pegs) = @_;
697        # Default all the PEGs to white.
698        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
699        # Divide the pegs into clusters.
700        my @clusters = $fig->compute_clusters($pegs, $sub);
701        # Get a list of useful colors.
702        my @colors =  &cool_colors();
703        # If we have too many clusters, chop off the big ones at the end. These
704        # are least likely to be important.
705        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
706        # Loop through the clusters.
707        for my $cluster (@clusters) {
708            # Get the color for this cluster.
709            my $color = shift @colors;
710            # Loop through this cluster, putting this color into the color_of
711            # entries for each PEG.
712            for my $peg (@$cluster) {
713                $color_of{$peg} = $color;
714            }
715        }
716        # Return the color map.
717        return \%color_of;
718  }  }
719    
720    =head3 cool_colors
721    
722        my @colorList = UnvSubsys::cool_colors();
723    
724    Return a list of web-safe colors.
725    
726    =cut
727    
728  sub cool_colors {  sub cool_colors {
729   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
730   return (   return (
# Line 327  Line 746 
746   );   );
747  }  }
748    
749    =head3 ext_id
750    
751        my $externalID = UnvSubsys::ext_id($fig, $peg);
752    
753    or
754    
755        my @externalIDs = UnvSubsys::ext_id($fig, $peg);
756    
757    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
758    a single non-FIG ID for the specified feature.
759    
760    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
761    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
762    of a given type. If it finds at least one, then all IDs of that type are returned.
763    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
764    KEGG IDs.
765    
766    =over 4
767    
768    =item fig
769    
770    A FIG-like object for accessing the data store.
771    
772    =item peg
773    
774    ID of the feature whose aliases are desired.
775    
776    =item RETURN
777    
778    In list context, a list of non-FIG IDs for the feature that are all of the same
779    type. In scalar context, the first non-FIG ID for the feature of the
780    highest-priority available type.
781    
782    =back
783    
784    =cut
785    
786  sub ext_id {  sub ext_id {
787      my($fig,$peg) = @_;      my($fig,$peg) = @_;
788    
# Line 363  Line 819 
819      }      }
820  }  }
821    
822    =head3 subsystem_curator
823    
824        my $name = $unvsub->subsystem_curator();
825    
826    Return the name of the subsystem curator. The database stores user names as
827    C<master:>I<name>. This method strips off the C<master:> prefix before it
828    passes the result back to the caller.
829    
830    =cut
831    
832  sub subsystem_curator {  sub subsystem_curator {
833      my($self) = @_;      my($self) = @_;
# Line 372  Line 837 
837      return $curator;      return $curator;
838  }  }
839    
840    =head3 get_roles
841    
842        my @roles = $unvsub->get_roles();
843    
844    Return a list of the roles (columns) for this subsystem. The roles will be
845    returned in column order, so that if you access the sixth element of the
846    return list, you'll get the name of the role for the sixth column.
847    
848    =cut
849    
850  sub get_roles {  sub get_roles {
851      my($self) = @_;      my($self) = @_;
852        # The role index in this object is a list of 3-tuples. The caller only
853        # wants the first element of each tuple, which is the role name.
854      return map { $_->[0] } @{$self->{Roles}};      return map { $_->[0] } @{$self->{Roles}};
855  }  }
856    
857    =head3 get_genome_index
858    
859        my $index = $unvsub->get_genome_index($genome);
860    
861    Return the row index of the specified genome.
862    
863    =over 4
864    
865    =item genome
866    
867    ID of the genome whose row index is desired.
868    
869    =item RETURN
870    
871    Returns the index of the row corresponding to the specified genome, or an
872    undefined value if the genome is not represented in the subsystem
873    spreadsheet.
874    
875    =back
876    
877    =cut
878    
879  sub get_genome_index {  sub get_genome_index {
880      my($self,$genome) = @_;      my($self,$genome) = @_;
881    
882      return $self->{GenomeIndex}->{$genome};      return $self->{GenomeIndex}->{$genome};
883  }  }
884    
885    =head3 get_genomes
886    
887        my @genomes = $unvsub->get_genomes();
888    
889    Return a list of the genome IDs for the subsystem. The genomes will be
890    presented in row order. In other words, if you index into the sixth
891    element of the return list, you will retrieve the ID of the genome for
892    the sixth row.
893    
894    =cut
895    
896  sub get_genomes {  sub get_genomes {
897      my($self) = @_;      my($self) = @_;
898        # The genome array is a list of 2-tuples. We extract the first
899        # element of each tuple, which is the genome ID.
900      return map { $_->[0] } @{$self->{Genomes}};      return map { $_->[0] } @{$self->{Genomes}};
901  }  }
902    
903    =head3 get_variant_code
904    
905        my $code = $unvsub->get_variant_code($genome);
906    
907    Return the variant code for a genome. Each subsystem has several variations.
908    The variant code indicates which variation of a subsystem is used by a
909    particular genome.
910    
911    Genome data is stored in a list of 2-tuples. The first element is the genome
912    ID; the second is the variant code.
913    
914    =over 4
915    
916    =item genome
917    
918    ID or row index of the genome whose variant code is desired.
919    
920    =item RETURN
921    
922    Returns the variant code for the specified genome.
923    
924    =back
925    
926    =cut
927    
928  sub get_variant_code {  sub get_variant_code {
929      my($self,$genome) = @_;      my($self,$genome) = @_;
930        # Check to see if we have a row index.
931      if ($genome =~ /^\d+$/)      if ($genome =~ /^\d+$/)
932      {      {
933            # Here we have a row index, so use it to find the genome's variant
934            # code.
935          return $self->{Genomes}->[$genome]->[1];          return $self->{Genomes}->[$genome]->[1];
936      }      }
937      else      else
938      {      {
939            # Here we have a genome ID, so we need to convert it to a row index.
940          my $genomeI = $self->{GenomeIndex}->{$genome};          my $genomeI = $self->{GenomeIndex}->{$genome};
941          return $self->{Genomes}->[$genomeI]->[1];          return $self->{Genomes}->[$genomeI]->[1];
942      }      }
943  }  }
944    
945    =head3 get_pegs_from_cell
946    
947        my @pegs = $unvsub->get_pegs_from_cell($genome, $role);
948    
949    Return a list of the features in a specified spreadsheet cell. The cell is specified
950    by genome ID and role ID.
951    
952    =over 4
953    
954    =item genome
955    
956    ID of the genome relevant to the cell.
957    
958    =item role
959    
960    ID of the role relevant to the cell.
961    
962    =item RETURN
963    
964    Returns a list of the features in the cell, or an empty list if the cell is empty.
965    
966    =back
967    
968    =cut
969    
970  sub get_pegs_from_cell {  sub get_pegs_from_cell {
971      my($self,$genome,$role) = @_;      my($self,$genome,$role) = @_;
972        # Convert the genome and role IDs to row and column indices.
973      my $genomeI = $self->{GenomeIndex}->{$genome};      my $genomeI = $self->{GenomeIndex}->{$genome};
974      my $roleI   = $self->{RoleIndex}->{$role};      my $roleI   = $self->{RoleIndex}->{$role};
975        # Get the pegs from the cell and return them.
976      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
977      return $pegs ? @$pegs : ();      return $pegs ? @$pegs : ();
978  }  }
979    
980    =head3 get_diagrams
981    
982        my @list = $sub->get_diagrams();
983    
984    Return a list of the diagrams associated with this subsystem. Each diagram
985    is represented in the return list as a 4-tuple C<[diagram_id, diagram_name,
986    page_link, img_link]> where
987    
988    =over 4
989    
990    =item diagram_id
991    
992    ID code for this diagram.
993    
994    =item diagram_name
995    
996    Displayable name of the diagram.
997    
998    =item page_link
999    
1000    URL of an HTML page containing information about the diagram.
1001    
1002    =item img_link
1003    
1004    URL of an HTML page containing an image for the diagram.
1005    
1006    =back
1007    
1008    Note that the URLs are in fact for CGI scripts with parameters that point them
1009    to the correct place. Though Sprout has diagram information in it, it has
1010    no relationship to the diagrams displayed in SEED, so the work is done entirely
1011    on the SEED side.
1012    
1013    =cut
1014    
1015    sub get_diagrams {
1016        # Get the parameters.
1017        my ($self) = @_;
1018        # Return the diagram list.
1019        return @{$self->{Diagrams}};
1020    }
1021    
1022  sub get_notes {  sub get_notes {
1023      my($self) = @_;      my($self) = @_;
1024    
# Line 473  Line 1078 
1078      return $self->{Colors}->{$peg};      return $self->{Colors}->{$peg};
1079  }  }
1080    
1081    =head3 active_genomes
1082    
1083        my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info);
1084    
1085    Return a hash containing the active genomes for this subsystem display. The
1086    keys of the hash will be the row indices of the genomes to be highlighted on the
1087    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
1088    tests TRUE, the fourth row should be highlighted.
1089    
1090    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
1091    specified, it is presumed to be the ID of the subset containing the active genomes.
1092    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
1093    ID of the genome currently in focus, and the active genomes will be the ones in the
1094    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
1095    I<$focus> are specified, then all genomes are active.
1096    
1097    =over 4
1098    
1099    =item row_subsets
1100    
1101    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1102    a reference to a list of the row indices for the rows in the identified subset.
1103    
1104    =item active_subsetR (optional)
1105    
1106    ID of the active subset (if any).
1107    
1108    =item focus
1109    
1110    ID of the genome currently in focus (if any). If there is no active subset, then
1111    the smallest subset containing this genome will be made active.
1112    
1113    =item genomeH
1114    
1115    Reference to a hash of genome IDs to row indices. The keys of this hash are
1116    the genomes in this subsystem, which also form the subsystem spreadsheet's
1117    rows.
1118    
1119    =item genomes_info
1120    
1121    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1122    ID of a genome; the second is the variant code for the subsystem variant
1123    used by the genome. The tuples are ordered by row index, so that the ID
1124    and variant code of the genome in a particular row can be located by indexing
1125    into this parameter using the subsystem spreadsheet row number.
1126    
1127    =item RETURN
1128    
1129    Returns a reference to a hash that maps the row indices of the active genomes
1130    to 1. This hash can be used to quickly determine whether or not a particular
1131    row is to be highlighted.
1132    
1133    =back
1134    
1135    =cut
1136    
1137  sub active_genomes {  sub active_genomes {
1138      my($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info) = @_;      # Get the parameters.
1139        my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1140      my($i,@bestL);      my($i,@bestL);
1141        # Declare the return variable.
1142      my $active_genomes = {};      my $active_genomes = {};
1143        # Check for an active subset.
1144      if ($active_subsetR)      if ($active_subsetR)
1145      {      {
1146            # Search for the active subset in the row subset array.
1147          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1148          if ($i < @$row_subsets)          if ($i < @$row_subsets)
1149          {          {
1150                # Here we found the active subset, so we extract its list of row indices.
1151                @bestL = @{$row_subsets->[$i]->[1]};
1152            }
1153            else {
1154                # Here we have the ID of the active subset. First, we search for that ID
1155                # in the row subset list.
1156                for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1157                if ($i < @$row_subsets)
1158                {
1159                    # Here we found the named subset, so we return its member list.
1160              @bestL = @{$row_subsets->[$i]->[1]};              @bestL = @{$row_subsets->[$i]->[1]};
1161          }          }
1162          else          else
1163          {          {
1164                    # Here the active subset does not exist. We punt by extracting a
1165                    # list of all the row indices in the spreadsheet.
1166              $active_subsetR = 'All';              $active_subsetR = 'All';
1167              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1168          }          }
1169      }      }
1170      else      }
1171        elsif ($focus)
1172      {      {
1173          if (! $focus)          # Here we don't have an active row subset, but a particular genome is in
1174            # focus. We'll look for the smallest row subset containing the genome
1175            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1176            # of the best subset found so far; "@bestL" is where we stash the list
1177            # of IDs in the subset. Our initial selection, then, will be the
1178            # fake "All" subset, which contains the entire collection of rows.
1179    
1180            if (! $fig->is_complete($focus))
1181          {          {
1182                # Here the gnome in focus is incomplete, so it won't be anywhere
1183                # in our list. We default to making everything active.
1184              $active_subsetR = 'All';              $active_subsetR = 'All';
1185              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1186          }          } else {
1187          else              my $bestN   = "All";
1188          {              @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1189              my $bestN   = undef;              # Next, we get the row index for the genome in focus.
1190              @bestL   = ();              my $focusIndex = $genomeH->{$focus};
1191                # Now we loop through all the row subsets.
1192              my $tuple;              my $tuple;
1193              foreach $tuple (@$row_subsets)              foreach $tuple (@$row_subsets)
1194              {              {
1195                    # Extract the subset ID and its list of row indices. The latter is
1196                    # in "$genomeIs".
1197                  my($id,$genomeIs) = @$tuple;                  my($id,$genomeIs) = @$tuple;
1198                  for ($i=0; ($i < @$genomeIs) && ($genomes_info->[$genomeIs->[$i]]->[0] ne $focus); $i++) {}                  # Search for the index of the focus row in the current subset's list
1199                    # of row indices.
1200                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1201                    # Now either $i will be the index of the focus row in the subset, or
1202                    # it is going to be equal to the number of rows in the subset.
1203                  if ($i < @$genomeIs)                  if ($i < @$genomeIs)
1204                  {                  {
1205                      if ((! $bestN) || (@$genomeIs < @bestL))                      # We've found the focus row in this subset. Select it as the new
1206                        # best subset if it's smaller than the last one we found.
1207                        if (@$genomeIs < @bestL)
1208                      {                      {
1209                          $bestN  = $id;                          $bestN  = $id;
1210                          @bestL = @$genomeIs;                          @bestL = @$genomeIs;
1211                      }                      }
1212                  }                  }
1213              }              }
1214                # Save the best subset found as the active one.
1215              $active_subsetR = $bestN;              $active_subsetR = $bestN;
1216          }          }
1217        } else {
1218            # Here we have nothing: no active subset, and no focus row. We make
1219            # all rows active.
1220            $active_subsetR = 'All';
1221            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1222      }      }
1223        # "@bestL" now contains a list of the row indices for the active genomes.
1224        # We convert it to a hash and return it.
1225      my %active_genomes = map { $_ => 1 } @bestL;      my %active_genomes = map { $_ => 1 } @bestL;
1226      return \%active_genomes;      return \%active_genomes;
1227  }  }
1228    
1229  1;  1;
   
   

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.19

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3