[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Tue Sep 27 00:58:27 2005 UTC revision 1.20, Fri Mar 14 16:52:48 2008 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  package UnvSubsys;  package UnvSubsys;
19    
20  use Subsystem;  use Subsystem;
21  use Carp;  use Carp;
22  use FIG;  use FIG;
23    use SFXlate;
24  use Data::Dumper;  use Data::Dumper;
25  use strict;  use strict;
26    use Tracer;
27    use HTML;
28    
29  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;  
30    
31      $ssa =~ s/ /_/g;  =head2 Introduction
32    
33    The universal subsystem object provides methods used to display useful information
34    about a subsystem. Its intent is to support both Sprout and SEED.
35    
36          ### { Roles =>Roles,  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
37          ###   RoleIndex => ToRoleIndexHash,  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
38          ###   RoleSubsets => ColSubsets,  are stored in the spreadsheet's cells. Each genome can be identified by genome
39          ###   Genomes => Genomes,  ID or row index. Each role can be identified by role name or column index.
         ###   GenomeIndex => ToGenomeIndexHash,  
         ###   PegHash => PegHash,  
         ###   Colors  => ColorHash,  
         ###   Aliases => AliasHash,  
         ###   Curator => Curator,  
         ###   Notes => Notes,  
         ###   Reactions => ReactionHash  
         ### }  
         ###  
         ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  
         ###  
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### RowSubSets = pointer to a list of [SubsetName,[GenomeIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
         ### ReactionHash is a hash: key=Role value=[reaction-ids]  
40    
41      if ((ref($fig) eq "FIG") || ((ref($fig) eq 'SFXlate') && ($fig = $fig->{'fig'})))  It is worth noting that the term I<subset> has two meanings-- a subset of roles
42      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
43          my $subsystem = new Subsystem($ssa,$fig,0);  subset of genomes is called a I<row subset>.
44    
45    The object created contains a great deal of information, so it's worthwhile to
46    stop for a moment and discuss it. The object will have the following members.
47    
48    =over 4
49    
50    =item Roles
51    
52    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
53    and a list of URLs for the reactions. Each role is a column in the
54    subsystem spreadsheet. Indexing into the list by column index yields the
55    ID, abbreviation, and reactions for the role in that column.
56    
57    =item RoleIndex
58    
59    A hash mapping each role to its column index in the spreadsheet.
60    
61    =item RoleSubsets
62    
63    A list of 2-tuples, each consisting of a subset name followed by a list of the
64    column indices for the roles in the subset.
65    
66    =item Genomes
67    
68    A list of 2-tuples, each containing the genome ID for the relevant row and the
69    variant code for that genome. Subsystems can have several variations, and the
70    variant code indicates which variant of the subsystem that the genome uses.
71    There is one genome for each row in the spreadsheet. Indexing into the list
72    yields the ID of the genome for that row and its variant code.
73    
74    =item GenomeIndex
75    
76    A hash mapping each genome ID to its row index in the spreadsheet.
77    
78    =item PegHash
79    
80    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
81    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
82    
83        $pegHash->{$row}->{$col}
84    
85    returns a reference to a list of the IDs for the PEGs in the relevant
86    spreadsheet cell.
87    
88    =item ColorHash
89    
90    A hash mapping each PEG ID to the color that should be used to represent that
91    PEG in the display.
92    
93    =item AliasHash
94    
95    A hash mapping each PEG ID to a list of its aliases.
96    
97    =item ReactionHash
98    
99    A hash mapping each role to the list of reactions it catalyzes.
100    
101    =back
102    
103    =head2 Public Methods
104    
105    =head3 new
106    
107        my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors);
108    
109    Construct a new universal subsystem object for a specified subsystem.
110    
111    =over 4
112    
113    =item ssa
114    
115    Name of the subsystem.
116    
117    =item fig
118    
119    Data access object, either a FIG object or an SFXlate object.
120    
121    =item active_subsetR
122    
123    Name of the active row subset. A row subset names a group of genomes.
124    
125    =item focus
126    
127    ID of the genome currently in focus.
128    
129    =item show_clusters
130    
131    TRUE if clusters should be painted by color, else FALSE.
132    
133    =item aliases
134    
135    TRUE if PEG aliases should be shown, else FALSE.
136    
137    =item peg_colors (optional)
138    
139    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
140    to be assigned to the PEG.
141    
142    =back
143    
144    =cut
145    
146    sub new {
147        # Get the parameters.
148        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
149        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
150        # in the seed's file system.
151        $ssa =~ s/ /_/g;
152        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
153        # a future pass through the code, we'll get data from a SEED or Sprout.
154        if ((ref($fig) eq "FIG") || (ref($fig) eq "FIGV") || (ref($fig) eq 'SFXlate')) {
155            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
156            # and SFXlate; however, the object returned in each case is different: FIG
157            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
158            my $subsystem = $fig->get_subsystem($ssa);
159            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
160            # in case the notes come from a Mac.
161          my $curator = $subsystem->get_curator;          my $curator = $subsystem->get_curator;
162          my $notes = $subsystem->get_notes;          my $notes = $subsystem->get_notes;
163          $notes =~ s/ /\n/g;          $notes =~ s/\r/\n/g;
164          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
165          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
166          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
167          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
168            my @diagrams = $subsystem->get_diagrams();
169            # Create the data structures for the role list and the role index.
170          my $role_info = [];          my $role_info = [];
171          my $roleH     = {};          my $roleH     = {};
172            # Loop through the roles to create the role list. The $i index will move
173            # through the columns of the spreadsheet.
174          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
175          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
176          {          {
177                # Get the ID of the role in the current column.
178              my $role = $roles[$i];              my $role = $roles[$i];
179              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
180                my $abbrev = $subsystem->get_role_abbr($i);
181                # Get the reactions. Note that its possible no reactions were found. If there were
182                # reactions found, however, we convert from reaction IDs to a comma-delimited
183                # list of HTML code full of hyperlinks.
184              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
185                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
186                # them onto the main role list.
187              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
188                # Set the role hash so that we can get back the column index for a given
189                # role name.
190              $roleH->{$role} = $i;              $roleH->{$role} = $i;
191          }          }
192            # Get the column subsets. A column subset is a list of role IDs, so we need to
193            # convert it to a set of column indices. We ignore the special "All" subset
194            # that contains everything.
195          my $subset_info = [];          my $subset_info = [];
196          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
197          {          {
# Line 80  Line 200 
200                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
201              }              }
202          }          }
203            # Now we create the genome directories. For each genome we need to be able to
204            # hash from the genomeID to the specified role index and we must be able to
205            # get the genome's variant code. (The variant code indicates which subsystem
206            # variant the genome uses.
207          my $genomes_info = [];          my $genomes_info = [];
208          my $genomeH      = {};          my $genomeH      = {};
209          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
210          {          {
211                # Get the genome ID for this row.
212              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
213              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
214                my $variant = $subsystem->get_variant_code($i);
215                # Form them into a 2-tuple and add the result to the genome list.
216              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
217                # Set up the hash to get from the genome ID to the row index.
218              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
219          }          }
220    
221            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
222            # object, this is the most expensive part of the operation, since it requires
223            # a database call for each cell.
224          my $pegH = {};          my $pegH = {};
225            # $i is the row index, which cycles through genomes.
226          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
227          {          {
228              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
229              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
230              {              {
231                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
232                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
233              }              }
234          }          }
235    
236          my $row_subsets    = &row_subsets($fig,$genomeH,$genomes_info);          # Get the row subsets. Row subsets are determined by the genome
237          my $active_genomes = &active_genomes($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info);          # taxonomy, so these are different from the row subsets stored
238            # in the subsystem object.
239            my $row_subsets    = &row_subsets($fig, $genomeH);
240            # Here we try to get a list of the active genomes. The caller
241            # gives us hints in the form of the "$focus" and "$active_subsetR"
242            # parameters. If, for example, we are using this object to generate
243            # a subsystem web page, the focus information would steer us to
244            # whatever the user wants to look at on the page.
245            my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
246                                                 $genomeH,$genomes_info);
247    
248            # Now we generate a table of colors for the various PEGs. If the
249            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
250            # we allow the option of painting the PEGs by cluster number. (The
251            # caller indicates this by setting the "show_clusters" flag in the
252            # parameter list.)
253          my $colorsH;          my $colorsH;
254          if ($peg_colors)          if ($peg_colors)
255          {          {
256                # Here the caller gave us a list of peg colors. The list contains
257                # 2-tuples, each consisting of a PEG ID followed by the
258                # color value. The loop below extracts the pairs and stuffs them
259                # into the color hash.
260              $colorsH = {};              $colorsH = {};
261              foreach $_ (@$peg_colors)              foreach $_ (@$peg_colors)
262              {              {
# Line 118  Line 266 
266          }          }
267          elsif ($show_clusters)          elsif ($show_clusters)
268          {          {
269              $colorsH  = &set_colors($fig,$pegH,$active_genomes);              # Here the user wants us to base the colors on the genome clustering
270                # information.
271                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
272          }          }
273          else          else
274          {          {
275                # Here the user is not interested in coloring the PEGs.
276              $colorsH = {};              $colorsH = {};
277          }          }
278            # If the user wants to see aliases, compute the alias hash. Aliases
279            # will only be computed for PEGs belonging to active (highlighted)
280            # genomes, and there is a maximum of one alias per PEG.
281          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};          my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
282          my $reactions = $subsystem->get_reactions;          # Create and bless the UnvSubsys object.
283          my $self = { Roles => $role_info,          my $self = { SSA => $ssa,
284                         Roles => $role_info,
285                       RoleIndex => $roleH,                       RoleIndex => $roleH,
286                       RoleSubsets => $subset_info,                       RoleSubsets => $subset_info,
287                       Genomes => $genomes_info,                       Genomes => $genomes_info,
# Line 138  Line 292 
292                       Aliases => $aliasesH,                       Aliases => $aliasesH,
293                       Curator => $curator,                       Curator => $curator,
294                       Notes => $notes,                       Notes => $notes,
295                       Reactions => $reactions                       Reactions => $reactions,
296                         Diagrams => \@diagrams,
297                     };                     };
298          bless($self, $class);          bless($self, $class);
299            # Return the object.
300          return $self;          return $self;
301      }      }
302      else      else
303      {      {
304            # Here the FIG-like object was not recognized, so we return an
305            # undefined value.
306          return undef;          return undef;
307      }      }
308  }  }
309    
310    =head3 get_ssa
311    
312        my $ssa = $unvsub->get_ssa();
313    
314    Return the name of the subsystem
315    
316    =cut
317    
318    sub get_ssa {
319        my($self) = @_;
320        return $self->{SSA};
321    }
322    
323    =head3 get_ssa_pretty
324    
325        my $ssa = $unvsub->get_ssa_pretty();
326    
327    Return the 'prettyfied' name of the subsystem
328    
329    =cut
330    
331    sub get_ssa_pretty{
332        my($self) = @_;
333        my $ssa = $self->{SSA};
334        $ssa =~ s/_/ /g;
335        return $ssa;
336    }
337    
338    
339    =head3 get_subset_namesR
340    
341        my @names = $unvsub->get_subset_namesR();
342    
343    Return the names of the genome (row) subsets.
344    
345    =cut
346    
347  sub get_subset_namesR {  sub get_subset_namesR {
348      my($self) = @_;      my($self) = @_;
349    
350      return map { $_->[0] } @{$self->{GenomeSubsets}};      return map { $_->[0] } @{$self->{GenomeSubsets}};
351  }  }
352    
353    =head3 get_subsetR
354    
355        my @genomes = $unvsub->get_subsetR($set);
356    
357    Return a list of the genome IDs covered by a row subset.
358    
359    =over 4
360    
361    =item set
362    
363    Name of the row subset whose genomes are desired.
364    
365    =item RETURN
366    
367    Returns a list of the IDs for the genomes found in the specified row
368    set.
369    
370    =back
371    
372    =cut
373    
374  sub get_subsetR {  sub get_subsetR {
375        # Get the parameters.
376      my($self,$set) = @_;      my($self,$set) = @_;
377      my($i);      my($i);
378        # Get the list of row subsets.
379      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
380        # Find the row subset with the specified name. The row subset list is a
381        # list of 2-tuples, and the first element of each tuple is the set
382        # name.
383      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}      for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
384      if ($i < @$sets)      if ($i < @$sets)
385      {      {
386            # Here we found the named subset. The subset tuple's second element is
387            # the list of row indices. We map these to genome IDs before returning
388            # them.
389          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}          return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
390      }      }
391        # Here we subset was not found, so we return the undefined value.
392      return undef;      return undef;
393  }  }
394    
395    =head3 get_subsetR
396    
397        my @pairs = $unvsub->get_subsetsR();
398    
399    Return a list of all the row subsets. The subsets are returned in the form
400    of 2-tuples, each consisting of a subset name followed by a reference to a
401    list of genome IDs. The genome IDs correspond to the rows in the subset.
402    
403    =cut
404    
405  sub get_subsetsR {  sub get_subsetsR {
406        # Get the parameters.
407      my($self) = @_;      my($self) = @_;
408        # Extract the list of genome subsets. This list is in the form of
409        # 2-tuples, but the rows are identified by row index, not genome ID.
410      my $sets = $self->{GenomeSubsets};      my $sets = $self->{GenomeSubsets};
411        # Create the return list.
412      my @pairs = ();      my @pairs = ();
413        # Loop through the subsets.
414      my $pair;      my $pair;
415      foreach $pair (@$sets)      foreach $pair (@$sets)
416      {      {
417            # Convert this subset's member list from row indices to genome IDs
418            # and stash the result in the return list.
419          my($id,$members) = @$pair;          my($id,$members) = @$pair;
420          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);          push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
421      }      }
422        # Return the list constructed.
423      return @pairs;      return @pairs;
424  }  }
425    
426    =head3 row_subsets
427    
428        my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH);
429    
430    This method computes the taxonomic row subsets for a subsystem. It takes
431    as input a hash that maps genome IDs to column indices and a FIG object.
432    The FIG object provides a list of taxonomic groups of 10 or more complete
433    genomes. From the list, we extract subsets which have more than 10
434    genomes in the list of subsystem genomes. If no such subsets exist,
435    we extract subsets which have at least 1 genome from the list of
436    subsystem genomes. The subsets are returned as 2-tuples, the first
437    element being the subset ID and the second being a list of row indices
438    for the genomes in the subset.
439    
440    =over 4
441    
442    =item fig
443    
444    A FIG-like object for accessing the data store.
445    
446    =item genomeH
447    
448    Reference to a hash that maps each genome ID to its row index in the
449    subsystem spreadsheet.
450    
451    =item RETURN
452    
453    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
454    subset ID followed by a list of the row indices for the genomes in the
455    subset.
456    
457    =back
458    
459    =cut
460    
461  sub row_subsets {  sub row_subsets {
462      my ($fig,$genomeH,$genomes_info) = @_;      my ($fig, $genomeH) = @_;
463    
464        # We need a real FIG object, since SFXlate does not yet support
465        # taxonomy trees. The "FIG" method does this for us.
466        $fig = $fig->FIG();
467        # Initialize the return value.
468      my $subsets = [];      my $subsets = [];
469      my $taxonomic_groups = $fig->taxonomic_groups_of_complete(10);      # Get a list of taxonomic groups. This will come back as a list of
470        # 2-tuples.
471        my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
472        # Loop through the 2-tuples. We're looking for subsets which
473        # contain at least one genome on the subsystem's spreadsheet.
474      my($pair,$id,$members);      my($pair,$id,$members);
475      foreach $pair (@$taxonomic_groups)      foreach $pair (@$taxonomic_groups)
476      {      {
477          ($id,$members) = @$pair;          ($id,$members) = @$pair;
478    #       warn "Group $id is @$members\n";
479    #       if ($id eq 'All')
480    #       {
481    #           push(@$members, '372461.6');
482    #       }
483    
484            # Extract the genomes in the member list that participate in this
485            # subsystem. To do this, we convert each genome ID to its row
486            # index. If no row index exists, the GREP condition discards the
487            # member.
488          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;          my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
489            # If there are enough members, save the subset.
490          if (@mem > 0)          if (@mem > 0)
491          {          {
492              push(@$subsets,[$id,[@mem]]);              push(@$subsets,[$id,[@mem]]);
493          }          }
494      }      }
495        # Return the list of row subsets.
496      return $subsets;      return $subsets;
497  }  }
498    
499    =head3 set_aliases
500    
501        my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes);
502    
503    Return a hash mapping PEG IDs to aliases.
504    
505    =over 4
506    
507    =item fig
508    
509    FIG-like object that can be used to access the data store.
510    
511    =item pegH
512    
513    Reference to the spreadsheet hash table. Given a row index I<$row> and a
514    column index I<$col>,
515    
516        $pegH->{$row}->{$col}
517    
518    will return a reference to a list of PEGs in the specified spreadsheet cell.
519    
520    =item active_genomes
521    
522    Reference to a hash whose keys correspond to the spreadsheet row indices
523    of genomes that should be highlighted.
524    
525    =item RETURN
526    
527    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
528    for active genomes will be processed.
529    
530    =back
531    
532    =cut
533    
534  sub set_aliases {  sub set_aliases {
535        # Get the parameters.
536      my($fig,$pegH,$active_genomes) = @_;      my($fig,$pegH,$active_genomes) = @_;
537      my($genomeI,$roleI,$pegs,$peg,$roleH);      my($genomeI,$roleI,$pegs,$peg,$roleH);
538    
539        # Create the return hash.
540      my $aliasesH = {};      my $aliasesH = {};
541    
542        # Loop through each row that corresponds to an active genome.
543        # The active genome list contains row indices, and there is
544        # one genome per row. Note that the genome ID is never used,
545        # only the row index.
546      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
547      {      {
548            # Get the role hash for the specified genome. The role hash
549            # maps column indices (representing roles) to lists of PEGs.
550          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
551            # Loop through the role (column) indices.
552          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
553          {          {
554                # Get the PEG list for this row/column combination.
555              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
556                # Only proceed if data was found in the cell.
557                if (defined $pegs) {
558                    # Loop through the pegs in the cell.
559              foreach $peg (@$pegs)              foreach $peg (@$pegs)
560              {              {
561                        # If we do not already have an alias for this PEG,
562                        # compute one.
563                  if (! $aliasesH->{$peg})                  if (! $aliasesH->{$peg})
564                  {                  {
565                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);                      $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
# Line 222  Line 567 
567              }              }
568          }          }
569      }      }
570        }
571        # Return the hash we built.
572      return $aliasesH;      return $aliasesH;
573  }  }
574    
575    =head3 set_colors
576    
577        my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes);
578    
579    Return a hash that maps each PEG in the subsystem spreadsheet to a display
580    color. Not all PEGs need to be mapped. Those that do not have a color
581    assigned will generally be displayed in white.
582    
583    =over 4
584    
585    =item fig
586    
587    FIG-like object that can be used to access the data store.
588    
589    =item sub
590    
591    Subsystem object for the current subsystem.
592    
593    =item pegH
594    
595    Reference to the spreadsheet hash table. Given a row index I<$row> and a
596    column index I<$col>,
597    
598        $pegH->{$row}->{$col}
599    
600    will return a reference to a list of PEGs in the specified spreadsheet cell.
601    
602    =item active_genomes
603    
604    Reference to a hash whose keys correspond to the spreadsheet row indices
605    of genomes that should be highlighted.
606    
607    =item RETURN
608    
609    Returns a hash that takes as input a PEG ID and returns a color. These colors
610    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
611    for active genomes will be colored.
612    
613    =back
614    
615    =cut
616    
617  sub set_colors {  sub set_colors {
618      my($fig,$pegH,$active_genomes) = @_;      # Get the parameters.
619      my($genomeI,$roleI,$pegs,$peg,$roleH,$peg,%pegs_in_genome);      my($fig,$sub,$pegH,$active_genomes) = @_;
620    
621        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
622        # Create the return hash.
623      my $colorsH = {};      my $colorsH = {};
624        # Loop through the active genomes. The keys of "%$pegH" are the row indices
625        # for rows that have at least one occupied cell in the spreadsheet. The
626        # Grep then reduces this list to those rows that are highlighted.
627      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))      foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
628      {      {
629            # We will use the following hash to compile a list of all the PEGs
630            # in the spreadsheet for the current genome, that is, the genome
631            # represented by row index "$genomeI".
632          undef %pegs_in_genome;          undef %pegs_in_genome;
633            # Get the hash for the current row. This hash maps column indices to
634            # lists of pegs.
635          $roleH = $pegH->{$genomeI};          $roleH = $pegH->{$genomeI};
636            # Loop through the column indices for the specified row.
637          foreach $roleI (keys(%$roleH))          foreach $roleI (keys(%$roleH))
638          {          {
639                # Here we can finally get a list of the pegs in this spreadsheet
640                # cell. We loop through them, marking them in the "%pegs_in_genome"
641                # hash.
642              $pegs = $roleH->{$roleI};              $pegs = $roleH->{$roleI};
643              foreach $peg (@$pegs)              foreach $peg (@$pegs)
644              {              {
645                  $pegs_in_genome{$peg} = 1;                  $pegs_in_genome{$peg} = 1;
646              }              }
647          }          }
648            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
649            # list of all the pegs for the current spreadsheet role.
650          my @pegs = keys(%pegs_in_genome);          my @pegs = keys(%pegs_in_genome);
651          my($tuple,$peg,$color);          my($tuple,$peg,$color);
652          my $colors_for_one_genome = &set_colors_for_genome($fig,\@pegs);          # Get a hash that maps the PEG IDs to colors.
653            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
654            # Loop through the hash we got back and assign the colors from that
655            # hash to the master hash we're returning to the caller.
656          while (($peg,$color) = each %$colors_for_one_genome)          while (($peg,$color) = each %$colors_for_one_genome)
657          {          {
658              $colorsH->{$peg} = $colors_for_one_genome->{$peg};              $colorsH->{$peg} = $colors_for_one_genome->{$peg};
659          }          }
660      }      }
661        # Return the color hash.
662      return $colorsH;      return $colorsH;
663  }  }
664    
665  sub set_colors_for_genome {  =head3 set_colors_for_genome
     my($fig,$pegs) = @_;  
     my($peg,@clusters,$cluster,@colors,$color,%seen,%conn,$x,$peg1,@pegs,$i);  
666    
667      my $color_of = {};      my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs);
     foreach $peg (@$pegs) { $color_of->{$peg} = '#FFFFFF' }  
668    
669      @pegs = keys(%$color_of);  #  Use of keys makes @pegs entries unique  Return a reference to a hash mapping the specified pegs to colors. PEGs that
670    are physically close to each other will be painted the same color.
671    
672      foreach $peg (@pegs)  =over 4
     {  
         $conn{$peg} = [grep { $color_of->{$_} && ($_ ne $peg) } $fig->close_genes($peg,5000)];  
     }  
673    
674      @clusters = ();  =item fig
     while ($peg = shift @pegs)  
     {  
         if (! $seen{$peg})  
         {  
             $cluster = [$peg];  
             $seen{$peg} = 1;  
             for ($i=0; ($i < @$cluster); $i++)  
             {  
                 my @tmp = grep { ! $seen{$_} } @{$conn{$cluster->[$i]}};  
                 if (@tmp > 0)  
                 {  
                     foreach my $peg1 (@tmp) { $seen{$peg1} = 1 }  
                     push(@$cluster,@tmp);  
                 }  
             }  
             push(@clusters,$cluster);  
         }  
     }  
675    
676      @colors =  &cool_colors();  A fig-like object that can be used to access the data store.
677    
678      @clusters = grep { @$_ > 1 } sort { @$a <=> @$b } @clusters;  =item sub
679    
680      if (@clusters > @colors) { splice(@clusters,0,(@clusters - @colors)) }  # make sure we have enough colors  Subsystem object for the relevant subsystem.
681    
682      my($cluster);  =item pegs
683      foreach $cluster (@clusters)  
684      {  Reference to a list of PEG IDs. All of the peg IDs should be for the
685          $color = shift @colors;  same genome.
686          foreach $peg (@$cluster)  
687          {  =item RETURN
688              $color_of->{$peg} = $color;  
689          }  Returns a reference to a hash that maps each PEG ID to a color.
690      }  
691      return $color_of;  =back
692    
693    =cut
694    
695    sub set_colors_for_genome {
696        # Get the parameters.
697        my($fig, $sub, $pegs) = @_;
698        # Default all the PEGs to white.
699        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
700        # Divide the pegs into clusters.
701        my @clusters = $fig->compute_clusters($pegs, $sub);
702        # Get a list of useful colors.
703        my @colors =  &cool_colors();
704        # If we have too many clusters, chop off the big ones at the end. These
705        # are least likely to be important.
706        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
707        # Loop through the clusters.
708        for my $cluster (@clusters) {
709            # Get the color for this cluster.
710            my $color = shift @colors;
711            # Loop through this cluster, putting this color into the color_of
712            # entries for each PEG.
713            for my $peg (@$cluster) {
714                $color_of{$peg} = $color;
715            }
716        }
717        # Return the color map.
718        return \%color_of;
719  }  }
720    
721    =head3 cool_colors
722    
723        my @colorList = UnvSubsys::cool_colors();
724    
725    Return a list of web-safe colors.
726    
727    =cut
728    
729  sub cool_colors {  sub cool_colors {
730   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)   # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
731   return (   return (
# Line 327  Line 747 
747   );   );
748  }  }
749    
750    =head3 ext_id
751    
752        my $externalID = UnvSubsys::ext_id($fig, $peg);
753    
754    or
755    
756        my @externalIDs = UnvSubsys::ext_id($fig, $peg);
757    
758    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
759    a single non-FIG ID for the specified feature.
760    
761    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
762    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
763    of a given type. If it finds at least one, then all IDs of that type are returned.
764    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
765    KEGG IDs.
766    
767    =over 4
768    
769    =item fig
770    
771    A FIG-like object for accessing the data store.
772    
773    =item peg
774    
775    ID of the feature whose aliases are desired.
776    
777    =item RETURN
778    
779    In list context, a list of non-FIG IDs for the feature that are all of the same
780    type. In scalar context, the first non-FIG ID for the feature of the
781    highest-priority available type.
782    
783    =back
784    
785    =cut
786    
787  sub ext_id {  sub ext_id {
788      my($fig,$peg) = @_;      my($fig,$peg) = @_;
789    
# Line 363  Line 820 
820      }      }
821  }  }
822    
823    =head3 subsystem_curator
824    
825        my $name = $unvsub->subsystem_curator();
826    
827    Return the name of the subsystem curator. The database stores user names as
828    C<master:>I<name>. This method strips off the C<master:> prefix before it
829    passes the result back to the caller.
830    
831    =cut
832    
833  sub subsystem_curator {  sub subsystem_curator {
834      my($self) = @_;      my($self) = @_;
# Line 372  Line 838 
838      return $curator;      return $curator;
839  }  }
840    
841    =head3 get_roles
842    
843        my @roles = $unvsub->get_roles();
844    
845    Return a list of the roles (columns) for this subsystem. The roles will be
846    returned in column order, so that if you access the sixth element of the
847    return list, you'll get the name of the role for the sixth column.
848    
849    =cut
850    
851  sub get_roles {  sub get_roles {
852      my($self) = @_;      my($self) = @_;
853        # The role index in this object is a list of 3-tuples. The caller only
854        # wants the first element of each tuple, which is the role name.
855      return map { $_->[0] } @{$self->{Roles}};      return map { $_->[0] } @{$self->{Roles}};
856  }  }
857    
858    =head3 get_genome_index
859    
860        my $index = $unvsub->get_genome_index($genome);
861    
862    Return the row index of the specified genome.
863    
864    =over 4
865    
866    =item genome
867    
868    ID of the genome whose row index is desired.
869    
870    =item RETURN
871    
872    Returns the index of the row corresponding to the specified genome, or an
873    undefined value if the genome is not represented in the subsystem
874    spreadsheet.
875    
876    =back
877    
878    =cut
879    
880  sub get_genome_index {  sub get_genome_index {
881      my($self,$genome) = @_;      my($self,$genome) = @_;
882    
883      return $self->{GenomeIndex}->{$genome};      return $self->{GenomeIndex}->{$genome};
884  }  }
885    
886    =head3 get_genomes
887    
888        my @genomes = $unvsub->get_genomes();
889    
890    Return a list of the genome IDs for the subsystem. The genomes will be
891    presented in row order. In other words, if you index into the sixth
892    element of the return list, you will retrieve the ID of the genome for
893    the sixth row.
894    
895    =cut
896    
897  sub get_genomes {  sub get_genomes {
898      my($self) = @_;      my($self) = @_;
899        # The genome array is a list of 2-tuples. We extract the first
900        # element of each tuple, which is the genome ID.
901      return map { $_->[0] } @{$self->{Genomes}};      return map { $_->[0] } @{$self->{Genomes}};
902  }  }
903    
904    =head3 get_variant_code
905    
906        my $code = $unvsub->get_variant_code($genome);
907    
908    Return the variant code for a genome. Each subsystem has several variations.
909    The variant code indicates which variation of a subsystem is used by a
910    particular genome.
911    
912    Genome data is stored in a list of 2-tuples. The first element is the genome
913    ID; the second is the variant code.
914    
915    =over 4
916    
917    =item genome
918    
919    ID or row index of the genome whose variant code is desired.
920    
921    =item RETURN
922    
923    Returns the variant code for the specified genome.
924    
925    =back
926    
927    =cut
928    
929  sub get_variant_code {  sub get_variant_code {
930      my($self,$genome) = @_;      my($self,$genome) = @_;
931        # Check to see if we have a row index.
932      if ($genome =~ /^\d+$/)      if ($genome =~ /^\d+$/)
933      {      {
934            # Here we have a row index, so use it to find the genome's variant
935            # code.
936          return $self->{Genomes}->[$genome]->[1];          return $self->{Genomes}->[$genome]->[1];
937      }      }
938      else      else
939      {      {
940            # Here we have a genome ID, so we need to convert it to a row index.
941          my $genomeI = $self->{GenomeIndex}->{$genome};          my $genomeI = $self->{GenomeIndex}->{$genome};
942          return $self->{Genomes}->[$genomeI]->[1];          return $self->{Genomes}->[$genomeI]->[1];
943      }      }
944  }  }
945    
946    =head3 get_pegs_from_cell
947    
948        my @pegs = $unvsub->get_pegs_from_cell($genome, $role);
949    
950    Return a list of the features in a specified spreadsheet cell. The cell is specified
951    by genome ID and role ID.
952    
953    =over 4
954    
955    =item genome
956    
957    ID of the genome relevant to the cell.
958    
959    =item role
960    
961    ID of the role relevant to the cell.
962    
963    =item RETURN
964    
965    Returns a list of the features in the cell, or an empty list if the cell is empty.
966    
967    =back
968    
969    =cut
970    
971  sub get_pegs_from_cell {  sub get_pegs_from_cell {
972      my($self,$genome,$role) = @_;      my($self,$genome,$role) = @_;
973        # Convert the genome and role IDs to row and column indices.
974      my $genomeI = $self->{GenomeIndex}->{$genome};      my $genomeI = $self->{GenomeIndex}->{$genome};
975      my $roleI   = $self->{RoleIndex}->{$role};      my $roleI   = $self->{RoleIndex}->{$role};
976        # Get the pegs from the cell and return them.
977      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};      my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
978      return $pegs ? @$pegs : ();      return $pegs ? @$pegs : ();
979  }  }
980    
981    =head3 get_diagrams
982    
983        my @list = $sub->get_diagrams();
984    
985    Return a list of the diagrams associated with this subsystem. Each diagram
986    is represented in the return list as a 4-tuple C<[diagram_id, diagram_name,
987    page_link, img_link]> where
988    
989    =over 4
990    
991    =item diagram_id
992    
993    ID code for this diagram.
994    
995    =item diagram_name
996    
997    Displayable name of the diagram.
998    
999    =item page_link
1000    
1001    URL of an HTML page containing information about the diagram.
1002    
1003    =item img_link
1004    
1005    URL of an HTML page containing an image for the diagram.
1006    
1007    =back
1008    
1009    Note that the URLs are in fact for CGI scripts with parameters that point them
1010    to the correct place. Though Sprout has diagram information in it, it has
1011    no relationship to the diagrams displayed in SEED, so the work is done entirely
1012    on the SEED side.
1013    
1014    =cut
1015    
1016    sub get_diagrams {
1017        # Get the parameters.
1018        my ($self) = @_;
1019        # Return the diagram list.
1020        return @{$self->{Diagrams}};
1021    }
1022    
1023  sub get_notes {  sub get_notes {
1024      my($self) = @_;      my($self) = @_;
1025    
# Line 473  Line 1079 
1079      return $self->{Colors}->{$peg};      return $self->{Colors}->{$peg};
1080  }  }
1081    
1082    =head3 active_genomes
1083    
1084        my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info);
1085    
1086    Return a hash containing the active genomes for this subsystem display. The
1087    keys of the hash will be the row indices of the genomes to be highlighted on the
1088    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
1089    tests TRUE, the fourth row should be highlighted.
1090    
1091    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
1092    specified, it is presumed to be the ID of the subset containing the active genomes.
1093    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
1094    ID of the genome currently in focus, and the active genomes will be the ones in the
1095    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
1096    I<$focus> are specified, then all genomes are active.
1097    
1098    =over 4
1099    
1100    =item row_subsets
1101    
1102    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1103    a reference to a list of the row indices for the rows in the identified subset.
1104    
1105    =item active_subsetR (optional)
1106    
1107    ID of the active subset (if any).
1108    
1109    =item focus
1110    
1111    ID of the genome currently in focus (if any). If there is no active subset, then
1112    the smallest subset containing this genome will be made active.
1113    
1114    =item genomeH
1115    
1116    Reference to a hash of genome IDs to row indices. The keys of this hash are
1117    the genomes in this subsystem, which also form the subsystem spreadsheet's
1118    rows.
1119    
1120    =item genomes_info
1121    
1122    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1123    ID of a genome; the second is the variant code for the subsystem variant
1124    used by the genome. The tuples are ordered by row index, so that the ID
1125    and variant code of the genome in a particular row can be located by indexing
1126    into this parameter using the subsystem spreadsheet row number.
1127    
1128    =item RETURN
1129    
1130    Returns a reference to a hash that maps the row indices of the active genomes
1131    to 1. This hash can be used to quickly determine whether or not a particular
1132    row is to be highlighted.
1133    
1134    =back
1135    
1136    =cut
1137    
1138  sub active_genomes {  sub active_genomes {
1139      my($row_subsets,$active_subsetR,$focus,$genomeH,$genomes_info) = @_;      # Get the parameters.
1140        my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1141      my($i,@bestL);      my($i,@bestL);
1142        # Declare the return variable.
1143      my $active_genomes = {};      my $active_genomes = {};
1144        # Check for an active subset.
1145      if ($active_subsetR)      if ($active_subsetR)
1146      {      {
1147            # Search for the active subset in the row subset array.
1148          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}          for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1149          if ($i < @$row_subsets)          if ($i < @$row_subsets)
1150          {          {
1151                # Here we found the active subset, so we extract its list of row indices.
1152                @bestL = @{$row_subsets->[$i]->[1]};
1153            }
1154            else {
1155                # Here we have the ID of the active subset. First, we search for that ID
1156                # in the row subset list.
1157                for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1158                if ($i < @$row_subsets)
1159                {
1160                    # Here we found the named subset, so we return its member list.
1161              @bestL = @{$row_subsets->[$i]->[1]};              @bestL = @{$row_subsets->[$i]->[1]};
1162          }          }
1163          else          else
1164          {          {
1165                    # Here the active subset does not exist. We punt by extracting a
1166                    # list of all the row indices in the spreadsheet.
1167              $active_subsetR = 'All';              $active_subsetR = 'All';
1168              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1169          }          }
1170      }      }
1171      else      }
1172        elsif ($focus)
1173      {      {
1174          if (! $focus)          # Here we don't have an active row subset, but a particular genome is in
1175            # focus. We'll look for the smallest row subset containing the genome
1176            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1177            # of the best subset found so far; "@bestL" is where we stash the list
1178            # of IDs in the subset. Our initial selection, then, will be the
1179            # fake "All" subset, which contains the entire collection of rows.
1180    
1181            if (! $fig->is_complete($focus))
1182          {          {
1183                # Here the gnome in focus is incomplete, so it won't be anywhere
1184                # in our list. We default to making everything active.
1185              $active_subsetR = 'All';              $active_subsetR = 'All';
1186              @bestL = map { $genomeH->{$_} } keys(%$genomeH);              @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1187          }          } else {
1188          else              my $bestN   = "All";
1189          {              @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1190              my $bestN   = undef;              # Next, we get the row index for the genome in focus.
1191              @bestL   = ();              my $focusIndex = $genomeH->{$focus};
1192                # Now we loop through all the row subsets.
1193              my $tuple;              my $tuple;
1194              foreach $tuple (@$row_subsets)              foreach $tuple (@$row_subsets)
1195              {              {
1196                    # Extract the subset ID and its list of row indices. The latter is
1197                    # in "$genomeIs".
1198                  my($id,$genomeIs) = @$tuple;                  my($id,$genomeIs) = @$tuple;
1199                  for ($i=0; ($i < @$genomeIs) && ($genomes_info->[$genomeIs->[$i]]->[0] ne $focus); $i++) {}                  # Search for the index of the focus row in the current subset's list
1200                    # of row indices.
1201                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1202                    # Now either $i will be the index of the focus row in the subset, or
1203                    # it is going to be equal to the number of rows in the subset.
1204                  if ($i < @$genomeIs)                  if ($i < @$genomeIs)
1205                  {                  {
1206                      if ((! $bestN) || (@$genomeIs < @bestL))                      # We've found the focus row in this subset. Select it as the new
1207                        # best subset if it's smaller than the last one we found.
1208                        if (@$genomeIs < @bestL)
1209                      {                      {
1210                          $bestN  = $id;                          $bestN  = $id;
1211                          @bestL = @$genomeIs;                          @bestL = @$genomeIs;
1212                      }                      }
1213                  }                  }
1214              }              }
1215                # Save the best subset found as the active one.
1216              $active_subsetR = $bestN;              $active_subsetR = $bestN;
1217          }          }
1218        } else {
1219            # Here we have nothing: no active subset, and no focus row. We make
1220            # all rows active.
1221            $active_subsetR = 'All';
1222            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1223      }      }
1224        # "@bestL" now contains a list of the row indices for the active genomes.
1225        # We convert it to a hash and return it.
1226      my %active_genomes = map { $_ => 1 } @bestL;      my %active_genomes = map { $_ => 1 } @bestL;
1227      return \%active_genomes;      return \%active_genomes;
1228  }  }
1229    
1230  1;  1;
   
   

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.20

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3