[Bio] / FigKernelPackages / UnvSubsys.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/UnvSubsys.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Sun Sep 11 23:44:46 2005 UTC revision 1.16, Tue Jan 31 04:09:13 2006 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  package UnvSubsys;  package UnvSubsys;
19    
20    use Subsystem;
21  use Carp;  use Carp;
22  use FIG;  use FIG;
23    use SFXlate;
24  use Data::Dumper;  use Data::Dumper;
25  use strict;  use strict;
26    use Tracer;
27    
28  sub new  =head1 Universal Subsystem Object
 {  
     my($class, $ssa, $fig, $colors, $aliases) = @_;  
29    
30      $ssa =~ s/ /_/g;  =head2 Introduction
31    
32    The universal subsystem object provides methods used to display useful information
33    about a subsystem. Its intent is to support both Sprout and SEED.
34    
35          ### format for one_sub = [Roles,ToRoleIndexHash,ColSubsets,Genomes,ToGenomeIndexHash,PegHash,ColorHash,AliasHash]  The soul of a subsystem is its spreadsheet. The columns of the spreadsheet are
36          ###  the subsystem roles. The rows are the genomes that contain the subsystem. PEGs
37          ### Roles = pointer to a list of [Role,Abbrev,[ReactionURLs]]  are stored in the spreadsheet's cells. Each genome can be identified by genome
38          ###  ID or row index. Each role can be identified by role name or column index.
         ### ToRoleIndexHash = a pointer to a hash: key=Role Value=RoleIndex  
         ###  
         ### ColSubsets = pointer to a list of [SubsetName,[RoleIndexesFrom0]]  
         ###  
         ### Genomes is a pointer to a list of [Genome,Variant]  
         ###  
         ### ToGenomeIndexHash = a pointer to a hash: key=Genome value=GenomeIndex  
         ###  
         ### PegHash = a pointer to a hash of hashes such that $peg_hash->{$genome_index}->{$role_index} = a  
         ###           pointer to a list of PEGs  
         ###  
         ### ColorHash is a hash: key=PEG value=color  
         ###  
         ### AliasHash is a hash: key=PEG value=aliases  
         ###  
39    
40      if (ref($fig) eq "FIG")  It is worth noting that the term I<subset> has two meanings-- a subset of roles
41      {  or a subset of genomes. A subset of roles is called a I<column subset>. A
42          my $subsystem = new Subsystem($ssa,$fig,0);  subset of genomes is called a I<row subset>.
43    
44    The object created contains a great deal of information, so it's worthwhile to
45    stop for a moment and discuss it. The object will have the following members.
46    
47    =over 4
48    
49    =item Roles
50    
51    A list of 3-tuples, each consisting of a role name, the abbreviated role name,
52    and a list of URLs for the reactions. Each role is a column in the
53    subsystem spreadsheet. Indexing into the list by column index yields the
54    ID, abbreviation, and reactions for the role in that column.
55    
56    =item RoleIndex
57    
58    A hash mapping each role to its column index in the spreadsheet.
59    
60    =item RoleSubsets
61    
62    A list of 2-tuples, each consisting of a subset name followed by a list of the
63    column indices for the roles in the subset.
64    
65    =item Genomes
66    
67    A list of 2-tuples, each containing the genome ID for the relevant row and the
68    variant code for that genome. Subsystems can have several variations, and the
69    variant code indicates which variant of the subsystem that the genome uses.
70    There is one genome for each row in the spreadsheet. Indexing into the list
71    yields the ID of the genome for that row and its variant code.
72    
73    =item GenomeIndex
74    
75    A hash mapping each genome ID to its row index in the spreadsheet.
76    
77    =item PegHash
78    
79    A hash of hashes containing the spreadsheet cells. If C<$pegHash> is the hash
80    of hashes, C<$row> is a genome index, and C<$col> is a role index, then
81    
82        $pegHash->{$row}->{$col}
83    
84    returns a reference to a list of the IDs for the PEGs in the relevant
85    spreadsheet cell.
86    
87    =item ColorHash
88    
89    A hash mapping each PEG ID to the color that should be used to represent that
90    PEG in the display.
91    
92    =item AliasHash
93    
94    A hash mapping each PEG ID to a list of its aliases.
95    
96    =item ReactionHash
97    
98    A hash mapping each role to the list of reactions it catalyzes.
99    
100    =back
101    
102    =head2 Public Methods
103    
104    =head3 new
105    
106    C<< my $usub = UnvSubsys->new($ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, \@peg_colors); >>
107    
108    Construct a new universal subsystem object for a specified subsystem.
109    
110    =over 4
111    
112    =item ssa
113    
114    Name of the subsystem.
115    
116    =item fig
117    
118    Data access object, either a FIG object or an SFXlate object.
119    
120    =item active_subsetR
121    
122    Name of the active row subset. A row subset names a group of genomes.
123    
124    =item focus
125    
126    ID of the genome currently in focus.
127    
128    =item show_clusters
129    
130    TRUE if clusters should be painted by color, else FALSE.
131    
132    =item aliases
133    
134    TRUE if PEG aliases should be shown, else FALSE.
135    
136    =item peg_colors (optional)
137    
138    Reference to a list of 2-tuples, each 2-tuple consisting of a PEG ID and the color
139    to be assigned to the PEG.
140    
141    =back
142    
143    =cut
144    
145    sub new {
146        # Get the parameters.
147        my($class, $ssa, $fig, $active_subsetR, $focus, $show_clusters, $aliases, $peg_colors) = @_;
148        # Fix the subsystem name. Spaces are replaced by underscores to avoid naming problems
149        # in the seed's file system.
150        $ssa =~ s/ /_/g;
151        # Get the FIG object. At this point in time, we're only getting data from the SEED. On
152        # a future pass through the code, we'll get data from a SEED or Sprout.
153        if ((ref($fig) eq "FIG") || (ref($fig) eq 'SFXlate')) {
154            # Create a subsystem object. The "get_subsystem" method is provided by both FIG
155            # and SFXlate; however, the object returned in each case is different: FIG
156            # returns a "Subsystem" object; SFXlate returns a "SproutSubsys" object.
157            my $subsystem = $fig->get_subsystem($ssa);
158            # Get the key subsystem data. Note that CRs in the notes are converted to LFs
159            # in case the notes come from a Mac.
160            my $curator = $subsystem->get_curator;
161            my $notes = $subsystem->get_notes;
162            $notes =~ s/\r/\n/g;
163          my @roles = $subsystem->get_roles;          my @roles = $subsystem->get_roles;
164          my $reactions = $subsystem->get_reactions;          my $reactions = $subsystem->get_reactions;
165          my @genomes = $subsystem->get_genomes;          my @genomes = $subsystem->get_genomes;
166          my @col_subsets = $subsystem->get_subset_namesC;          my @col_subsets = $subsystem->get_subset_namesC;
167            my @diagrams = $subsystem->get_diagrams();
168            # Create the data structures for the role list and the role index.
169          my $role_info = [];          my $role_info = [];
170          my $roleH     = {};          my $roleH     = {};
171            # Loop through the roles to create the role list. The $i index will move
172            # through the columns of the spreadsheet.
173          my($i,$j,$subset,$peg);          my($i,$j,$subset,$peg);
174          for ($i=0; ($i < @roles); $i++)          for ($i=0; ($i < @roles); $i++)
175          {          {
176                # Get the ID of the role in the current column.
177              my $role = $roles[$i];              my $role = $roles[$i];
178              my $abbrev = $subsystem->get_role_abbr( $subsystem->get_role_index( $role ) );              # Extract its abbreviation.
179                my $abbrev = $subsystem->get_role_abbr($i);
180                # Get the reactions. Note that its possible no reactions were found. If there were
181                # reactions found, however, we convert from reaction IDs to a comma-delimited
182                # list of HTML code full of hyperlinks.
183              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];              my $react = $reactions ? join(",", map { &HTML::reaction_link($_) } @{$reactions->{$role}}) : [];
184                # Form the role name, abbreviation, and reaction list into a 3-tuple and push
185                # them onto the main role list.
186              push(@$role_info,[$role,$abbrev,$react]);              push(@$role_info,[$role,$abbrev,$react]);
187                # Set the role hash so that we can get back the column index for a given
188                # role name.
189              $roleH->{$role} = $i;              $roleH->{$role} = $i;
190          }          }
191            # Get the column subsets. A column subset is a list of role IDs, so we need to
192            # convert it to a set of column indices. We ignore the special "All" subset
193            # that contains everything.
194          my $subset_info = [];          my $subset_info = [];
195          foreach $subset (@col_subsets)          foreach $subset (@col_subsets)
196          {          {
197              if ($subset ne 'All')              if ($subset ne 'All')
198              {              {
199                  push(@$subset_info,[$subset,[$subsystem->get_subsetC_roles($subset)]]);                  push(@$subset_info,[$subset,[map { $roleH->{$_} } $subsystem->get_subsetC_roles($subset)]]);
200              }              }
201          }          }
202            # Now we create the genome directories. For each genome we need to be able to
203            # hash from the genomeID to the specified role index and we must be able to
204            # get the genome's variant code. (The variant code indicates which subsystem
205            # variant the genome uses.
206          my $genomes_info = [];          my $genomes_info = [];
207          my $genomeH      = {};          my $genomeH      = {};
208          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
209          {          {
210                # Get the genome ID for this row.
211              my $genome  = $genomes[$i];              my $genome  = $genomes[$i];
212              my $variant = $subsystem->get_variant_code( $subsystem->get_genome_index( $genome ) );              # Get its variant code.
213                my $variant = $subsystem->get_variant_code($i);
214                # Form them into a 2-tuple and add the result to the genome list.
215              push(@$genomes_info,[$genome,$variant]);              push(@$genomes_info,[$genome,$variant]);
216                # Set up the hash to get from the genome ID to the row index.
217              $genomeH->{$genome} = $i;              $genomeH->{$genome} = $i;
218          }          }
219    
220            # Next we gather the data from the actual spreadsheet cells. For an SFXlate
221            # object, this is the most expensive part of the operation, since it requires
222            # a database call for each cell.
223          my $pegH = {};          my $pegH = {};
224            # $i is the row index, which cycles through genomes.
225          for ($i=0; ($i < @genomes); $i++)          for ($i=0; ($i < @genomes); $i++)
226          {          {
227              my $genome = $genomes[$i];              # $j is the column index, which cycles through roles.
228              for ($j=0; ($j < @roles); $j++)              for ($j=0; ($j < @roles); $j++)
229              {              {
230                  my $role = $roles[$j];                  my @pegs = $subsystem->get_pegs_from_cell($i,$j);
                 my @pegs = $subsystem->get_pegs_from_cell($genome,$role);  
231                  $pegH->{$i}->{$j} = [@pegs];                  $pegH->{$i}->{$j} = [@pegs];
232              }              }
233          }          }
234    
235          my $colorsH  = $colors  ? &set_colors($fig,$pegH)  : {};          # Get the row subsets. Row subsets are determined by the genome
236          my $aliasesH = $aliases ? &set_aliases($fig,$pegH) : {};          # taxonomy, so these are different from the row subsets stored
237            # in the subsystem object.
238            my $row_subsets    = &row_subsets($fig, $genomeH);
239            # Here we try to get a list of the active genomes. The caller
240            # gives us hints in the form of the "$focus" and "$active_subsetR"
241            # parameters. If, for example, we are using this object to generate
242            # a subsystem web page, the focus information would steer us to
243            # whatever the user wants to look at on the page.
244            my $active_genomes = &active_genomes($fig, $row_subsets,$active_subsetR,$focus,
245                                                 $genomeH,$genomes_info);
246    
247          my $self = [$role_info,$roleH,$subset_info,$genomes_info,$genomeH,$pegH,$colorsH,$aliasesH];          # Now we generate a table of colors for the various PEGs. If the
248            # caller gave us a map of peg IDs to colors, we use that. Otherwise,
249            # we allow the option of painting the PEGs by cluster number. (The
250            # caller indicates this by setting the "show_clusters" flag in the
251            # parameter list.)
252            my $colorsH;
253            if ($peg_colors)
254            {
255                # Here the caller gave us a list of peg colors. The list contains
256                # 2-tuples, each consisting of a PEG ID followed by the
257                # color value. The loop below extracts the pairs and stuffs them
258                # into the color hash.
259                $colorsH = {};
260                foreach $_ (@$peg_colors)
261                {
262                    my($peg,$color) = @$_;
263                    $colorsH->{$peg} = $color;
264                }
265            }
266            elsif ($show_clusters)
267            {
268                # Here the user wants us to base the colors on the genome clustering
269                # information.
270                $colorsH  = &set_colors($fig,$subsystem,$pegH,$active_genomes);
271            }
272            else
273            {
274                # Here the user is not interested in coloring the PEGs.
275                $colorsH = {};
276            }
277            # If the user wants to see aliases, compute the alias hash. Aliases
278            # will only be computed for PEGs belonging to active (highlighted)
279            # genomes, and there is a maximum of one alias per PEG.
280            my $aliasesH = $aliases ? &set_aliases($fig,$pegH,$active_genomes) : {};
281            # Create and bless the UnvSubsys object.
282            my $self = { Roles => $role_info,
283                         RoleIndex => $roleH,
284                         RoleSubsets => $subset_info,
285                         Genomes => $genomes_info,
286                         GenomeIndex => $genomeH,
287                         GenomeSubsets => $row_subsets,
288                         PegHash => $pegH,
289                         Colors => $colorsH,
290                         Aliases => $aliasesH,
291                         Curator => $curator,
292                         Notes => $notes,
293                         Reactions => $reactions,
294                         Diagrams => \@diagrams,
295                       };
296          bless($self, $class);          bless($self, $class);
297            # Return the object.
298          return $self;          return $self;
299      }      }
300      else      else
301      {      {
302            # Here the FIG-like object was not recognized, so we return an
303            # undefined value.
304          return undef;          return undef;
305      }      }
306  }  }
307    
308  1;  =head3 get_subset_namesR
309    
310    C<< my @names = $unvsub->get_subset_namesR(); >>
311    
312    Return the names of the genome (row) subsets.
313    
314    =cut
315    
316    sub get_subset_namesR {
317        my($self) = @_;
318    
319        return map { $_->[0] } @{$self->{GenomeSubsets}};
320    }
321    
322    =head3 get_subsetR
323    
324    C<< my @genomes = $unvsub->get_subsetR($set); >>
325    
326    Return a list of the genome IDs covered by a row subset.
327    
328    =over 4
329    
330    =item set
331    
332    Name of the row subset whose genomes are desired.
333    
334    =item RETURN
335    
336    Returns a list of the IDs for the genomes found in the specified row
337    set.
338    
339    =back
340    
341    =cut
342    
343    sub get_subsetR {
344        # Get the parameters.
345        my($self,$set) = @_;
346        my($i);
347        # Get the list of row subsets.
348        my $sets = $self->{GenomeSubsets};
349        # Find the row subset with the specified name. The row subset list is a
350        # list of 2-tuples, and the first element of each tuple is the set
351        # name.
352        for ($i=0; ($i < @$sets) && ($sets->[$i]->[0] ne $set); $i++) {}
353        if ($i < @$sets)
354        {
355            # Here we found the named subset. The subset tuple's second element is
356            # the list of row indices. We map these to genome IDs before returning
357            # them.
358            return map { $self->{Genomes}->[$_]->[0] } @{$sets->[$i]->[1]}
359        }
360        # Here we subset was not found, so we return the undefined value.
361        return undef;
362    }
363    
364    =head3 get_subsetR
365    
366    C<< my @pairs = $unvsub->get_subsetsR(); >>
367    
368    Return a list of all the row subsets. The subsets are returned in the form
369    of 2-tuples, each consisting of a subset name followed by a reference to a
370    list of genome IDs. The genome IDs correspond to the rows in the subset.
371    
372    =cut
373    
374    sub get_subsetsR {
375        # Get the parameters.
376        my($self) = @_;
377        # Extract the list of genome subsets. This list is in the form of
378        # 2-tuples, but the rows are identified by row index, not genome ID.
379        my $sets = $self->{GenomeSubsets};
380        # Create the return list.
381        my @pairs = ();
382        # Loop through the subsets.
383        my $pair;
384        foreach $pair (@$sets)
385        {
386            # Convert this subset's member list from row indices to genome IDs
387            # and stash the result in the return list.
388            my($id,$members) = @$pair;
389            push(@pairs,[$id,[map { $self->{Genomes}->[$_]->[0] } @$members]]);
390        }
391        # Return the list constructed.
392        return @pairs;
393    }
394    
395    =head3 row_subsets
396    
397    C<< my $subsetList = UnvSubsys::row_subsets($fig, \%genomeH); >>
398    
399    This method computes the taxonomic row subsets for a subsystem. It takes
400    as input a hash that maps genome IDs to column indices and a FIG object.
401    The FIG object provides a list of taxonomic groups of 10 or more complete
402    genomes. From the list, we extract subsets which have more than 10
403    genomes in the list of subsystem genomes. If no such subsets exist,
404    we extract subsets which have at least 1 genome from the list of
405    subsystem genomes. The subsets are returned as 2-tuples, the first
406    element being the subset ID and the second being a list of row indices
407    for the genomes in the subset.
408    
409    =over 4
410    
411    =item fig
412    
413    A FIG-like object for accessing the data store.
414    
415    =item genomeH
416    
417    Reference to a hash that maps each genome ID to its row index in the
418    subsystem spreadsheet.
419    
420    =item RETURN
421    
422    Returns a reference to a list of 2-tuples. Each 2-tuple consists of a
423    subset ID followed by a list of the row indices for the genomes in the
424    subset.
425    
426    =back
427    
428    =cut
429    
430    sub row_subsets {
431        my ($fig, $genomeH) = @_;
432    
433        # We need a real FIG object, since SFXlate does not yet support
434        # taxonomy trees. The "FIG" method does this for us.
435        $fig = $fig->FIG();
436        # Initialize the return value.
437        my $subsets = [];
438        # Get a list of taxonomic groups. This will come back as a list of
439        # 2-tuples.
440        my $taxonomic_groups = $fig->taxonomic_groups_of_complete(5);
441        # Loop through the 2-tuples. We're looking for subsets which
442        # contain at least one genome on the subsystem's spreadsheet.
443        my($pair,$id,$members);
444        foreach $pair (@$taxonomic_groups)
445        {
446            ($id,$members) = @$pair;
447            # Extract the genomes in the member list that participate in this
448            # subsystem. To do this, we convert each genome ID to its row
449            # index. If no row index exists, the GREP condition discards the
450            # member.
451            my @mem = grep { defined($_) } map { $genomeH->{$_} } @$members;
452            # If there are enough members, save the subset.
453            if (@mem > 0)
454            {
455                push(@$subsets,[$id,[@mem]]);
456            }
457        }
458        # Return the list of row subsets.
459        return $subsets;
460    }
461    
462    =head3 set_aliases
463    
464    C<< my $aliasHash = UnvSubsys::set_aliases($fig, $pegH, $active_genomes); >>
465    
466    Return a hash mapping PEG IDs to aliases.
467    
468    =over 4
469    
470    =item fig
471    
472    FIG-like object that can be used to access the data store.
473    
474    =item pegH
475    
476    Reference to the spreadsheet hash table. Given a row index I<$row> and a
477    column index I<$col>,
478    
479        $pegH->{$row}->{$col}
480    
481    will return a reference to a list of PEGs in the specified spreadsheet cell.
482    
483    =item active_genomes
484    
485    Reference to a hash whose keys correspond to the spreadsheet row indices
486    of genomes that should be highlighted.
487    
488    =item RETURN
489    
490    Returns a hash that takes as input a PEG ID and returns an alias. Only PEGs
491    for active genomes will be processed.
492    
493    =back
494    
495    =cut
496    
497    sub set_aliases {
498        # Get the parameters.
499        my($fig,$pegH,$active_genomes) = @_;
500        my($genomeI,$roleI,$pegs,$peg,$roleH);
501    
502        # Create the return hash.
503        my $aliasesH = {};
504    
505        # Loop through each row that corresponds to an active genome.
506        # The active genome list contains row indices, and there is
507        # one genome per row. Note that the genome ID is never used,
508        # only the row index.
509        foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
510        {
511            # Get the role hash for the specified genome. The role hash
512            # maps column indices (representing roles) to lists of PEGs.
513            $roleH = $pegH->{$genomeI};
514            # Loop through the role (column) indices.
515            foreach $roleI (keys(%$roleH))
516            {
517                # Get the PEG list for this row/column combination.
518                $pegs = $roleH->{$roleI};
519                # Only proceed if data was found in the cell.
520                if (defined $pegs) {
521                    # Loop through the pegs in the cell.
522                    foreach $peg (@$pegs)
523                    {
524                        # If we do not already have an alias for this PEG,
525                        # compute one.
526                        if (! $aliasesH->{$peg})
527                        {
528                            $aliasesH->{$peg} = scalar &ext_id($fig,$peg);
529                        }
530                    }
531                }
532            }
533        }
534        # Return the hash we built.
535        return $aliasesH;
536    }
537    
538    =head3 set_colors
539    
540    C<< my $colorHash = UnvSubsys::set_colors($fig, $sub, \%pegH, \%active_genomes); >>
541    
542    Return a hash that maps each PEG in the subsystem spreadsheet to a display
543    color. Not all PEGs need to be mapped. Those that do not have a color
544    assigned will generally be displayed in white.
545    
546    =over 4
547    
548    =item fig
549    
550    FIG-like object that can be used to access the data store.
551    
552    =item sub
553    
554    Subsystem object for the current subsystem.
555    
556    =item pegH
557    
558    Reference to the spreadsheet hash table. Given a row index I<$row> and a
559    column index I<$col>,
560    
561        $pegH->{$row}->{$col}
562    
563    will return a reference to a list of PEGs in the specified spreadsheet cell.
564    
565    =item active_genomes
566    
567    Reference to a hash whose keys correspond to the spreadsheet row indices
568    of genomes that should be highlighted.
569    
570    =item RETURN
571    
572    Returns a hash that takes as input a PEG ID and returns a color. These colors
573    are used when displaying the PEGs in the subsystem spreadsheet. Only PEGs
574    for active genomes will be colored.
575    
576    =back
577    
578    =cut
579    
580    sub set_colors {
581        # Get the parameters.
582        my($fig,$sub,$pegH,$active_genomes) = @_;
583    
584        my($genomeI,$roleI,$pegs,$peg,$roleH,%pegs_in_genome);
585        # Create the return hash.
586        my $colorsH = {};
587        # Loop through the active genomes. The keys of "%$pegH" are the row indices
588        # for rows that have at least one occupied cell in the spreadsheet. The
589        # Grep then reduces this list to those rows that are highlighted.
590        foreach $genomeI (grep { $active_genomes->{$_} } keys(%$pegH))
591        {
592            # We will use the following hash to compile a list of all the PEGs
593            # in the spreadsheet for the current genome, that is, the genome
594            # represented by row index "$genomeI".
595            undef %pegs_in_genome;
596            # Get the hash for the current row. This hash maps column indices to
597            # lists of pegs.
598            $roleH = $pegH->{$genomeI};
599            # Loop through the column indices for the specified row.
600            foreach $roleI (keys(%$roleH))
601            {
602                # Here we can finally get a list of the pegs in this spreadsheet
603                # cell. We loop through them, marking them in the "%pegs_in_genome"
604                # hash.
605                $pegs = $roleH->{$roleI};
606                foreach $peg (@$pegs)
607                {
608                    $pegs_in_genome{$peg} = 1;
609                }
610            }
611            # Extract the "%pegs_in_genome" hash keys. This gives us a duplicate-free
612            # list of all the pegs for the current spreadsheet role.
613            my @pegs = keys(%pegs_in_genome);
614            my($tuple,$peg,$color);
615            # Get a hash that maps the PEG IDs to colors.
616            my $colors_for_one_genome = &set_colors_for_genome($fig,$sub, \@pegs);
617            # Loop through the hash we got back and assign the colors from that
618            # hash to the master hash we're returning to the caller.
619            while (($peg,$color) = each %$colors_for_one_genome)
620            {
621                $colorsH->{$peg} = $colors_for_one_genome->{$peg};
622            }
623        }
624        # Return the color hash.
625        return $colorsH;
626    }
627    
628    =head3 set_colors_for_genome
629    
630    C<< my $colorHash = UnvSubsys::set_colors_for_genome($fig, $sub, \@pegs); >>
631    
632    Return a reference to a hash mapping the specified pegs to colors. PEGs that
633    are physically close to each other will be painted the same color.
634    
635    =over 4
636    
637    =item fig
638    
639    A fig-like object that can be used to access the data store.
640    
641    =item sub
642    
643    Subsystem object for the relevant subsystem.
644    
645    =item pegs
646    
647    Reference to a list of PEG IDs. All of the peg IDs should be for the
648    same genome.
649    
650    =item RETURN
651    
652    Returns a reference to a hash that maps each PEG ID to a color.
653    
654    =back
655    
656    =cut
657    
658    sub set_colors_for_genome {
659        # Get the parameters.
660        my($fig, $sub, $pegs) = @_;
661        # Default all the PEGs to white.
662        my %color_of = map { $_ => '#FFFFFF' } @$pegs;
663        # Divide the pegs into clusters.
664        my @clusters = $fig->compute_clusters($pegs, $sub);
665        # Get a list of useful colors.
666        my @colors =  &cool_colors();
667        # If we have too many clusters, chop off the big ones at the end. These
668        # are least likely to be important.
669        if (@clusters > @colors) { splice(@clusters, 0, (@clusters - @colors)) }
670        # Loop through the clusters.
671        for my $cluster (@clusters) {
672            # Get the color for this cluster.
673            my $color = shift @colors;
674            # Loop through this cluster, putting this color into the color_of
675            # entries for each PEG.
676            for my $peg (@$cluster) {
677                $color_of{$peg} = $color;
678            }
679        }
680        # Return the color map.
681        return \%color_of;
682    }
683    
684    =head3 cool_colors
685    
686    C<< my @colorList = UnvSubsys::cool_colors(); >>
687    
688    Return a list of web-safe colors.
689    
690    =cut
691    
692    sub cool_colors {
693     # just an array of "websafe" colors or whatever colors we want to use. Feel free to remove bad colors (hence the lines not being equal length!)
694     return (
695     '#C0C0C0', '#FF40C0', '#FF8040', '#FF0080', '#FFC040', '#40C0FF', '#40FFC0', '#C08080', '#C0FF00', '#00FF80', '#00C040',
696     "#6B8E23", "#483D8B", "#2E8B57", "#008000", "#006400", "#800000", "#00FF00", "#7FFFD4",
697     "#87CEEB", "#A9A9A9", "#90EE90", "#D2B48C", "#8DBC8F", "#D2691E", "#87CEFA", "#E9967A", "#FFE4C4", "#FFB6C1",
698     "#E0FFFF", "#FFA07A", "#DB7093", "#9370DB", "#008B8B", "#FFDEAD", "#DA70D6", "#DCDCDC", "#FF00FF", "#6A5ACD",
699     "#00FA9A", "#228B22", "#1E90FF", "#FA8072", "#CD853F", "#DC143C", "#FF6347", "#98FB98", "#4682B4",
700     "#D3D3D3", "#7B68EE", "#2F4F4F", "#FF7F50", "#FF69B4", "#BC8F8F", "#A0522D", "#DEB887", "#00DED1",
701     "#6495ED", "#800080", "#FFD700", "#F5DEB3", "#66CDAA", "#FF4500", "#4B0082", "#CD5C5C",
702     "#EE82EE", "#7CFC00", "#FFFF00", "#191970", "#FFFFE0", "#DDA0DD", "#00BFFF", "#DAA520", "#008080",
703     "#00FF7F", "#9400D3", "#BA55D3", "#D8BFD8", "#8B4513", "#3CB371", "#00008B", "#5F9EA0",
704     "#4169E1", "#20B2AA", "#8A2BE2", "#ADFF2F", "#556B2F",
705     "#F0FFFF", "#B0E0E6", "#FF1493", "#B8860B", "#FF0000", "#F08080", "#7FFF00", "#8B0000",
706     "#40E0D0", "#0000CD", "#48D1CC", "#8B008B", "#696969", "#AFEEEE", "#FF8C00", "#EEE8AA", "#A52A2A",
707     "#FFE4B5", "#B0C4DE", "#FAF0E6", "#9ACD32", "#B22222", "#FAFAD2", "#808080", "#0000FF",
708     "#000080", "#32CD32", "#FFFACD", "#9932CC", "#FFA500", "#F0E68C", "#E6E6FA", "#F4A460", "#C71585",
709     "#BDB76B", "#00FFFF", "#FFDAB9", "#ADD8E6", "#778899",
710     );
711    }
712    
713    =head3 ext_id
714    
715    C<< my $externalID = UnvSubsys::ext_id($fig, $peg); >>
716    
717    or
718    
719    C<< my @externalIDs = UnvSubsys::ext_id($fig, $peg); >>
720    
721    Return a list of non-FIG IDs for the specified feature. In a scalar context, return
722    a single non-FIG ID for the specified feature.
723    
724    This method returns IDs that are all of the same type, that is, all UniProt IDs, or
725    all KEGG IDs, and so forth. To do this, it checks the feature's alias list for IDs
726    of a given type. If it finds at least one, then all IDs of that type are returned.
727    Highest priority is given to the UniProt IDs, then SP IDs, GI IDs, and finally
728    KEGG IDs.
729    
730    =over 4
731    
732    =item fig
733    
734    A FIG-like object for accessing the data store.
735    
736    =item peg
737    
738    ID of the feature whose aliases are desired.
739    
740    =item RETURN
741    
742    In list context, a list of non-FIG IDs for the feature that are all of the same
743    type. In scalar context, the first non-FIG ID for the feature of the
744    highest-priority available type.
745    
746    =back
747    
748    =cut
749    
750    sub ext_id {
751        my($fig,$peg) = @_;
752    
753        my @tmp;
754        my @aliases = $fig->feature_aliases($peg);
755        if      ((@tmp = grep { $_ =~ /^uni\|/ } @aliases) > 0)
756        {
757            @aliases =  @tmp;
758        }
759        elsif   ((@tmp = grep { $_ =~ /^sp\|/ } @aliases) > 0)
760        {
761            @aliases = @tmp;
762        }
763        elsif   ((@tmp = grep { $_ =~ /^gi\|/ } @aliases) > 0)
764        {
765            @aliases = @tmp;
766        }
767        elsif   ((@tmp = grep { $_ =~ /^kegg\|/ } @aliases) > 0)
768        {
769            @aliases = @tmp;
770        }
771        else
772        {
773            @aliases = ();
774        }
775    
776        if (wantarray())
777        {
778            return @aliases;
779        }
780        else
781        {
782            return $aliases[0];
783        }
784    }
785    
786    =head3 subsystem_curator
787    
788    C<< my $name = $unvsub->subsystem_curator(); >>
789    
790    Return the name of the subsystem curator. The database stores user names as
791    C<master:>I<name>. This method strips off the C<master:> prefix before it
792    passes the result back to the caller.
793    
794    =cut
795    
796    sub subsystem_curator {
797        my($self) = @_;
798    
799        my $curator = $self->{Curator};
800        $curator =~ s/master://;
801        return $curator;
802    }
803    
804    =head3 get_roles
805    
806    C<< my @roles = $unvsub->get_roles(); >>
807    
808    Return a list of the roles (columns) for this subsystem. The roles will be
809    returned in column order, so that if you access the sixth element of the
810    return list, you'll get the name of the role for the sixth column.
811    
812    =cut
813    
814    sub get_roles {
815        my($self) = @_;
816        # The role index in this object is a list of 3-tuples. The caller only
817        # wants the first element of each tuple, which is the role name.
818        return map { $_->[0] } @{$self->{Roles}};
819    }
820    
821    =head3 get_genome_index
822    
823    C<< my $index = $unvsub->get_genome_index($genome); >>
824    
825    Return the row index of the specified genome.
826    
827    =over 4
828    
829    =item genome
830    
831    ID of the genome whose row index is desired.
832    
833    =item RETURN
834    
835    Returns the index of the row corresponding to the specified genome, or an
836    undefined value if the genome is not represented in the subsystem
837    spreadsheet.
838    
839    =back
840    
841    =cut
842    
843    sub get_genome_index {
844        my($self,$genome) = @_;
845    
846        return $self->{GenomeIndex}->{$genome};
847    }
848    
849    =head3 get_genomes
850    
851    C<< my @genomes = $unvsub->get_genomes(); >>
852    
853    Return a list of the genome IDs for the subsystem. The genomes will be
854    presented in row order. In other words, if you index into the sixth
855    element of the return list, you will retrieve the ID of the genome for
856    the sixth row.
857    
858    =cut
859    
860    sub get_genomes {
861        my($self) = @_;
862        # The genome array is a list of 2-tuples. We extract the first
863        # element of each tuple, which is the genome ID.
864        return map { $_->[0] } @{$self->{Genomes}};
865    }
866    
867    =head3 get_variant_code
868    
869    C<< my $code = $unvsub->get_variant_code($genome); >>
870    
871    Return the variant code for a genome. Each subsystem has several variations.
872    The variant code indicates which variation of a subsystem is used by a
873    particular genome.
874    
875    Genome data is stored in a list of 2-tuples. The first element is the genome
876    ID; the second is the variant code.
877    
878    =over 4
879    
880    =item genome
881    
882    ID or row index of the genome whose variant code is desired.
883    
884    =item RETURN
885    
886    Returns the variant code for the specified genome.
887    
888    =back
889    
890    =cut
891    
892    sub get_variant_code {
893        my($self,$genome) = @_;
894        # Check to see if we have a row index.
895        if ($genome =~ /^\d+$/)
896        {
897            # Here we have a row index, so use it to find the genome's variant
898            # code.
899            return $self->{Genomes}->[$genome]->[1];
900        }
901        else
902        {
903            # Here we have a genome ID, so we need to convert it to a row index.
904            my $genomeI = $self->{GenomeIndex}->{$genome};
905            return $self->{Genomes}->[$genomeI]->[1];
906        }
907    }
908    
909    =head3 get_pegs_from_cell
910    
911    C<< my @pegs = $unvsub->get_pegs_from_cell($genome, $role); >>
912    
913    Return a list of the features in a specified spreadsheet cell. The cell is specified
914    by genome ID and role ID.
915    
916    =over 4
917    
918    =item genome
919    
920    ID of the genome relevant to the cell.
921    
922    =item role
923    
924    ID of the role relevant to the cell.
925    
926    =item RETURN
927    
928    Returns a list of the features in the cell, or an empty list if the cell is empty.
929    
930    =back
931    
932    =cut
933    
934    sub get_pegs_from_cell {
935        my($self,$genome,$role) = @_;
936        # Convert the genome and role IDs to row and column indices.
937        my $genomeI = $self->{GenomeIndex}->{$genome};
938        my $roleI   = $self->{RoleIndex}->{$role};
939        # Get the pegs from the cell and return them.
940        my $pegs    = $self->{PegHash}->{$genomeI}->{$roleI};
941        return $pegs ? @$pegs : ();
942    }
943    
944    =head3 get_diagrams
945    
946    C<< my @list = $sub->get_diagrams(); >>
947    
948    Return a list of the diagrams associated with this subsystem. Each diagram
949    is represented in the return list as a 4-tuple C<[diagram_id, diagram_name,
950    page_link, img_link]> where
951    
952    =over 4
953    
954    =item diagram_id
955    
956    ID code for this diagram.
957    
958    =item diagram_name
959    
960    Displayable name of the diagram.
961    
962    =item page_link
963    
964    URL of an HTML page containing information about the diagram.
965    
966    =item img_link
967    
968    URL of an HTML page containing an image for the diagram.
969    
970    =back
971    
972    Note that the URLs are in fact for CGI scripts with parameters that point them
973    to the correct place. Though Sprout has diagram information in it, it has
974    no relationship to the diagrams displayed in SEED, so the work is done entirely
975    on the SEED side.
976    
977    =cut
978    
979    sub get_diagrams {
980        # Get the parameters.
981        my ($self) = @_;
982        # Return the diagram list.
983        return @{$self->{Diagrams}};
984    }
985    
986    sub get_notes {
987        my($self) = @_;
988    
989        return $self->{Notes};
990    }
991    
992    sub get_role_index {
993        my($self,$role) = @_;
994    
995        return $self->{RoleIndex}->{$role};
996    }
997    
998    sub get_role_abbr {
999        my($self,$roleI) = @_;
1000    
1001        if ($roleI !~ /^\d+$/)
1002        {
1003            $roleI = $self->{RoleIndex}->{$roleI};
1004        }
1005        my $roles = $self->{Roles};
1006        return $roles->[$roleI]->[1];
1007    }
1008    
1009    sub get_reactions {
1010        my($self) = @_;
1011    
1012        return $self->{Reactions};
1013    }
1014    
1015    sub get_subset_namesC {
1016        my($self) = @_;
1017    
1018        return map { $_->[0] } @{$self->{RoleSubsets}};
1019    }
1020    
1021    sub get_subsetC_roles {
1022        my($self,$subset) = @_;
1023        my($i,$j);
1024    
1025        my $subset_info = $self->{RoleSubsets};
1026        for ($i=0; ($i < @$subset_info) && ($subset_info->[$i]->[0] ne $subset); $i++) {}
1027        if ($i < @$subset_info)
1028        {
1029            my @roles = ();
1030            foreach $j (@{$subset_info->[$i]->[1]})
1031            {
1032                push(@roles,$self->{Roles}->[$j]->[0]);
1033            }
1034            return @roles;
1035        }
1036        return undef;
1037    }
1038    
1039    sub get_color_of {
1040        my($self,$peg)  = @_;
1041    
1042        return $self->{Colors}->{$peg};
1043    }
1044    
1045    =head3 active_genomes
1046    
1047    C<< my $activeHash = UnvSubsys::active_genomes(\@row_subsets, $active_subsetR, $focus, \%genomeH, \@genomes_info); >>
1048    
1049    Return a hash containing the active genomes for this subsystem display. The
1050    keys of the hash will be the row indices of the genomes to be highlighted on the
1051    display. Each genome ID will map to 1. Thus, if C<< $activeHash->{3} >>
1052    tests TRUE, the fourth row should be highlighted.
1053    
1054    The rules for determining the active genomes are as follows. If I<$active_subsetR> is
1055    specified, it is presumed to be the ID of the subset containing the active genomes.
1056    If it is not specified and I<$focus> is specified, then I<$focus> is presumed to be the
1057    ID of the genome currently in focus, and the active genomes will be the ones in the
1058    smallest row subset containing the genome in focus. If neither I<$active_subsetR> nor
1059    I<$focus> are specified, then all genomes are active.
1060    
1061    =over 4
1062    
1063    =item row_subsets
1064    
1065    Reference to a list of 2-tuples. Each tuple consists of a row subset ID followed by
1066    a reference to a list of the row indices for the rows in the identified subset.
1067    
1068    =item active_subsetR (optional)
1069    
1070    ID of the active subset (if any).
1071    
1072    =item focus
1073    
1074    ID of the genome currently in focus (if any). If there is no active subset, then
1075    the smallest subset containing this genome will be made active.
1076    
1077    =item genomeH
1078    
1079    Reference to a hash of genome IDs to row indices. The keys of this hash are
1080    the genomes in this subsystem, which also form the subsystem spreadsheet's
1081    rows.
1082    
1083    =item genomes_info
1084    
1085    Reference to a list of 2-tuples. The first element of each 2-tuple is the
1086    ID of a genome; the second is the variant code for the subsystem variant
1087    used by the genome. The tuples are ordered by row index, so that the ID
1088    and variant code of the genome in a particular row can be located by indexing
1089    into this parameter using the subsystem spreadsheet row number.
1090    
1091    =item RETURN
1092    
1093    Returns a reference to a hash that maps the row indices of the active genomes
1094    to 1. This hash can be used to quickly determine whether or not a particular
1095    row is to be highlighted.
1096    
1097    =back
1098    
1099    =cut
1100    
1101    sub active_genomes {
1102        # Get the parameters.
1103        my($fig, $row_subsets, $active_subsetR, $focus, $genomeH, $genomes_info) = @_;
1104        my($i,@bestL);
1105        # Declare the return variable.
1106        my $active_genomes = {};
1107        # Check for an active subset.
1108        if ($active_subsetR)
1109        {
1110            # Search for the active subset in the row subset array.
1111            for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1112            if ($i < @$row_subsets)
1113            {
1114                # Here we found the active subset, so we extract its list of row indices.
1115                @bestL = @{$row_subsets->[$i]->[1]};
1116            }
1117            else {
1118                # Here we have the ID of the active subset. First, we search for that ID
1119                # in the row subset list.
1120                for ($i=0; ($i < @$row_subsets) && ($row_subsets->[$i]->[0] ne $active_subsetR); $i++) {}
1121                if ($i < @$row_subsets)
1122                {
1123                    # Here we found the named subset, so we return its member list.
1124                    @bestL = @{$row_subsets->[$i]->[1]};
1125                }
1126                else
1127                {
1128                    # Here the active subset does not exist. We punt by extracting a
1129                    # list of all the row indices in the spreadsheet.
1130                    $active_subsetR = 'All';
1131                    @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1132                }
1133            }
1134        }
1135        elsif ($focus)
1136        {
1137            # Here we don't have an active row subset, but a particular genome is in
1138            # focus. We'll look for the smallest row subset containing the genome
1139            # in focus. First, we need to prime the loop. "$bestN" will be the ID
1140            # of the best subset found so far; "@bestL" is where we stash the list
1141            # of IDs in the subset. Our initial selection, then, will be the
1142            # fake "All" subset, which contains the entire collection of rows.
1143    
1144            if (! $fig->is_complete($focus))
1145            {
1146                # Here the gnome in focus is incomplete, so it won't be anywhere
1147                # in our list. We default to making everything active.
1148                $active_subsetR = 'All';
1149                @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1150            } else {
1151                my $bestN   = "All";
1152                @bestL   = map { $genomeH->{$_} } keys(%$genomeH);
1153                # Next, we get the row index for the genome in focus.
1154                my $focusIndex = $genomeH->{$focus};
1155                # Now we loop through all the row subsets.
1156                my $tuple;
1157                foreach $tuple (@$row_subsets)
1158                {
1159                    # Extract the subset ID and its list of row indices. The latter is
1160                    # in "$genomeIs".
1161                    my($id,$genomeIs) = @$tuple;
1162                    # Search for the index of the focus row in the current subset's list
1163                    # of row indices.
1164                    for ($i=0; ($i < @$genomeIs) && ($genomeIs->[$i] != $focusIndex); $i++) {}
1165                    # Now either $i will be the index of the focus row in the subset, or
1166                    # it is going to be equal to the number of rows in the subset.
1167                    if ($i < @$genomeIs)
1168                    {
1169                        # We've found the focus row in this subset. Select it as the new
1170                        # best subset if it's smaller than the last one we found.
1171                        if (@$genomeIs < @bestL)
1172                        {
1173                            $bestN  = $id;
1174                            @bestL = @$genomeIs;
1175                        }
1176                    }
1177                }
1178                # Save the best subset found as the active one.
1179                $active_subsetR = $bestN;
1180            }
1181        } else {
1182            # Here we have nothing: no active subset, and no focus row. We make
1183            # all rows active.
1184            $active_subsetR = 'All';
1185            @bestL = map { $genomeH->{$_} } keys(%$genomeH);
1186        }
1187        # "@bestL" now contains a list of the row indices for the active genomes.
1188        # We convert it to a hash and return it.
1189        my %active_genomes = map { $_ => 1 } @bestL;
1190        return \%active_genomes;
1191    }
1192    
1193    1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.16

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3