[Bio] / Sprout / Sprout.pm Repository:
ViewVC logotype

Diff of /Sprout/Sprout.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.105, Wed Jan 30 22:14:02 2008 UTC revision 1.111, Wed May 7 23:11:51 2008 UTC
# Line 14  Line 14 
14      use BasicLocation;      use BasicLocation;
15      use CustomAttributes;      use CustomAttributes;
16      use RemoteCustomAttributes;      use RemoteCustomAttributes;
17        use CGI;
18        use WikiTools;
19      use base qw(ERDB);      use base qw(ERDB);
20    
21  =head1 Sprout Database Manipulation Object  =head1 Sprout Database Manipulation Object
# Line 146  Line 148 
148          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));
149          $retVal->{_ca} = CustomAttributes->new(user => $user);          $retVal->{_ca} = CustomAttributes->new(user => $user);
150      }      }
151        # Insure we have access to the stem module.
152        WikiUse('Lingua::Stem');
153        $retVal->{stemmer} = Lingua::Stem->new();
154        $retVal->{stemmer}->stem_caching({ -level => 2 });
155      # Return it.      # Return it.
156      return $retVal;      return $retVal;
157  }  }
158    
159    =head3 CoreGenomes
160    
161        my @genomes = $sprout->CoreGenomes($scope);
162    
163    Return the IDs of NMPDR genomes in the specified scope.
164    
165    =over 4
166    
167    =item scope
168    
169    Scope of the desired genomes. C<core> covers the original core genomes,
170    C<nmpdr> covers all genomes in NMPDR groups, and C<all> covers all
171    genomes in the system.
172    
173    =item RETURN
174    
175    Returns a list of the IDs for the genomes in the specified scope.
176    
177    =back
178    
179    =cut
180    
181    sub CoreGenomes {
182        # Get the parameters.
183        my ($self, $scope) = @_;
184        # Declare the return variable.
185        my @retVal = ();
186        # If we want all genomes, then this is easy.
187        if ($scope eq 'all') {
188            @retVal = $self->Genomes();
189        } else {
190            # Here we're dealing with groups. Get the hash of all the
191            # genome groups.
192            my %groups = $self->GetGroups();
193            # Loop through the groups, keeping the ones that we want.
194            for my $group (keys %groups) {
195                # Decide if we want to keep this group.
196                my $keepGroup = 0;
197                if ($scope eq 'nmpdr') {
198                    # NMPDR mode: keep all groups.
199                    $keepGroup = 1;
200                } elsif ($scope eq 'core') {
201                    # CORE mode. Only keep real core groups.
202                    if (grep { $group =~ /$_/ } @{$FIG_Config::realCoreGroups}) {
203                        $keepGroup = 1;
204                    }
205                }
206                # Add this group if we're keeping it.
207                if ($keepGroup) {
208                    push @retVal, @{$groups{$group}};
209                }
210            }
211        }
212        # Return the result.
213        return @retVal;
214    }
215    
216    =head3 SuperGroup
217    
218        my $superGroup = $sprout->SuperGroup($groupName);
219    
220    Return the name of the super-group containing the specified NMPDR genome
221    group. If no appropriate super-group can be found, an error will be
222    thrown.
223    
224    =over 4
225    
226    =item groupName
227    
228    Name of the group whose super-group is desired.
229    
230    =item RETURN
231    
232    Returns the name of the super-group containing the incoming group.
233    
234    =back
235    
236    =cut
237    
238    sub SuperGroup {
239        # Get the parameters.
240        my ($self, $groupName) = @_;
241        # Declare the return variable.
242        my $retVal;
243        # Get the group hash.
244        my %groupHash = $self->CheckGroupFile();
245        # Find the super-group genus.
246        $groupName =~ /([A-Z]\w+)/;
247        my $nameThing = $1;
248        # See if it's directly in the group hash.
249        if (exists $groupHash{$nameThing}) {
250            # Yes, then it's our result.
251            $retVal = $nameThing;
252        } else {
253            # No, so we have to search.
254            for my $superGroup (keys %groupHash) {
255                # Get this super-group's item list.
256                my $list = $groupHash{$superGroup}->{contents};
257                # Search it.
258                if (grep { $_->[0] eq $nameThing } @{$list}) {
259                    $retVal = $superGroup;
260                }
261            }
262            # Make sure we found something.
263            if (! $retVal) {
264                Confess("No super-group found for \"$groupName\".");
265            }
266        }
267        # Return the result.
268        return $retVal;
269    }
270    
271  =head3 MaxSegment  =head3 MaxSegment
272    
273      my $length = $sprout->MaxSegment();      my $length = $sprout->MaxSegment();
# Line 353  Line 471 
471      return $retVal;      return $retVal;
472  }  }
473    
474  =head3 GeneMenu  =head3 GenomeMenu
475    
476      my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params, $selected);      my $html = $sprout->GenomeMenu(%options);
477    
478  Return an HTML select menu of genomes. Each genome will be an option in the menu,  Generate a genome selection control with the specified name and options.
479  and will be displayed by name with the ID and a contig count attached. The selection  This control is almost but not quite the same as the genome control in the
480  value will be the genome ID. The genomes will be sorted by genus/species name.  B<SearchHelper> class. Eventually, the two will be combined.
481    
482  =over 4  =over 4
483    
484  =item attributes  =item options
485    
486  Reference to a hash mapping attributes to values for the SELECT tag generated.  Optional parameters for the control (see below).
487    
488  =item filterString  =item RETURN
489    
490  A filter string for use in selecting the genomes. The filter string must conform  Returns the HTML for a genome selection control on a form (sometimes called a popup menu).
 to the rules for the C<< ERDB->Get >> method.  
491    
492  =item params  =back
493    
494  Reference to a list of values to be substituted in for the parameter marks in  The valid options are as follows.
 the filter string.  
495    
496  =item selected (optional)  =over 4
497    
498  ID of the genome to be initially selected.  =item name
499    
500  =item fast (optional)  Name to give this control for use in passing it to the form. The default is C<myGenomeControl>.
501    Terrible things will happen if you have two controls with the same name on the same page.
502    
503  If specified and TRUE, the contig counts will be omitted to improve performance.  =item filter
504    
505  =item RETURN  If specified, a filter for the list of genomes to display. The filter should be in the form of a
506    list reference. The first element of the list should be the filter string, and the remaining elements
507    the filter parameters.
508    
509    =item multiSelect
510    
511    If TRUE, then the user can select multiple genomes. If FALSE, the user can only select one genome.
512    
513    =item size
514    
515  Returns an HTML select menu with the specified genomes as selectable options.  Number of rows to display in the control. The default is C<10>
516    
517    =item id
518    
519    ID to give this control. The default is the value of the C<name> option. Nothing will work correctly
520    unless this ID is unique.
521    
522    =item selected
523    
524    A comma-delimited list of selected genomes, or a reference to a list of selected genomes. The
525    default is none.
526    
527    =item class
528    
529    If specified, a style class to assign to the genome control.
530    
531  =back  =back
532    
533  =cut  =cut
534    
535  sub GeneMenu {  sub GenomeMenu {
536      # Get the parameters.      # Get the parameters.
537      my ($self, $attributes, $filterString, $params, $selected, $fast) = @_;      my ($self, %options) = @_;
538      my $slowMode = ! $fast;      # Get the control's name and ID.
539      # Default to nothing selected. This prevents an execution warning if "$selected"      my $menuName = $options{name} || 'myGenomeControl';
540      # is undefined.      my $menuID = $options{id} || $menuName;
541      $selected = "" unless defined $selected;      # Compute the IDs for the status display.
542      Trace("Gene Menu called with slow mode \"$slowMode\" and selection \"$selected\".") if T(3);      my $divID = "${menuID}_status";
543      # Start the menu.      my $urlID = "${menuID}_url";
544      my $retVal = "<select " .      # Compute the code to show selected genomes in the status area.
545          join(" ", map { "$_=\"$attributes->{$_}\"" } keys %{$attributes}) .      my $showSelect = "showSelected('$menuID', '$divID', '$urlID', 1000)";
546          ">\n";      # Check for single-select or multi-select.
547      # Get the genomes.      my $multiSelect = $options{multiSelect} || 0;
548      my @genomes = $self->GetAll(['Genome'], $filterString, $params, ['Genome(id)',      # Get the style data.
549                                                                       'Genome(genus)',      my $class = $options{class} || '';
550                                                                       'Genome(species)',      # Get the list of pre-selected items.
551                                                                       'Genome(unique-characterization)']);      my $selections = $options{selected} || [];
552      # Sort them by name.      if (ref $selections ne 'ARRAY') {
553      my @sorted = sort { lc("$a->[1] $a->[2]") cmp lc("$b->[1] $b->[2]") } @genomes;          $selections = [ split /\s*,\s*/, $selections ];
554      # Loop through the genomes, creating the option tags.      }
555      for my $genomeData (@sorted) {      my %selected = map { $_ => } @{$selections};
556          # Get the data for this genome.      # Extract the filter information. The default is no filtering. It can be passed as a tab-delimited
557          my ($genomeID, $genus, $species, $strain) = @{$genomeData};      # string or a list reference.
558          # Get the contig count.      my $filterParms = $options{filter} || "";
559          my $contigInfo = "";      if (! ref $filterParms) {
560          if ($slowMode) {          $filterParms = [split /\t|\\t/, $filterParms];
561              my $count = $self->ContigCount($genomeID);      }
562              my $counting = ($count == 1 ? "contig" : "contigs");      my $filterString = shift @{$filterParms};
563              $contigInfo = "[$count $counting]";      # Get a list of all the genomes in group order. In fact, we only need them ordered
564          }      # by name (genus,species,strain), but putting primary-group in front enables us to
565          # Find out if we're selected.      # take advantage of an existing index.
566          my $selectOption = ($selected eq $genomeID ? " selected" : "");      my @genomeList = $self->GetAll(['Genome'], "$filterString ORDER BY Genome(primary-group), Genome(genus), Genome(species), Genome(unique-characterization)",
567          # Build the option tag.                                     $filterParms,
568          $retVal .= "<option value=\"$genomeID\"$selectOption>$genus $species $strain ($genomeID)$contigInfo</option>\n";                                     [qw(Genome(primary-group) Genome(id) Genome(genus) Genome(species) Genome(unique-characterization) Genome(taxonomy) Genome(contigs))]);
569        # Create a hash to organize the genomes by group. Each group will contain a list of
570        # 2-tuples, the first element being the genome ID and the second being the genome
571        # name.
572        my %gHash = ();
573        for my $genome (@genomeList) {
574            # Get the genome data.
575            my ($group, $genomeID, $genus, $species, $strain, $taxonomy, $contigs) = @{$genome};
576            # Compute its name. This is the genus, species, strain (if any), and the contig count.
577            my $name = "$genus $species ";
578            $name .= "$strain " if $strain;
579            my $contigCount = ($contigs == 1 ? "" : ", $contigs contigs");
580            # Now we get the domain. The domain tells us the display style of the organism.
581            my ($domain) = split /\s*;\s*/, $taxonomy, 2;
582            # Now compute the display group. This is normally the primary group, but if the
583            # organism is supporting, we blank it out.
584            my $displayGroup = ($group eq $FIG_Config::otherGroup ? "" : $group);
585            # Push the genome into the group's list. Note that we use the real group
586            # name for the hash key here, not the display group name.
587            push @{$gHash{$group}}, [$genomeID, $name, $contigCount, $domain];
588        }
589        # We are almost ready to unroll the menu out of the group hash. The final step is to separate
590        # the supporting genomes by domain. First, we extract the NMPDR groups and sort them. They
591        # are sorted by the first capitalized word. Groups with "other" are sorted after groups
592        # that aren't "other". At some point, we will want to make this less complicated.
593        my %sortGroups = map { $_ =~ /(other)?(.*)([A-Z].+)/; "$3$1$2" => $_ }
594                             grep { $_ ne $FIG_Config::otherGroup } keys %gHash;
595        my @groups = map { $sortGroups{$_} } sort keys %sortGroups;
596        # Remember the number of NMPDR groups.
597        my $nmpdrGroupCount = scalar @groups;
598        # Loop through the supporting genomes, classifying them by domain. We'll also keep a list
599        # of the domains found.
600        my @otherGenomes = @{$gHash{$FIG_Config::otherGroup}};
601        my @domains = ();
602        for my $genomeData (@otherGenomes) {
603            my ($genomeID, $name, $contigCount, $domain) = @{$genomeData};
604            if (exists $gHash{$domain}) {
605                push @{$gHash{$domain}}, $genomeData;
606            } else {
607                $gHash{$domain} = [$genomeData];
608                push @domains, $domain;
609            }
610        }
611        # Add the domain groups at the end of the main group list. The main group list will now
612        # contain all the categories we need to display the genomes.
613        push @groups, sort @domains;
614        # Delete the supporting group.
615        delete $gHash{$FIG_Config::otherGroup};
616        # Now it gets complicated. We need a way to mark all the NMPDR genomes. We take advantage
617        # of the fact they come first in the list. We'll accumulate a count of the NMPDR genomes
618        # and use that to make the selections.
619        my $nmpdrCount = 0;
620        # Create the type counters.
621        my $groupCount = 1;
622        # Get the number of rows to display.
623        my $rows = $options{size} || 10;
624        # If we're multi-row, create an onChange event.
625        my $onChangeTag = ( $rows > 1 ? " onChange=\"$showSelect;\" onFocus=\"$showSelect;\"" : "" );
626        # Set up the multiple-select flag.
627        my $multipleTag = ($multiSelect ? " multiple" : "" );
628        # Set up the style class.
629        my $classTag = ($class ? " class=\"$class\"" : "" );
630        # Create the SELECT tag and stuff it into the output array.
631        my @lines = ("<SELECT name=\"$menuID\" id=\"$menuID\" $onChangeTag$multipleTag$classTag size=\"$rows\">");
632        # Loop through the groups.
633        for my $group (@groups) {
634            # Get the genomes in the group.
635            for my $genome (@{$gHash{$group}}) {
636                # If this is an NMPDR organism, we add an extra style and count it.
637                my $nmpdrStyle = "";
638                if ($nmpdrGroupCount > 0) {
639                    $nmpdrCount++;
640                    $nmpdrStyle = " Core";
641                }
642                # Get the organism ID, name, contig count, and domain.
643                my ($genomeID, $name, $contigCount, $domain) = @{$genome};
644                # See if we're pre-selected.
645                my $selectTag = ($selected{$genomeID} ? " SELECTED" : "");
646                # Compute the display name.
647                my $nameString = "$name ($genomeID$contigCount)";
648                # Generate the option tag.
649                my $optionTag = "<OPTION class=\"$domain$nmpdrStyle\" title=\"$group\" value=\"$genomeID\"$selectTag>$nameString</OPTION>";
650                push @lines, "    $optionTag";
651            }
652            # Record this group in the nmpdrGroup count. When that gets to 0, we've finished the NMPDR
653            # groups.
654            $nmpdrGroupCount--;
655      }      }
656      # Close the SELECT tag.      # Close the SELECT tag.
657      $retVal .= "</select>\n";      push @lines, "</SELECT>";
658        if ($rows > 1) {
659            # We're in a non-compact mode, so we need to add some selection helpers. First is
660            # the search box. This allows the user to type text and change which genomes are
661            # displayed. For multiple-select mode, we include a button that selects the displayed
662            # genes. For single-select mode, we use a plain label instead.
663            my $searchThingName = "${menuID}_SearchThing";
664            my $searchThingLabel = ($multiSelect ? "<INPUT type=\"button\" name=\"MacroSearch\" class=\"button\" value=\"Select genomes containing\" onClick=\"selectShowing('$menuID', '$searchThingName'); $showSelect;\" />"
665                                                 : "Show genomes containing");
666            push @lines, "<br />$searchThingLabel&nbsp;" .
667                         "<INPUT type=\"text\" id=\"$searchThingName\" name=\"$searchThingName\" size=\"30\" onKeyup=\"showTyped('$menuID', '$searchThingName');\" />";
668            # For multi-select mode, we also have buttons to set and clear selections.
669            if ($multiSelect) {
670                push @lines, "<INPUT type=\"button\" name=\"ClearAll\" class=\"bigButton\"  value=\"Clear All\" onClick=\"clearAll('$menuID'); $showSelect\" />";
671                push @lines, "<INPUT type=\"button\" name=\"SelectAll\" class=\"bigButton\" value=\"Select All\" onClick=\"selectAll('$menuID'); $showSelect\" />";
672                push @lines, "<INPUT type=\"button\" name=\"NMPDROnly\" class=\"bigButton\"  value=\"Select NMPDR\" onClick=\"selectSome('$menuID', $nmpdrCount, true); $showSelect;\" />";
673            }
674            # Add a hidden field we can use to generate organism page hyperlinks.
675            push @lines, "<INPUT type=\"hidden\" id=\"$urlID\" value=\"$FIG_Config::cgi_url/seedviewer.cgi?page=Organism;organism=\" />";
676            # Add the status display. This tells the user what's selected no matter where the list is scrolled.
677            push @lines, "<DIV id=\"$divID\" class=\"Panel\"></DIV>";
678        }
679        # Assemble all the lines into a string.
680        my $retVal = join("\n", @lines, "");
681      # Return the result.      # Return the result.
682      return $retVal;      return $retVal;
683  }  }
684    
685    
686    =head3 Stem
687    
688        my $stem = $sprout->Stem($word);
689    
690    Return the stem of the specified word, or C<undef> if the word is not
691    stemmable. Note that even if the word is stemmable, the stem may be
692    the same as the original word.
693    
694    =over 4
695    
696    =item word
697    
698    Word to convert into a stem.
699    
700    =item RETURN
701    
702    Returns a stem of the word (which may be the word itself), or C<undef> if
703    the word is not stemmable.
704    
705    =back
706    
707    =cut
708    
709    sub Stem {
710        # Get the parameters.
711        my ($self, $word) = @_;
712        # Declare the return variable.
713        my $retVal;
714        # See if it's stemmable.
715        if ($word =~ /^[A-Za-z]+$/) {
716            # Compute the stem.
717            my $stemList = $self->{stemmer}->stem($word);
718            my $stem = $stemList->[0];
719            # Check to see if it's long enough.
720            if (length $stem >= 3) {
721                # Yes, keep it.
722                $retVal = $stem;
723            } else {
724                # No, use the original word.
725                $retVal = $word;
726            }
727        }
728        # Return the result.
729        return $retVal;
730    }
731    
732    
733  =head3 Build  =head3 Build
734    
735      $sprout->Build();      $sprout->Build();
# Line 571  Line 867 
867  =item RETURN  =item RETURN
868    
869  Returns a list of the feature's contig segments. The locations are returned as a list in a list  Returns a list of the feature's contig segments. The locations are returned as a list in a list
870  context and as a comma-delimited string in a scalar context.  context and as a comma-delimited string in a scalar context. An empty list means the feature
871    wasn't found.
872    
873  =back  =back
874    
# Line 580  Line 877 
877  sub FeatureLocation {  sub FeatureLocation {
878      # Get the parameters.      # Get the parameters.
879      my ($self, $featureID) = @_;      my ($self, $featureID) = @_;
880        # Declare the return variable.
881        my @retVal = ();
882      # Get the feature record.      # Get the feature record.
883      my $object = $self->GetEntity('Feature', $featureID);      my $object = $self->GetEntity('Feature', $featureID);
884      Confess("Feature $featureID not found.") if ! defined($object);      # Only proceed if we found it.
885        if (defined $object) {
886      # Get the location string.      # Get the location string.
887      my $locString = $object->PrimaryValue('Feature(location-string)');      my $locString = $object->PrimaryValue('Feature(location-string)');
888      # Create the return list.      # Create the return list.
889      my @retVal = split /\s*,\s*/, $locString;          @retVal = split /\s*,\s*/, $locString;
890        }
891      # Return the list in the format indicated by the context.      # Return the list in the format indicated by the context.
892      return (wantarray ? @retVal : join(',', @retVal));      return (wantarray ? @retVal : join(',', @retVal));
893  }  }
# Line 3621  Line 3922 
3922      my ($self, %groupHash) = @_;      my ($self, %groupHash) = @_;
3923      # Create the result hash.      # Create the result hash.
3924      my %retVal = ();      my %retVal = ();
     # Get the super-group table.  
     my %superTable = $self->CheckGroupFile();  
3925      # Copy over the genomes.      # Copy over the genomes.
3926      for my $groupID (keys %groupHash) {      for my $groupID (keys %groupHash) {
3927          # Get the super-group name.          # Get the super-group name.
3928          my $realGroupID;          my $realGroupID = $self->SuperGroup($groupID);
3929          if ($groupID =~ /([A-Z]\w+)/) {          # Append this group's genomes into the result hash
3930              if (! defined($superTable{$1})) {          # using the super-group name.
                 Confess("Super-group name not found for group $groupID.");  
             } else {  
                 $realGroupID = $1;  
             }  
         } else {  
             Confess("Invalid group name $groupID.");  
         }  
         # Append this group's genomes into the result hash.  
3931          push @{$retVal{$realGroupID}}, @{$groupHash{$groupID}};          push @{$retVal{$realGroupID}}, @{$groupHash{$groupID}};
3932      }      }
3933      # Return the result hash.      # Return the result hash.
# Line 3667  Line 3958 
3958  sub GroupPageName {  sub GroupPageName {
3959      # Get the parameters.      # Get the parameters.
3960      my ($self, $group) = @_;      my ($self, $group) = @_;
     # Declare the return variable.  
     my $retVal;  
3961      # Check for the group file data.      # Check for the group file data.
3962      my %superTable = $self->CheckGroupFile();      my %superTable = $self->CheckGroupFile();
3963      # Compute the real group name.      # Compute the real group name.
3964      if ($group =~ /([A-Z]\w+)/) {      my $realGroup = $self->SuperGroup($group);
3965          my $realGroup = $1;      # Get the associated page name.
3966          if (! defined($superTable{$1})) {      my $retVal = "../content/$superTable{$realGroup}->{page}";
             Confess("No super-group found for \"$group\".");  
         } else {  
             $retVal = "../content/$superTable{$1}->{page}";  
         }  
     } else {  
         Confess("\"group\" is not a valid group name.");  
     }  
3967      # Return the result.      # Return the result.
3968      return $retVal;      return $retVal;
3969  }  }
# Line 3803  Line 4085 
4085      # Get the parameters.      # Get the parameters.
4086      my ($self, $searchExpression) = @_;      my ($self, $searchExpression) = @_;
4087      # Perform the standard cleanup.      # Perform the standard cleanup.
4088      my $retVal = $self->ERDB::CleanKeywords($searchExpression);      my $words = $self->ERDB::CleanKeywords($searchExpression);
4089      # Fix the periods in EC and TC numbers.      # Fix the periods in EC and TC numbers.
4090      $retVal =~ s/(\d+|\-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/$1_$2_$3_$4/g;      $words =~ s/(\d+|\-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/$1_$2_$3_$4/g;
4091      # Fix non-trailing periods.      # Fix non-trailing periods.
4092      $retVal =~ s/\.(\w)/_$1/g;      $words =~ s/\.(\w)/_$1/g;
4093      # Fix non-leading minus signs.      # Fix non-leading minus signs.
4094      $retVal =~ s/(\w)[\-]/$1_/g;      $words =~ s/(\w)[\-]/$1_/g;
4095      # Fix the vertical bars and colons      # Fix the vertical bars and colons
4096      $retVal =~ s/(\w)[|:](\w)/$1'$2/g;      $words =~ s/(\w)[|:](\w)/$1'$2/g;
4097        # Now split up the list so that each keyword is in its own string. We keep the delimiters
4098        # because they may contain boolean expression data.
4099        my @words = split /([^A-Za-z'0-9_]+)/, $words;
4100        # We'll convert the stemmable words into stems and re-assemble the result.
4101        my $retVal = "";
4102        for my $word (@words) {
4103            my $stem = $self->Stem($word);
4104            if (defined $stem) {
4105                $retVal .= $stem;
4106            } else {
4107                $retVal .= $word;
4108            }
4109        }
4110        Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3);
4111      # Return the result.      # Return the result.
4112      return $retVal;      return $retVal;
4113  }  }

Legend:
Removed from v.1.105  
changed lines
  Added in v.1.111

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3