[Bio] / Sprout / Sprout.pm Repository:
ViewVC logotype

Diff of /Sprout/Sprout.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.118, Sat Sep 20 14:32:34 2008 UTC revision 1.124, Wed Mar 4 00:09:43 2009 UTC
# Line 4  Line 4 
4      use strict;      use strict;
5      use DBKernel;      use DBKernel;
6      use XML::Simple;      use XML::Simple;
7      use DBQuery;      use ERDBQuery;
8      use ERDBObject;      use ERDBObject;
9      use Tracer;      use Tracer;
10      use FIGRules;      use FIGRules;
# Line 14  Line 14 
14      use BasicLocation;      use BasicLocation;
15      use CustomAttributes;      use CustomAttributes;
16      use RemoteCustomAttributes;      use RemoteCustomAttributes;
17      use CGI;      use CGI qw(-nosticky);
18      use WikiTools;      use WikiTools;
19      use BioWords;      use BioWords;
20      use base qw(ERDB);      use base qw(ERDB);
# Line 41  Line 41 
41    
42  =cut  =cut
43    
 #: Constructor SFXlate->new_sprout_only();  
   
44  =head2 Public Methods  =head2 Public Methods
45    
46  =head3 new  =head3 new
# Line 57  Line 55 
55    
56  =item dbName  =item dbName
57    
58  Name of the database.  Name of the database. If omitted, the default Sprout database name is used.
59    
60  =item options  =item options
61    
# Line 91  Line 89 
89    
90      my $sprout = Sprout->new('Sprout', { userData => 'fig/admin', dataDir => '/usr/fig/SproutData' });      my $sprout = Sprout->new('Sprout', { userData => 'fig/admin', dataDir => '/usr/fig/SproutData' });
91    
92    In order to work properly with [[ERDBGeneratorPl]], the constructor has an alternate
93    form.
94    
95        my $sprout = Sprout->new(dbd => $filename);
96    
97    Where I<$fileName> is the name of the DBD file. This enables us to specify an alternate
98    DBD for the loader, which is important when the database format changes.
99    
100  =cut  =cut
101    
102  sub new {  sub new {
103      # Get the parameters.      # Get the parameters.
104      my ($class, $dbName, $options) = @_;      my ($class, $dbName, $options) = @_;
105        # Check for the alternate signature, and default the database name if it is missing.
106        if ($dbName eq 'dbd') {
107            $dbName = $FIG_Config::sproutDB;
108            $options = { xmlFileName => $options };
109        } elsif (! defined $dbName) {
110            $dbName = $FIG_Config::sproutDB;
111        } elsif (ref $dbName eq 'HASH') {
112            $options = $dbName;
113            $dbName = $FIG_Config::sproutDB;
114        }
115      # Compute the DBD directory.      # Compute the DBD directory.
116      my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :      my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :
117                                                    $FIG_Config::fig );                                                    $FIG_Config::fig );
# Line 117  Line 133 
133                         maxSegmentLength => 4500,        # maximum feature segment length                         maxSegmentLength => 4500,        # maximum feature segment length
134                         maxSequenceLength => 8000,       # maximum contig sequence length                         maxSequenceLength => 8000,       # maximum contig sequence length
135                         noDBOpen     => 0,               # 1 to suppress the database open                         noDBOpen     => 0,               # 1 to suppress the database open
136                           demandDriven => 0,               # 1 for forward-only queries
137                        }, $options);                        }, $options);
138      # Get the data directory.      # Get the data directory.
139      my $dataDir = $optionTable->{dataDir};      my $dataDir = $optionTable->{dataDir};
# Line 132  Line 149 
149      }      }
150      # Create the ERDB object.      # Create the ERDB object.
151      my $xmlFileName = "$optionTable->{xmlFileName}";      my $xmlFileName = "$optionTable->{xmlFileName}";
152      my $retVal = ERDB::new($class, $dbh, $xmlFileName);      my $retVal = ERDB::new($class, $dbh, $xmlFileName, %$optionTable);
153      # Add the option table and XML file name.      # Add the option table and XML file name.
154      $retVal->{_options} = $optionTable;      $retVal->{_options} = $optionTable;
155      $retVal->{_xmlName} = $xmlFileName;      $retVal->{_xmlName} = $xmlFileName;
156      # Set up space for the group file data.      # Set up space for the group file data.
157      $retVal->{groupHash} = undef;      $retVal->{groupHash} = undef;
158      # Set up space for the genome hash. We use this to identify NMPDR genomes.      # Set up space for the genome hash. We use this to identify NMPDR genomes
159      $retVal->{genomeHash} = undef;      # and remember genome data.
160      # Connect to the attributes.      $retVal->{genomeHash} = {};
161        $retVal->{genomeHashFilled} = 0;
162        # Remember the data directory name.
163        $retVal->{dataDir} = $dataDir;
164        # Return it.
165        return $retVal;
166    }
167    
168    =head3 ca
169    
170        my $ca = $sprout->ca():;
171    
172    Return the [[CustomAttributesPm]] object for retrieving object
173    properties.
174    
175    =cut
176    
177    sub ca {
178        # Get the parameters.
179        my ($self) = @_;
180        # Do we already have an attribute object?
181        my $retVal = $self->{_ca};
182        if (! defined $retVal) {
183            # No, create one. How we do it depends on the configuration.
184      if ($FIG_Config::attrURL) {      if ($FIG_Config::attrURL) {
185          Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3);          Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3);
186          $retVal->{_ca} = RemoteCustomAttributes->new($FIG_Config::attrURL);              $retVal = RemoteCustomAttributes->new($FIG_Config::attrURL);
187      } elsif ($FIG_Config::attrDbName) {      } elsif ($FIG_Config::attrDbName) {
188          Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3);          Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3);
189          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));
190          $retVal->{_ca} = CustomAttributes->new(user => $user);              $retVal = CustomAttributes->new(user => $user);
191      }      }
192      # Return it.          # Save it for next time.
193            $self->{_ca} = $retVal;
194        }
195        # Return the result.
196      return $retVal;      return $retVal;
197  }  }
198    
# Line 311  Line 354 
354  The files are loaded based on the presumption that each line of the file is a record in the  The files are loaded based on the presumption that each line of the file is a record in the
355  relation, and the individual fields are delimited by tabs. Tab and new-line characters inside  relation, and the individual fields are delimited by tabs. Tab and new-line characters inside
356  fields must be represented by the escape sequences C<\t> and C<\n>, respectively. The fields must  fields must be represented by the escape sequences C<\t> and C<\n>, respectively. The fields must
357  be presented in the order given in the relation tables produced by the L</ShowMetaData> method.  be presented in the order given in the relation tables produced by the database documentation.
358    
359  =over 4  =over 4
360    
# Line 500  Line 543 
543  =item filter  =item filter
544    
545  If specified, a filter for the list of genomes to display. The filter should be in the form of a  If specified, a filter for the list of genomes to display. The filter should be in the form of a
546  list reference. The first element of the list should be the filter string, and the remaining elements  list reference, a string, or a hash reference. If it is a list reference, the first element
547  the filter parameters.  of the list should be the filter string, and the remaining elements the filter parameters. If it is a
548    string, it will be split into a list at each included tab. If it is a hash reference, it should be
549    a hash that maps genomes which should be included to a TRUE value.
550    
551  =item multiSelect  =item multiSelect
552    
# Line 540  Line 585 
585      my $divID = "${menuID}_status";      my $divID = "${menuID}_status";
586      my $urlID = "${menuID}_url";      my $urlID = "${menuID}_url";
587      # Compute the code to show selected genomes in the status area.      # Compute the code to show selected genomes in the status area.
588      my $showSelect = "showSelected('$menuID', '$divID', '$urlID', 1000)";      my $showSelect = "showSelected('$menuID', '$divID', '$urlID', $FIG_Config::genome_control_cap)";
589      # Check for single-select or multi-select.      # Check for single-select or multi-select.
590      my $multiSelect = $options{multiSelect} || 0;      my $multiSelect = $options{multiSelect} || 0;
591      # Get the style data.      # Get the style data.
# Line 552  Line 597 
597      }      }
598      my %selected = map { $_ => 1 } @{$selections};      my %selected = map { $_ => 1 } @{$selections};
599      # Extract the filter information. The default is no filtering. It can be passed as a tab-delimited      # Extract the filter information. The default is no filtering. It can be passed as a tab-delimited
600      # string or a list reference.      # string, a hash reference, or a list reference.
601        my ($filterHash, $filterString);
602      my $filterParms = $options{filter} || "";      my $filterParms = $options{filter} || "";
603        if (ref $filterParms eq 'HASH') {
604            $filterHash = $filterParms;
605            $filterParms = [];
606            $filterString = "";
607        } else {
608      if (! ref $filterParms) {      if (! ref $filterParms) {
609          $filterParms = [split /\t|\\t/, $filterParms];          $filterParms = [split /\t|\\t/, $filterParms];
610      }      }
611      my $filterString = shift @{$filterParms};          $filterString = shift @{$filterParms};
612        }
613        # Check for possible subsystem filtering. If there is one, we will tack the
614        # relationship onto the object name list.
615        my @objectNames = qw(Genome);
616        if ($filterString =~ /ParticipatesIn\(/) {
617            push @objectNames, 'ParticipatesIn';
618        }
619      # Get a list of all the genomes in group order. In fact, we only need them ordered      # Get a list of all the genomes in group order. In fact, we only need them ordered
620      # by name (genus,species,strain), but putting primary-group in front enables us to      # by name (genus,species,strain), but putting primary-group in front enables us to
621      # take advantage of an existing index.      # take advantage of an existing index.
622      my @genomeList = $self->GetAll(['Genome'], "$filterString ORDER BY Genome(primary-group), Genome(genus), Genome(species), Genome(unique-characterization)",      my @genomeList = $self->GetAll(\@objectNames, "$filterString ORDER BY Genome(primary-group), Genome(genus), Genome(species), Genome(unique-characterization)",
623                                     $filterParms,                                     $filterParms,
624                                     [qw(Genome(primary-group) Genome(id) Genome(genus) Genome(species) Genome(unique-characterization) Genome(taxonomy) Genome(contigs))]);                                     [qw(Genome(primary-group) Genome(id) Genome(genus) Genome(species) Genome(unique-characterization) Genome(taxonomy) Genome(contigs))]);
625        # Apply the hash filter (if any).
626        if (defined $filterHash) {
627            @genomeList = grep { $filterHash->{$_->[1]} } @genomeList;
628        }
629      # Create a hash to organize the genomes by group. Each group will contain a list of      # Create a hash to organize the genomes by group. Each group will contain a list of
630      # 2-tuples, the first element being the genome ID and the second being the genome      # 2-tuples, the first element being the genome ID and the second being the genome
631      # name.      # name.
# Line 593  Line 655 
655      my @groups = map { $sortGroups{$_} } sort keys %sortGroups;      my @groups = map { $sortGroups{$_} } sort keys %sortGroups;
656      # Remember the number of NMPDR groups.      # Remember the number of NMPDR groups.
657      my $nmpdrGroupCount = scalar @groups;      my $nmpdrGroupCount = scalar @groups;
658        # Are there any supporting genomes?
659        if (exists $gHash{$FIG_Config::otherGroup}) {
660      # Loop through the supporting genomes, classifying them by domain. We'll also keep a list      # Loop through the supporting genomes, classifying them by domain. We'll also keep a list
661      # of the domains found.      # of the domains found.
662      my @otherGenomes = @{$gHash{$FIG_Config::otherGroup}};      my @otherGenomes = @{$gHash{$FIG_Config::otherGroup}};
# Line 611  Line 675 
675      push @groups, sort @domains;      push @groups, sort @domains;
676      # Delete the supporting group.      # Delete the supporting group.
677      delete $gHash{$FIG_Config::otherGroup};      delete $gHash{$FIG_Config::otherGroup};
678        }
679      # Now it gets complicated. We need a way to mark all the NMPDR genomes. We take advantage      # Now it gets complicated. We need a way to mark all the NMPDR genomes. We take advantage
680      # of the fact they come first in the list. We'll accumulate a count of the NMPDR genomes      # of the fact they come first in the list. We'll accumulate a count of the NMPDR genomes
681      # and use that to make the selections.      # and use that to make the selections.
# Line 624  Line 689 
689      # Set up the multiple-select flag.      # Set up the multiple-select flag.
690      my $multipleTag = ($multiSelect ? " multiple" : "" );      my $multipleTag = ($multiSelect ? " multiple" : "" );
691      # Set up the style class.      # Set up the style class.
692      my $classTag = ($class ? " class=\"$class\"" : "" );      my $classTag = ($class ? " $class" : "" );
693      # Create the SELECT tag and stuff it into the output array.      # Create the SELECT tag and stuff it into the output array.
694      my @lines = ("<SELECT name=\"$menuName\" id=\"$menuID\" $onChangeTag$multipleTag$classTag size=\"$rows\">");      my @lines = qq(<SELECT name="$menuName" id="$menuID" class="genomeSelect $class" $onChangeTag$multipleTag$classTag size="$rows">);
695      # Loop through the groups.      # Loop through the groups.
696      for my $group (@groups) {      for my $group (@groups) {
697          # Get the genomes in the group.          # Get the genomes in the group.
# Line 659  Line 724 
724          # displayed. For multiple-select mode, we include a button that selects the displayed          # displayed. For multiple-select mode, we include a button that selects the displayed
725          # genes. For single-select mode, we use a plain label instead.          # genes. For single-select mode, we use a plain label instead.
726          my $searchThingName = "${menuID}_SearchThing";          my $searchThingName = "${menuID}_SearchThing";
727          my $searchThingLabel = ($multiSelect ? "<INPUT type=\"button\" name=\"MacroSearch\" class=\"button\" value=\"Select genomes containing\" onClick=\"selectShowing('$menuID', '$searchThingName'); $showSelect;\" />"          my $searchThingLabel = "Type to narrow selection";
728                                               : "Show genomes containing");          my $searchThingButton = "";
729            my $goHint = "";
730            if ($multiSelect) {
731                $searchThingButton = qq(<INPUT type="button" name="MacroSearch" class="button" value="Go" onClick="selectShowing('$menuID', '$searchThingName'); $showSelect;" />);
732                $goHint = " Click <strong>Go</strong> to select them.";
733            }
734          push @lines, "<br />$searchThingLabel&nbsp;" .          push @lines, "<br />$searchThingLabel&nbsp;" .
735                       "<INPUT type=\"text\" id=\"$searchThingName\" name=\"$searchThingName\" size=\"30\" onKeyup=\"showTyped('$menuID', '$searchThingName');\" />" .                       qq(<INPUT type="text" id="$searchThingName" name="$searchThingName" class="genomeSearchThing" onKeyup="showTyped('$menuID', '$searchThingName');" />) .
736                       Hint("GenomeControl", "Type here to filter the genomes displayed.") . "<br />";                       $searchThingButton .
737                         Hint("GenomeControl", "Type a genome ID or part of an organism name to filter the genomes displayed.$goHint") . "<br />";
738          # For multi-select mode, we also have buttons to set and clear selections.          # For multi-select mode, we also have buttons to set and clear selections.
739          if ($multiSelect) {          if ($multiSelect) {
740              push @lines, "<INPUT type=\"button\" name=\"ClearAll\" class=\"bigButton\"  value=\"Clear All\" onClick=\"clearAll('$menuID'); $showSelect\" />";              push @lines, qq(<INPUT type="button" name="ClearAll" class="bigButton genomeButton" value="Clear All" onClick="clearAll(getElementById('$menuID')); $showSelect" />);
741              push @lines, "<INPUT type=\"button\" name=\"SelectAll\" class=\"bigButton\" value=\"Select All\" onClick=\"selectAll('$menuID'); $showSelect\" />";              push @lines, qq(<INPUT type="button" name="SelectAll" class="bigButton genomeButton" value="Select All" onClick="selectAll(getElementById('$menuID')); $showSelect" />);
742              push @lines, "<INPUT type=\"button\" name=\"NMPDROnly\" class=\"bigButton\"  value=\"Select NMPDR\" onClick=\"selectSome('$menuID', $nmpdrCount, true); $showSelect;\" />";              push @lines, qq(<INPUT type="button" name="NMPDROnly" class="bigButton genomeButton" value="Select NMPDR" onClick="selectSome(getElementById('$menuID'), $nmpdrCount, true); $showSelect;" />);
743          }          }
744          # Add a hidden field we can use to generate organism page hyperlinks.          # Add a hidden field we can use to generate organism page hyperlinks.
745          push @lines, "<INPUT type=\"hidden\" id=\"$urlID\" value=\"$FIG_Config::cgi_url/wiki/rest.cgi/NmpdrPlugin/SeedViewer?page=Organism;organism=\" />";          push @lines, qq(<INPUT type="hidden" id="$urlID" value="$FIG_Config::cgi_url/wiki/rest.cgi/NmpdrPlugin/SeedViewer?page=Organism;organism=" />);
746          # Add the status display. This tells the user what's selected no matter where the list is scrolled.          # Add the status display. This tells the user what's selected no matter where the list is scrolled.
747          push @lines, "<DIV id=\"$divID\" class=\"Panel\"></DIV>";          push @lines, qq(<DIV id="$divID" class="Panel"></DIV>);
748      }      }
749      # Assemble all the lines into a string.      # Assemble all the lines into a string.
750      my $retVal = join("\n", @lines, "");      my $retVal = join("\n", @lines, "");
# Line 714  Line 785 
785          # We don't have one pre-built, so we build and save it now.          # We don't have one pre-built, so we build and save it now.
786          $stemmer = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",          $stemmer = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
787                                   stops => "$FIG_Config::sproutData/StopWords.txt",                                   stops => "$FIG_Config::sproutData/StopWords.txt",
788                                   cache => 1);                                   cache => 0);
789          $self->{stemmer} = $stemmer;          $self->{stemmer} = $stemmer;
790      }      }
791      # Try to stem the word.      # Try to stem the word.
# Line 782  Line 853 
853  sub GenusSpecies {  sub GenusSpecies {
854      # Get the parameters.      # Get the parameters.
855      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
856      # Get the data for the specified genome.      # Declare the return value.
857      my @values = $self->GetEntityValues('Genome', $genomeID, ['Genome(genus)', 'Genome(species)',      my $retVal;
858                                                                'Genome(unique-characterization)']);      # Get the genome data.
859      # Format the result and return it.      my $genomeData = $self->_GenomeData($genomeID);
860      my $retVal = join(' ', @values);      # Only proceed if we found the genome.
861        if (defined $genomeData) {
862            $retVal = $genomeData->PrimaryValue('Genome(scientific-name)');
863        }
864        # Return it.
865      return $retVal;      return $retVal;
866  }  }
867    
# Line 1124  Line 1199 
1199      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
1200      # Declare the return variable.      # Declare the return variable.
1201      my $retVal = 0;      my $retVal = 0;
1202      # Get the genome's contig sequence lengths.      # Get the genome data.
1203      my @lens = $self->GetFlat(['HasContig', 'IsMadeUpOf'], 'HasContig(from-link) = ?',      my $genomeData = $self->_GenomeData($genomeID);
1204                         [$genomeID], 'IsMadeUpOf(len)');      # Only proceed if it exists.
1205      # Sum the lengths.      if (defined $genomeData) {
1206      map { $retVal += $_ } @lens;          $retVal = $genomeData->PrimaryValue('Genome(dna-size)');
1207        }
1208      # Return the result.      # Return the result.
1209      return $retVal;      return $retVal;
1210  }  }
# Line 1795  Line 1871 
1871      # Loop through the incoming features.      # Loop through the incoming features.
1872      for my $featureID (@{$featureList}) {      for my $featureID (@{$featureList}) {
1873          # Ask the server for the feature's best hit.          # Ask the server for the feature's best hit.
1874          my @bbhData = FIGRules::BBHData($featureID);          my $bbhData = FIGRules::BBHData($featureID);
1875          # Peel off the BBHs found.          # Peel off the BBHs found.
1876          my @found = ();          my @found = ();
1877          for my $bbh (@bbhData) {          for my $bbh (@$bbhData) {
1878              my $fid = $bbh->[0];              my $fid = $bbh->[0];
1879              my $bbGenome = $self->GenomeOf($fid);              my $bbGenome = $self->GenomeOf($fid);
1880              if ($bbGenome eq $genomeID) {              if ($bbGenome eq $genomeID) {
# Line 1837  Line 1913 
1913      # Get the parameters.      # Get the parameters.
1914      my ($self, $featureID, $count) = @_;      my ($self, $featureID, $count) = @_;
1915      # Ask for the best hits.      # Ask for the best hits.
1916      my @lists = FIGRules::BBHData($featureID);      my $lists = FIGRules::BBHData($featureID);
1917      # Create the return value.      # Create the return value.
1918      my %retVal = ();      my %retVal = ();
1919      for my $tuple (@lists) {      for my $tuple (@$lists) {
1920          $retVal{$tuple->[0]} = $tuple->[1];          $retVal{$tuple->[0]} = $tuple->[1];
1921      }      }
1922      # Return the result.      # Return the result.
# Line 1874  Line 1950 
1950      # Declare the return variable.      # Declare the return variable.
1951      my $retVal;      my $retVal;
1952      # Get the genome's data.      # Get the genome's data.
1953      my $genomeData = $self->GetEntity('Genome', $genomeID);      my $genomeData = $self->_GenomeData($genomeID);
1954      if ($genomeData) {      # Only proceed if it exists.
1955        if (defined $genomeData) {
1956          # The genome exists, so get the completeness flag.          # The genome exists, so get the completeness flag.
1957          $retVal = $genomeData->PrimaryValue('Genome(complete)');          $retVal = $genomeData->PrimaryValue('Genome(complete)');
1958      }      }
# Line 1991  Line 2068 
2068              $retVal{$featureID2} = $score;              $retVal{$featureID2} = $score;
2069          }          }
2070      }      }
     # Functional coupling is reflexive. If we found at least one coupled feature, we must add  
     # the incoming feature as well.  
     if (keys %retVal) {  
         $retVal{$featureID} = 9999;  
     }  
2071      # Return the hash.      # Return the hash.
2072      return %retVal;      return %retVal;
2073  }  }
# Line 2352  Line 2424 
2424  sub GetGenomeNameData {  sub GetGenomeNameData {
2425      # Get the parameters.      # Get the parameters.
2426      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
2427        # Declare the return variables.
2428        my ($genus, $species, $strain);
2429        # Get the genome's data.
2430        my $genomeData = $self->_GenomeData($genomeID);
2431        # Only proceed if the genome exists.
2432        if (defined $genomeData) {
2433      # Get the desired values.      # Get the desired values.
2434      my ($genus, $species, $strain) = $self->GetEntityValues('Genome', $genomeID =>          ($genus, $species, $strain) = $genomeData->Values(['Genome(genus)',
2435                                                              [qw(Genome(genus) Genome(species) Genome(unique-characterization))]);                                                             'Genome(species)',
2436      # Throw an error if they were not found.                                                             'Genome(unique-characterization)']);
2437      if (! defined $genus) {      } else {
2438            # Throw an error because they were not found.
2439          Confess("Genome $genomeID not found in database.");          Confess("Genome $genomeID not found in database.");
2440      }      }
2441      # Return the results.      # Return the results.
# Line 2657  Line 2736 
2736  sub Taxonomy {  sub Taxonomy {
2737      # Get the parameters.      # Get the parameters.
2738      my ($self, $genome) = @_;      my ($self, $genome) = @_;
     # Find the specified genome's taxonomy string.  
     my ($list) = $self->GetEntityValues('Genome', $genome, ['Genome(taxonomy)']);  
2739      # Declare the return variable.      # Declare the return variable.
2740      my @retVal = ();      my @retVal = ();
2741      # If we found the genome, return its taxonomy string.      # Get the genome data.
2742      if ($list) {      my $genomeData = $self->_GenomeData($genome);
2743          @retVal = split /\s*;\s*/, $list;      # Only proceed if it exists.
2744        if (defined $genomeData) {
2745            # Create the taxonomy from the taxonomy string.
2746            @retVal = split /\s*;\s*/, $genomeData->PrimaryValue('Genome(taxonomy)');
2747      } else {      } else {
2748            # Genome doesn't exist, so emit a warning.
2749          Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0);          Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0);
2750      }      }
2751      # Return the value found.      # Return the value found.
# Line 2709  Line 2790 
2790      }      }
2791      my @taxA = $self->Taxonomy($genomeA);      my @taxA = $self->Taxonomy($genomeA);
2792      my @taxB = $self->Taxonomy($genomeB);      my @taxB = $self->Taxonomy($genomeB);
2793      # Initialize the distance to 1. We'll reduce it each time we find a match between the      # Compute the distance.
2794      # taxonomies.      my $retVal = FIGRules::CrudeDistanceFormula(\@taxA, \@taxB);
     my $retVal = 1.0;  
     # Initialize the subtraction amount. This amount determines the distance reduction caused  
     # by a mismatch at the current level.  
     my $v = 0.5;  
     # Loop through the taxonomies.  
     for (my $i = 0; ($i < @taxA) && ($i < @taxB) && ($taxA[$i] eq $taxB[$i]); $i++) {  
         $retVal -= $v;  
         $v /= 2;  
     }  
2795      return $retVal;      return $retVal;
2796  }  }
2797    
# Line 2815  Line 2887 
2887      # Get the parameters.      # Get the parameters.
2888      my ($self, $featureID) = @_;      my ($self, $featureID) = @_;
2889      # Get the properties.      # Get the properties.
2890      my @attributes = $self->{_ca}->GetAttributes($featureID);      my @attributes = $self->ca->GetAttributes($featureID);
2891      # Strip the feature ID off each tuple.      # Strip the feature ID off each tuple.
2892      my @retVal = ();      my @retVal = ();
2893      for my $attributeRow (@attributes) {      for my $attributeRow (@attributes) {
# Line 3184  Line 3256 
3256      # Get the parameters.      # Get the parameters.
3257      my ($self, $featureID, $function, $userID) = @_;      my ($self, $featureID, $function, $userID) = @_;
3258      # Get a list of the features that are BBHs of the incoming feature.      # Get a list of the features that are BBHs of the incoming feature.
3259      my @bbhFeatures = map { $_->[0] } FIGRules::BBHData($featureID);      my $bbhData = FIGRules::BBHData($featureID);
3260        my @bbhFeatures = map { $_->[0] } @$bbhData;
3261      # Now we loop through the features, pulling out the ones that have the correct      # Now we loop through the features, pulling out the ones that have the correct
3262      # functional assignment.      # functional assignment.
3263      my @retVal = ();      my @retVal = ();
# Line 3357  Line 3430 
3430    
3431  =head3 BBHMatrix  =head3 BBHMatrix
3432    
3433      my %bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets);      my $bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets);
3434    
3435  Find all the bidirectional best hits for the features of a genome in a  Find all the bidirectional best hits for the features of a genome in a
3436  specified list of target genomes. The return value will be a hash mapping  specified list of target genomes. The return value will be a hash mapping
# Line 3381  Line 3454 
3454    
3455  =item RETURN  =item RETURN
3456    
3457  Returns a hash mapping each feature in the original genome to a hash mapping its  Returns a reference to a hash mapping each feature in the original genome
3458  BBH pegs in the target genomes to their scores.  to a sub-hash mapping its BBH pegs in the target genomes to their scores.
3459    
3460  =back  =back
3461    
# Line 3395  Line 3468 
3468      my %retVal = ();      my %retVal = ();
3469      # Ask for the BBHs.      # Ask for the BBHs.
3470      my @bbhList = FIGRules::BatchBBHs("fig|$genomeID.%", $cutoff, @targets);      my @bbhList = FIGRules::BatchBBHs("fig|$genomeID.%", $cutoff, @targets);
3471        Trace("Retrieved " . scalar(@bbhList) . " BBH results.") if T(3);
3472      # We now have a set of 4-tuples that we need to convert into a hash of hashes.      # We now have a set of 4-tuples that we need to convert into a hash of hashes.
3473      for my $bbhData (@bbhList) {      for my $bbhData (@bbhList) {
3474          my ($peg1, $peg2, $score) = @{$bbhData};          my ($peg1, $peg2, $score) = @{$bbhData};
# Line 3405  Line 3479 
3479          }          }
3480      }      }
3481      # Return the result.      # Return the result.
3482      return %retVal;      return \%retVal;
3483  }  }
3484    
3485    
# Line 3511  Line 3585 
3585      # Create the return hash.      # Create the return hash.
3586      my %retVal = ();      my %retVal = ();
3587      # Query for the desired BBHs.      # Query for the desired BBHs.
3588      my @bbhList = FIGRules::BBHData($featureID, $cutoff);      my $bbhList = FIGRules::BBHData($featureID, $cutoff);
3589      # Form the results into the return hash.      # Form the results into the return hash.
3590      for my $pair (@bbhList) {      for my $pair (@$bbhList) {
3591          my $fid = $pair->[0];          my $fid = $pair->[0];
3592          if ($self->Exists('Feature', $fid)) {          if ($self->Exists('Feature', $fid)) {
3593              $retVal{$fid} = $pair->[1];              $retVal{$fid} = $pair->[1];
# Line 3891  Line 3965 
3965      # Get the parameters.      # Get the parameters.
3966      my ($self, $featureID, $key, @values) = @_;      my ($self, $featureID, $key, @values) = @_;
3967      # Add the property using the attached attributes object.      # Add the property using the attached attributes object.
3968      $self->{_ca}->AddAttribute($featureID, $key, @values);      $self->ca->AddAttribute($featureID, $key, @values);
3969  }  }
3970    
3971  =head3 CheckGroupFile  =head3 CheckGroupFile
# Line 3978  Line 4052 
4052  sub CleanKeywords {  sub CleanKeywords {
4053      # Get the parameters.      # Get the parameters.
4054      my ($self, $searchExpression) = @_;      my ($self, $searchExpression) = @_;
4055      # Perform the standard cleanup.      # Get the stemmer.
4056      my $words = $self->ERDB::CleanKeywords($searchExpression);      my $stemmer = $self->GetStemmer();
4057      # Fix the periods in EC and TC numbers.      # Convert the search expression using the stemmer.
4058      $words =~ s/(\d+|\-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/$1_$2_$3_$4/g;      my $retVal = $stemmer->PrepareSearchExpression($searchExpression);
     # Fix non-trailing periods.  
     $words =~ s/\.(\w)/_$1/g;  
     # Fix non-leading minus signs.  
     $words =~ s/(\w)[\-]/$1_/g;  
     # Fix the vertical bars and colons  
     $words =~ s/(\w)[|:](\w)/$1'$2/g;  
     # Now split up the list so that each keyword is in its own string. We keep the delimiters  
     # because they may contain boolean expression data.  
     my @words = split /([^A-Za-z'0-9_]+)/, $words;  
     # We'll convert the stemmable words into stems and re-assemble the result.  
     my $retVal = "";  
     for my $word (@words) {  
         my $stem = $self->Stem($word);  
         if (defined $stem) {  
             $retVal .= $stem;  
         } else {  
             $retVal .= $word;  
         }  
     }  
4059      Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3);      Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3);
4060      # Return the result.      # Return the result.
4061      return $retVal;      return $retVal;
4062  }  }
4063    
4064    =head3 GetSourceObject
4065    
4066        my $source = $erdb->GetSourceObject();
4067    
4068    Return the object to be used in creating load files for this database.
4069    
4070    =cut
4071    
4072    sub GetSourceObject {
4073        # Get the parameters.
4074        my ($self) = @_;
4075        # Check to see if we already have a source object.
4076        my $retVal = $self->{_fig};
4077        if (! defined $retVal) {
4078            # No, so create one.
4079            require FIG;
4080            $retVal = FIG->new();
4081        }
4082        # Return the object.
4083        return $retVal;
4084    }
4085    
4086    =head3 SectionList
4087    
4088        my @sections = $erdb->SectionList();
4089    
4090    Return a list of the names for the different data sections used when loading this database.
4091    The default is a single string, in which case there is only one section representing the
4092    entire database.
4093    
4094    =cut
4095    
4096    sub SectionList {
4097        # Get the parameters.
4098        my ($self, $source) = @_;
4099        # Ask the BaseSproutLoader for a section list.
4100        require BaseSproutLoader;
4101        my @retVal = BaseSproutLoader::GetSectionList($self);
4102        # Return the list.
4103        return @retVal;
4104    }
4105    
4106    =head3 Loader
4107    
4108        my $groupLoader = $erdb->Loader($groupName, $options);
4109    
4110    Return an [[ERDBLoadGroupPm]] object for the specified load group. This method is used
4111    by [[ERDBGeneratorPl]] to create the load group objects. If you are not using
4112    [[ERDBGeneratorPl]], you don't need to override this method.
4113    
4114    =over 4
4115    
4116    =item groupName
4117    
4118    Name of the load group whose object is to be returned. The group name is
4119    guaranteed to be a single word with only the first letter capitalized.
4120    
4121    =item options
4122    
4123    Reference to a hash of command-line options.
4124    
4125    =item RETURN
4126    
4127    Returns an [[ERDBLoadGroupPm]] object that can be used to process the specified load group
4128    for this database.
4129    
4130    =back
4131    
4132    =cut
4133    
4134    sub Loader {
4135        # Get the parameters.
4136        my ($self, $groupName, $options) = @_;
4137        # Compute the loader name.
4138        my $loaderClass = "${groupName}SproutLoader";
4139        # Pull in its definition.
4140        require "$loaderClass.pm";
4141        # Create an object for it.
4142        my $retVal = eval("$loaderClass->new(\$self, \$options)");
4143        # Insure it worked.
4144        Confess("Could not create $loaderClass object: $@") if $@;
4145        # Return it to the caller.
4146        return $retVal;
4147    }
4148    
4149    
4150    =head3 LoadGroupList
4151    
4152        my @groups = $erdb->LoadGroupList();
4153    
4154    Returns a list of the names for this database's load groups. This method is used
4155    by [[ERDBGeneratorPl]] when the user wishes to load all table groups. The default
4156    is a single group called 'All' that loads everything.
4157    
4158    =cut
4159    
4160    sub LoadGroupList {
4161        # Return the list.
4162        return qw(Genome Subsystem Annotation Property Source Reaction Synonym Feature Drug);
4163    }
4164    
4165    =head3 LoadDirectory
4166    
4167        my $dirName = $erdb->LoadDirectory();
4168    
4169    Return the name of the directory in which load files are kept. The default is
4170    the FIG temporary directory, which is a really bad choice, but it's always there.
4171    
4172    =cut
4173    
4174    sub LoadDirectory {
4175        # Get the parameters.
4176        my ($self) = @_;
4177        # Return the directory name.
4178        return $self->{dataDir};
4179    }
4180    
4181  =head2 Internal Utility Methods  =head2 Internal Utility Methods
4182    
4183    =head3 GetStemmer
4184    
4185        my $stermmer = $sprout->GetStemmer();
4186    
4187    Return the stemmer object for this database.
4188    
4189    =cut
4190    
4191    sub GetStemmer {
4192        # Get the parameters.
4193        my ($self) = @_;
4194        # Declare the return variable.
4195        my $retVal = $self->{stemmer};
4196        if (! defined $retVal) {
4197            # We don't have one pre-built, so we build and save it now.
4198            $retVal = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
4199                                     stops => "$FIG_Config::sproutData/StopWords.txt",
4200                                     cache => 0);
4201            $self->{stemmer} = $retVal;
4202        }
4203        # Return the result.
4204        return $retVal;
4205    }
4206    
4207  =head3 ParseAssignment  =head3 ParseAssignment
4208    
4209  Parse annotation text to determine whether or not it is a functional assignment. If it is,  Parse annotation text to determine whether or not it is a functional assignment. If it is,
# Line 4094  Line 4290 
4290      # Get the parameters.      # Get the parameters.
4291      my ($self, $fid) = @_;      my ($self, $fid) = @_;
4292      # Insure we have a genome hash.      # Insure we have a genome hash.
4293      if (! defined $self->{genomeHash}) {      my $genomes = $self->_GenomeHash();
         my %genomeHash = map { $_ => 1 } $self->GetFlat(['Genome'], "", [], 'Genome(id)');  
         $self->{genomeHash} = \%genomeHash;  
     }  
4294      # Get the feature's genome ID.      # Get the feature's genome ID.
4295      my ($genomeID) = FIGRules::ParseFeatureID($fid);      my ($genomeID) = FIGRules::ParseFeatureID($fid);
4296      # Return an indicator of whether or not the genome ID is in the hash.      # Return an indicator of whether or not the genome ID is in the hash.
# Line 4174  Line 4367 
4367      return $retVal;      return $retVal;
4368  }  }
4369    
4370    =head3 _GenomeHash
4371    
4372        my $gHash = $sprout->_GenomeHash();
4373    
4374    Return a hash mapping all NMPDR genome IDs to [[ERDBObjectPm]] genome objects.
4375    
4376    =cut
4377    
4378    sub _GenomeHash {
4379        # Get the parameters.
4380        my ($self) = @_;
4381        # Do we already have a filled hash?
4382        if (! $self->{genomeHashFilled}) {
4383            # No, create it.
4384            my %gHash = map { $_->PrimaryValue('id') => $_ } $self->GetList("Genome", "", []);
4385            $self->{genomeHash} = \%gHash;
4386            # Denote we have it.
4387            $self->{genomeHashFilled} = 1;
4388        }
4389        # Return the hash.
4390        return $self->{genomeHash};
4391    }
4392    
4393    =head3 _GenomeData
4394    
4395        my $genomeData = $sprout->_GenomeData($genomeID);
4396    
4397    Return an [[ERDBObjectPm]] object for the specified genome, or an undefined
4398    value if the genome does not exist.
4399    
4400    =over 4
4401    
4402    =item genomeID
4403    
4404    ID of the desired genome.
4405    
4406    =item RETURN
4407    
4408    Returns either an [[ERDBObjectPm]] containing the genome, or an undefined value.
4409    If the genome exists, it will have been read into the genome cache.
4410    
4411    =back
4412    
4413    =cut
4414    
4415    sub _GenomeData {
4416        # Get the parameters.
4417        my ($self, $genomeID) = @_;
4418        # Are we in the genome hash?
4419        if (! exists $self->{genomeHash}->{$genomeID} && ! $self->{genomeHashFilled}) {
4420            # The genome isn't in the hash, and the hash is not complete, so we try to
4421            # read it.
4422            $self->{genomeHash}->{$genomeID} = $self->GetEntity(Genome => $genomeID);
4423        }
4424        # Return the result.
4425        return $self->{genomeHash}->{$genomeID};
4426    }
4427    
4428    =head3 _CacheGenome
4429    
4430        $sprout->_CacheGenome($genomeID, $genomeData);
4431    
4432    Store the specified genome object in the genome cache if it is already there.
4433    
4434    =over 4
4435    
4436    =item genomeID
4437    
4438    ID of the genome to store in the cache.
4439    
4440    =item genomeData
4441    
4442    An [[ERDBObjectPm]] containing at least the data for the specified genome.
4443    Note that the Genome may not be the primary object in it, so a fully-qualified
4444    field name has to be used to retrieve data from it.
4445    
4446    =back
4447    
4448    =cut
4449    
4450    sub _CacheGenome {
4451        # Get the parameters.
4452        my ($self, $genomeID, $genomeData) = @_;
4453        # Only proceed if we don't already have the genome.
4454        if (! exists $self->{genomeHash}->{$genomeID}) {
4455            $self->{genomeHash}->{$genomeID} = $genomeData;
4456        }
4457    }
4458    
4459  1;  1;

Legend:
Removed from v.1.118  
changed lines
  Added in v.1.124

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3