[Bio] / Sprout / Sprout.pm Repository:
ViewVC logotype

Diff of /Sprout/Sprout.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.117, Tue Sep 16 18:57:59 2008 UTC revision 1.122, Mon Jan 19 21:46:21 2009 UTC
# Line 4  Line 4 
4      use strict;      use strict;
5      use DBKernel;      use DBKernel;
6      use XML::Simple;      use XML::Simple;
7      use DBQuery;      use ERDBQuery;
8      use ERDBObject;      use ERDBObject;
9      use Tracer;      use Tracer;
10      use FIGRules;      use FIGRules;
# Line 14  Line 14 
14      use BasicLocation;      use BasicLocation;
15      use CustomAttributes;      use CustomAttributes;
16      use RemoteCustomAttributes;      use RemoteCustomAttributes;
17      use CGI;      use CGI qw(-nosticky);
18      use WikiTools;      use WikiTools;
19      use BioWords;      use BioWords;
20      use base qw(ERDB);      use base qw(ERDB);
# Line 41  Line 41 
41    
42  =cut  =cut
43    
 #: Constructor SFXlate->new_sprout_only();  
   
44  =head2 Public Methods  =head2 Public Methods
45    
46  =head3 new  =head3 new
# Line 57  Line 55 
55    
56  =item dbName  =item dbName
57    
58  Name of the database.  Name of the database. If omitted, the default Sprout database name is used.
59    
60  =item options  =item options
61    
# Line 91  Line 89 
89    
90      my $sprout = Sprout->new('Sprout', { userData => 'fig/admin', dataDir => '/usr/fig/SproutData' });      my $sprout = Sprout->new('Sprout', { userData => 'fig/admin', dataDir => '/usr/fig/SproutData' });
91    
92    In order to work properly with [[ERDBGeneratorPl]], the constructor has an alternate
93    form.
94    
95        my $sprout = Sprout->new(dbd => $filename);
96    
97    Where I<$fileName> is the name of the DBD file. This enables us to specify an alternate
98    DBD for the loader, which is important when the database format changes.
99    
100  =cut  =cut
101    
102  sub new {  sub new {
103      # Get the parameters.      # Get the parameters.
104      my ($class, $dbName, $options) = @_;      my ($class, $dbName, $options) = @_;
105        # Check for the alternate signature, and default the database name if it is missing.
106        if ($dbName eq 'dbd') {
107            $dbName = $FIG_Config::sproutDB;
108            $options = { xmlFileName => $options };
109        } elsif (! defined $dbName) {
110            $dbName = $FIG_Config::sproutDB;
111        } elsif (ref $dbName eq 'HASH') {
112            $options = $dbName;
113            $dbName = $FIG_Config::sproutDB;
114        }
115      # Compute the DBD directory.      # Compute the DBD directory.
116      my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :      my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :
117                                                    $FIG_Config::fig );                                                    $FIG_Config::fig );
# Line 138  Line 154 
154      $retVal->{_xmlName} = $xmlFileName;      $retVal->{_xmlName} = $xmlFileName;
155      # Set up space for the group file data.      # Set up space for the group file data.
156      $retVal->{groupHash} = undef;      $retVal->{groupHash} = undef;
157      # Set up space for the genome hash. We use this to identify NMPDR genomes.      # Set up space for the genome hash. We use this to identify NMPDR genomes
158      $retVal->{genomeHash} = undef;      # and remember genome data.
159      # Connect to the attributes.      $retVal->{genomeHash} = {};
160        $retVal->{genomeHashFilled} = 0;
161        # Remember the data directory name.
162        $retVal->{dataDir} = $dataDir;
163        # Return it.
164        return $retVal;
165    }
166    
167    =head3 ca
168    
169        my $ca = $sprout->ca():;
170    
171    Return the [[CustomAttributesPm]] object for retrieving object
172    properties.
173    
174    =cut
175    
176    sub ca {
177        # Get the parameters.
178        my ($self) = @_;
179        # Do we already have an attribute object?
180        my $retVal = $self->{_ca};
181        if (! defined $retVal) {
182            # No, create one. How we do it depends on the configuration.
183      if ($FIG_Config::attrURL) {      if ($FIG_Config::attrURL) {
184          Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3);          Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3);
185          $retVal->{_ca} = RemoteCustomAttributes->new($FIG_Config::attrURL);              $retVal = RemoteCustomAttributes->new($FIG_Config::attrURL);
186      } elsif ($FIG_Config::attrDbName) {      } elsif ($FIG_Config::attrDbName) {
187          Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3);          Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3);
188          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));          my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent()));
189          $retVal->{_ca} = CustomAttributes->new(user => $user);              $retVal = CustomAttributes->new(user => $user);
190      }      }
191      # Return it.          # Save it for next time.
192            $self->{_ca} = $retVal;
193        }
194        # Return the result.
195      return $retVal;      return $retVal;
196  }  }
197    
# Line 311  Line 353 
353  The files are loaded based on the presumption that each line of the file is a record in the  The files are loaded based on the presumption that each line of the file is a record in the
354  relation, and the individual fields are delimited by tabs. Tab and new-line characters inside  relation, and the individual fields are delimited by tabs. Tab and new-line characters inside
355  fields must be represented by the escape sequences C<\t> and C<\n>, respectively. The fields must  fields must be represented by the escape sequences C<\t> and C<\n>, respectively. The fields must
356  be presented in the order given in the relation tables produced by the L</ShowMetaData> method.  be presented in the order given in the relation tables produced by the database documentation.
357    
358  =over 4  =over 4
359    
# Line 500  Line 542 
542  =item filter  =item filter
543    
544  If specified, a filter for the list of genomes to display. The filter should be in the form of a  If specified, a filter for the list of genomes to display. The filter should be in the form of a
545  list reference. The first element of the list should be the filter string, and the remaining elements  list reference, a string, or a hash reference. If it is a list reference, the first element
546  the filter parameters.  of the list should be the filter string, and the remaining elements the filter parameters. If it is a
547    string, it will be split into a list at each included tab. If it is a hash reference, it should be
548    a hash that maps genomes which should be included to a TRUE value.
549    
550  =item multiSelect  =item multiSelect
551    
# Line 540  Line 584 
584      my $divID = "${menuID}_status";      my $divID = "${menuID}_status";
585      my $urlID = "${menuID}_url";      my $urlID = "${menuID}_url";
586      # Compute the code to show selected genomes in the status area.      # Compute the code to show selected genomes in the status area.
587      my $showSelect = "showSelected('$menuID', '$divID', '$urlID', 1000)";      my $showSelect = "showSelected('$menuID', '$divID', '$urlID', $FIG_Config::genome_control_cap)";
588      # Check for single-select or multi-select.      # Check for single-select or multi-select.
589      my $multiSelect = $options{multiSelect} || 0;      my $multiSelect = $options{multiSelect} || 0;
590      # Get the style data.      # Get the style data.
# Line 552  Line 596 
596      }      }
597      my %selected = map { $_ => 1 } @{$selections};      my %selected = map { $_ => 1 } @{$selections};
598      # Extract the filter information. The default is no filtering. It can be passed as a tab-delimited      # Extract the filter information. The default is no filtering. It can be passed as a tab-delimited
599      # string or a list reference.      # string, a hash reference, or a list reference.
600        my ($filterHash, $filterString);
601      my $filterParms = $options{filter} || "";      my $filterParms = $options{filter} || "";
602        if (ref $filterParms eq 'HASH') {
603            $filterHash = $filterParms;
604            $filterParms = [];
605            $filterString = "";
606        } else {
607      if (! ref $filterParms) {      if (! ref $filterParms) {
608          $filterParms = [split /\t|\\t/, $filterParms];          $filterParms = [split /\t|\\t/, $filterParms];
609      }      }
610      my $filterString = shift @{$filterParms};          $filterString = shift @{$filterParms};
611        }
612        # Check for possible subsystem filtering. If there is one, we will tack the
613        # relationship onto the object name list.
614        my @objectNames = qw(Genome);
615        if ($filterString =~ /ParticipatesIn\(/) {
616            push @objectNames, 'ParticipatesIn';
617        }
618      # Get a list of all the genomes in group order. In fact, we only need them ordered      # Get a list of all the genomes in group order. In fact, we only need them ordered
619      # by name (genus,species,strain), but putting primary-group in front enables us to      # by name (genus,species,strain), but putting primary-group in front enables us to
620      # take advantage of an existing index.      # take advantage of an existing index.
621      my @genomeList = $self->GetAll(['Genome'], "$filterString ORDER BY Genome(primary-group), Genome(genus), Genome(species), Genome(unique-characterization)",      my @genomeList = $self->GetAll(\@objectNames, "$filterString ORDER BY Genome(primary-group), Genome(genus), Genome(species), Genome(unique-characterization)",
622                                     $filterParms,                                     $filterParms,
623                                     [qw(Genome(primary-group) Genome(id) Genome(genus) Genome(species) Genome(unique-characterization) Genome(taxonomy) Genome(contigs))]);                                     [qw(Genome(primary-group) Genome(id) Genome(genus) Genome(species) Genome(unique-characterization) Genome(taxonomy) Genome(contigs))]);
624        # Apply the hash filter (if any).
625        if (defined $filterHash) {
626            @genomeList = grep { $filterHash->{$_->[1]} } @genomeList;
627        }
628      # Create a hash to organize the genomes by group. Each group will contain a list of      # Create a hash to organize the genomes by group. Each group will contain a list of
629      # 2-tuples, the first element being the genome ID and the second being the genome      # 2-tuples, the first element being the genome ID and the second being the genome
630      # name.      # name.
# Line 593  Line 654 
654      my @groups = map { $sortGroups{$_} } sort keys %sortGroups;      my @groups = map { $sortGroups{$_} } sort keys %sortGroups;
655      # Remember the number of NMPDR groups.      # Remember the number of NMPDR groups.
656      my $nmpdrGroupCount = scalar @groups;      my $nmpdrGroupCount = scalar @groups;
657        # Are there any supporting genomes?
658        if (exists $gHash{$FIG_Config::otherGroup}) {
659      # Loop through the supporting genomes, classifying them by domain. We'll also keep a list      # Loop through the supporting genomes, classifying them by domain. We'll also keep a list
660      # of the domains found.      # of the domains found.
661      my @otherGenomes = @{$gHash{$FIG_Config::otherGroup}};      my @otherGenomes = @{$gHash{$FIG_Config::otherGroup}};
# Line 611  Line 674 
674      push @groups, sort @domains;      push @groups, sort @domains;
675      # Delete the supporting group.      # Delete the supporting group.
676      delete $gHash{$FIG_Config::otherGroup};      delete $gHash{$FIG_Config::otherGroup};
677        }
678      # Now it gets complicated. We need a way to mark all the NMPDR genomes. We take advantage      # Now it gets complicated. We need a way to mark all the NMPDR genomes. We take advantage
679      # of the fact they come first in the list. We'll accumulate a count of the NMPDR genomes      # of the fact they come first in the list. We'll accumulate a count of the NMPDR genomes
680      # and use that to make the selections.      # and use that to make the selections.
# Line 666  Line 730 
730                       Hint("GenomeControl", "Type here to filter the genomes displayed.") . "<br />";                       Hint("GenomeControl", "Type here to filter the genomes displayed.") . "<br />";
731          # For multi-select mode, we also have buttons to set and clear selections.          # For multi-select mode, we also have buttons to set and clear selections.
732          if ($multiSelect) {          if ($multiSelect) {
733              push @lines, "<INPUT type=\"button\" name=\"ClearAll\" class=\"bigButton\"  value=\"Clear All\" onClick=\"clearAll('$menuID'); $showSelect\" />";              push @lines, "<INPUT type=\"button\" name=\"ClearAll\" class=\"bigButton\"  value=\"Clear All\" onClick=\"clearAll(getElementById('$menuID')); $showSelect\" />";
734              push @lines, "<INPUT type=\"button\" name=\"SelectAll\" class=\"bigButton\" value=\"Select All\" onClick=\"selectAll('$menuID'); $showSelect\" />";              push @lines, "<INPUT type=\"button\" name=\"SelectAll\" class=\"bigButton\" value=\"Select All\" onClick=\"selectAll(getElementById('$menuID')); $showSelect\" />";
735              push @lines, "<INPUT type=\"button\" name=\"NMPDROnly\" class=\"bigButton\"  value=\"Select NMPDR\" onClick=\"selectSome('$menuID', $nmpdrCount, true); $showSelect;\" />";              push @lines, "<INPUT type=\"button\" name=\"NMPDROnly\" class=\"bigButton\"  value=\"Select NMPDR\" onClick=\"selectSome(getElementById('$menuID'), $nmpdrCount, true); $showSelect;\" />";
736          }          }
737          # Add a hidden field we can use to generate organism page hyperlinks.          # Add a hidden field we can use to generate organism page hyperlinks.
738          push @lines, "<INPUT type=\"hidden\" id=\"$urlID\" value=\"$FIG_Config::cgi_url/wiki/rest.cgi/NmpdrPlugin/SeedViewer?page=Organism;organism=\" />";          push @lines, "<INPUT type=\"hidden\" id=\"$urlID\" value=\"$FIG_Config::cgi_url/wiki/rest.cgi/NmpdrPlugin/SeedViewer?page=Organism;organism=\" />";
# Line 714  Line 778 
778          # We don't have one pre-built, so we build and save it now.          # We don't have one pre-built, so we build and save it now.
779          $stemmer = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",          $stemmer = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
780                                   stops => "$FIG_Config::sproutData/StopWords.txt",                                   stops => "$FIG_Config::sproutData/StopWords.txt",
781                                   cache => 1);                                   cache => 0);
782          $self->{stemmer} = $stemmer;          $self->{stemmer} = $stemmer;
783      }      }
784      # Try to stem the word.      # Try to stem the word.
# Line 782  Line 846 
846  sub GenusSpecies {  sub GenusSpecies {
847      # Get the parameters.      # Get the parameters.
848      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
849      # Get the data for the specified genome.      # Declare the return value.
850      my @values = $self->GetEntityValues('Genome', $genomeID, ['Genome(genus)', 'Genome(species)',      my $retVal;
851                                                                'Genome(unique-characterization)']);      # Get the genome data.
852      # Format the result and return it.      my $genomeData = $self->_GenomeData($genomeID);
853      my $retVal = join(' ', @values);      # Only proceed if we found the genome.
854        if (defined $genomeData) {
855            $retVal = $genomeData->PrimaryValue('Genome(scientific-name)');
856        }
857        # Return it.
858      return $retVal;      return $retVal;
859  }  }
860    
# Line 1124  Line 1192 
1192      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
1193      # Declare the return variable.      # Declare the return variable.
1194      my $retVal = 0;      my $retVal = 0;
1195      # Get the genome's contig sequence lengths.      # Get the genome data.
1196      my @lens = $self->GetFlat(['HasContig', 'IsMadeUpOf'], 'HasContig(from-link) = ?',      my $genomeData = $self->_GenomeData($genomeID);
1197                         [$genomeID], 'IsMadeUpOf(len)');      # Only proceed if it exists.
1198      # Sum the lengths.      if (defined $genomeData) {
1199      map { $retVal += $_ } @lens;          $retVal = $genomeData->PrimaryValue('Genome(dna-size)');
1200        }
1201      # Return the result.      # Return the result.
1202      return $retVal;      return $retVal;
1203  }  }
# Line 1795  Line 1864 
1864      # Loop through the incoming features.      # Loop through the incoming features.
1865      for my $featureID (@{$featureList}) {      for my $featureID (@{$featureList}) {
1866          # Ask the server for the feature's best hit.          # Ask the server for the feature's best hit.
1867          my @bbhData = FIGRules::BBHData($featureID);          my $bbhData = FIGRules::BBHData($featureID);
1868          # Peel off the BBHs found.          # Peel off the BBHs found.
1869          my @found = ();          my @found = ();
1870          for my $bbh (@bbhData) {          for my $bbh (@$bbhData) {
1871              my $fid = $bbh->[0];              my $fid = $bbh->[0];
1872              my $bbGenome = $self->GenomeOf($fid);              my $bbGenome = $self->GenomeOf($fid);
1873              if ($bbGenome eq $genomeID) {              if ($bbGenome eq $genomeID) {
# Line 1837  Line 1906 
1906      # Get the parameters.      # Get the parameters.
1907      my ($self, $featureID, $count) = @_;      my ($self, $featureID, $count) = @_;
1908      # Ask for the best hits.      # Ask for the best hits.
1909      my @lists = FIGRules::BBHData($featureID);      my $lists = FIGRules::BBHData($featureID);
1910      # Create the return value.      # Create the return value.
1911      my %retVal = ();      my %retVal = ();
1912      for my $tuple (@lists) {      for my $tuple (@$lists) {
1913          $retVal{$tuple->[0]} = $tuple->[1];          $retVal{$tuple->[0]} = $tuple->[1];
1914      }      }
1915      # Return the result.      # Return the result.
# Line 1874  Line 1943 
1943      # Declare the return variable.      # Declare the return variable.
1944      my $retVal;      my $retVal;
1945      # Get the genome's data.      # Get the genome's data.
1946      my $genomeData = $self->GetEntity('Genome', $genomeID);      my $genomeData = $self->_GenomeData($genomeID);
1947      if ($genomeData) {      # Only proceed if it exists.
1948        if (defined $genomeData) {
1949          # The genome exists, so get the completeness flag.          # The genome exists, so get the completeness flag.
1950          $retVal = $genomeData->PrimaryValue('Genome(complete)');          $retVal = $genomeData->PrimaryValue('Genome(complete)');
1951      }      }
# Line 1991  Line 2061 
2061              $retVal{$featureID2} = $score;              $retVal{$featureID2} = $score;
2062          }          }
2063      }      }
     # Functional coupling is reflexive. If we found at least one coupled feature, we must add  
     # the incoming feature as well.  
     if (keys %retVal) {  
         $retVal{$featureID} = 9999;  
     }  
2064      # Return the hash.      # Return the hash.
2065      return %retVal;      return %retVal;
2066  }  }
# Line 2352  Line 2417 
2417  sub GetGenomeNameData {  sub GetGenomeNameData {
2418      # Get the parameters.      # Get the parameters.
2419      my ($self, $genomeID) = @_;      my ($self, $genomeID) = @_;
2420        # Declare the return variables.
2421        my ($genus, $species, $strain);
2422        # Get the genome's data.
2423        my $genomeData = $self->_GenomeData($genomeID);
2424        # Only proceed if the genome exists.
2425        if (defined $genomeData) {
2426      # Get the desired values.      # Get the desired values.
2427      my ($genus, $species, $strain) = $self->GetEntityValues('Genome', $genomeID =>          ($genus, $species, $strain) = $genomeData->Values(['Genome(genus)',
2428                                                              [qw(Genome(genus) Genome(species) Genome(unique-characterization))]);                                                             'Genome(species)',
2429      # Throw an error if they were not found.                                                             'Genome(unique-characterization)']);
2430      if (! defined $genus) {      } else {
2431            # Throw an error because they were not found.
2432          Confess("Genome $genomeID not found in database.");          Confess("Genome $genomeID not found in database.");
2433      }      }
2434      # Return the results.      # Return the results.
# Line 2657  Line 2729 
2729  sub Taxonomy {  sub Taxonomy {
2730      # Get the parameters.      # Get the parameters.
2731      my ($self, $genome) = @_;      my ($self, $genome) = @_;
     # Find the specified genome's taxonomy string.  
     my ($list) = $self->GetEntityValues('Genome', $genome, ['Genome(taxonomy)']);  
2732      # Declare the return variable.      # Declare the return variable.
2733      my @retVal = ();      my @retVal = ();
2734      # If we found the genome, return its taxonomy string.      # Get the genome data.
2735      if ($list) {      my $genomeData = $self->_GenomeData($genome);
2736          @retVal = split /\s*;\s*/, $list;      # Only proceed if it exists.
2737        if (defined $genomeData) {
2738            # Create the taxonomy from the taxonomy string.
2739            @retVal = split /\s*;\s*/, $genomeData->PrimaryValue('Genome(taxonomy)');
2740      } else {      } else {
2741            # Genome doesn't exist, so emit a warning.
2742          Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0);          Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0);
2743      }      }
2744      # Return the value found.      # Return the value found.
# Line 2709  Line 2783 
2783      }      }
2784      my @taxA = $self->Taxonomy($genomeA);      my @taxA = $self->Taxonomy($genomeA);
2785      my @taxB = $self->Taxonomy($genomeB);      my @taxB = $self->Taxonomy($genomeB);
2786      # Initialize the distance to 1. We'll reduce it each time we find a match between the      # Compute the distance.
2787      # taxonomies.      my $retVal = FIGRules::CrudeDistanceFormula(\@taxA, \@taxB);
     my $retVal = 1.0;  
     # Initialize the subtraction amount. This amount determines the distance reduction caused  
     # by a mismatch at the current level.  
     my $v = 0.5;  
     # Loop through the taxonomies.  
     for (my $i = 0; ($i < @taxA) && ($i < @taxB) && ($taxA[$i] eq $taxB[$i]); $i++) {  
         $retVal -= $v;  
         $v /= 2;  
     }  
2788      return $retVal;      return $retVal;
2789  }  }
2790    
# Line 2815  Line 2880 
2880      # Get the parameters.      # Get the parameters.
2881      my ($self, $featureID) = @_;      my ($self, $featureID) = @_;
2882      # Get the properties.      # Get the properties.
2883      my @attributes = $self->{_ca}->GetAttributes($featureID);      my @attributes = $self->ca->GetAttributes($featureID);
2884      # Strip the feature ID off each tuple.      # Strip the feature ID off each tuple.
2885      my @retVal = ();      my @retVal = ();
2886      for my $attributeRow (@attributes) {      for my $attributeRow (@attributes) {
# Line 3122  Line 3187 
3187      # Declare the return variable.      # Declare the return variable.
3188      my %retVal = ();      my %retVal = ();
3189      # Get a list of the genome features that participate in subsystems. For each      # Get a list of the genome features that participate in subsystems. For each
3190      # feature we get its spreadsheet cells and the corresponding roles.      # feature we get its subsystem ID and the corresponding roles.
3191      my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf'],      my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf', 'HasSSCell'],
3192                               "HasFeature(from-link) = ?", [$genomeID],                               "HasFeature(from-link) = ?", [$genomeID],
3193                               ['HasFeature(to-link)', 'IsRoleOf(to-link)', 'IsRoleOf(from-link)']);                                   ['HasFeature(to-link)', 'IsRoleOf(from-link)',  'HasSSCell(from-link)']);
3194      # Now we get a list of the spreadsheet cells and their associated subsystems. Subsystems      # Now we get a list of valid subsystems. These are the subsystems connected to the genome with
3195      # with an unknown variant code (-1) are skipped. Note the genome ID is at both ends of the      # a non-negative variant code.
3196      # list. We use it at the beginning to get all the spreadsheet cells for the genome and      my %subs = map { $_ => 1 } $self->GetFlat(['ParticipatesIn'],
3197      # again at the end to filter out participation in subsystems with a negative variant code.                                                  "ParticipatesIn(from-link) = ? AND ParticipatesIn(variant-code) >= 0",
3198      my @cellData = $self->GetAll(['IsGenomeOf', 'HasSSCell', 'ParticipatesIn'],                                                  [$genomeID], 'ParticipatesIn(to-link)');
3199                                   "IsGenomeOf(from-link) = ? AND ParticipatesIn(variant-code) >= 0 AND ParticipatesIn(from-link) = ?",      # We loop through @roleData to build the hash.
                                  [$genomeID, $genomeID], ['HasSSCell(to-link)', 'HasSSCell(from-link)']);  
     # Now "@roleData" lists the spreadsheet cell and role for each of the genome's features.  
     # "@cellData" lists the subsystem name for each of the genome's spreadsheet cells. We  
     # link these two lists together to create the result. First, we want a hash mapping  
     # spreadsheet cells to subsystem names.  
     my %subHash = map { $_->[0] => $_->[1] } @cellData;  
     # We loop through @cellData to build the hash.  
3200      for my $roleEntry (@roleData) {      for my $roleEntry (@roleData) {
3201          # Get the data for this feature and cell.          # Get the data for this feature and cell.
3202          my ($fid, $cellID, $role) = @{$roleEntry};          my ($fid, $role, $subsys) = @{$roleEntry};
3203          # Check for a subsystem name.          Trace("Subsystem for $fid is $subsys.") if T(4);
3204          my $subsys = $subHash{$cellID};          # Check the subsystem;
3205          if ($subsys) {          if ($subs{$subsys}) {
3206                Trace("Subsystem found.") if T(4);
3207              # Insure this feature has an entry in the return hash.              # Insure this feature has an entry in the return hash.
3208              if (! exists $retVal{$fid}) { $retVal{$fid} = []; }              if (! exists $retVal{$fid}) { $retVal{$fid} = []; }
3209              # Merge in this new data.              # Merge in this new data.
# Line 3190  Line 3249 
3249      # Get the parameters.      # Get the parameters.
3250      my ($self, $featureID, $function, $userID) = @_;      my ($self, $featureID, $function, $userID) = @_;
3251      # Get a list of the features that are BBHs of the incoming feature.      # Get a list of the features that are BBHs of the incoming feature.
3252      my @bbhFeatures = map { $_->[0] } FIGRules::BBHData($featureID);      my $bbhData = FIGRules::BBHData($featureID);
3253        my @bbhFeatures = map { $_->[0] } @$bbhData;
3254      # Now we loop through the features, pulling out the ones that have the correct      # Now we loop through the features, pulling out the ones that have the correct
3255      # functional assignment.      # functional assignment.
3256      my @retVal = ();      my @retVal = ();
# Line 3363  Line 3423 
3423    
3424  =head3 BBHMatrix  =head3 BBHMatrix
3425    
3426      my %bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets);      my $bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets);
3427    
3428  Find all the bidirectional best hits for the features of a genome in a  Find all the bidirectional best hits for the features of a genome in a
3429  specified list of target genomes. The return value will be a hash mapping  specified list of target genomes. The return value will be a hash mapping
# Line 3387  Line 3447 
3447    
3448  =item RETURN  =item RETURN
3449    
3450  Returns a hash mapping each feature in the original genome to a hash mapping its  Returns a reference to a hash mapping each feature in the original genome
3451  BBH pegs in the target genomes to their scores.  to a sub-hash mapping its BBH pegs in the target genomes to their scores.
3452    
3453  =back  =back
3454    
# Line 3401  Line 3461 
3461      my %retVal = ();      my %retVal = ();
3462      # Ask for the BBHs.      # Ask for the BBHs.
3463      my @bbhList = FIGRules::BatchBBHs("fig|$genomeID.%", $cutoff, @targets);      my @bbhList = FIGRules::BatchBBHs("fig|$genomeID.%", $cutoff, @targets);
3464        Trace("Retrieved " . scalar(@bbhList) . " BBH results.") if T(3);
3465      # We now have a set of 4-tuples that we need to convert into a hash of hashes.      # We now have a set of 4-tuples that we need to convert into a hash of hashes.
3466      for my $bbhData (@bbhList) {      for my $bbhData (@bbhList) {
3467          my ($peg1, $peg2, $score) = @{$bbhData};          my ($peg1, $peg2, $score) = @{$bbhData};
# Line 3411  Line 3472 
3472          }          }
3473      }      }
3474      # Return the result.      # Return the result.
3475      return %retVal;      return \%retVal;
3476  }  }
3477    
3478    
# Line 3517  Line 3578 
3578      # Create the return hash.      # Create the return hash.
3579      my %retVal = ();      my %retVal = ();
3580      # Query for the desired BBHs.      # Query for the desired BBHs.
3581      my @bbhList = FIGRules::BBHData($featureID, $cutoff);      my $bbhList = FIGRules::BBHData($featureID, $cutoff);
3582      # Form the results into the return hash.      # Form the results into the return hash.
3583      for my $pair (@bbhList) {      for my $pair (@$bbhList) {
3584          my $fid = $pair->[0];          my $fid = $pair->[0];
3585          if ($self->Exists('Feature', $fid)) {          if ($self->Exists('Feature', $fid)) {
3586              $retVal{$fid} = $pair->[1];              $retVal{$fid} = $pair->[1];
# Line 3897  Line 3958 
3958      # Get the parameters.      # Get the parameters.
3959      my ($self, $featureID, $key, @values) = @_;      my ($self, $featureID, $key, @values) = @_;
3960      # Add the property using the attached attributes object.      # Add the property using the attached attributes object.
3961      $self->{_ca}->AddAttribute($featureID, $key, @values);      $self->ca->AddAttribute($featureID, $key, @values);
3962  }  }
3963    
3964  =head3 CheckGroupFile  =head3 CheckGroupFile
# Line 3984  Line 4045 
4045  sub CleanKeywords {  sub CleanKeywords {
4046      # Get the parameters.      # Get the parameters.
4047      my ($self, $searchExpression) = @_;      my ($self, $searchExpression) = @_;
4048      # Perform the standard cleanup.      # Get the stemmer.
4049      my $words = $self->ERDB::CleanKeywords($searchExpression);      my $stemmer = $self->GetStemmer();
4050      # Fix the periods in EC and TC numbers.      # Convert the search expression using the stemmer.
4051      $words =~ s/(\d+|\-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/$1_$2_$3_$4/g;      my $retVal = $stemmer->PrepareSearchExpression($searchExpression);
     # Fix non-trailing periods.  
     $words =~ s/\.(\w)/_$1/g;  
     # Fix non-leading minus signs.  
     $words =~ s/(\w)[\-]/$1_/g;  
     # Fix the vertical bars and colons  
     $words =~ s/(\w)[|:](\w)/$1'$2/g;  
     # Now split up the list so that each keyword is in its own string. We keep the delimiters  
     # because they may contain boolean expression data.  
     my @words = split /([^A-Za-z'0-9_]+)/, $words;  
     # We'll convert the stemmable words into stems and re-assemble the result.  
     my $retVal = "";  
     for my $word (@words) {  
         my $stem = $self->Stem($word);  
         if (defined $stem) {  
             $retVal .= $stem;  
         } else {  
             $retVal .= $word;  
         }  
     }  
4052      Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3);      Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3);
4053      # Return the result.      # Return the result.
4054      return $retVal;      return $retVal;
4055  }  }
4056    
4057    =head3 GetSourceObject
4058    
4059        my $source = $erdb->GetSourceObject();
4060    
4061    Return the object to be used in creating load files for this database.
4062    
4063    =cut
4064    
4065    sub GetSourceObject {
4066        # Get the parameters.
4067        my ($self) = @_;
4068        # Check to see if we already have a source object.
4069        my $retVal = $self->{_fig};
4070        if (! defined $retVal) {
4071            # No, so create one.
4072            require FIG;
4073            $retVal = FIG->new();
4074        }
4075        # Return the object.
4076        return $retVal;
4077    }
4078    
4079    =head3 SectionList
4080    
4081        my @sections = $erdb->SectionList();
4082    
4083    Return a list of the names for the different data sections used when loading this database.
4084    The default is a single string, in which case there is only one section representing the
4085    entire database.
4086    
4087    =cut
4088    
4089    sub SectionList {
4090        # Get the parameters.
4091        my ($self, $source) = @_;
4092        # Ask the BaseSproutLoader for a section list.
4093        require BaseSproutLoader;
4094        my @retVal = BaseSproutLoader::GetSectionList($self);
4095        # Return the list.
4096        return @retVal;
4097    }
4098    
4099    =head3 Loader
4100    
4101        my $groupLoader = $erdb->Loader($groupName, $options);
4102    
4103    Return an [[ERDBLoadGroupPm]] object for the specified load group. This method is used
4104    by [[ERDBGeneratorPl]] to create the load group objects. If you are not using
4105    [[ERDBGeneratorPl]], you don't need to override this method.
4106    
4107    =over 4
4108    
4109    =item groupName
4110    
4111    Name of the load group whose object is to be returned. The group name is
4112    guaranteed to be a single word with only the first letter capitalized.
4113    
4114    =item options
4115    
4116    Reference to a hash of command-line options.
4117    
4118    =item RETURN
4119    
4120    Returns an [[ERDBLoadGroupPm]] object that can be used to process the specified load group
4121    for this database.
4122    
4123    =back
4124    
4125    =cut
4126    
4127    sub Loader {
4128        # Get the parameters.
4129        my ($self, $groupName, $options) = @_;
4130        # Compute the loader name.
4131        my $loaderClass = "${groupName}SproutLoader";
4132        # Pull in its definition.
4133        require "$loaderClass.pm";
4134        # Create an object for it.
4135        my $retVal = eval("$loaderClass->new(\$self, \$options)");
4136        # Insure it worked.
4137        Confess("Could not create $loaderClass object: $@") if $@;
4138        # Return it to the caller.
4139        return $retVal;
4140    }
4141    
4142    
4143    =head3 LoadGroupList
4144    
4145        my @groups = $erdb->LoadGroupList();
4146    
4147    Returns a list of the names for this database's load groups. This method is used
4148    by [[ERDBGeneratorPl]] when the user wishes to load all table groups. The default
4149    is a single group called 'All' that loads everything.
4150    
4151    =cut
4152    
4153    sub LoadGroupList {
4154        # Return the list.
4155        return qw(Genome Subsystem Annotation Property Source Reaction Synonym Feature Drug);
4156    }
4157    
4158    =head3 LoadDirectory
4159    
4160        my $dirName = $erdb->LoadDirectory();
4161    
4162    Return the name of the directory in which load files are kept. The default is
4163    the FIG temporary directory, which is a really bad choice, but it's always there.
4164    
4165    =cut
4166    
4167    sub LoadDirectory {
4168        # Get the parameters.
4169        my ($self) = @_;
4170        # Return the directory name.
4171        return $self->{dataDir};
4172    }
4173    
4174  =head2 Internal Utility Methods  =head2 Internal Utility Methods
4175    
4176    =head3 GetStemmer
4177    
4178        my $stermmer = $sprout->GetStemmer();
4179    
4180    Return the stemmer object for this database.
4181    
4182    =cut
4183    
4184    sub GetStemmer {
4185        # Get the parameters.
4186        my ($self) = @_;
4187        # Declare the return variable.
4188        my $retVal = $self->{stemmer};
4189        if (! defined $retVal) {
4190            # We don't have one pre-built, so we build and save it now.
4191            $retVal = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
4192                                     stops => "$FIG_Config::sproutData/StopWords.txt",
4193                                     cache => 0);
4194            $self->{stemmer} = $retVal;
4195        }
4196        # Return the result.
4197        return $retVal;
4198    }
4199    
4200  =head3 ParseAssignment  =head3 ParseAssignment
4201    
4202  Parse annotation text to determine whether or not it is a functional assignment. If it is,  Parse annotation text to determine whether or not it is a functional assignment. If it is,
# Line 4100  Line 4283 
4283      # Get the parameters.      # Get the parameters.
4284      my ($self, $fid) = @_;      my ($self, $fid) = @_;
4285      # Insure we have a genome hash.      # Insure we have a genome hash.
4286      if (! defined $self->{genomeHash}) {      my $genomes = $self->_GenomeHash();
         my %genomeHash = map { $_ => 1 } $self->GetFlat(['Genome'], "", [], 'Genome(id)');  
         $self->{genomeHash} = \%genomeHash;  
     }  
4287      # Get the feature's genome ID.      # Get the feature's genome ID.
4288      my ($genomeID) = FIGRules::ParseFeatureID($fid);      my ($genomeID) = FIGRules::ParseFeatureID($fid);
4289      # Return an indicator of whether or not the genome ID is in the hash.      # Return an indicator of whether or not the genome ID is in the hash.
# Line 4180  Line 4360 
4360      return $retVal;      return $retVal;
4361  }  }
4362    
4363    =head3 _GenomeHash
4364    
4365        my $gHash = $sprout->_GenomeHash();
4366    
4367    Return a hash mapping all NMPDR genome IDs to [[ERDBObjectPm]] genome objects.
4368    
4369    =cut
4370    
4371    sub _GenomeHash {
4372        # Get the parameters.
4373        my ($self) = @_;
4374        # Do we already have a filled hash?
4375        if (! $self->{genomeHashFilled}) {
4376            # No, create it.
4377            my %gHash = map { $_->PrimaryValue('id') => $_ } $self->GetList("Genome", "", []);
4378            $self->{genomeHash} = \%gHash;
4379            # Denote we have it.
4380            $self->{genomeHashFilled} = 1;
4381        }
4382        # Return the hash.
4383        return $self->{genomeHash};
4384    }
4385    
4386    =head3 _GenomeData
4387    
4388        my $genomeData = $sprout->_GenomeData($genomeID);
4389    
4390    Return an [[ERDBObjectPm]] object for the specified genome, or an undefined
4391    value if the genome does not exist.
4392    
4393    =over 4
4394    
4395    =item genomeID
4396    
4397    ID of the desired genome.
4398    
4399    =item RETURN
4400    
4401    Returns either an [[ERDBObjectPm]] containing the genome, or an undefined value.
4402    If the genome exists, it will have been read into the genome cache.
4403    
4404    =back
4405    
4406    =cut
4407    
4408    sub _GenomeData {
4409        # Get the parameters.
4410        my ($self, $genomeID) = @_;
4411        # Are we in the genome hash?
4412        if (! exists $self->{genomeHash}->{$genomeID} && ! $self->{genomeHashFilled}) {
4413            # The genome isn't in the hash, and the hash is not complete, so we try to
4414            # read it.
4415            $self->{genomeHash}->{$genomeID} = $self->GetEntity(Genome => $genomeID);
4416        }
4417        # Return the result.
4418        return $self->{genomeHash}->{$genomeID};
4419    }
4420    
4421    =head3 _CacheGenome
4422    
4423        $sprout->_CacheGenome($genomeID, $genomeData);
4424    
4425    Store the specified genome object in the genome cache if it is already there.
4426    
4427    =over 4
4428    
4429    =item genomeID
4430    
4431    ID of the genome to store in the cache.
4432    
4433    =item genomeData
4434    
4435    An [[ERDBObjectPm]] containing at least the data for the specified genome.
4436    Note that the Genome may not be the primary object in it, so a fully-qualified
4437    field name has to be used to retrieve data from it.
4438    
4439    =back
4440    
4441    =cut
4442    
4443    sub _CacheGenome {
4444        # Get the parameters.
4445        my ($self, $genomeID, $genomeData) = @_;
4446        # Only proceed if we don't already have the genome.
4447        if (! exists $self->{genomeHash}->{$genomeID}) {
4448            $self->{genomeHash}->{$genomeID} = $genomeData;
4449        }
4450    }
4451    
4452  1;  1;

Legend:
Removed from v.1.117  
changed lines
  Added in v.1.122

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3