[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.84, Thu May 17 23:44:51 2007 UTC revision 1.95, Sat Sep 20 14:33:28 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16        use BioWords;
17    
18  =head1 Sprout Load Methods  =head1 Sprout Load Methods
19    
# Line 50  Line 53 
53    
54  =head3 new  =head3 new
55    
56  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57    
58  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
59  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 101  Line 104 
104              # Here we want all the complete genomes and an access code of 1.              # Here we want all the complete genomes and an access code of 1.
105              my @genomeList = $fig->genomes(1);              my @genomeList = $fig->genomes(1);
106              %genomes = map { $_ => 1 } @genomeList;              %genomes = map { $_ => 1 } @genomeList;
107                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
108          } else {          } else {
109              my $type = ref $genomeFile;              my $type = ref $genomeFile;
110              Trace("Genome file parameter type is \"$type\".") if T(3);              Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 167  Line 171 
171          for my $subsystem (keys %subsystems) {          for my $subsystem (keys %subsystems) {
172              my $name = $subsystem;              my $name = $subsystem;
173              $name =~ s/_/ /g;              $name =~ s/_/ /g;
             my $classes = $fig->subsystem_classification($subsystem);  
             $name .= " " . join(" ", @{$classes});  
174              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
175          }          }
176      }      }
177        # Get the list of NMPDR-oriented attribute keys.
178        my @propKeys = $fig->get_group_keys("NMPDR");
179      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
180      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
181      # Create the Sprout load object.      # Create the Sprout load object.
# Line 183  Line 187 
187                    loadDirectory => $directory,                    loadDirectory => $directory,
188                    erdb => $sprout,                    erdb => $sprout,
189                    loaders => [],                    loaders => [],
190                    options => $options                    options => $options,
191                      propKeys => \@propKeys,
192                   };                   };
193      # Bless and return it.      # Bless and return it.
194      bless $retVal, $class;      bless $retVal, $class;
# Line 192  Line 197 
197    
198  =head3 LoadOnly  =head3 LoadOnly
199    
200  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
201    
202  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
203    
# Line 203  Line 208 
208      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
209  }  }
210    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
211    
212  =head3 LoadGenomeData  =head3 LoadGenomeData
213    
214  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
215    
216  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
217    
# Line 255  Line 248 
248      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
249      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
250      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
251      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
252      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
253      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
254      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
255      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
256          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
257      } else {      } else {
258          Trace("Generating genome data.") if T(2);          Trace("Generating genome data.") if T(2);
259            # Get the full info for the FIG genomes.
260            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
263          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
264              Trace("Generating data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
# Line 292  Line 288 
288                  $group = $FIG_Config::otherGroup;                  $group = $FIG_Config::otherGroup;
289              }              }
290              close TMP;              close TMP;
291                # Get the contigs.
292                my @contigs = $fig->all_contigs($genomeID);
293                # Get this genome's info array.
294                my $info = $genomeInfo{$genomeID};
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
             my @contigs = $fig->all_contigs($genomeID);  
299              for my $contigID (@contigs) {              for my $contigID (@contigs) {
300                  Trace("Processing contig $contigID for $genomeID.") if T(4);                  Trace("Processing contig $contigID for $genomeID.") if T(4);
301                  $loadContig->Add("contigIn");                  $loadContig->Add("contigIn");
# Line 332  Line 331 
331      return $retVal;      return $retVal;
332  }  }
333    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     # Set up an ID counter for the PCHs.  
     my $pchID = 0;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Get the ID for this evidence.  
                             $pchID++;  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($pchID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $pchID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($pchID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($pchID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
334  =head3 LoadFeatureData  =head3 LoadFeatureData
335    
336  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
337    
338  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
339    
# Line 470  Line 343 
343    
344      Feature      Feature
345      FeatureAlias      FeatureAlias
346        IsAliasOf
347      FeatureLink      FeatureLink
348      FeatureTranslation      FeatureTranslation
349      FeatureUpstream      FeatureUpstream
# Line 479  Line 353 
353      FeatureEssential      FeatureEssential
354      FeatureVirulent      FeatureVirulent
355      FeatureIEDB      FeatureIEDB
356        CDD
357        IsPresentOnProteinOf
358        CellLocation
359        IsPossiblePlaceFor
360        ExternalDatabase
361        IsAlsoFoundIn
362        Keyword
363    
364  =over 4  =over 4
365    
# Line 500  Line 381 
381      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
382      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
383      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
384      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
385      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
386        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
387      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
388      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
389      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
390      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);      my $loadHasFeature = $self->_TableLoader('HasFeature');
391      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
392      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
393      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
394      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395        my $loadCDD = $self->_TableLoader('CDD');
396        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397        my $loadCellLocation = $self->_TableLoader('CellLocation');
398        my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399        my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400        my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401        my $loadKeyword = $self->_TableLoader('Keyword');
402      # Get the subsystem hash.      # Get the subsystem hash.
403      my $subHash = $self->{subsystems};      my $subHash = $self->{subsystems};
404        # Get the property keys.
405        my $propKeys = $self->{propKeys};
406        # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407        my %CDD = ();
408        my %alias = ();
409        my %cellLocation = ();
410        my %xdb = ();
411        # Create the bio-words object.
412        my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
413                                     stops => "$FIG_Config::sproutData/StopWords.txt",
414                                     cache => 0);
415        # One of the things we have to do here is build the keyword table, and the keyword
416        # table needs to contain the originating text and feature count for each stem. Unfortunately,
417        # the number of distinct keywords is so large it causes PERL to hang if we try to
418        # keep them in memory. As a result, we need to track them using disk files.
419        # Our approach will be to use two sequential files. One will contain stems and phonexes.
420        # Each time a stem occurs in a feature, a record will be written to that file. The stem
421        # file can then be sorted and collated to determine the number of features for each
422        # stem. A separate file will contain keywords and stems. This last file
423        # will be subjected to a sort unique on stem/keyword. The file is then merged
424        # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
425        my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
426        my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
427        my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
428        my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
429      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
430      # locations.      # locations.
431      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 520  Line 434 
434      } else {      } else {
435          Trace("Generating feature data.") if T(2);          Trace("Generating feature data.") if T(2);
436          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
437          for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
438            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
439            for my $genomeID (@allGenomes) {
440              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
441              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
442              # Get the feature list for this genome.              # Get the feature list for this genome.
# Line 531  Line 447 
447              my @fids = map { $_->[0] } @featureTuples;              my @fids = map { $_->[0] } @featureTuples;
448              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
449              # Get the attributes for this genome and put them in a hash by feature ID.              # Get the attributes for this genome and put them in a hash by feature ID.
450              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
451                Trace("Looping through features for $genomeID.") if T(3);
452              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
453              my $oldFeatureID = "";              my $oldFeatureID = "";
454              # Loop through the features.              # Loop through the features.
# Line 556  Line 473 
473                                      $fig->taxonomy_of($genomeID));                                      $fig->taxonomy_of($genomeID));
474                      # Create the aliases.                      # Create the aliases.
475                      for my $alias ($fig->feature_aliases($featureID)) {                      for my $alias ($fig->feature_aliases($featureID)) {
476                          $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
477                            $loadIsAliasOf->Put($alias, $featureID);
478                          push @keywords, $alias;                          push @keywords, $alias;
479                            # If this is a locus tag, also add its natural form as a keyword.
480                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
481                            if ($naturalName) {
482                                push @keywords, $naturalName;
483                            }
484                            # If this is the first time for the specified alias, create its
485                            # alias record.
486                            if (! exists $alias{$alias}) {
487                                $loadFeatureAlias->Put($alias);
488                                $alias{$alias} = 1;
489                            }
490                        }
491                        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
492                        my @corresponders = $fig->get_corresponding_ids($featureID, 1);
493                        for my $tuple (@corresponders) {
494                            my ($id, $xdb) = @{$tuple};
495                            # Ignore SEED: that's us.
496                            if ($xdb ne 'SEED') {
497                                # Connect this ID to the feature.
498                                $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
499                                # Add it as a keyword.
500                                push @keywords, $id;
501                                # If this is a new database, create a record for it.
502                                if (! exists $xdb{$xdb}) {
503                                    $xdb{$xdb} = 1;
504                                    $loadExternalDatabase->Put($xdb);
505                                }
506                            }
507                      }                      }
508                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
509                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
# Line 585  Line 531 
531                      }                      }
532                      # Now we need to find the subsystems this feature participates in.                      # Now we need to find the subsystems this feature participates in.
533                      # We also add the subsystems to the keyword list. Before we do that,                      # We also add the subsystems to the keyword list. Before we do that,
534                      # we must convert underscores to spaces and tack on the classifications.                      # we must convert underscores to spaces.
535                      my @subsystems = $fig->peg_to_subsystems($featureID);                      my @subsystems = $fig->peg_to_subsystems($featureID);
536                      for my $subsystem (@subsystems) {                      for my $subsystem (@subsystems) {
537                          # Only proceed if we like this subsystem.                          # Only proceed if we like this subsystem.
# Line 634  Line 580 
580                          push @keywords, 'iedb';                          push @keywords, 'iedb';
581                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
582                      }                      }
583                      # Now we need to bust up hyphenated words in the keyword                      # Now we have some other attributes we need to process. To get
584                      # list. We keep them separate and put them at the end so                      # through them, we convert the attribute list for this feature
585                      # the original word order is available.                      # into a two-layer hash: key => subkey => value.
586                      my $keywordString = "";                      my %attributeHash = ();
587                      my $bustedString = "";                      for my $attrRow (@{$attributes->{$featureID}}) {
588                      for my $keyword (@keywords) {                          my (undef, $key, @values) = @{$attrRow};
589                          if (length $keyword >= 3) {                          my ($realKey, $subKey);
590                              $keywordString .= " $keyword";                          if ($key =~ /^([^:]+)::(.+)/) {
591                              if ($keyword =~ /-/) {                              ($realKey, $subKey) = ($1, $2);
592                                  my @words = split /-/, $keyword;                          } else {
593                                  $bustedString .= join(" ", "", @words);                              ($realKey, $subKey) = ($key, "");
594                              }                          }
595                            if (exists $attributeHash{$1}) {
596                                $attributeHash{$1}->{$2} = \@values;
597                            } else {
598                                $attributeHash{$1} = {$2 => \@values};
599                            }
600                        }
601                        # First we handle CDD. This is a bit complicated, because
602                        # there are multiple CDDs per protein.
603                        if (exists $attributeHash{CDD}) {
604                            # Get the hash of CDD IDs to scores for this feature. We
605                            # already know it exists because of the above IF.
606                            my $cddHash = $attributeHash{CDD};
607                            my @cddData = sort keys %{$cddHash};
608                            for my $cdd (@cddData) {
609                                # Extract the score for this CDD and decode it.
610                                my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
611                                my $realScore = FIGRules::DecodeScore($codeScore);
612                                # We can't afford to crash because of a bad attribute
613                                # value, hence the IF below.
614                                if (! defined($realScore)) {
615                                    # Bad score, so count it.
616                                    $loadFeature->Add('badCDDscore');
617                                    Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
618                                } else {
619                                    # Create the connection.
620                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
621                                    # If this CDD does not yet exist, create its record.
622                                    if (! exists $CDD{$cdd}) {
623                                        $CDD{$cdd} = 1;
624                                        $loadCDD->Put($cdd);
625                                    }
626                                }
627                            }
628                        }
629                        # Next we do PSORT cell locations. here the confidence value
630                        # could have the value "unknown", which we translate to -1.
631                        if (exists $attributeHash{PSORT}) {
632                            # This will be a hash of cell locations to confidence
633                            # factors.
634                            my $psortHash = $attributeHash{PSORT};
635                            for my $psort (keys %{$psortHash}) {
636                                # Get the confidence, and convert it to a number if necessary.
637                                my $confidence = $psortHash->{$psort};
638                                if ($confidence eq 'unknown') {
639                                    $confidence = -1;
640                                }
641                                $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
642                                # If this cell location does not yet exist, create its record.
643                                if (! exists $cellLocation{$psort}) {
644                                    $cellLocation{$psort} = 1;
645                                    $loadCellLocation->Put($psort);
646                                }
647                                # If this is a significant location, add it as a keyword.
648                                if ($confidence > 2.5) {
649                                    push @keywords, $psort;
650                                }
651                            }
652                        }
653                        # Phobius data is next. This consists of the signal peptide location and
654                        # the transmembrane locations.
655                        my $signalList = "";
656                        my $transList = "";
657                        if (exists $attributeHash{Phobius}) {
658                            # This will be a hash of two keys (transmembrane and signal) to
659                            # location strings. If there's no value, we stuff in an empty string.
660                            $signalList = GetCommaList($attributeHash{Phobius}->{signal});
661                            $transList = GetCommaList($attributeHash{Phobius}->{transmembrane});
662                        }
663                        # Here are some more numbers: isoelectric point, molecular weight, and
664                        # the similar-to-human flag.
665                        my $isoelectric = 0;
666                        if (exists $attributeHash{isoelectric_point}) {
667                            $isoelectric = $attributeHash{isoelectric_point}->{""};
668                        }
669                        my $similarToHuman = 0;
670                        if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
671                            $similarToHuman = 1;
672                        }
673                        my $molecularWeight = 0;
674                        if (exists $attributeHash{molecular_weight}) {
675                            $molecularWeight = $attributeHash{molecular_weight}->{""};
676                        }
677                        # Create the keyword string.
678                        my $keywordString = join(" ", @keywords);
679                        Trace("Real keyword string for $featureID: $keywordString.") if T(4);
680                        # Get rid of annoying punctuation.
681                        $keywordString =~ s/[();@#\/]/ /g;
682                        # Get the list of keywords in the keyword string.
683                        my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
684                        # We need to do two things here: create the keyword string for the feature table
685                        # and write records to the keyword and stem files. The stuff we write to
686                        # the files will be taken from the following two hashes. The stuff used
687                        # to create the keyword string will be taken from the list.
688                        my (%keys, %stems, @realStems);
689                        for my $keyword (@realKeywords) {
690                            # Compute the stem and phonex for this keyword.
691                            my ($stem, $phonex) = $biowords->StemLookup($keyword);
692                            # Only proceed if a stem comes back. If no stem came back, it's a
693                            # stop word and we throw it away.
694                            if ($stem) {
695                                $keys{$keyword} = $stem;
696                                $stems{$stem} = $phonex;
697                                push @realStems, $stem;
698                          }                          }
699                      }                      }
700                      $keywordString .= $bustedString;                      # Now create the keyword string.
701                      # Get rid of annoying punctuation.                      my $cleanWords = join(" ", @realStems);
                     $keywordString =~ s/[();]//g;  
                     # Clean the keyword list.  
                     my $cleanWords = $sprout->CleanKeywords($keywordString);  
702                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
703                      # Create the feature record.                      # Write the stem and keyword records.
704                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);                      for my $stem (keys %stems) {
705                            Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
706                        }
707                        for my $key (keys %keys) {
708                            # The stem goes first in this file, because we want to sort
709                            # by stem and then keyword.
710                            Tracer::PutLine($keyh, [$keys{$key}, $key]);
711                        }
712                        # Now we need to process the feature's locations. First, we split them up.
713                        my @locationList = split /\s*,\s*/, $locations;
714                        # Next, we convert them to Sprout location objects.
715                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
716                        # Assemble them into a sprout location string for later.
717                        my $locationString = join(", ", map { $_->String } @locObjectList);
718                        # We'll store the sequence length in here.
719                        my $sequenceLength = 0;
720                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
721                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
722                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
723                      # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                     my @locationList = split /\s*,\s*/, $locations;  
                     # Create the location position indicator.  
724                      my $i = 1;                      my $i = 1;
725                      # Loop through the locations.                      # Loop through the locations.
726                      for my $location (@locationList) {                      for my $locObject (@locObjectList) {
727                          # Parse the location.                          # Record the length.
728                          my $locObject = BasicLocation->new("$genomeID:$location");                          $sequenceLength += $locObject->Length;
729                          # Split it into a list of chunks.                          # Split this location into a list of chunks.
730                          my @locOList = ();                          my @locOList = ();
731                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
732                              $loadIsLocatedIn->Add("peeling");                              $loadIsLocatedIn->Add("peeling");
# Line 682  Line 741 
741                              $i++;                              $i++;
742                          }                          }
743                      }                      }
744                        # Now we get some ancillary flags.
745                        my $locked = $fig->is_locked_fid($featureID);
746                        my $in_genbank = $fig->peg_in_gendb($featureID);
747                        # Create the feature record.
748                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
749                                          $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
750                                          $transList);
751                    }
752                }
753                Trace("Genome $genomeID processed.") if T(3);
754            }
755        }
756        Trace("Sorting keywords.") if T(2);
757        # Now we need to load the keyword table from the key and stem files.
758        close $keyh;
759        close $stemh;
760        Trace("Loading keywords.") if T(2);
761        $keyh = Open(undef, "<$keyFileName");
762        $stemh = Open(undef, "<$stemFileName");
763        # We'll count the keywords in here, for tracing purposes.
764        my $count = 0;
765        # These variables track the current stem's data. When an incoming
766        # keyword's stem changes, these will be recomputed.
767        my ($currentStem, $currentPhonex, $currentCount);
768        # Prime the loop by reading the first stem in the stem file.
769        my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
770        # Loop through the keyword file.
771        while (! eof $keyh) {
772            # Read this keyword.
773            my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
774            # Check to see if it's the new stem yet.
775            if ($thisStem ne $currentStem) {
776                # Yes. It's a terrible error if it's not also the next stem.
777                if ($thisStem ne $nextStem) {
778                    Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
779                } else {
780                    # Here we're okay.
781                    ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
782                    # Count the number of features for this stem.
783                    $currentCount = 0;
784                    while ($nextStem eq $thisStem) {
785                        ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
786                        $currentCount++;
787                    }
788                  }                  }
789              }              }
790            # Now $currentStem is the same as $thisStem, and the other $current-vars
791            # contain the stem's data (phonex and count).
792            $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
793            if (++$count % 1000 == 0 && T(3)) {
794                Trace("$count keywords loaded.");
795          }          }
796      }      }
797        Trace("$count keywords loaded into keyword table.") if T(2);
798      # Finish the loads.      # Finish the loads.
799      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
800      return $retVal;      return $retVal;
# Line 693  Line 802 
802    
803  =head3 LoadSubsystemData  =head3 LoadSubsystemData
804    
805  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
806    
807  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
808    
# Line 709  Line 818 
818      SubsystemClass      SubsystemClass
819      Role      Role
820      RoleEC      RoleEC
821        IsIdentifiedByEC
822      SSCell      SSCell
823      ContainsFeature      ContainsFeature
824      IsGenomeOf      IsGenomeOf
# Line 722  Line 832 
832      ConsistsOfGenomes      ConsistsOfGenomes
833      GenomeSubset      GenomeSubset
834      HasGenomeSubset      HasGenomeSubset
     Catalyzes  
835      Diagram      Diagram
836      RoleOccursIn      RoleOccursIn
837        SubsystemHopeNotes
838    
839  =over 4  =over 4
840    
# Line 750  Line 860 
860      # Get the map list.      # Get the map list.
861      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
862      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
863      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
864      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
865      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
866      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
867      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
868      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
869      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
870      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
871      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
872      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
873      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
874      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
875      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
876      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
877      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
878      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
879      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
880      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
881      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
882      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
883        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
884        my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
885      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
886          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
887      } else {      } else {
888          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
889          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
890          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
891          my %ecToRoles = ();          my %ecToRoles = ();
892          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 794  Line 906 
906                  # Create the subsystem record.                  # Create the subsystem record.
907                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
908                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
909                  $loadSubsystem->Put($subsysID, $curator, $notes);                  my $version = $sub->get_version();
910                    my $description = $sub->get_description();
911                    $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
912                    # Add the hope notes.
913                    my $hopeNotes = $sub->get_hope_curation_notes();
914                    if ($hopeNotes) {
915                        $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
916                    }
917                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
918                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
919                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
# Line 802  Line 921 
921                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
922                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
923                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
924                        # Get the role's abbreviation.
925                        my $abbr = $sub->get_role_abbr($col);
926                        # Get its essentiality.
927                        my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
928                        # Get its reaction note.
929                        my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
930                      # Connect to this role.                      # Connect to this role.
931                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
932                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
933                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
934                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
935                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
936                          # Add the role.                          # Add the role.
937                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
938                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
939                          # Check for an EC number.                          # Check for an EC number.
940                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
941                              my $ec = $1;                              my $ec = $1;
942                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
943                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
944                                if (exists $ecToRoles{$ec}) {
945                                    # No, so just add this role to the EC list.
946                                    push @{$ecToRoles{$ec}}, $roleID;
947                                } else {
948                                    # Output this EC.
949                                    $loadRoleEC->Put($ec);
950                                    # Create its role list.
951                                    $ecToRoles{$ec} = [$roleID];
952                                }
953                          }                          }
954                      }                      }
955                  }                  }
# Line 929  Line 1062 
1062              # Now we need to link all the map's roles to it.              # Now we need to link all the map's roles to it.
1063              # A hash is used to prevent duplicates.              # A hash is used to prevent duplicates.
1064              my %roleHash = ();              my %roleHash = ();
1065              for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
1066                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {                  if (exists $ecToRoles{$ec}) {
1067                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);                      for my $role (@{$ecToRoles{$ec}}) {
1068                            if (! $roleHash{$role}) {
1069                                $loadRoleOccursIn->Put($role, $map);
1070                      $roleHash{$role} = 1;                      $roleHash{$role} = 1;
1071                  }                  }
1072              }              }
1073          }          }
         # Before we leave, we must create the Catalyzes table. We start with the reactions,  
         # then use the "ecToRoles" table to convert EC numbers to role IDs.  
         my @reactions = $fig->all_reactions();  
         for my $reactionID (@reactions) {  
             # Get this reaction's list of roles. The results will be EC numbers.  
             my @roles = $fig->catalyzed_by($reactionID);  
             # Loop through the roles, creating catalyzation records.  
             for my $thisRole (@roles) {  
                 if (exists $ecToRoles{$thisRole}) {  
                     $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);  
                 }  
1074              }              }
1075          }          }
1076      }      }
# Line 957  Line 1081 
1081    
1082  =head3 LoadPropertyData  =head3 LoadPropertyData
1083    
1084  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
1085    
1086  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
1087    
# Line 993  Line 1117 
1117      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1118      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1119      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
1120      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
1121      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1122          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1123      } else {      } else {
# Line 1002  Line 1126 
1126          my %propertyKeys = ();          my %propertyKeys = ();
1127          my $nextID = 1;          my $nextID = 1;
1128          # Get the attributes we intend to store in the property table.          # Get the attributes we intend to store in the property table.
1129          my @propKeys = $fig->get_group_keys("NMPDR");          my $propKeys = $self->{propKeys};
1130          # Loop through the genomes.          # Loop through the genomes.
1131          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1132              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
# Line 1010  Line 1134 
1134              # Initialize a counter.              # Initialize a counter.
1135              my $propertyCount = 0;              my $propertyCount = 0;
1136              # Get the properties for this genome's features.              # Get the properties for this genome's features.
1137              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
1138              Trace("Property list built for $genomeID.") if T(3);              Trace("Property list built for $genomeID.") if T(3);
1139              # Loop through the results, creating HasProperty records.              # Loop through the results, creating HasProperty records.
1140              for my $attributeData (@attributes) {              for my $attributeData (@attributes) {
# Line 1045  Line 1169 
1169    
1170  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1171    
1172  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1173    
1174  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1175    
# Line 1079  Line 1203 
1203      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1204      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1205      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1206      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1207      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1208      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1209      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1210      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1211          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1212      } else {      } else {
# Line 1152  Line 1276 
1276    
1277  =head3 LoadSourceData  =head3 LoadSourceData
1278    
1279  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1280    
1281  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1282    
# Line 1186  Line 1310 
1310      # Get the genome hash.      # Get the genome hash.
1311      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1312      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1313      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1314      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1315      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1316      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1230  Line 1354 
1354    
1355  =head3 LoadExternalData  =head3 LoadExternalData
1356    
1357  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1358    
1359  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1360    
# Line 1310  Line 1434 
1434    
1435  =head3 LoadReactionData  =head3 LoadReactionData
1436    
1437  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1438    
1439  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1440    
# Line 1323  Line 1447 
1447      Compound      Compound
1448      CompoundName      CompoundName
1449      CompoundCAS      CompoundCAS
1450        IsIdentifiedByCAS
1451        HasCompoundName
1452      IsAComponentOf      IsAComponentOf
1453        Scenario
1454        Catalyzes
1455        HasScenario
1456        IsInputFor
1457        IsOutputOf
1458        ExcludesReaction
1459        IncludesReaction
1460        IsOnDiagram
1461        IncludesReaction
1462    
1463  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
1464    
# Line 1344  Line 1479 
1479      my $fig = $self->{fig};      my $fig = $self->{fig};
1480      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1481      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1482      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1483      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1484      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1485      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1486      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1487        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1488        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1489        my $loadScenario = $self->_TableLoader('Scenario');
1490        my $loadHasScenario = $self->_TableLoader('HasScenario');
1491        my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1492        my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1493        my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1494        my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1495        my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1496        my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1497      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1498          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1499      } else {      } else {
1500          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1501            # We need some hashes to prevent duplicates.
1502            my %compoundNames = ();
1503            my %compoundCASes = ();
1504          # First we create the compounds.          # First we create the compounds.
1505          my @compounds = $fig->all_compounds();          my %compounds = map { $_ => 1 } $fig->all_compounds();
1506          for my $cid (@compounds) {          for my $cid (keys %compounds) {
1507              # Check for names.              # Check for names.
1508              my @names = $fig->names_of_compound($cid);              my @names = $fig->names_of_compound($cid);
1509              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1510              my $prio = 1;              my $prio = 1;
1511              for my $name (@names) {              for my $name (@names) {
1512                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1513                        $loadCompoundName->Put($name);
1514                        $compoundNames{$name} = 1;
1515                    }
1516                    $loadHasCompoundName->Put($cid, $name, $prio++);
1517              }              }
1518              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1519              # becomes the label.              # becomes the label.
# Line 1370  Line 1522 
1522              # Check for a CAS ID.              # Check for a CAS ID.
1523              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1524              if ($cas) {              if ($cas) {
1525                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1526                    if (! exists $compoundCASes{$cas}) {
1527                        $loadCompoundCAS->Put($cas);
1528                        $compoundCASes{$cas} = 1;
1529                    }
1530              }              }
1531          }          }
1532          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
1533          # we initialize the discriminator index. This is a single integer used to insure          # we initialize the discriminator index. This is a single integer used to insure
1534          # duplicate elements in a reaction are not accidentally collapsed.          # duplicate elements in a reaction are not accidentally collapsed.
1535          my $discrim = 0;          my $discrim = 0;
1536          my @reactions = $fig->all_reactions();          my %reactions = map { $_ => 1 } $fig->all_reactions();
1537          for my $reactionID (@reactions) {          for my $reactionID (keys %reactions) {
1538              # Create the reaction record.              # Create the reaction record.
1539              $loadReaction->Put($reactionID, $fig->reversible($reactionID));              $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1540              # Compute the reaction's URL.              # Compute the reaction's URL.
# Line 1401  Line 1557 
1557                  }                  }
1558              }              }
1559          }          }
1560            # Now we run through the subsystems and roles, generating the scenarios
1561            # and connecting the reactions. We'll need some hashes to prevent
1562            # duplicates and a counter for compound group keys.
1563            my %roles = ();
1564            my %scenarios = ();
1565            my @subsystems = $fig->all_subsystems();
1566            for my $subName (@subsystems) {
1567                my $sub = $fig->get_subsystem($subName);
1568                Trace("Processing $subName reactions.") if T(3);
1569                # Get the subsystem's reactions.
1570                my %reactions = $sub->get_hope_reactions();
1571                # Loop through the roles, connecting them to the reactions.
1572                for my $role (keys %reactions) {
1573                    # Only process this role if it is new.
1574                    if (! $roles{$role}) {
1575                        $roles{$role} = 1;
1576                        my @reactions = @{$reactions{$role}};
1577                        for my $reaction (@reactions) {
1578                            $loadCatalyzes->Put($role, $reaction);
1579                        }
1580                    }
1581                }
1582                Trace("Processing $subName scenarios.") if T(3);
1583                # Get the subsystem's scenarios.
1584                my @scenarioNames = $sub->get_hope_scenario_names();
1585                # Loop through the scenarios, creating scenario data.
1586                for my $scenarioName (@scenarioNames) {
1587                    # Link this scenario to this subsystem.
1588                    $loadHasScenario->Put($subName, $scenarioName);
1589                    # If this scenario is new, we need to create it.
1590                    if (! $scenarios{$scenarioName}) {
1591                        Trace("Creating scenario $scenarioName.") if T(3);
1592                        $scenarios{$scenarioName} = 1;
1593                        # Create the scenario itself.
1594                        $loadScenario->Put($scenarioName);
1595                        # Attach the input compounds.
1596                        for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1597                            $loadIsInputFor->Put($input, $scenarioName);
1598                        }
1599                        # Now we need to set up the output compounds. They come in two
1600                        # groups, which we mark 0 and 1.
1601                        my $outputGroup = 0;
1602                        # Set up the output compounds.
1603                        for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1604                            # Attach the compounds.
1605                            for my $compound (@$outputGroup) {
1606                                $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1607                            }
1608                        }
1609                        # Create the reaction lists.
1610                        my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1611                        for my $reaction (@addReactions) {
1612                            $loadIncludesReaction->Put($scenarioName, $reaction);
1613                        }
1614                        my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1615                        for my $reaction (@notReactions) {
1616                            $loadExcludesReaction->Put($scenarioName, $reaction);
1617                        }
1618                        # Link the maps.
1619                        my @maps = $sub->get_hope_map_ids($scenarioName);
1620                        for my $map (@maps) {
1621                            $loadIsOnDiagram->Put($scenarioName, "map$map");
1622                        }
1623                    }
1624      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
1625  }  }
   
 =head3 LoadGroupData  
   
 C<< my $stats = $spl->LoadGroupData(); >>  
   
 Load the genome Groups into Sprout.  
   
 The following relations are loaded by this method.  
   
     GenomeGroups  
   
 Currently, we do not use groups. We used to use them for NMPDR groups,  
 butThere is no direct support for genome groups in FIG, so we access the SEED  
 files directly.  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadGroupData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     # Create a load object for the table we're loading.  
     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating group data.") if T(2);  
         # Currently there are no groups.  
1626      }      }
1627      # Finish the load.      # Finish the load.
1628      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1453  Line 1631 
1631    
1632  =head3 LoadSynonymData  =head3 LoadSynonymData
1633    
1634  C<< my $stats = $spl->LoadSynonymData(); >>      my $stats = $spl->LoadSynonymData();
1635    
1636  Load the synonym groups into Sprout.  Load the synonym groups into Sprout.
1637    
# Line 1492  Line 1670 
1670          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1671          # Get the database handle.          # Get the database handle.
1672          my $dbh = $fig->db_handle();          my $dbh = $fig->db_handle();
1673          # Ask for the synonyms.          # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1674          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1675          my $result = $sth->execute();          my $result = $sth->execute();
1676          if (! defined($result)) {          if (! defined($result)) {
1677              Confess("Database error in Synonym load: " . $sth->errstr());              Confess("Database error in Synonym load: " . $sth->errstr());
1678          } else {          } else {
1679                Trace("Processing synonym results.") if T(2);
1680              # Remember the current synonym.              # Remember the current synonym.
1681              my $current_syn = "";              my $current_syn = "";
1682              # Count the features.              # Count the features.
1683              my $featureCount = 0;              my $featureCount = 0;
1684                my $entryCount = 0;
1685              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1686              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1687                  # Get the synonym ID and feature ID.                  # Get the synonym group ID and feature ID.
1688                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1689                    # Count this row.
1690                    $entryCount++;
1691                    if ($entryCount % 1000 == 0) {
1692                        Trace("$entryCount rows processed.") if T(3);
1693                    }
1694                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1695                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
1696                  if (exists $genomeHash->{$genomeID}) {                  if (exists $genomeHash->{$genomeID}) {
# Line 1524  Line 1709 
1709                      }                      }
1710                  }                  }
1711              }              }
1712                Trace("$entryCount rows produced $featureCount features.") if T(2);
1713          }          }
1714      }      }
1715      # Finish the load.      # Finish the load.
# Line 1533  Line 1719 
1719    
1720  =head3 LoadFamilyData  =head3 LoadFamilyData
1721    
1722  C<< my $stats = $spl->LoadFamilyData(); >>      my $stats = $spl->LoadFamilyData();
1723    
1724  Load the protein families into Sprout.  Load the protein families into Sprout.
1725    
# Line 1601  Line 1787 
1787    
1788  =head3 LoadDrugData  =head3 LoadDrugData
1789    
1790  C<< my $stats = $spl->LoadDrugData(); >>      my $stats = $spl->LoadDrugData();
1791    
1792  Load the drug target data into Sprout.  Load the drug target data into Sprout.
1793    
# Line 1735  Line 1921 
1921                          # Decode the score.                          # Decode the score.
1922                          my $realScore = FIGRules::DecodeScore($score);                          my $realScore = FIGRules::DecodeScore($score);
1923                          # Connect the PDB to the feature.                          # Connect the PDB to the feature.
1924                          $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);                          $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1925                      }                      }
1926                  }                  }
1927              }              }
# Line 1800  Line 1986 
1986    
1987  =head3 SpecialAttribute  =head3 SpecialAttribute
1988    
1989  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>      my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1990    
1991  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1992  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
# Line 1890  Line 2076 
2076    
2077  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
2078    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
2079  =item RETURN  =item RETURN
2080    
2081  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1904  Line 2086 
2086    
2087  sub _TableLoader {  sub _TableLoader {
2088      # Get the parameters.      # Get the parameters.
2089      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
2090      # Create the load object.      # Create the load object.
2091      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
2092      # Cache it in the loader list.      # Cache it in the loader list.
2093      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
2094      # Return it to the caller.      # Return it to the caller.
# Line 1981  Line 2162 
2162    
2163  =head3 GetGenomeAttributes  =head3 GetGenomeAttributes
2164    
2165  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>      my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2166    
2167  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2168  attributes for all the features of a genome in a single call, then organizes them into  attributes for all the features of a genome in a single call, then organizes them into
# Line 2001  Line 2182 
2182    
2183  Reference to a list of the feature IDs whose attributes are to be kept.  Reference to a list of the feature IDs whose attributes are to be kept.
2184    
2185    =item propKeys
2186    
2187    A list of the keys to retrieve.
2188    
2189  =item RETURN  =item RETURN
2190    
2191  Returns a reference to a hash. The key of the hash is the feature ID. The value is the  Returns a reference to a hash. The key of the hash is the feature ID. The value is the
# Line 2013  Line 2198 
2198    
2199  sub GetGenomeAttributes {  sub GetGenomeAttributes {
2200      # Get the parameters.      # Get the parameters.
2201      my ($fig, $genomeID, $fids) = @_;      my ($fig, $genomeID, $fids, $propKeys) = @_;
2202      # Declare the return variable.      # Declare the return variable.
2203      my $retVal = {};      my $retVal = {};
     # Get a list of the attributes we care about.  
     my @propKeys = $fig->get_group_keys("NMPDR");  
     # Get the attributes.  
     my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);  
2204      # Initialize the hash. This not only enables us to easily determine which FIDs to      # Initialize the hash. This not only enables us to easily determine which FIDs to
2205      # keep, it insures that the caller sees a list reference for every known fid,      # keep, it insures that the caller sees a list reference for every known fid,
2206      # simplifying the logic.      # simplifying the logic.
2207      for my $fid (@{$fids}) {      for my $fid (@{$fids}) {
2208          $retVal->{$fid} = [];          $retVal->{$fid} = [];
2209      }      }
2210      # Populate the hash.      # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
2211        # an eval is used.
2212        my @aList = ();
2213        eval {
2214            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
2215            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
2216        };
2217        # Check for a problem.
2218        if ($@) {
2219            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
2220            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
2221            # but allows us to continue processing.
2222            my $nFids = scalar @{$fids};
2223            for (my $i = 0; $i < $nFids; $i += 100) {
2224                # Determine the index of the last feature ID we'll be specifying on this pass.
2225                # Normally it's $i + 99, but if we're close to the end it may be less.
2226                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
2227                # Get a slice of the fid list.
2228                my @slice = @{$fids}[$i .. $end];
2229                # Get the relevant attributes.
2230                Trace("Retrieving attributes for fids $i to $end.") if T(3);
2231                my @aShort = $fig->get_attributes(\@slice, $propKeys);
2232                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
2233                push @aList, @aShort;
2234            }
2235        }
2236        # Now we should have all the interesting attributes in @aList. Populate the hash with
2237        # them.
2238      for my $aListEntry (@aList) {      for my $aListEntry (@aList) {
2239          my $fid = $aListEntry->[0];          my $fid = $aListEntry->[0];
2240          if (exists $retVal->{$fid}) {          if (exists $retVal->{$fid}) {
# Line 2037  Line 2245 
2245      return $retVal;      return $retVal;
2246  }  }
2247    
2248    =head3 GetCommaList
2249    
2250        my $string = GetCommaList($value);
2251    
2252    Create a comma-separated list of the values in a list reference. If the
2253    list reference is a scalar, it will be returned unchanged. If it is
2254    undefined, an empty string will be returned. The idea is that we may be
2255    looking at a string, a list, or nothing, but whatever comes out will be a
2256    string.
2257    
2258    =over 4
2259    
2260    =item value
2261    
2262    Reference to a list of values to be assembled into the return string.
2263    
2264    =item RETURN
2265    
2266    Returns a scalar string containing the content of the input value.
2267    
2268    =back
2269    
2270    =cut
2271    
2272    sub GetCommaList {
2273        # Get the parameters.
2274        my ($value) = @_;
2275        # Declare the return variable.
2276        my $retVal = "";
2277        # Only proceed if we have an input value.
2278        if (defined $value) {
2279            # Analyze the input value.
2280            if (ref $value eq 'ARRAY') {
2281                # Here it's a list reference.
2282                $retVal = join(", ", @$value);
2283            } else {
2284                # Here it's not. Flatten it to a scalar.
2285                $retVal = "$value";
2286            }
2287        }
2288        # Return the result.
2289        return $retVal;
2290    }
2291    
2292    
2293  1;  1;

Legend:
Removed from v.1.84  
changed lines
  Added in v.1.95

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3