[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.32, Sun Apr 2 17:34:44 2006 UTC revision 1.96, Thu Sep 25 23:28:28 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16        use BioWords;
17    
18  =head1 Sprout Load Methods  =head1 Sprout Load Methods
19    
# Line 30  Line 33 
33      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
34      print $stats->Show();      print $stats->Show();
35    
 This module makes use of the internal Sprout property C<_erdb>.  
   
36  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
37  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
38  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 52  Line 53 
53    
54  =head3 new  =head3 new
55    
56  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57    
58  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
59  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 80  Line 81 
81  =item subsysFile  =item subsysFile
82    
83  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
84  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
85  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
86    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
87    
88  =item options  =item options
89    
# Line 94  Line 96 
96  sub new {  sub new {
97      # Get the parameters.      # Get the parameters.
98      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
99      # Load the list of genomes into a hash.      # Create the genome hash.
100      my %genomes;      my %genomes = ();
101        # We only need it if load-only is NOT specified.
102        if (! $options->{loadOnly}) {
103      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
104          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
105          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
106          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
107                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
108      } else {      } else {
109          my $type = ref $genomeFile;          my $type = ref $genomeFile;
110          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 119  Line 124 
124                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
125                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
126                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
127                      if (undef $accessCode) {                          if (! defined($accessCode)) {
128                          $accessCode = 1;                          $accessCode = 1;
129                      }                      }
130                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 129  Line 134 
134              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
135          }          }
136      }      }
137        }
138      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
139      my %subsystems = ();      my %subsystems = ();
140        # We only need it if load-only is NOT specified.
141        if (! $options->{loadOnly}) {
142      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
143          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
144          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
145                # Loop through, checking for the NMPDR file.
146                for my $sub (@subs) {
147                    if ($fig->nmpdr_subsystem($sub)) {
148                        $subsystems{$sub} = 1;
149                    }
150                }
151      } else {      } else {
152          my $type = ref $subsysFile;          my $type = ref $subsysFile;
153          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 153  Line 167 
167              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
168          }          }
169      }      }
170            # Go through the subsys hash again, creating the keyword list for each subsystem.
171            for my $subsystem (keys %subsystems) {
172                my $name = $subsystem;
173                $name =~ s/_/ /g;
174                $subsystems{$subsystem} = $name;
175            }
176        }
177        # Get the list of NMPDR-oriented attribute keys.
178        my @propKeys = $fig->get_group_keys("NMPDR");
179      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
180      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
181      # Create the Sprout load object.      # Create the Sprout load object.
# Line 162  Line 185 
185                    subsystems => \%subsystems,                    subsystems => \%subsystems,
186                    sprout => $sprout,                    sprout => $sprout,
187                    loadDirectory => $directory,                    loadDirectory => $directory,
188                    erdb => $sprout->{_erdb},                    erdb => $sprout,
189                    loaders => [],                    loaders => [],
190                    options => $options                    options => $options,
191                      propKeys => \@propKeys,
192                   };                   };
193      # Bless and return it.      # Bless and return it.
194      bless $retVal, $class;      bless $retVal, $class;
# Line 173  Line 197 
197    
198  =head3 LoadOnly  =head3 LoadOnly
199    
200  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
201    
202  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
203    
# Line 184  Line 208 
208      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
209  }  }
210    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
211    
212  =head3 LoadGenomeData  =head3 LoadGenomeData
213    
214  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
215    
216  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
217    
# Line 236  Line 248 
248      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
249      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
250      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
251      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
252      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
253      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
254      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
255      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
256          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
257      } else {      } else {
258          Trace("Generating genome data.") if T(2);          Trace("Generating genome data.") if T(2);
259            # Get the full info for the FIG genomes.
260            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
263          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
264              Trace("Generating data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
# Line 255  Line 270 
270              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
271              # Get the full taxonomy.              # Get the full taxonomy.
272              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
273                # Get the version. If no version is specified, we default to the genome ID by itself.
274                my $version = $fig->genome_version($genomeID);
275                if (! defined($version)) {
276                    $version = $genomeID;
277                }
278                # Get the DNA size.
279                my $dnaSize = $fig->genome_szdna($genomeID);
280                # Open the NMPDR group file for this genome.
281                my $group;
282                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
283                    defined($group = <TMP>)) {
284                    # Clean the line ending.
285                    chomp $group;
286                } else {
287                    # No group, so use the default.
288                    $group = $FIG_Config::otherGroup;
289                }
290                close TMP;
291                # Get the contigs.
292                my @contigs = $fig->all_contigs($genomeID);
293                # Get this genome's info array.
294                my $info = $genomeInfo{$genomeID};
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297                               $species, $extra, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
             my @contigs = $fig->all_contigs($genomeID);  
299              for my $contigID (@contigs) {              for my $contigID (@contigs) {
300                  Trace("Processing contig $contigID for $genomeID.") if T(4);                  Trace("Processing contig $contigID for $genomeID.") if T(4);
301                  $loadContig->Add("contigIn");                  $loadContig->Add("contigIn");
# Line 295  Line 331 
331      return $retVal;      return $retVal;
332  }  }
333    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($evidenceID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
334  =head3 LoadFeatureData  =head3 LoadFeatureData
335    
336  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
337    
338  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
339    
# Line 431  Line 343 
343    
344      Feature      Feature
345      FeatureAlias      FeatureAlias
346        IsAliasOf
347      FeatureLink      FeatureLink
348      FeatureTranslation      FeatureTranslation
349      FeatureUpstream      FeatureUpstream
350      IsLocatedIn      IsLocatedIn
351      HasFeature      HasFeature
352        HasRoleInSubsystem
353        FeatureEssential
354        FeatureVirulent
355        FeatureIEDB
356        CDD
357        IsPresentOnProteinOf
358        CellLocation
359        IsPossiblePlaceFor
360        ExternalDatabase
361        IsAlsoFoundIn
362        Keyword
363    
364  =over 4  =over 4
365    
# Line 450  Line 374 
374  sub LoadFeatureData {  sub LoadFeatureData {
375      # Get this object instance.      # Get this object instance.
376      my ($self) = @_;      my ($self) = @_;
377      # Get the FIG object.      # Get the FIG and Sprout objects.
378      my $fig = $self->{fig};      my $fig = $self->{fig};
379        my $sprout = $self->{sprout};
380      # Get the table of genome IDs.      # Get the table of genome IDs.
381      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
382      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
383      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
384      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
385      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
386        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
387      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
388      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
389      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
390      my $loadHasFeature = $self->_TableLoader('HasFeature');      my $loadHasFeature = $self->_TableLoader('HasFeature');
391        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
392        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
393        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
394        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395        my $loadCDD = $self->_TableLoader('CDD');
396        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397        my $loadCellLocation = $self->_TableLoader('CellLocation');
398        my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399        my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400        my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401        my $loadKeyword = $self->_TableLoader('Keyword');
402        # Get the subsystem hash.
403        my $subHash = $self->{subsystems};
404        # Get the property keys.
405        my $propKeys = $self->{propKeys};
406        # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407        my %CDD = ();
408        my %alias = ();
409        my %cellLocation = ();
410        my %xdb = ();
411        # Create the bio-words object.
412        my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
413                                     stops => "$FIG_Config::sproutData/StopWords.txt",
414                                     cache => 0);
415        # One of the things we have to do here is build the keyword table, and the keyword
416        # table needs to contain the originating text and feature count for each stem. Unfortunately,
417        # the number of distinct keywords is so large it causes PERL to hang if we try to
418        # keep them in memory. As a result, we need to track them using disk files.
419        # Our approach will be to use two sequential files. One will contain stems and phonexes.
420        # Each time a stem occurs in a feature, a record will be written to that file. The stem
421        # file can then be sorted and collated to determine the number of features for each
422        # stem. A separate file will contain keywords and stems. This last file
423        # will be subjected to a sort unique on stem/keyword. The file is then merged
424        # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
425        my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
426        my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
427        my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
428        my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
429      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
430      # locations.      # locations.
431      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 470  Line 434 
434      } else {      } else {
435          Trace("Generating feature data.") if T(2);          Trace("Generating feature data.") if T(2);
436          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
437          for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
438            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
439            for my $genomeID (@allGenomes) {
440              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
441              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
442              # Get the feature list for this genome.              # Get the feature list for this genome.
443              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
444                # Sort and count the list.
445                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
446                my $count = scalar @featureTuples;
447                my @fids = map { $_->[0] } @featureTuples;
448                Trace("$count features found for genome $genomeID.") if T(3);
449                # Get the attributes for this genome and put them in a hash by feature ID.
450                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
451                Trace("Looping through features for $genomeID.") if T(3);
452                # Set up for our duplicate-feature check.
453                my $oldFeatureID = "";
454              # Loop through the features.              # Loop through the features.
455              for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
                 $loadFeature->Add("featureIn");  
456                  # Split the tuple.                  # Split the tuple.
457                  my ($featureID, $locations, undef, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
458                  # Create the feature record.                  # Check for duplicates.
459                  $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
460                  # Link it to the parent genome.                      Trace("Duplicate feature $featureID found.") if T(1);
461                  $loadHasFeature->Put($genomeID, $featureID, $type);                  } else {
462                        $oldFeatureID = $featureID;
463                        # Count this feature.
464                        $loadFeature->Add("featureIn");
465                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
466                        # Sprout database requires a single character.
467                        if (! defined($quality) || $quality eq "") {
468                            $quality = " ";
469                        }
470                        # Begin building the keywords. We start with the genome ID, the
471                        # feature ID, the taxonomy, and the organism name.
472                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
473                                        $fig->taxonomy_of($genomeID));
474                  # Create the aliases.                  # Create the aliases.
475                  for my $alias ($fig->feature_aliases($featureID)) {                  for my $alias ($fig->feature_aliases($featureID)) {
476                      $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
477                            $loadIsAliasOf->Put($alias, $featureID);
478                            push @keywords, $alias;
479                            # If this is a locus tag, also add its natural form as a keyword.
480                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
481                            if ($naturalName) {
482                                push @keywords, $naturalName;
483                            }
484                            # If this is the first time for the specified alias, create its
485                            # alias record.
486                            if (! exists $alias{$alias}) {
487                                $loadFeatureAlias->Put($alias);
488                                $alias{$alias} = 1;
489                            }
490                        }
491                        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
492                        my @corresponders = $fig->get_corresponding_ids($featureID, 1);
493                        for my $tuple (@corresponders) {
494                            my ($id, $xdb) = @{$tuple};
495                            # Ignore SEED: that's us.
496                            if ($xdb ne 'SEED') {
497                                # Connect this ID to the feature.
498                                $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
499                                # Add it as a keyword.
500                                push @keywords, $id;
501                                # If this is a new database, create a record for it.
502                                if (! exists $xdb{$xdb}) {
503                                    $xdb{$xdb} = 1;
504                                    $loadExternalDatabase->Put($xdb);
505                                }
506                  }                  }
507                        }
508                        Trace("Assignment for $featureID is: $assignment") if T(4);
509                        # Break the assignment into words and shove it onto the
510                        # keyword list.
511                        push @keywords, split(/\s+/, $assignment);
512                        # Link this feature to the parent genome.
513                        $loadHasFeature->Put($genomeID, $featureID, $type);
514                  # Get the links.                  # Get the links.
515                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
516                  for my $link (@links) {                  for my $link (@links) {
# Line 506  Line 529 
529                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
530                      }                      }
531                  }                  }
532                        # Now we need to find the subsystems this feature participates in.
533                        # We also add the subsystems to the keyword list. Before we do that,
534                        # we must convert underscores to spaces.
535                        my @subsystems = $fig->peg_to_subsystems($featureID);
536                        for my $subsystem (@subsystems) {
537                            # Only proceed if we like this subsystem.
538                            if (exists $subHash->{$subsystem}) {
539                                # Store the has-role link.
540                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
541                                # Save the subsystem's keyword data.
542                                my $subKeywords = $subHash->{$subsystem};
543                                push @keywords, split /\s+/, $subKeywords;
544                                # Now we need to get this feature's role in the subsystem.
545                                my $subObject = $fig->get_subsystem($subsystem);
546                                my @roleColumns = $subObject->get_peg_roles($featureID);
547                                my @allRoles = $subObject->get_roles();
548                                for my $col (@roleColumns) {
549                                    my $role = $allRoles[$col];
550                                    push @keywords, split /\s+/, $role;
551                                    push @keywords, $subObject->get_role_abbr($col);
552                                }
553                            }
554                        }
555                        # There are three special attributes computed from property
556                        # data that we build next. If the special attribute is non-empty,
557                        # its name will be added to the keyword list. First, we get all
558                        # the attributes for this feature. They will come back as
559                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
560                        # [name, value, value with URL]. (We don't need the PEG, since
561                        # we already know it.)
562                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
563                                             @{$attributes->{$featureID}};
564                        # Now we process each of the special attributes.
565                        if (SpecialAttribute($featureID, \@attributes,
566                                             1, [0,2], '^(essential|potential_essential)$',
567                                             $loadFeatureEssential)) {
568                            push @keywords, 'essential';
569                            $loadFeature->Add('essential');
570                        }
571                        if (SpecialAttribute($featureID, \@attributes,
572                                             0, [2], '^virulen',
573                                             $loadFeatureVirulent)) {
574                            push @keywords, 'virulent';
575                            $loadFeature->Add('virulent');
576                        }
577                        if (SpecialAttribute($featureID, \@attributes,
578                                             0, [0,2], '^iedb_',
579                                             $loadFeatureIEDB)) {
580                            push @keywords, 'iedb';
581                            $loadFeature->Add('iedb');
582                        }
583                        # Now we have some other attributes we need to process. To get
584                        # through them, we convert the attribute list for this feature
585                        # into a two-layer hash: key => subkey => value.
586                        my %attributeHash = ();
587                        for my $attrRow (@{$attributes->{$featureID}}) {
588                            my (undef, $key, @values) = @{$attrRow};
589                            my ($realKey, $subKey);
590                            if ($key =~ /^([^:]+)::(.+)/) {
591                                ($realKey, $subKey) = ($1, $2);
592                            } else {
593                                ($realKey, $subKey) = ($key, "");
594                            }
595                            if (exists $attributeHash{$1}) {
596                                $attributeHash{$1}->{$2} = \@values;
597                            } else {
598                                $attributeHash{$1} = {$2 => \@values};
599                            }
600                        }
601                        # First we handle CDD. This is a bit complicated, because
602                        # there are multiple CDDs per protein.
603                        if (exists $attributeHash{CDD}) {
604                            # Get the hash of CDD IDs to scores for this feature. We
605                            # already know it exists because of the above IF.
606                            my $cddHash = $attributeHash{CDD};
607                            my @cddData = sort keys %{$cddHash};
608                            for my $cdd (@cddData) {
609                                # Extract the score for this CDD and decode it.
610                                my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
611                                my $realScore = FIGRules::DecodeScore($codeScore);
612                                # We can't afford to crash because of a bad attribute
613                                # value, hence the IF below.
614                                if (! defined($realScore)) {
615                                    # Bad score, so count it.
616                                    $loadFeature->Add('badCDDscore');
617                                    Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
618                                } else {
619                                    # Create the connection.
620                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
621                                    # If this CDD does not yet exist, create its record.
622                                    if (! exists $CDD{$cdd}) {
623                                        $CDD{$cdd} = 1;
624                                        $loadCDD->Put($cdd);
625                                    }
626                                }
627                            }
628                        }
629                        # Next we do PSORT cell locations. here the confidence value
630                        # could have the value "unknown", which we translate to -1.
631                        if (exists $attributeHash{PSORT}) {
632                            # This will be a hash of cell locations to confidence
633                            # factors.
634                            my $psortHash = $attributeHash{PSORT};
635                            for my $psort (keys %{$psortHash}) {
636                                # Get the confidence, and convert it to a number if necessary.
637                                my $confidence = $psortHash->{$psort};
638                                if ($confidence eq 'unknown') {
639                                    $confidence = -1;
640                                }
641                                $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
642                                # If this cell location does not yet exist, create its record.
643                                if (! exists $cellLocation{$psort}) {
644                                    $cellLocation{$psort} = 1;
645                                    $loadCellLocation->Put($psort);
646                                }
647                                # If this is a significant location, add it as a keyword.
648                                if ($confidence > 2.5) {
649                                    push @keywords, $psort;
650                                }
651                            }
652                        }
653                        # Phobius data is next. This consists of the signal peptide location and
654                        # the transmembrane locations.
655                        my $signalList = "";
656                        my $transList = "";
657                        if (exists $attributeHash{Phobius}) {
658                            # This will be a hash of two keys (transmembrane and signal) to
659                            # location strings. If there's no value, we stuff in an empty string.
660                            $signalList = GetCommaList($attributeHash{Phobius}->{signal});
661                            $transList = GetCommaList($attributeHash{Phobius}->{transmembrane});
662                        }
663                        # Here are some more numbers: isoelectric point, molecular weight, and
664                        # the similar-to-human flag.
665                        my $isoelectric = 0;
666                        if (exists $attributeHash{isoelectric_point}) {
667                            $isoelectric = $attributeHash{isoelectric_point}->{""};
668                        }
669                        my $similarToHuman = 0;
670                        if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
671                            $similarToHuman = 1;
672                        }
673                        my $molecularWeight = 0;
674                        if (exists $attributeHash{molecular_weight}) {
675                            $molecularWeight = $attributeHash{molecular_weight}->{""};
676                        }
677                        # Create the keyword string.
678                        my $keywordString = join(" ", @keywords);
679                        Trace("Real keyword string for $featureID: $keywordString.") if T(4);
680                        # Get rid of annoying punctuation.
681                        $keywordString =~ s/[();@#\/]/ /g;
682                        # Get the list of keywords in the keyword string.
683                        my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
684                        # We need to do two things here: create the keyword string for the feature table
685                        # and write records to the keyword and stem files. The stuff we write to
686                        # the files will be taken from the following two hashes. The stuff used
687                        # to create the keyword string will be taken from the list.
688                        my (%keys, %stems, @realStems);
689                        for my $keyword (@realKeywords) {
690                            # Compute the stem and phonex for this keyword.
691                            my ($stem, $phonex) = $biowords->StemLookup($keyword);
692                            # Only proceed if a stem comes back. If no stem came back, it's a
693                            # stop word and we throw it away.
694                            if ($stem) {
695                                $keys{$keyword} = $stem;
696                                $stems{$stem} = $phonex;
697                                push @realStems, $stem;
698                            }
699                        }
700                        # Now create the keyword string.
701                        my $cleanWords = join(" ", @realStems);
702                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
703                        # Write the stem and keyword records.
704                        for my $stem (keys %stems) {
705                            Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
706                        }
707                        for my $key (keys %keys) {
708                            # The stem goes first in this file, because we want to sort
709                            # by stem and then keyword.
710                            Tracer::PutLine($keyh, [$keys{$key}, $key]);
711                        }
712                        # Now we need to process the feature's locations. First, we split them up.
713                        my @locationList = split /\s*,\s*/, $locations;
714                        # Next, we convert them to Sprout location objects.
715                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
716                        # Assemble them into a sprout location string for later.
717                        my $locationString = join(", ", map { $_->String } @locObjectList);
718                        # We'll store the sequence length in here.
719                        my $sequenceLength = 0;
720                  # This part is the roughest. We need to relate the features to contig                  # This part is the roughest. We need to relate the features to contig
721                  # locations, and the locations must be split so that none of them exceed                  # locations, and the locations must be split so that none of them exceed
722                  # the maximum segment size. This simplifies the genes_in_region processing                  # the maximum segment size. This simplifies the genes_in_region processing
723                  # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                 my @locationList = split /\s*,\s*/, $locations;  
                 # Create the location position indicator.  
724                  my $i = 1;                  my $i = 1;
725                  # Loop through the locations.                  # Loop through the locations.
726                  for my $location (@locationList) {                      for my $locObject (@locObjectList) {
727                      # Parse the location.                          # Record the length.
728                      my $locObject = BasicLocation->new("$genomeID:$location");                          $sequenceLength += $locObject->Length;
729                      # Split it into a list of chunks.                          # Split this location into a list of chunks.
730                      my @locOList = ();                      my @locOList = ();
731                      while (my $peeling = $locObject->Peel($chunkSize)) {                      while (my $peeling = $locObject->Peel($chunkSize)) {
732                          $loadIsLocatedIn->Add("peeling");                          $loadIsLocatedIn->Add("peeling");
# Line 532  Line 741 
741                          $i++;                          $i++;
742                      }                      }
743                  }                  }
744              }                      # Now we get some ancillary flags.
745          }                      my $locked = $fig->is_locked_fid($featureID);
746      }                      my $in_genbank = $fig->peg_in_gendb($featureID);
747      # Finish the loads.                      # Create the feature record.
748      my $retVal = $self->_FinishAll();                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
749      return $retVal;                                        $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
750  }                                        $transList);
751                    }
752  =head3 LoadBBHData              }
753                Trace("Genome $genomeID processed.") if T(3);
754  C<< my $stats = $spl->LoadBBHData(); >>          }
755        }
756  Load the bidirectional best hit data from FIG into Sprout.      Trace("Sorting keywords.") if T(2);
757        # Now we need to load the keyword table from the key and stem files.
758  Sprout does not store information on similarities. Instead, it has only the      close $keyh;
759  bi-directional best hits. Even so, the BBH table is one of the largest in      close $stemh;
760  the database.      Trace("Loading keywords.") if T(2);
761        $keyh = Open(undef, "<$keyFileName");
762  The following relations are loaded by this method.      $stemh = Open(undef, "<$stemFileName");
763        # We'll count the keywords in here, for tracing purposes.
764      IsBidirectionalBestHitOf      my $count = 0;
765        # These variables track the current stem's data. When an incoming
766  =over 4      # keyword's stem changes, these will be recomputed.
767        my ($currentStem, $currentPhonex, $currentCount);
768  =item RETURNS      # Prime the loop by reading the first stem in the stem file.
769        my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
770  Returns a statistics object for the loads.      # Loop through the keyword file.
771        while (! eof $keyh) {
772  =back          # Read this keyword.
773            my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
774  =cut          # Check to see if it's the new stem yet.
775  #: Return Type $%;          if ($thisStem ne $currentStem) {
776  sub LoadBBHData {              # Yes. It's a terrible error if it's not also the next stem.
777      # Get this object instance.              if ($thisStem ne $nextStem) {
778      my ($self) = @_;                  Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
779      } else {      } else {
780          Trace("Generating BBH data.") if T(2);                  # Here we're okay.
781          # Now we loop through the genomes, generating the data for each one.                  ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
782          for my $genomeID (sort keys %{$genomeHash}) {                  # Count the number of features for this stem.
783              $loadIsBidirectionalBestHitOf->Add("genomeIn");                  $currentCount = 0;
784              Trace("Processing features for genome $genomeID.") if T(3);                  while ($nextStem eq $thisStem) {
785              # Get the feature list for this genome.                      ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
786              my $features = $fig->all_features_detailed($genomeID);                      $currentCount++;
             # Loop through the features.  
             for my $featureData (@{$features}) {  
                 # Split the tuple.  
                 my ($featureID, $locations, $aliases, $type) = @{$featureData};  
                 # Get the bi-directional best hits.  
                 my @bbhList = $fig->bbhs($featureID);  
                 for my $bbhEntry (@bbhList) {  
                     # Get the target feature ID and the score.  
                     my ($targetID, $score) = @{$bbhEntry};  
                     # Check the target feature's genome.  
                     my $targetGenomeID = $fig->genome_of($targetID);  
                     # Only proceed if it's one of our genomes.  
                     if ($genomeHash->{$targetGenomeID}) {  
                         $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                            $score);  
787                      }                      }
788                  }                  }
789              }              }
790            # Now $currentStem is the same as $thisStem, and the other $current-vars
791            # contain the stem's data (phonex and count).
792            $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
793            if (++$count % 1000 == 0 && T(3)) {
794                Trace("$count keywords loaded.");
795          }          }
796      }      }
797        Trace("$count keywords loaded into keyword table.") if T(2);
798      # Finish the loads.      # Finish the loads.
799      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
800      return $retVal;      return $retVal;
# Line 610  Line 802 
802    
803  =head3 LoadSubsystemData  =head3 LoadSubsystemData
804    
805  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
806    
807  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
808    
# Line 623  Line 815 
815  The following relations are loaded by this method.  The following relations are loaded by this method.
816    
817      Subsystem      Subsystem
818        SubsystemClass
819      Role      Role
820      RoleEC      RoleEC
821        IsIdentifiedByEC
822      SSCell      SSCell
823      ContainsFeature      ContainsFeature
824      IsGenomeOf      IsGenomeOf
# Line 638  Line 832 
832      ConsistsOfGenomes      ConsistsOfGenomes
833      GenomeSubset      GenomeSubset
834      HasGenomeSubset      HasGenomeSubset
     Catalyzes  
835      Diagram      Diagram
836      RoleOccursIn      RoleOccursIn
837        SubsystemHopeNotes
838    
839  =over 4  =over 4
840    
# Line 666  Line 860 
860      # Get the map list.      # Get the map list.
861      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
862      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
863      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
864      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
865      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
866      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
867      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
868      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
869      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
870      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
871      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
872      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
873      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
874      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
875      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
876      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
877      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
878      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
879      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
880      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
881      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
882        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
883        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
884        my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
885      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
886          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
887      } else {      } else {
888          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
889          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
890          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
891          my %ecToRoles = ();          my %ecToRoles = ();
892          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 703  Line 900 
900              # Get the subsystem object.              # Get the subsystem object.
901              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
902              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
903              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
904                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
905                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
906                  # Create the subsystem record.                  # Create the subsystem record.
907                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
908                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
909                  $loadSubsystem->Put($subsysID, $curator, $notes);                  my $version = $sub->get_version();
910                    my $description = $sub->get_description();
911                    $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
912                    # Add the hope notes.
913                    my $hopeNotes = $sub->get_hope_curation_notes();
914                    if ($hopeNotes) {
915                        $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
916                    }
917                    # Now for the classification string. This comes back as a list
918                    # reference and we convert it to a space-delimited string.
919                    my $classList = $fig->subsystem_classification($subsysID);
920                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
921                    $loadSubsystemClass->Put($subsysID, $classString);
922                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
923                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
924                        # Get the role's abbreviation.
925                        my $abbr = $sub->get_role_abbr($col);
926                        # Get its essentiality.
927                        my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
928                        # Get its reaction note.
929                        my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
930                      # Connect to this role.                      # Connect to this role.
931                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
932                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
933                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
934                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
935                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
936                          # Add the role.                          # Add the role.
937                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
938                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
939                          # Check for an EC number.                          # Check for an EC number.
940                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
941                              my $ec = $1;                              my $ec = $1;
942                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
943                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
944                                if (exists $ecToRoles{$ec}) {
945                                    # No, so just add this role to the EC list.
946                                    push @{$ecToRoles{$ec}}, $roleID;
947                                } else {
948                                    # Output this EC.
949                                    $loadRoleEC->Put($ec);
950                                    # Create its role list.
951                                    $ecToRoles{$ec} = [$roleID];
952                                }
953                          }                          }
954                      }                      }
955                  }                  }
# Line 753  Line 976 
976                          # part of the spreadsheet cell ID.                          # part of the spreadsheet cell ID.
977                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
978                              # Get the features in the spreadsheet cell for this genome and role.                              # Get the features in the spreadsheet cell for this genome and role.
979                              my @pegs = $sub->get_pegs_from_cell($row, $col);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
980                              # Only proceed if features exist.                              # Only proceed if features exist.
981                              if (@pegs > 0) {                              if (@pegs > 0) {
982                                  # Create the spreadsheet cell.                                  # Create the spreadsheet cell.
# Line 774  Line 997 
997                          if ($pegCount > 0) {                          if ($pegCount > 0) {
998                              Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);                              Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
999                              $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);                              $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
                             # Partition the PEGs found into clusters.  
                             my @clusters = $fig->compute_clusters(\@pegsFound, $sub);  
1000                              # Create a hash mapping PEG IDs to cluster numbers.                              # Create a hash mapping PEG IDs to cluster numbers.
1001                              # We default to -1 for all of them.                              # We default to -1 for all of them.
1002                              my %clusterOf = map { $_ => -1 } @pegsFound;                              my %clusterOf = map { $_ => -1 } @pegsFound;
1003                                # Partition the PEGs found into clusters.
1004                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
1005                              for (my $i = 0; $i <= $#clusters; $i++) {                              for (my $i = 0; $i <= $#clusters; $i++) {
1006                                  my $subList = $clusters[$i];                                  my $subList = $clusters[$i];
1007                                  for my $peg (@{$subList}) {                                  for my $peg (@{$subList}) {
# Line 826  Line 1049 
1049                      }                      }
1050                  }                  }
1051              }              }
1052            }
1053              # Now we loop through the diagrams. We need to create the diagram records              # Now we loop through the diagrams. We need to create the diagram records
1054              # and link each diagram to its roles. Note that only roles which occur              # and link each diagram to its roles. Note that only roles which occur
1055              # in subsystems (and therefore appear in the %ecToRoles hash) are              # in subsystems (and therefore appear in the %ecToRoles hash) are
# Line 838  Line 1062 
1062                  # Now we need to link all the map's roles to it.                  # Now we need to link all the map's roles to it.
1063                  # A hash is used to prevent duplicates.                  # A hash is used to prevent duplicates.
1064                  my %roleHash = ();                  my %roleHash = ();
1065                  for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
1066                      if (exists $ecToRoles{$role} && ! $roleHash{$role}) {                  if (exists $ecToRoles{$ec}) {
1067                          $loadRoleOccursIn->Put($ecToRoles{$role}, $map);                      for my $role (@{$ecToRoles{$ec}}) {
1068                            if (! $roleHash{$role}) {
1069                                $loadRoleOccursIn->Put($role, $map);
1070                          $roleHash{$role} = 1;                          $roleHash{$role} = 1;
1071                      }                      }
1072                  }                  }
1073              }              }
             # Before we leave, we must create the Catalyzes table. We start with the reactions,  
             # then use the "ecToRoles" table to convert EC numbers to role IDs.  
             my @reactions = $fig->all_reactions();  
             for my $reactionID (@reactions) {  
                 # Get this reaction's list of roles. The results will be EC numbers.  
                 my @roles = $fig->catalyzed_by($reactionID);  
                 # Loop through the roles, creating catalyzation records.  
                 for my $thisRole (@roles) {  
                     if (exists $ecToRoles{$thisRole}) {  
                         $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);  
                     }  
                 }  
1074              }              }
1075          }          }
1076      }      }
# Line 867  Line 1081 
1081    
1082  =head3 LoadPropertyData  =head3 LoadPropertyData
1083    
1084  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
1085    
1086  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
1087    
# Line 903  Line 1117 
1117      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1118      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1119      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
1120      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
1121      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1122          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1123      } else {      } else {
# Line 911  Line 1125 
1125          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
1126          my %propertyKeys = ();          my %propertyKeys = ();
1127          my $nextID = 1;          my $nextID = 1;
1128            # Get the attributes we intend to store in the property table.
1129            my $propKeys = $self->{propKeys};
1130          # Loop through the genomes.          # Loop through the genomes.
1131          for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1132              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1133              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1134              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
1135              my $propertyCount = 0;              my $propertyCount = 0;
1136              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
1137              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
1138                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
1139                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
1140                  my @attributeList = $fig->get_attributes($fid, '', '', '');              for my $attributeData (@attributes) {
1141                  if (scalar @attributeList) {                  # Pull apart the attribute tuple.
1142                      $featureCount++;                  my ($fid, $key, $value, $url) = @{$attributeData};
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
1143                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
1144                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
1145                      # character.                      # character.
# Line 951  Line 1156 
1156                      }                      }
1157                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
1158                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
1159                  }                  $propertyCount++;
1160              }              }
1161              # Update the statistics.              # Update the statistics.
1162              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
1163              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1164          }          }
1165      }      }
# Line 966  Line 1170 
1170    
1171  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1172    
1173  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1174    
1175  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1176    
# Line 1000  Line 1204 
1204      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1205      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1206      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1207      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1208      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1209      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1210      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1211      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1212          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1213      } else {      } else {
# Line 1021  Line 1225 
1225          # Loop through the genomes.          # Loop through the genomes.
1226          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1227              Trace("Processing $genomeID.") if T(3);              Trace("Processing $genomeID.") if T(3);
             # Get the genome's PEGs.  
             my @pegs = $fig->pegs_of($genomeID);  
             for my $peg (@pegs) {  
                 Trace("Processing $peg.") if T(4);  
1228                  # Create a hash of timestamps. We use this to prevent duplicate time stamps                  # Create a hash of timestamps. We use this to prevent duplicate time stamps
1229                  # from showing up for a single PEG's annotations.                  # from showing up for a single PEG's annotations.
1230                  my %seenTimestamps = ();                  my %seenTimestamps = ();
1231                  # Loop through the annotations.              # Get the genome's annotations.
1232                  for my $tuple ($fig->feature_annotations($peg, "raw")) {              my @annotations = $fig->read_all_annotations($genomeID);
1233                      my ($fid, $timestamp, $user, $text) = @{$tuple};              Trace("Processing annotations.") if T(2);
1234                for my $tuple (@annotations) {
1235                    # Get the annotation tuple.
1236                    my ($peg, $timestamp, $user, $text) = @{$tuple};
1237                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1238                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1239                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1240                      # stop the substitution search.                      # stop the substitution search.
1241                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 1045  Line 1248 
1248                          # Here it's a number. We need to insure the one we use to form                          # Here it's a number. We need to insure the one we use to form
1249                          # the key is unique.                          # the key is unique.
1250                          my $keyStamp = $timestamp;                          my $keyStamp = $timestamp;
1251                          while ($seenTimestamps{$keyStamp}) {                      while ($seenTimestamps{"$peg:$keyStamp"}) {
1252                              $keyStamp++;                              $keyStamp++;
1253                          }                          }
                         $seenTimestamps{$keyStamp} = 1;  
1254                          my $annotationID = "$peg:$keyStamp";                          my $annotationID = "$peg:$keyStamp";
1255                        $seenTimestamps{$annotationID} = 1;
1256                          # Insure the user exists.                          # Insure the user exists.
1257                          if (! $users{$user}) {                          if (! $users{$user}) {
1258                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 1067  Line 1270 
1270                  }                  }
1271              }              }
1272          }          }
     }  
1273      # Finish the load.      # Finish the load.
1274      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1275      return $retVal;      return $retVal;
# Line 1075  Line 1277 
1277    
1278  =head3 LoadSourceData  =head3 LoadSourceData
1279    
1280  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1281    
1282  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1283    
# Line 1109  Line 1311 
1311      # Get the genome hash.      # Get the genome hash.
1312      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1313      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1314      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1315      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1316      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1317      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1153  Line 1355 
1355    
1356  =head3 LoadExternalData  =head3 LoadExternalData
1357    
1358  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1359    
1360  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1361    
# Line 1196  Line 1398 
1398      } else {      } else {
1399          Trace("Generating external data.") if T(2);          Trace("Generating external data.") if T(2);
1400          # We loop through the files one at a time. First, the organism file.          # We loop through the files one at a time. First, the organism file.
1401          Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1402          my $orgLine;          my $orgLine;
1403          while (defined($orgLine = <ORGS>)) {          while (defined($orgLine = <ORGS>)) {
1404              # Clean the input line.              # Clean the input line.
# Line 1208  Line 1410 
1410          close ORGS;          close ORGS;
1411          # Now the function file.          # Now the function file.
1412          my $funcLine;          my $funcLine;
1413          Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1414          while (defined($funcLine = <FUNCS>)) {          while (defined($funcLine = <FUNCS>)) {
1415              # Clean the line ending.              # Clean the line ending.
1416              chomp $funcLine;              chomp $funcLine;
# Line 1233  Line 1435 
1435    
1436  =head3 LoadReactionData  =head3 LoadReactionData
1437    
1438  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1439    
1440  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1441    
# Line 1246  Line 1448 
1448      Compound      Compound
1449      CompoundName      CompoundName
1450      CompoundCAS      CompoundCAS
1451        IsIdentifiedByCAS
1452        HasCompoundName
1453      IsAComponentOf      IsAComponentOf
1454        Scenario
1455        Catalyzes
1456        HasScenario
1457        IsInputFor
1458        IsOutputOf
1459        ExcludesReaction
1460        IncludesReaction
1461        IsOnDiagram
1462        IncludesReaction
1463    
1464  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
1465    
# Line 1267  Line 1480 
1480      my $fig = $self->{fig};      my $fig = $self->{fig};
1481      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1482      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1483      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1484      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1485      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1486      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1487      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1488        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1489        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1490        my $loadScenario = $self->_TableLoader('Scenario');
1491        my $loadHasScenario = $self->_TableLoader('HasScenario');
1492        my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1493        my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1494        my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1495        my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1496        my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1497        my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1498      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1499          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1500      } else {      } else {
1501          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1502            # We need some hashes to prevent duplicates.
1503            my %compoundNames = ();
1504            my %compoundCASes = ();
1505          # First we create the compounds.          # First we create the compounds.
1506          my @compounds = $fig->all_compounds();          my %compounds = map { $_ => 1 } $fig->all_compounds();
1507          for my $cid (@compounds) {          for my $cid (keys %compounds) {
1508              # Check for names.              # Check for names.
1509              my @names = $fig->names_of_compound($cid);              my @names = $fig->names_of_compound($cid);
1510              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1511              my $prio = 1;              my $prio = 1;
1512              for my $name (@names) {              for my $name (@names) {
1513                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1514                        $loadCompoundName->Put($name);
1515                        $compoundNames{$name} = 1;
1516                    }
1517                    $loadHasCompoundName->Put($cid, $name, $prio++);
1518              }              }
1519              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1520              # becomes the label.              # becomes the label.
# Line 1293  Line 1523 
1523              # Check for a CAS ID.              # Check for a CAS ID.
1524              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1525              if ($cas) {              if ($cas) {
1526                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1527                    if (! exists $compoundCASes{$cas}) {
1528                        $loadCompoundCAS->Put($cas);
1529                        $compoundCASes{$cas} = 1;
1530                    }
1531              }              }
1532          }          }
1533          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
1534          # we initialize the discriminator index. This is a single integer used to insure          # we initialize the discriminator index. This is a single integer used to insure
1535          # duplicate elements in a reaction are not accidentally collapsed.          # duplicate elements in a reaction are not accidentally collapsed.
1536          my $discrim = 0;          my $discrim = 0;
1537          my @reactions = $fig->all_reactions();          my %reactions = map { $_ => 1 } $fig->all_reactions();
1538          for my $reactionID (@reactions) {          for my $reactionID (keys %reactions) {
1539              # Create the reaction record.              # Create the reaction record.
1540              $loadReaction->Put($reactionID, $fig->reversible($reactionID));              $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1541              # Compute the reaction's URL.              # Compute the reaction's URL.
# Line 1324  Line 1558 
1558                  }                  }
1559              }              }
1560          }          }
1561            # Now we run through the subsystems and roles, generating the scenarios
1562            # and connecting the reactions. We'll need some hashes to prevent
1563            # duplicates and a counter for compound group keys.
1564            my %roles = ();
1565            my %scenarios = ();
1566            my @subsystems = $fig->all_subsystems();
1567            for my $subName (@subsystems) {
1568                my $sub = $fig->get_subsystem($subName);
1569                Trace("Processing $subName reactions.") if T(3);
1570                # Get the subsystem's reactions.
1571                my %reactions = $sub->get_hope_reactions();
1572                # Loop through the roles, connecting them to the reactions.
1573                for my $role (keys %reactions) {
1574                    # Only process this role if it is new.
1575                    if (! $roles{$role}) {
1576                        $roles{$role} = 1;
1577                        my @reactions = @{$reactions{$role}};
1578                        for my $reaction (@reactions) {
1579                            $loadCatalyzes->Put($role, $reaction);
1580                        }
1581                    }
1582                }
1583                Trace("Processing $subName scenarios.") if T(3);
1584                # Get the subsystem's scenarios.
1585                my @scenarioNames = $sub->get_hope_scenario_names();
1586                # Loop through the scenarios, creating scenario data.
1587                for my $scenarioName (@scenarioNames) {
1588                    # Link this scenario to this subsystem.
1589                    $loadHasScenario->Put($subName, $scenarioName);
1590                    # If this scenario is new, we need to create it.
1591                    if (! $scenarios{$scenarioName}) {
1592                        Trace("Creating scenario $scenarioName.") if T(3);
1593                        $scenarios{$scenarioName} = 1;
1594                        # Create the scenario itself.
1595                        $loadScenario->Put($scenarioName);
1596                        # Attach the input compounds.
1597                        for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1598                            $loadIsInputFor->Put($input, $scenarioName);
1599                        }
1600                        # Now we need to set up the output compounds. They come in two
1601                        # groups, which we mark 0 and 1.
1602                        my $outputGroup = 0;
1603                        # Set up the output compounds.
1604                        for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1605                            # Attach the compounds.
1606                            for my $compound (@$outputGroup) {
1607                                $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1608                            }
1609                        }
1610                        # Create the reaction lists.
1611                        my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1612                        for my $reaction (@addReactions) {
1613                            $loadIncludesReaction->Put($scenarioName, $reaction);
1614                        }
1615                        my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1616                        for my $reaction (@notReactions) {
1617                            $loadExcludesReaction->Put($scenarioName, $reaction);
1618                        }
1619                        # Link the maps.
1620                        my @maps = $sub->get_hope_map_ids($scenarioName);
1621                        for my $map (@maps) {
1622                            $loadIsOnDiagram->Put($scenarioName, "map$map");
1623                        }
1624                    }
1625                }
1626            }
1627      }      }
1628      # Finish the load.      # Finish the load.
1629      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1630      return $retVal;      return $retVal;
1631  }  }
1632    
1633  =head3 LoadGroupData  =head3 LoadSynonymData
1634    
1635  C<< my $stats = $spl->LoadGroupData(); >>      my $stats = $spl->LoadSynonymData();
1636    
1637  Load the genome Groups into Sprout.  Load the synonym groups into Sprout.
1638    
1639  The following relations are loaded by this method.  The following relations are loaded by this method.
1640    
1641      GenomeGroups      SynonymGroup
1642        IsSynonymGroupFor
1643    
1644  There is no direct support for genome groups in FIG, so we access the SEED  The source information for these relations is taken from the C<maps_to_id> method
1645  files directly.  of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1646    SQL against the FIG database.
1647    
1648  =over 4  =over 4
1649    
# Line 1353  Line 1655 
1655    
1656  =cut  =cut
1657  #: Return Type $%;  #: Return Type $%;
1658  sub LoadGroupData {  sub LoadSynonymData {
1659      # Get this object instance.      # Get this object instance.
1660      my ($self) = @_;      my ($self) = @_;
1661      # Get the FIG object.      # Get the FIG object.
# Line 1361  Line 1663 
1663      # Get the genome hash.      # Get the genome hash.
1664      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1665      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1666      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1667        my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1668      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1669          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1670      } else {      } else {
1671          Trace("Generating group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1672            # Get the database handle.
1673            my $dbh = $fig->db_handle();
1674            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1675            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1676            my $result = $sth->execute();
1677            if (! defined($result)) {
1678                Confess("Database error in Synonym load: " . $sth->errstr());
1679            } else {
1680                Trace("Processing synonym results.") if T(2);
1681                # Remember the current synonym.
1682                my $current_syn = "";
1683                # Count the features.
1684                my $featureCount = 0;
1685                my $entryCount = 0;
1686                # Loop through the synonym/peg pairs.
1687                while (my @row = $sth->fetchrow()) {
1688                    # Get the synonym group ID and feature ID.
1689                    my ($syn_id, $peg) = @row;
1690                    # Count this row.
1691                    $entryCount++;
1692                    if ($entryCount % 1000 == 0) {
1693                        Trace("$entryCount rows processed.") if T(3);
1694                    }
1695                    # Insure it's for one of our genomes.
1696                    my $genomeID = FIG::genome_of($peg);
1697                    if (exists $genomeHash->{$genomeID}) {
1698                        # Verify the synonym.
1699                        if ($syn_id ne $current_syn) {
1700                            # It's new, so put it in the group table.
1701                            $loadSynonymGroup->Put($syn_id);
1702                            $current_syn = $syn_id;
1703                        }
1704                        # Connect the synonym to the peg.
1705                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1706                        # Count this feature.
1707                        $featureCount++;
1708                        if ($featureCount % 1000 == 0) {
1709                            Trace("$featureCount features processed.") if T(3);
1710                        }
1711                    }
1712                }
1713                Trace("$entryCount rows produced $featureCount features.") if T(2);
1714            }
1715        }
1716        # Finish the load.
1717        my $retVal = $self->_FinishAll();
1718        return $retVal;
1719    }
1720    
1721    =head3 LoadFamilyData
1722    
1723        my $stats = $spl->LoadFamilyData();
1724    
1725    Load the protein families into Sprout.
1726    
1727    The following relations are loaded by this method.
1728    
1729        Family
1730        IsFamilyForFeature
1731    
1732    The source information for these relations is taken from the C<families_for_protein>,
1733    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1734    
1735    =over 4
1736    
1737    =item RETURNS
1738    
1739    Returns a statistics object for the loads.
1740    
1741    =back
1742    
1743    =cut
1744    #: Return Type $%;
1745    sub LoadFamilyData {
1746        # Get this object instance.
1747        my ($self) = @_;
1748        # Get the FIG object.
1749        my $fig = $self->{fig};
1750        # Get the genome hash.
1751        my $genomeHash = $self->{genomes};
1752        # Create load objects for the tables we're loading.
1753        my $loadFamily = $self->_TableLoader('Family');
1754        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1755        if ($self->{options}->{loadOnly}) {
1756            Trace("Loading from existing files.") if T(2);
1757        } else {
1758            Trace("Generating family data.") if T(2);
1759            # Create a hash for the family IDs.
1760            my %familyHash = ();
1761          # Loop through the genomes.          # Loop through the genomes.
1762          my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1763          for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1764              Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1765              # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1766              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1767                  defined($line = <TMP>)) {                  # Get this feature's families.
1768                  # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1769                  chomp $line;                  # Loop through the families, connecting them to the feature.
1770                  # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1771                  # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1772                  $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1773                        if (! exists $familyHash{$family}) {
1774                            $familyHash{$family} = 1;
1775                            $loadFamily->Add("families", 1);
1776                            my $size = $fig->sz_family($family);
1777                            my $func = $fig->family_function($family);
1778                            $loadFamily->Put($family, $size, $func);
1779                        }
1780                    }
1781              }              }
             close TMP;  
1782          }          }
1783      }      }
1784      # Finish the load.      # Finish the load.
# Line 1387  Line 1786 
1786      return $retVal;      return $retVal;
1787  }  }
1788    
1789    =head3 LoadDrugData
1790    
1791        my $stats = $spl->LoadDrugData();
1792    
1793    Load the drug target data into Sprout.
1794    
1795    The following relations are loaded by this method.
1796    
1797        PDB
1798        DocksWith
1799        IsProteinForFeature
1800        Ligand
1801    
1802    The source information for these relations is taken from attributes. The
1803    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1804    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1805    attribute contains the information for the B<DocksWith> relationship. It is
1806    expected that additional attributes and tables will be added in the future.
1807    
1808    =over 4
1809    
1810    =item RETURNS
1811    
1812    Returns a statistics object for the loads.
1813    
1814    =back
1815    
1816    =cut
1817    #: Return Type $%;
1818    sub LoadDrugData {
1819        # Get this object instance.
1820        my ($self) = @_;
1821        # Get the FIG object.
1822        my $fig = $self->{fig};
1823        # Get the genome hash.
1824        my $genomeHash = $self->{genomes};
1825        # Create load objects for the tables we're loading.
1826        my $loadPDB = $self->_TableLoader('PDB');
1827        my $loadLigand = $self->_TableLoader('Ligand');
1828        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1829        my $loadDocksWith = $self->_TableLoader('DocksWith');
1830        if ($self->{options}->{loadOnly}) {
1831            Trace("Loading from existing files.") if T(2);
1832        } else {
1833            Trace("Generating drug target data.") if T(2);
1834            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1835            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1836            # this process, PDB information is collected in a hash table and then
1837            # unspooled after both relationships are created.
1838            my %pdbHash = ();
1839            Trace("Generating docking data.") if T(2);
1840            # Get all the docking data. This may cause problems if there are too many PDBs,
1841            # at which point we'll need another algorithm. The indicator that this is
1842            # happening will be a timeout error in the next statement.
1843            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1844                                                  ['docking_results', $FIG_Config::dockLimit]);
1845            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1846            for my $dockData (@dockData) {
1847                # Get the docking data components.
1848                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1849                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1850                $pdbID = lc $pdbID;
1851                # Strip off the object type.
1852                $pdbID =~ s/pdb://;
1853                # Extract the ZINC ID from the docking key. Note that there are two possible
1854                # formats.
1855                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1856                if (! $zinc_id) {
1857                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1858                    $loadDocksWith->Add("errors");
1859                } else {
1860                    # Get the pieces of the value and parse the energy.
1861                    # Note that we don't care about the rank, since
1862                    # we can sort on the energy level itself in our database.
1863                    my ($energy, $tool, $type) = @valueData;
1864                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1865                    # Ignore predicted results.
1866                    if ($type ne "Predicted") {
1867                        # Count this docking result.
1868                        if (! exists $pdbHash{$pdbID}) {
1869                            $pdbHash{$pdbID} = 1;
1870                        } else {
1871                            $pdbHash{$pdbID}++;
1872                        }
1873                        # Write the result to the output.
1874                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1875                                            $total, $vanderwaals);
1876                    }
1877                }
1878            }
1879            Trace("Connecting features.") if T(2);
1880            # Loop through the genomes.
1881            for my $genome (sort keys %{$genomeHash}) {
1882                Trace("Generating PDBs for $genome.") if T(3);
1883                # Get all of the PDBs that BLAST against this genome's features.
1884                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1885                for my $pdbData (@attributeData) {
1886                    # The PDB ID is coded as a subkey.
1887                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1888                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1889                        $loadPDB->Add("errors");
1890                    } else {
1891                        my $pdbID = $1;
1892                        # Insure the PDB is in the hash.
1893                        if (! exists $pdbHash{$pdbID}) {
1894                            $pdbHash{$pdbID} = 0;
1895                        }
1896                        # The score and locations are coded in the attribute value.
1897                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1898                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1899                            $loadIsProteinForFeature->Add("errors");
1900                        } else {
1901                            my ($score, $locData) = ($1,$2);
1902                            # The location data may not be present, so we have to start with some
1903                            # defaults and then check.
1904                            my ($start, $end) = (1, 0);
1905                            if ($locData) {
1906                                $locData =~ /(\d+)-(\d+)/;
1907                                $start = $1;
1908                                $end = $2;
1909                            }
1910                            # If we still don't have the end location, compute it from
1911                            # the feature length.
1912                            if (! $end) {
1913                                # Most features have one location, but we do a list iteration
1914                                # just in case.
1915                                my @locations = $fig->feature_location($pdbData->[0]);
1916                                $end = 0;
1917                                for my $loc (@locations) {
1918                                    my $locObject = BasicLocation->new($loc);
1919                                    $end += $locObject->Length;
1920                                }
1921                            }
1922                            # Decode the score.
1923                            my $realScore = FIGRules::DecodeScore($score);
1924                            # Connect the PDB to the feature.
1925                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1926                        }
1927                    }
1928                }
1929            }
1930            # We've got all our PDBs now, so we unspool them from the hash.
1931            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1932            my $count = 0;
1933            for my $pdbID (sort keys %pdbHash) {
1934                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1935                $count++;
1936                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1937            }
1938            # Finally we create the ligand table. This information can be found in the
1939            # zinc_name attribute.
1940            Trace("Loading ligands.") if T(2);
1941            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1942            my $last_zinc_id = "";
1943            my $zinc_id = "";
1944            my $done = 0;
1945            while (! $done) {
1946                # Get the next 10000 ligands. We insist that the object ID is greater than
1947                # the last ID we processed.
1948                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1949                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1950                                                           ["ZINC:$zinc_id", "zinc_name"]);
1951                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1952                if (! @attributeData) {
1953                    # Here there are no attributes left, so we quit the loop.
1954                    $done = 1;
1955                } else {
1956                    # Process the attribute data we've received.
1957                    for my $zinc_data (@attributeData) {
1958                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1959                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1960                            $zinc_id = $1;
1961                            # Check for a duplicate.
1962                            if ($zinc_id eq $last_zinc_id) {
1963                                $loadLigand->Add("duplicate");
1964                            } else {
1965                                # Here it's safe to output the ligand. The ligand name is the attribute value
1966                                # (third column in the row).
1967                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1968                                # Insure we don't try to add this ID again.
1969                                $last_zinc_id = $zinc_id;
1970                            }
1971                        } else {
1972                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1973                            $loadLigand->Add("errors");
1974                        }
1975                    }
1976                }
1977            }
1978            Trace("Ligands loaded.") if T(2);
1979        }
1980        # Finish the load.
1981        my $retVal = $self->_FinishAll();
1982        return $retVal;
1983    }
1984    
1985    
1986  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1987    
1988    =head3 SpecialAttribute
1989    
1990        my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1991    
1992    Look for special attributes of a given type. A special attribute is found by comparing one of
1993    the columns of the incoming attribute list to a search pattern. If a match is found, then
1994    a set of columns is put into an output table connected to the specified ID.
1995    
1996    For example, when processing features, the attribute list we look at has three columns: attribute
1997    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1998    begins with C<iedb_>. The call signature is therefore
1999    
2000        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
2001    
2002    The pattern is matched against column 0, and if we have a match, then column 2's value is put
2003    to the output along with the specified feature ID.
2004    
2005    =over 4
2006    
2007    =item id
2008    
2009    ID of the object whose special attributes are being loaded. This forms the first column of the
2010    output.
2011    
2012    =item attributes
2013    
2014    Reference to a list of tuples.
2015    
2016    =item idxMatch
2017    
2018    Index in each tuple of the column to be matched against the pattern. If the match is
2019    successful, an output record will be generated.
2020    
2021    =item idxValues
2022    
2023    Reference to a list containing the indexes in each tuple of the columns to be put as
2024    the second column of the output.
2025    
2026    =item pattern
2027    
2028    Pattern to be matched against the specified column. The match will be case-insensitive.
2029    
2030    =item loader
2031    
2032    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
2033    but technically it could be anything with a C<Put> method.
2034    
2035    =item RETURN
2036    
2037    Returns a count of the matches found.
2038    
2039    =item
2040    
2041    =back
2042    
2043    =cut
2044    
2045    sub SpecialAttribute {
2046        # Get the parameters.
2047        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
2048        # Declare the return variable.
2049        my $retVal = 0;
2050        # Loop through the attribute rows.
2051        for my $row (@{$attributes}) {
2052            # Check for a match.
2053            if ($row->[$idxMatch] =~ m/$pattern/i) {
2054                # We have a match, so output a row. This is a bit tricky, since we may
2055                # be putting out multiple columns of data from the input.
2056                my $value = join(" ", map { $row->[$_] } @{$idxValues});
2057                $loader->Put($id, $value);
2058                $retVal++;
2059            }
2060        }
2061        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
2062        # Return the number of matches.
2063        return $retVal;
2064    }
2065    
2066  =head3 TableLoader  =head3 TableLoader
2067    
2068  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1403  Line 2077 
2077    
2078  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
2079    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
2080  =item RETURN  =item RETURN
2081    
2082  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1417  Line 2087 
2087    
2088  sub _TableLoader {  sub _TableLoader {
2089      # Get the parameters.      # Get the parameters.
2090      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
2091      # Create the load object.      # Create the load object.
2092      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
2093      # Cache it in the loader list.      # Cache it in the loader list.
2094      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
2095      # Return it to the caller.      # Return it to the caller.
# Line 1454  Line 2123 
2123      my $retVal = Stats->new();      my $retVal = Stats->new();
2124      # Get the loader list.      # Get the loader list.
2125      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
2126        # Create a hash to hold the statistics objects, keyed on relation name.
2127        my %loaderHash = ();
2128      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
2129      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
2130      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
2131          # Get the relation name.          # Get the relation name.
2132          my $relName = $loader->RelName;          my $relName = $loader->RelName;
# Line 1466  Line 2137 
2137              # Here we really need to finish.              # Here we really need to finish.
2138              Trace("Finishing $relName.") if T(2);              Trace("Finishing $relName.") if T(2);
2139              my $stats = $loader->Finish();              my $stats = $loader->Finish();
2140                $loaderHash{$relName} = $stats;
2141            }
2142        }
2143        # Now we loop through again, actually loading the tables. We want to finish before
2144        # loading so that if something goes wrong at this point, all the load files are usable
2145        # and we don't have to redo all that work.
2146        for my $relName (sort keys %loaderHash) {
2147            # Get the statistics for this relation.
2148            my $stats = $loaderHash{$relName};
2149            # Check for a database load.
2150              if ($self->{options}->{dbLoad}) {              if ($self->{options}->{dbLoad}) {
2151                  # Here we want to use the load file just created to load the database.                  # Here we want to use the load file just created to load the database.
2152                  Trace("Loading relation $relName.") if T(2);                  Trace("Loading relation $relName.") if T(2);
# Line 1476  Line 2157 
2157              $retVal->Accumulate($stats);              $retVal->Accumulate($stats);
2158              Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);              Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
2159          }          }
     }  
2160      # Return the load statistics.      # Return the load statistics.
2161      return $retVal;      return $retVal;
2162  }  }
2163    
2164    =head3 GetGenomeAttributes
2165    
2166        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2167    
2168    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2169    attributes for all the features of a genome in a single call, then organizes them into
2170    a hash.
2171    
2172    =over 4
2173    
2174    =item fig
2175    
2176    FIG-like object for accessing attributes.
2177    
2178    =item genomeID
2179    
2180    ID of the genome who's attributes are desired.
2181    
2182    =item fids
2183    
2184    Reference to a list of the feature IDs whose attributes are to be kept.
2185    
2186    =item propKeys
2187    
2188    A list of the keys to retrieve.
2189    
2190    =item RETURN
2191    
2192    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2193    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2194    the attribute key, and one or more attribute values.
2195    
2196    =back
2197    
2198    =cut
2199    
2200    sub GetGenomeAttributes {
2201        # Get the parameters.
2202        my ($fig, $genomeID, $fids, $propKeys) = @_;
2203        # Declare the return variable.
2204        my $retVal = {};
2205        # Initialize the hash. This not only enables us to easily determine which FIDs to
2206        # keep, it insures that the caller sees a list reference for every known fid,
2207        # simplifying the logic.
2208        for my $fid (@{$fids}) {
2209            $retVal->{$fid} = [];
2210        }
2211        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
2212        # an eval is used.
2213        my @aList = ();
2214        eval {
2215            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
2216            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
2217        };
2218        # Check for a problem.
2219        if ($@) {
2220            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
2221            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
2222            # but allows us to continue processing.
2223            my $nFids = scalar @{$fids};
2224            for (my $i = 0; $i < $nFids; $i += 100) {
2225                # Determine the index of the last feature ID we'll be specifying on this pass.
2226                # Normally it's $i + 99, but if we're close to the end it may be less.
2227                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
2228                # Get a slice of the fid list.
2229                my @slice = @{$fids}[$i .. $end];
2230                # Get the relevant attributes.
2231                Trace("Retrieving attributes for fids $i to $end.") if T(3);
2232                my @aShort = $fig->get_attributes(\@slice, $propKeys);
2233                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
2234                push @aList, @aShort;
2235            }
2236        }
2237        # Now we should have all the interesting attributes in @aList. Populate the hash with
2238        # them.
2239        for my $aListEntry (@aList) {
2240            my $fid = $aListEntry->[0];
2241            if (exists $retVal->{$fid}) {
2242                push @{$retVal->{$fid}}, $aListEntry;
2243            }
2244        }
2245        # Return the result.
2246        return $retVal;
2247    }
2248    
2249    =head3 GetCommaList
2250    
2251        my $string = GetCommaList($value);
2252    
2253    Create a comma-separated list of the values in a list reference. If the
2254    list reference is a scalar, it will be returned unchanged. If it is
2255    undefined, an empty string will be returned. The idea is that we may be
2256    looking at a string, a list, or nothing, but whatever comes out will be a
2257    string.
2258    
2259    =over 4
2260    
2261    =item value
2262    
2263    Reference to a list of values to be assembled into the return string.
2264    
2265    =item RETURN
2266    
2267    Returns a scalar string containing the content of the input value.
2268    
2269    =back
2270    
2271    =cut
2272    
2273    sub GetCommaList {
2274        # Get the parameters.
2275        my ($value) = @_;
2276        # Declare the return variable.
2277        my $retVal = "";
2278        # Only proceed if we have an input value.
2279        if (defined $value) {
2280            # Analyze the input value.
2281            if (ref $value eq 'ARRAY') {
2282                # Here it's a list reference.
2283                $retVal = join(", ", @$value);
2284            } else {
2285                # Here it's not. Flatten it to a scalar.
2286                $retVal = "$value";
2287            }
2288        }
2289        # Return the result.
2290        return $retVal;
2291    }
2292    
2293    
2294  1;  1;

Legend:
Removed from v.1.32  
changed lines
  Added in v.1.96

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3