[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.91, Thu Feb 14 19:15:18 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14        use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 29  Line 32 
32      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
33      print $stats->Show();      print $stats->Show();
34    
 This module makes use of the internal Sprout property C<_erdb>.  
   
35  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
36  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
37  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 80 
80  =item subsysFile  =item subsysFile
81    
82  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
85    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
86    
87    =item options
88    
89    Reference to a hash of command-line options.
90    
91  =back  =back
92    
# Line 88  Line 94 
94    
95  sub new {  sub new {
96      # Get the parameters.      # Get the parameters.
97      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
98      # Load the list of genomes into a hash.      # Create the genome hash.
99      my %genomes;      my %genomes = ();
100        # We only need it if load-only is NOT specified.
101        if (! $options->{loadOnly}) {
102      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
103          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
104          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
105          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107      } else {      } else {
108          my $type = ref $genomeFile;          my $type = ref $genomeFile;
109          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 114  Line 123 
123                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
124                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
125                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
126                      if (undef $accessCode) {                          if (! defined($accessCode)) {
127                          $accessCode = 1;                          $accessCode = 1;
128                      }                      }
129                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 133 
133              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
134          }          }
135      }      }
136        }
137      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
138      my %subsystems = ();      my %subsystems = ();
139        # We only need it if load-only is NOT specified.
140        if (! $options->{loadOnly}) {
141      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
142          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
143          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
144                # Loop through, checking for the NMPDR file.
145                for my $sub (@subs) {
146                    if ($fig->nmpdr_subsystem($sub)) {
147                        $subsystems{$sub} = 1;
148                    }
149                }
150      } else {      } else {
151          my $type = ref $subsysFile;          my $type = ref $subsysFile;
152          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 166 
166              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
167          }          }
168      }      }
169            # Go through the subsys hash again, creating the keyword list for each subsystem.
170            for my $subsystem (keys %subsystems) {
171                my $name = $subsystem;
172                $name =~ s/_/ /g;
173                $subsystems{$subsystem} = $name;
174            }
175        }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 184 
184                    subsystems => \%subsystems,                    subsystems => \%subsystems,
185                    sprout => $sprout,                    sprout => $sprout,
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout->{_erdb},                    erdb => $sprout,
188                    loaders => []                    loaders => [],
189                      options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
194      return $retVal;      return $retVal;
195  }  }
196    
197    =head3 LoadOnly
198    
199        my $flag = $spl->LoadOnly;
200    
201    Return TRUE if we are in load-only mode, else FALSE.
202    
203    =cut
204    
205    sub LoadOnly {
206        my ($self) = @_;
207        return $self->{options}->{loadOnly};
208    }
209    
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
214    
215  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
216    
# Line 192  Line 235 
235    
236  =back  =back
237    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
238  =cut  =cut
239  #: Return Type $%;  #: Return Type $%;
240  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 245 
245      # Get the genome count.      # Get the genome count.
246      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
254        if ($self->{options}->{loadOnly}) {
255            Trace("Loading from existing files.") if T(2);
256        } else {
257            Trace("Generating genome data.") if T(2);
258      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
259      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
260          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
261          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
262          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
263          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
264          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
265          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
266          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
267          # Get the full taxonomy.          # Get the full taxonomy.
268          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
269                # Get the version. If no version is specified, we default to the genome ID by itself.
270                my $version = $fig->genome_version($genomeID);
271                if (! defined($version)) {
272                    $version = $genomeID;
273                }
274                # Get the DNA size.
275                my $dnaSize = $fig->genome_szdna($genomeID);
276                # Open the NMPDR group file for this genome.
277                my $group;
278                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
279                    defined($group = <TMP>)) {
280                    # Clean the line ending.
281                    chomp $group;
282                } else {
283                    # No group, so use the default.
284                    $group = $FIG_Config::otherGroup;
285                }
286                close TMP;
287          # Output the genome record.          # Output the genome record.
288          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
289                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
290          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
291          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
292          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 262  Line 317 
317              }              }
318          }          }
319      }      }
320        }
321      # Finish the loads.      # Finish the loads.
322      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
323      # Return the result.      # Return the result.
324      return $retVal;      return $retVal;
325  }  }
326    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence is  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
327  =head3 LoadFeatureData  =head3 LoadFeatureData
328    
329  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
330    
331  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
332    
# Line 400  Line 336 
336    
337      Feature      Feature
338      FeatureAlias      FeatureAlias
339        IsAliasOf
340      FeatureLink      FeatureLink
341      FeatureTranslation      FeatureTranslation
342      FeatureUpstream      FeatureUpstream
343      IsLocatedIn      IsLocatedIn
344        HasFeature
345        HasRoleInSubsystem
346        FeatureEssential
347        FeatureVirulent
348        FeatureIEDB
349        CDD
350        IsPresentOnProteinOf
351    
352  =over 4  =over 4
353    
# Line 418  Line 362 
362  sub LoadFeatureData {  sub LoadFeatureData {
363      # Get this object instance.      # Get this object instance.
364      my ($self) = @_;      my ($self) = @_;
365      # Get the FIG object.      # Get the FIG and Sprout objects.
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367        my $sprout = $self->{sprout};
368      # Get the table of genome IDs.      # Get the table of genome IDs.
369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
370      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
371      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
372      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
373      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
374      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
375      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
376      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
377        my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
378        my $loadHasFeature = $self->_TableLoader('HasFeature');
379        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
380        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
381        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
382        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
383        my $loadCDD = $self->_TableLoader('CDD');
384        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
385        # Get the subsystem hash.
386        my $subHash = $self->{subsystems};
387        # Get the property keys.
388        my $propKeys = $self->{propKeys};
389        # Create a hashes to hold CDD and alias values.
390        my %CDD = ();
391        my %alias = ();
392      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
393      # locations.      # locations.
394      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
395      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
396            Trace("Loading from existing files.") if T(2);
397        } else {
398            Trace("Generating feature data.") if T(2);
399      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
400      for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
401            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
402            for my $genomeID (@allGenomes) {
403          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
404          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
405          # Get the feature list for this genome.          # Get the feature list for this genome.
406          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
407                # Sort and count the list.
408                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
409                my $count = scalar @featureTuples;
410                my @fids = map { $_->[0] } @featureTuples;
411                Trace("$count features found for genome $genomeID.") if T(3);
412                # Get the attributes for this genome and put them in a hash by feature ID.
413                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
414                Trace("Looping through features for $genomeID.") if T(3);
415                # Set up for our duplicate-feature check.
416                my $oldFeatureID = "";
417          # Loop through the features.          # Loop through the features.
418          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
419              # Split the tuple.              # Split the tuple.
420              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
421              # Create the feature record.                  # Check for duplicates.
422              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
423                        Trace("Duplicate feature $featureID found.") if T(1);
424                    } else {
425                        $oldFeatureID = $featureID;
426                        # Count this feature.
427                        $loadFeature->Add("featureIn");
428                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
429                        # Sprout database requires a single character.
430                        if (! defined($quality) || $quality eq "") {
431                            $quality = " ";
432                        }
433                        # Begin building the keywords. We start with the genome ID, the
434                        # feature ID, the taxonomy, and the organism name.
435                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
436                                        $fig->taxonomy_of($genomeID));
437              # Create the aliases.              # Create the aliases.
438              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
439                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
440              }                          $loadIsAliasOf->Put($alias, $featureID);
441                            push @keywords, $alias;
442                            # If this is a locus tag, also add its natural form as a keyword.
443                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
444                            if ($naturalName) {
445                                push @keywords, $naturalName;
446                            }
447                            # If this is the first time for the specified alias, create its
448                            # alias record.
449                            if (! exists $alias{$alias}) {
450                                $loadFeatureAlias->Put($alias);
451                                $alias{$alias} = 1;
452                            }
453                        }
454                        Trace("Assignment for $featureID is: $assignment") if T(4);
455                        # Break the assignment into words and shove it onto the
456                        # keyword list.
457                        push @keywords, split(/\s+/, $assignment);
458                        # Link this feature to the parent genome.
459                        $loadHasFeature->Put($genomeID, $featureID, $type);
460              # Get the links.              # Get the links.
461              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
462              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 475 
475                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
476                  }                  }
477              }              }
478                        # Now we need to find the subsystems this feature participates in.
479                        # We also add the subsystems to the keyword list. Before we do that,
480                        # we must convert underscores to spaces.
481                        my @subsystems = $fig->peg_to_subsystems($featureID);
482                        for my $subsystem (@subsystems) {
483                            # Only proceed if we like this subsystem.
484                            if (exists $subHash->{$subsystem}) {
485                                # Store the has-role link.
486                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
487                                # Save the subsystem's keyword data.
488                                my $subKeywords = $subHash->{$subsystem};
489                                push @keywords, split /\s+/, $subKeywords;
490                                # Now we need to get this feature's role in the subsystem.
491                                my $subObject = $fig->get_subsystem($subsystem);
492                                my @roleColumns = $subObject->get_peg_roles($featureID);
493                                my @allRoles = $subObject->get_roles();
494                                for my $col (@roleColumns) {
495                                    my $role = $allRoles[$col];
496                                    push @keywords, split /\s+/, $role;
497                                    push @keywords, $subObject->get_role_abbr($col);
498                                }
499                            }
500                        }
501                        # There are three special attributes computed from property
502                        # data that we build next. If the special attribute is non-empty,
503                        # its name will be added to the keyword list. First, we get all
504                        # the attributes for this feature. They will come back as
505                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
506                        # [name, value, value with URL]. (We don't need the PEG, since
507                        # we already know it.)
508                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
509                                             @{$attributes->{$featureID}};
510                        # Now we process each of the special attributes.
511                        if (SpecialAttribute($featureID, \@attributes,
512                                             1, [0,2], '^(essential|potential_essential)$',
513                                             $loadFeatureEssential)) {
514                            push @keywords, 'essential';
515                            $loadFeature->Add('essential');
516                        }
517                        if (SpecialAttribute($featureID, \@attributes,
518                                             0, [2], '^virulen',
519                                             $loadFeatureVirulent)) {
520                            push @keywords, 'virulent';
521                            $loadFeature->Add('virulent');
522                        }
523                        if (SpecialAttribute($featureID, \@attributes,
524                                             0, [0,2], '^iedb_',
525                                             $loadFeatureIEDB)) {
526                            push @keywords, 'iedb';
527                            $loadFeature->Add('iedb');
528                        }
529                        # Now we have some other attributes we need to process. Currently,
530                        # this is CDD and CELLO, but we expect the number to increase.
531                        my %attributeHash = ();
532                        for my $attrRow (@{$attributes->{$featureID}}) {
533                            my (undef, $key, @values) = @{$attrRow};
534                            $key =~ /^([^:]+)::(.+)/;
535                            if (exists $attributeHash{$1}) {
536                                $attributeHash{$1}->{$2} = \@values;
537                            } else {
538                                $attributeHash{$1} = {$2 => \@values};
539                            }
540                        }
541                        my $celloValue = "unknown";
542                        # Pull in the CELLO attribute. There will never be more than one.
543                        # If we have one, it's a feature attribute AND a keyword.
544                        my @celloData = keys %{$attributeHash{CELLO}};
545                        if (@celloData) {
546                            $celloValue = $celloData[0];
547                            push @keywords, $celloValue;
548                        }
549                        # Now we handle CDD. This is a bit more complicated, because
550                        # there are multiple CDDs per protein.
551                        if (exists $attributeHash{CDD}) {
552                            # Get the hash of CDD IDs to scores for this feature. We
553                            # already know it exists because of the above IF.
554                            my $cddHash = $attributeHash{CDD};
555                            my @cddData = sort keys %{$cddHash};
556                            for my $cdd (@cddData) {
557                                # Extract the score for this CDD and decode it.
558                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]);
559                                my $realScore = FIGRules::DecodeScore($codeScore);
560                                # We can't afford to crash because of a bad attribute
561                                # value, hence the IF below.
562                                if (! defined($realScore)) {
563                                    # Bad score, so count it.
564                                    $loadFeature->Add('badCDDscore');
565                                } else {
566                                    # Create the connection.
567                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
568                                    # If this CDD does not yet exist, create its record.
569                                    if (! exists $CDD{$cdd}) {
570                                        $CDD{$cdd} = 1;
571                                        $loadCDD->Put($cdd);
572                                    }
573                                }
574                            }
575                        }
576                        # Now we need to bust up hyphenated words in the keyword
577                        # list. We keep them separate and put them at the end so
578                        # the original word order is available.
579                        my $keywordString = "";
580                        my $bustedString = "";
581                        for my $keyword (@keywords) {
582                            if (length $keyword >= 3) {
583                                $keywordString .= " $keyword";
584                                if ($keyword =~ /-/) {
585                                    my @words = split /-/, $keyword;
586                                    $bustedString .= join(" ", "", @words);
587                                }
588                            }
589                        }
590                        $keywordString .= $bustedString;
591                        # Get rid of annoying punctuation.
592                        $keywordString =~ s/[();]//g;
593                        # Clean the keyword list.
594                        my $cleanWords = $sprout->CleanKeywords($keywordString);
595                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
596                        # Now we need to process the feature's locations. First, we split them up.
597                        my @locationList = split /\s*,\s*/, $locations;
598                        # Next, we convert them to Sprout location objects.
599                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
600                        # Assemble them into a sprout location string for later.
601                        my $locationString = join(", ", map { $_->String } @locObjectList);
602              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
603              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
604              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
605              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
606              my @locationList = split /\s*,\s*/, $locations;                      my $i = 1;
607              # Loop through the locations.              # Loop through the locations.
608              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
609                  # Parse the location.                          # Split this location into a list of chunks.
                 my $locObject = BasicLocation->new($location);  
                 # Split it into a list of chunks.  
610                  my @locOList = ();                  my @locOList = ();
611                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
612                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 488  Line 615 
615                  push @locOList, $locObject;                  push @locOList, $locObject;
616                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
617                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
618                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
619                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
620                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
621                      $i++;                      $i++;
622                  }                  }
623              }              }
624                        # Finally, reassemble the location objects into a list of Sprout location strings.
625                        # Create the feature record.
626                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString);
627          }          }
628      }      }
629      # Finish the loads.              Trace("Genome $genomeID processed.") if T(3);
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
630          }          }
631      }      }
632      # Finish the loads.      # Finish the loads.
# Line 571  Line 636 
636    
637  =head3 LoadSubsystemData  =head3 LoadSubsystemData
638    
639  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
640    
641  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
642    
# Line 584  Line 649 
649  The following relations are loaded by this method.  The following relations are loaded by this method.
650    
651      Subsystem      Subsystem
652        SubsystemClass
653      Role      Role
654        RoleEC
655        IsIdentifiedByEC
656      SSCell      SSCell
657      ContainsFeature      ContainsFeature
658      IsGenomeOf      IsGenomeOf
# Line 592  Line 660 
660      OccursInSubsystem      OccursInSubsystem
661      ParticipatesIn      ParticipatesIn
662      HasSSCell      HasSSCell
663        ConsistsOfRoles
664        RoleSubset
665        HasRoleSubset
666        ConsistsOfGenomes
667        GenomeSubset
668        HasGenomeSubset
669        Catalyzes
670        Diagram
671        RoleOccursIn
672    
673  =over 4  =over 4
674    
# Line 601  Line 678 
678    
679  =back  =back
680    
 B<TO DO>  
   
 Generate RoleName table?  
   
681  =cut  =cut
682  #: Return Type $%;  #: Return Type $%;
683  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 691 
691      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
692      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
693      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
694      my $subsysCount = @subsysIDs;      # Get the map list.
695      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
696      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
697      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
698      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
699      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
700      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role');
701      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
702      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
703      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
704      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
705      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
706      Trace("Beginning subsystem data load.") if T(2);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
707        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
708        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
709        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
710        my $loadHasSSCell = $self->_TableLoader('HasSSCell');
711        my $loadRoleSubset = $self->_TableLoader('RoleSubset');
712        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
713        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
714        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
715        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
716        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
717        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
718        if ($self->{options}->{loadOnly}) {
719            Trace("Loading from existing files.") if T(2);
720        } else {
721            Trace("Generating subsystem data.") if T(2);
722            # This hash will contain the roles for each EC. When we're done, this
723            # information will be used to generate the Catalyzes table.
724            my %ecToRoles = ();
725      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
726      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
727      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
728      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
729            # and memorize up the reactions.
730            my ($genomeID, $roleID);
731      my %roleData = ();      my %roleData = ();
732      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
733                # Get the subsystem object.
734                my $sub = $fig->get_subsystem($subsysID);
735                # Only proceed if the subsystem has a spreadsheet.
736                if (defined($sub) && ! $sub->{empty_ss}) {
737          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
738          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
739          # Create the subsystem record.          # Create the subsystem record.
740          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
741          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
742          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
743          # Connect the roles to the subsystem. If a role is new, we create                  # Now for the classification string. This comes back as a list
744          # a role record for it.                  # reference and we convert it to a space-delimited string.
745          for my $roleID (@roles) {                  my $classList = $fig->subsystem_classification($subsysID);
746                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
747                    $loadSubsystemClass->Put($subsysID, $classString);
748                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
749                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
750                        # Get the role's abbreviation.
751                        my $abbr = $sub->get_role_abbr($col);
752                        # Connect to this role.
753              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
754              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
755                        # If it's a new role, add it to the role table.
756              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
757                            # Get the role's abbreviation.
758                            # Add the role.
759                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
760                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
761                            # Check for an EC number.
762                            if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
763                                my $ec = $1;
764                                $loadIsIdentifiedByEC->Put($roleID, $ec);
765                                # Check to see if this is our first encounter with this EC.
766                                if (exists $ecToRoles{$ec}) {
767                                    # No, so just add this role to the EC list.
768                                    push @{$ecToRoles{$ec}}, $roleID;
769                                } else {
770                                    # Output this EC.
771                                    $loadRoleEC->Put($ec);
772                                    # Create its role list.
773                                    $ecToRoles{$ec} = [$roleID];
774                                }
775                            }
776              }              }
777          }          }
778          # Now all roles for this subsystem have been filled in. We create the                  # Now we create the spreadsheet for the subsystem by matching roles to
779          # spreadsheet by matches roles to genomes. To do this, we need to                  # genomes. Each genome is a row and each role is a column. We may need
780          # get the genomes on the sheet.                  # to actually create the roles as we find them.
781          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
782          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
783          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
784              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
785                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
786                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
787                            my $cellCount = 0;
788                            # Create a list for the PEGs we find. This list will be used
789                            # to generate cluster numbers.
790                            my @pegsFound = ();
791                            # Create a hash that maps spreadsheet IDs to PEGs. We will
792                            # use this to generate the ContainsFeature data after we have
793                            # the cluster numbers.
794                            my %cellPegs = ();
795                            # Get the genome's variant code for this subsystem.
796                            my $variantCode = $sub->get_variant_code($row);
797                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
798                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
799                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
800                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
801                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
802                      # Only proceed if features exist.                      # Only proceed if features exist.
803                      if (@pegs > 0) {                      if (@pegs > 0) {
804                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
805                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
806                                    my $cellID = "$subsysID:$genomeID:$col";
807                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
808                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
809                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
810                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
811                          # Attach the features to it.                                  # Remember its features.
812                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
813                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
814                                    $pegCount += @pegs;
815                                }
816                            }
817                            # If we found some cells for this genome, we need to compute clusters and
818                            # denote it participates in the subsystem.
819                            if ($pegCount > 0) {
820                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
821                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
822                                # Create a hash mapping PEG IDs to cluster numbers.
823                                # We default to -1 for all of them.
824                                my %clusterOf = map { $_ => -1 } @pegsFound;
825                                # Partition the PEGs found into clusters.
826                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
827                                for (my $i = 0; $i <= $#clusters; $i++) {
828                                    my $subList = $clusters[$i];
829                                    for my $peg (@{$subList}) {
830                                        $clusterOf{$peg} = $i;
831                                    }
832                                }
833                                # Create the ContainsFeature data.
834                                for my $cellID (keys %cellPegs) {
835                                    my $cellList = $cellPegs{$cellID};
836                                    for my $cellPeg (@$cellList) {
837                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
838                          }                          }
839                      }                      }
840                  }                  }
841              }              }
842          }          }
843                    # Now we need to generate the subsets. The subset names must be concatenated to
844                    # the subsystem name to make them unique keys. There are two types of subsets:
845                    # genome subsets and role subsets. We do the role subsets first.
846                    my @subsetNames = $sub->get_subset_names();
847                    for my $subsetID (@subsetNames) {
848                        # Create the subset record.
849                        my $actualID = "$subsysID:$subsetID";
850                        $loadRoleSubset->Put($actualID);
851                        # Connect the subset to the subsystem.
852                        $loadHasRoleSubset->Put($subsysID, $actualID);
853                        # Connect the subset to its roles.
854                        my @roles = $sub->get_subsetC_roles($subsetID);
855                        for my $roleID (@roles) {
856                            $loadConsistsOfRoles->Put($actualID, $roleID);
857      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
858  }  }
859                    # Next the genome subsets.
860  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
861                    for my $subsetID (@subsetNames) {
862  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
863                        my $actualID = "$subsysID:$subsetID";
864  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
865                        # Connect the subset to the subsystem.
866  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
867  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
868                        my @genomes = $sub->get_subsetR($subsetID);
869  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
870                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
871      Diagram                      }
872      RoleOccursIn                  }
873                }
874  =over 4          }
875            # Now we loop through the diagrams. We need to create the diagram records
876  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
877            # in subsystems (and therefore appear in the %ecToRoles hash) are
878  Returns a statistics object for the loads.          # included.
879            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
880          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
881          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
882          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 739  Line 884 
884          # Now we need to link all the map's roles to it.          # Now we need to link all the map's roles to it.
885          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
886          my %roleHash = ();          my %roleHash = ();
887          for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
888                    if (exists $ecToRoles{$ec}) {
889                        for my $role (@{$ecToRoles{$ec}}) {
890              if (! $roleHash{$role}) {              if (! $roleHash{$role}) {
891                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($role, $map);
892                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
893              }              }
894          }          }
895      }      }
896                }
897            }
898            # Before we leave, we must create the Catalyzes table. We start with the reactions,
899            # then use the "ecToRoles" table to convert EC numbers to role IDs.
900            my @reactions = $fig->all_reactions();
901            for my $reactionID (@reactions) {
902                # Get this reaction's list of roles. The results will be EC numbers.
903                my @ecs = $fig->catalyzed_by($reactionID);
904                # Loop through the roles, creating catalyzation records.
905                for my $thisEC (@ecs) {
906                    if (exists $ecToRoles{$thisEC}) {
907                        for my $thisRole (@{$ecToRoles{$thisEC}}) {
908                            $loadCatalyzes->Put($thisRole, $reactionID);
909                        }
910                    }
911                }
912            }
913        }
914      # Finish the load.      # Finish the load.
915      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
916      return $retVal;      return $retVal;
# Line 753  Line 918 
918    
919  =head3 LoadPropertyData  =head3 LoadPropertyData
920    
921  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
922    
923  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
924    
# Line 787  Line 952 
952      my $fig = $self->{fig};      my $fig = $self->{fig};
953      # Get the genome hash.      # Get the genome hash.
954      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
955      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
956      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
957      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
958      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
959            Trace("Loading from existing files.") if T(2);
960        } else {
961            Trace("Generating property data.") if T(2);
962      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
963      my %propertyKeys = ();      my %propertyKeys = ();
964      my $nextID = 1;      my $nextID = 1;
965            # Get the attributes we intend to store in the property table.
966            my $propKeys = $self->{propKeys};
967      # Loop through the genomes.      # Loop through the genomes.
968      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
969          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
970          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
971          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
972          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
973          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
974          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
975          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
976              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
977              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
978              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
979              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
980                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
981                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
982                  # character.                  # character.
# Line 830  Line 994 
994                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
995                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
996              }              }
997                # Update the statistics.
998                Trace("$propertyCount attributes processed.") if T(3);
999                $loadHasProperty->Add("propertiesIn", $propertyCount);
1000          }          }
1001      }      }
1002      # Finish the load.      # Finish the load.
# Line 839  Line 1006 
1006    
1007  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1008    
1009  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1010    
1011  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1012    
# Line 871  Line 1038 
1038      my $fig = $self->{fig};      my $fig = $self->{fig};
1039      # Get the genome hash.      # Get the genome hash.
1040      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1041      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1042      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1043      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1044      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1045      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1046      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1047      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1048            Trace("Loading from existing files.") if T(2);
1049        } else {
1050            Trace("Generating annotation data.") if T(2);
1051      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1052      # user records.      # user records.
1053      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1061 
1061      # Loop through the genomes.      # Loop through the genomes.
1062      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1063          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1064              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1065              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1066              my %seenTimestamps = ();              my %seenTimestamps = ();
1067              # Check for a functional assignment.              # Get the genome's annotations.
1068              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1069              if ($func) {              Trace("Processing annotations.") if T(2);
1070                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1071                  # assignment annotation for it.                  # Get the annotation tuple.
1072                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1073                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1074                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1075                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1076                      # stop the substitution search.                      # stop the substitution search.
1077                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1081 
1081                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1082                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1083                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1084                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1085                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1086                              $timestamp++;                      my $keyStamp = $timestamp;
1087                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1088                            $keyStamp++;
1089                          }                          }
1090                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1091                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1092                          # Insure the user exists.                          # Insure the user exists.
1093                          if (! $users{$user}) {                          if (! $users{$user}) {
1094                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1096 
1096                              $users{$user} = 1;                              $users{$user} = 1;
1097                          }                          }
1098                          # Generate the annotation.                          # Generate the annotation.
1099                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1100                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1101                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1102                      } else {                      } else {
# Line 950  Line 1106 
1106                  }                  }
1107              }              }
1108          }          }
     }  
1109      # Finish the load.      # Finish the load.
1110      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1111      return $retVal;      return $retVal;
# Line 958  Line 1113 
1113    
1114  =head3 LoadSourceData  =head3 LoadSourceData
1115    
1116  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1117    
1118  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1119    
# Line 991  Line 1146 
1146      my $fig = $self->{fig};      my $fig = $self->{fig};
1147      # Get the genome hash.      # Get the genome hash.
1148      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1149      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1150      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1151      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1152      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1153      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1154            Trace("Loading from existing files.") if T(2);
1155        } else {
1156            Trace("Generating annotation data.") if T(2);
1157      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1158      my %sourceURL = ();      my %sourceURL = ();
1159      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1167 
1167              chomp $line;              chomp $line;
1168              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1169              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1170              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1171                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1172                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1173              }              }
1174              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1175                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1176                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1177                        $sourceDesc{$sourceID} = $sourceID;
1178              }              }
1179          }          }
1180          close TMP;          close TMP;
1181      }      }
1182            # Write the source descriptions.
1183            for my $sourceID (keys %sourceDesc) {
1184                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1185            }
1186        }
1187      # Finish the load.      # Finish the load.
1188      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1189      return $retVal;      return $retVal;
# Line 1028  Line 1191 
1191    
1192  =head3 LoadExternalData  =head3 LoadExternalData
1193    
1194  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1195    
1196  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1197    
# Line 1060  Line 1223 
1223      my $fig = $self->{fig};      my $fig = $self->{fig};
1224      # Get the genome hash.      # Get the genome hash.
1225      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1226      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1227      # it the key.      # it the key.
1228      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1229      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1230      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1231      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1232      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1233            Trace("Loading from existing files.") if T(2);
1234        } else {
1235            Trace("Generating external data.") if T(2);
1236      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1237      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1238      my $orgLine;      my $orgLine;
1239      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1240          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1246 
1246      close ORGS;      close ORGS;
1247      # Now the function file.      # Now the function file.
1248      my $funcLine;      my $funcLine;
1249      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1250      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1251          # Clean the line ending.          # Clean the line ending.
1252          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1262 
1262              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1263          }          }
1264      }      }
1265        }
1266      # Finish the load.      # Finish the load.
1267      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1268      return $retVal;      return $retVal;
1269  }  }
1270    
 =head3 LoadGroupData  
1271    
1272  C<< my $stats = $spl->LoadGroupData(); >>  =head3 LoadReactionData
1273    
1274  Load the genome Groups into Sprout.      my $stats = $spl->LoadReactionData();
1275    
1276    Load the reaction data from FIG into Sprout.
1277    
1278    Reaction data connects reactions to the compounds that participate in them.
1279    
1280  The following relations are loaded by this method.  The following relations are loaded by this method.
1281    
1282      GenomeGroups      Reaction
1283        ReactionURL
1284        Compound
1285        CompoundName
1286        CompoundCAS
1287        IsIdentifiedByCAS
1288        HasCompoundName
1289        IsAComponentOf
1290    
1291  There is no direct support for genome groups in FIG, so we access the SEED  This method proceeds reaction by reaction rather than genome by genome.
 files directly.  
1292    
1293  =over 4  =over 4
1294    
# Line 1125  Line 1300 
1300    
1301  =cut  =cut
1302  #: Return Type $%;  #: Return Type $%;
1303  sub LoadGroupData {  sub LoadReactionData {
1304        # Get this object instance.
1305        my ($self) = @_;
1306        # Get the FIG object.
1307        my $fig = $self->{fig};
1308        # Create load objects for each of the tables we're loading.
1309        my $loadReaction = $self->_TableLoader('Reaction');
1310        my $loadReactionURL = $self->_TableLoader('ReactionURL');
1311        my $loadCompound = $self->_TableLoader('Compound');
1312        my $loadCompoundName = $self->_TableLoader('CompoundName');
1313        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1314        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1315        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1316        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1317        if ($self->{options}->{loadOnly}) {
1318            Trace("Loading from existing files.") if T(2);
1319        } else {
1320            Trace("Generating reaction data.") if T(2);
1321            # We need some hashes to prevent duplicates.
1322            my %compoundNames = ();
1323            my %compoundCASes = ();
1324            # First we create the compounds.
1325            my @compounds = $fig->all_compounds();
1326            for my $cid (@compounds) {
1327                # Check for names.
1328                my @names = $fig->names_of_compound($cid);
1329                # Each name will be given a priority number, starting with 1.
1330                my $prio = 1;
1331                for my $name (@names) {
1332                    if (! exists $compoundNames{$name}) {
1333                        $loadCompoundName->Put($name);
1334                        $compoundNames{$name} = 1;
1335                    }
1336                    $loadHasCompoundName->Put($cid, $name, $prio++);
1337                }
1338                # Create the main compound record. Note that the first name
1339                # becomes the label.
1340                my $label = (@names > 0 ? $names[0] : $cid);
1341                $loadCompound->Put($cid, $label);
1342                # Check for a CAS ID.
1343                my $cas = $fig->cas($cid);
1344                if ($cas) {
1345                    $loadIsIdentifiedByCAS->Put($cid, $cas);
1346                    if (! exists $compoundCASes{$cas}) {
1347                        $loadCompoundCAS->Put($cas);
1348                        $compoundCASes{$cas} = 1;
1349                    }
1350                }
1351            }
1352            # All the compounds are set up, so we need to loop through the reactions next. First,
1353            # we initialize the discriminator index. This is a single integer used to insure
1354            # duplicate elements in a reaction are not accidentally collapsed.
1355            my $discrim = 0;
1356            my @reactions = $fig->all_reactions();
1357            for my $reactionID (@reactions) {
1358                # Create the reaction record.
1359                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1360                # Compute the reaction's URL.
1361                my $url = HTML::reaction_link($reactionID);
1362                # Put it in the ReactionURL table.
1363                $loadReactionURL->Put($reactionID, $url);
1364                # Now we need all of the reaction's compounds. We get these in two phases,
1365                # substrates first and then products.
1366                for my $product (0, 1) {
1367                    # Get the compounds of the current type for the current reaction. FIG will
1368                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1369                    # have location data in SEED, so it defaults to the empty string.
1370                    my @compounds = $fig->reaction2comp($reactionID, $product);
1371                    for my $compData (@compounds) {
1372                        # Extract the compound data from the current tuple.
1373                        my ($cid, $stoich, $main) = @{$compData};
1374                        # Link the compound to the reaction.
1375                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1376                                                 $product, $stoich);
1377                    }
1378                }
1379            }
1380        }
1381        # Finish the load.
1382        my $retVal = $self->_FinishAll();
1383        return $retVal;
1384    }
1385    
1386    =head3 LoadSynonymData
1387    
1388        my $stats = $spl->LoadSynonymData();
1389    
1390    Load the synonym groups into Sprout.
1391    
1392    The following relations are loaded by this method.
1393    
1394        SynonymGroup
1395        IsSynonymGroupFor
1396    
1397    The source information for these relations is taken from the C<maps_to_id> method
1398    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1399    SQL against the FIG database.
1400    
1401    =over 4
1402    
1403    =item RETURNS
1404    
1405    Returns a statistics object for the loads.
1406    
1407    =back
1408    
1409    =cut
1410    #: Return Type $%;
1411    sub LoadSynonymData {
1412      # Get this object instance.      # Get this object instance.
1413      my ($self) = @_;      my ($self) = @_;
1414      # Get the FIG object.      # Get the FIG object.
1415      my $fig = $self->{fig};      my $fig = $self->{fig};
1416      # Get the genome hash.      # Get the genome hash.
1417      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1418      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1419      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1420      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1421        if ($self->{options}->{loadOnly}) {
1422            Trace("Loading from existing files.") if T(2);
1423        } else {
1424            Trace("Generating synonym group data.") if T(2);
1425            # Get the database handle.
1426            my $dbh = $fig->db_handle();
1427            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1428            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1429            my $result = $sth->execute();
1430            if (! defined($result)) {
1431                Confess("Database error in Synonym load: " . $sth->errstr());
1432            } else {
1433                Trace("Processing synonym results.") if T(2);
1434                # Remember the current synonym.
1435                my $current_syn = "";
1436                # Count the features.
1437                my $featureCount = 0;
1438                my $entryCount = 0;
1439                # Loop through the synonym/peg pairs.
1440                while (my @row = $sth->fetchrow()) {
1441                    # Get the synonym group ID and feature ID.
1442                    my ($syn_id, $peg) = @row;
1443                    # Count this row.
1444                    $entryCount++;
1445                    if ($entryCount % 1000 == 0) {
1446                        Trace("$entryCount rows processed.") if T(3);
1447                    }
1448                    # Insure it's for one of our genomes.
1449                    my $genomeID = FIG::genome_of($peg);
1450                    if (exists $genomeHash->{$genomeID}) {
1451                        # Verify the synonym.
1452                        if ($syn_id ne $current_syn) {
1453                            # It's new, so put it in the group table.
1454                            $loadSynonymGroup->Put($syn_id);
1455                            $current_syn = $syn_id;
1456                        }
1457                        # Connect the synonym to the peg.
1458                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1459                        # Count this feature.
1460                        $featureCount++;
1461                        if ($featureCount % 1000 == 0) {
1462                            Trace("$featureCount features processed.") if T(3);
1463                        }
1464                    }
1465                }
1466                Trace("$entryCount rows produced $featureCount features.") if T(2);
1467            }
1468        }
1469        # Finish the load.
1470        my $retVal = $self->_FinishAll();
1471        return $retVal;
1472    }
1473    
1474    =head3 LoadFamilyData
1475    
1476        my $stats = $spl->LoadFamilyData();
1477    
1478    Load the protein families into Sprout.
1479    
1480    The following relations are loaded by this method.
1481    
1482        Family
1483        IsFamilyForFeature
1484    
1485    The source information for these relations is taken from the C<families_for_protein>,
1486    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1487    
1488    =over 4
1489    
1490    =item RETURNS
1491    
1492    Returns a statistics object for the loads.
1493    
1494    =back
1495    
1496    =cut
1497    #: Return Type $%;
1498    sub LoadFamilyData {
1499        # Get this object instance.
1500        my ($self) = @_;
1501        # Get the FIG object.
1502        my $fig = $self->{fig};
1503        # Get the genome hash.
1504        my $genomeHash = $self->{genomes};
1505        # Create load objects for the tables we're loading.
1506        my $loadFamily = $self->_TableLoader('Family');
1507        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1508        if ($self->{options}->{loadOnly}) {
1509            Trace("Loading from existing files.") if T(2);
1510        } else {
1511            Trace("Generating family data.") if T(2);
1512            # Create a hash for the family IDs.
1513            my %familyHash = ();
1514      # Loop through the genomes.      # Loop through the genomes.
1515      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1516      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1517          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1518          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1519          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1520              defined($line = <TMP>)) {                  # Get this feature's families.
1521              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1522              chomp $line;                  # Loop through the families, connecting them to the feature.
1523              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1524              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1525              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1526                        if (! exists $familyHash{$family}) {
1527                            $familyHash{$family} = 1;
1528                            $loadFamily->Add("families", 1);
1529                            my $size = $fig->sz_family($family);
1530                            my $func = $fig->family_function($family);
1531                            $loadFamily->Put($family, $size, $func);
1532                        }
1533                    }
1534                }
1535          }          }
         close TMP;  
1536      }      }
1537      # Finish the load.      # Finish the load.
1538      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1539      return $retVal;      return $retVal;
1540  }  }
1541    
1542    =head3 LoadDrugData
1543    
1544        my $stats = $spl->LoadDrugData();
1545    
1546    Load the drug target data into Sprout.
1547    
1548    The following relations are loaded by this method.
1549    
1550        PDB
1551        DocksWith
1552        IsProteinForFeature
1553        Ligand
1554    
1555    The source information for these relations is taken from attributes. The
1556    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1557    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1558    attribute contains the information for the B<DocksWith> relationship. It is
1559    expected that additional attributes and tables will be added in the future.
1560    
1561    =over 4
1562    
1563    =item RETURNS
1564    
1565    Returns a statistics object for the loads.
1566    
1567    =back
1568    
1569    =cut
1570    #: Return Type $%;
1571    sub LoadDrugData {
1572        # Get this object instance.
1573        my ($self) = @_;
1574        # Get the FIG object.
1575        my $fig = $self->{fig};
1576        # Get the genome hash.
1577        my $genomeHash = $self->{genomes};
1578        # Create load objects for the tables we're loading.
1579        my $loadPDB = $self->_TableLoader('PDB');
1580        my $loadLigand = $self->_TableLoader('Ligand');
1581        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1582        my $loadDocksWith = $self->_TableLoader('DocksWith');
1583        if ($self->{options}->{loadOnly}) {
1584            Trace("Loading from existing files.") if T(2);
1585        } else {
1586            Trace("Generating drug target data.") if T(2);
1587            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1588            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1589            # this process, PDB information is collected in a hash table and then
1590            # unspooled after both relationships are created.
1591            my %pdbHash = ();
1592            Trace("Generating docking data.") if T(2);
1593            # Get all the docking data. This may cause problems if there are too many PDBs,
1594            # at which point we'll need another algorithm. The indicator that this is
1595            # happening will be a timeout error in the next statement.
1596            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1597                                                  ['docking_results', $FIG_Config::dockLimit]);
1598            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1599            for my $dockData (@dockData) {
1600                # Get the docking data components.
1601                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1602                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1603                $pdbID = lc $pdbID;
1604                # Strip off the object type.
1605                $pdbID =~ s/pdb://;
1606                # Extract the ZINC ID from the docking key. Note that there are two possible
1607                # formats.
1608                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1609                if (! $zinc_id) {
1610                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1611                    $loadDocksWith->Add("errors");
1612                } else {
1613                    # Get the pieces of the value and parse the energy.
1614                    # Note that we don't care about the rank, since
1615                    # we can sort on the energy level itself in our database.
1616                    my ($energy, $tool, $type) = @valueData;
1617                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1618                    # Ignore predicted results.
1619                    if ($type ne "Predicted") {
1620                        # Count this docking result.
1621                        if (! exists $pdbHash{$pdbID}) {
1622                            $pdbHash{$pdbID} = 1;
1623                        } else {
1624                            $pdbHash{$pdbID}++;
1625                        }
1626                        # Write the result to the output.
1627                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1628                                            $total, $vanderwaals);
1629                    }
1630                }
1631            }
1632            Trace("Connecting features.") if T(2);
1633            # Loop through the genomes.
1634            for my $genome (sort keys %{$genomeHash}) {
1635                Trace("Generating PDBs for $genome.") if T(3);
1636                # Get all of the PDBs that BLAST against this genome's features.
1637                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1638                for my $pdbData (@attributeData) {
1639                    # The PDB ID is coded as a subkey.
1640                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1641                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1642                        $loadPDB->Add("errors");
1643                    } else {
1644                        my $pdbID = $1;
1645                        # Insure the PDB is in the hash.
1646                        if (! exists $pdbHash{$pdbID}) {
1647                            $pdbHash{$pdbID} = 0;
1648                        }
1649                        # The score and locations are coded in the attribute value.
1650                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1651                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1652                            $loadIsProteinForFeature->Add("errors");
1653                        } else {
1654                            my ($score, $locData) = ($1,$2);
1655                            # The location data may not be present, so we have to start with some
1656                            # defaults and then check.
1657                            my ($start, $end) = (1, 0);
1658                            if ($locData) {
1659                                $locData =~ /(\d+)-(\d+)/;
1660                                $start = $1;
1661                                $end = $2;
1662                            }
1663                            # If we still don't have the end location, compute it from
1664                            # the feature length.
1665                            if (! $end) {
1666                                # Most features have one location, but we do a list iteration
1667                                # just in case.
1668                                my @locations = $fig->feature_location($pdbData->[0]);
1669                                $end = 0;
1670                                for my $loc (@locations) {
1671                                    my $locObject = BasicLocation->new($loc);
1672                                    $end += $locObject->Length;
1673                                }
1674                            }
1675                            # Decode the score.
1676                            my $realScore = FIGRules::DecodeScore($score);
1677                            # Connect the PDB to the feature.
1678                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1679                        }
1680                    }
1681                }
1682            }
1683            # We've got all our PDBs now, so we unspool them from the hash.
1684            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1685            my $count = 0;
1686            for my $pdbID (sort keys %pdbHash) {
1687                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1688                $count++;
1689                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1690            }
1691            # Finally we create the ligand table. This information can be found in the
1692            # zinc_name attribute.
1693            Trace("Loading ligands.") if T(2);
1694            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1695            my $last_zinc_id = "";
1696            my $zinc_id = "";
1697            my $done = 0;
1698            while (! $done) {
1699                # Get the next 10000 ligands. We insist that the object ID is greater than
1700                # the last ID we processed.
1701                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1702                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1703                                                           ["ZINC:$zinc_id", "zinc_name"]);
1704                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1705                if (! @attributeData) {
1706                    # Here there are no attributes left, so we quit the loop.
1707                    $done = 1;
1708                } else {
1709                    # Process the attribute data we've received.
1710                    for my $zinc_data (@attributeData) {
1711                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1712                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1713                            $zinc_id = $1;
1714                            # Check for a duplicate.
1715                            if ($zinc_id eq $last_zinc_id) {
1716                                $loadLigand->Add("duplicate");
1717                            } else {
1718                                # Here it's safe to output the ligand. The ligand name is the attribute value
1719                                # (third column in the row).
1720                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1721                                # Insure we don't try to add this ID again.
1722                                $last_zinc_id = $zinc_id;
1723                            }
1724                        } else {
1725                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1726                            $loadLigand->Add("errors");
1727                        }
1728                    }
1729                }
1730            }
1731            Trace("Ligands loaded.") if T(2);
1732        }
1733        # Finish the load.
1734        my $retVal = $self->_FinishAll();
1735        return $retVal;
1736    }
1737    
1738    
1739  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1740    
1741    =head3 SpecialAttribute
1742    
1743        my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1744    
1745    Look for special attributes of a given type. A special attribute is found by comparing one of
1746    the columns of the incoming attribute list to a search pattern. If a match is found, then
1747    a set of columns is put into an output table connected to the specified ID.
1748    
1749    For example, when processing features, the attribute list we look at has three columns: attribute
1750    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1751    begins with C<iedb_>. The call signature is therefore
1752    
1753        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1754    
1755    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1756    to the output along with the specified feature ID.
1757    
1758    =over 4
1759    
1760    =item id
1761    
1762    ID of the object whose special attributes are being loaded. This forms the first column of the
1763    output.
1764    
1765    =item attributes
1766    
1767    Reference to a list of tuples.
1768    
1769    =item idxMatch
1770    
1771    Index in each tuple of the column to be matched against the pattern. If the match is
1772    successful, an output record will be generated.
1773    
1774    =item idxValues
1775    
1776    Reference to a list containing the indexes in each tuple of the columns to be put as
1777    the second column of the output.
1778    
1779    =item pattern
1780    
1781    Pattern to be matched against the specified column. The match will be case-insensitive.
1782    
1783    =item loader
1784    
1785    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1786    but technically it could be anything with a C<Put> method.
1787    
1788    =item RETURN
1789    
1790    Returns a count of the matches found.
1791    
1792    =item
1793    
1794    =back
1795    
1796    =cut
1797    
1798    sub SpecialAttribute {
1799        # Get the parameters.
1800        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1801        # Declare the return variable.
1802        my $retVal = 0;
1803        # Loop through the attribute rows.
1804        for my $row (@{$attributes}) {
1805            # Check for a match.
1806            if ($row->[$idxMatch] =~ m/$pattern/i) {
1807                # We have a match, so output a row. This is a bit tricky, since we may
1808                # be putting out multiple columns of data from the input.
1809                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1810                $loader->Put($id, $value);
1811                $retVal++;
1812            }
1813        }
1814        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1815        # Return the number of matches.
1816        return $retVal;
1817    }
1818    
1819  =head3 TableLoader  =head3 TableLoader
1820    
1821  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 1830 
1830    
1831  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1832    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
1833  =item RETURN  =item RETURN
1834    
1835  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1186  Line 1840 
1840    
1841  sub _TableLoader {  sub _TableLoader {
1842      # Get the parameters.      # Get the parameters.
1843      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
1844      # Create the load object.      # Create the load object.
1845      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
1846      # Cache it in the loader list.      # Cache it in the loader list.
1847      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1848      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 1876 
1876      my $retVal = Stats->new();      my $retVal = Stats->new();
1877      # Get the loader list.      # Get the loader list.
1878      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1879        # Create a hash to hold the statistics objects, keyed on relation name.
1880        my %loaderHash = ();
1881      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1882      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1883      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1884            # Get the relation name.
1885            my $relName = $loader->RelName;
1886            # Check the ignore flag.
1887            if ($loader->Ignore) {
1888                Trace("Relation $relName not loaded.") if T(2);
1889            } else {
1890                # Here we really need to finish.
1891                Trace("Finishing $relName.") if T(2);
1892          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1893                $loaderHash{$relName} = $stats;
1894            }
1895        }
1896        # Now we loop through again, actually loading the tables. We want to finish before
1897        # loading so that if something goes wrong at this point, all the load files are usable
1898        # and we don't have to redo all that work.
1899        for my $relName (sort keys %loaderHash) {
1900            # Get the statistics for this relation.
1901            my $stats = $loaderHash{$relName};
1902            # Check for a database load.
1903            if ($self->{options}->{dbLoad}) {
1904                # Here we want to use the load file just created to load the database.
1905                Trace("Loading relation $relName.") if T(2);
1906                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1907                # Accumulate the statistics from the DB load.
1908                $stats->Accumulate($newStats);
1909            }
1910          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1911          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1912      }      }
1913      # Return the load statistics.      # Return the load statistics.
1914      return $retVal;      return $retVal;
1915  }  }
1916    
1917    =head3 GetGenomeAttributes
1918    
1919        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
1920    
1921    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1922    attributes for all the features of a genome in a single call, then organizes them into
1923    a hash.
1924    
1925    =over 4
1926    
1927    =item fig
1928    
1929    FIG-like object for accessing attributes.
1930    
1931    =item genomeID
1932    
1933    ID of the genome who's attributes are desired.
1934    
1935    =item fids
1936    
1937    Reference to a list of the feature IDs whose attributes are to be kept.
1938    
1939    =item propKeys
1940    
1941    A list of the keys to retrieve.
1942    
1943    =item RETURN
1944    
1945    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1946    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1947    the attribute key, and one or more attribute values.
1948    
1949    =back
1950    
1951    =cut
1952    
1953    sub GetGenomeAttributes {
1954        # Get the parameters.
1955        my ($fig, $genomeID, $fids, $propKeys) = @_;
1956        # Declare the return variable.
1957        my $retVal = {};
1958        # Initialize the hash. This not only enables us to easily determine which FIDs to
1959        # keep, it insures that the caller sees a list reference for every known fid,
1960        # simplifying the logic.
1961        for my $fid (@{$fids}) {
1962            $retVal->{$fid} = [];
1963        }
1964        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1965        # an eval is used.
1966        my @aList = ();
1967        eval {
1968            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1969            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1970        };
1971        # Check for a problem.
1972        if ($@) {
1973            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1974            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1975            # but allows us to continue processing.
1976            my $nFids = scalar @{$fids};
1977            for (my $i = 0; $i < $nFids; $i += 100) {
1978                # Determine the index of the last feature ID we'll be specifying on this pass.
1979                # Normally it's $i + 99, but if we're close to the end it may be less.
1980                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1981                # Get a slice of the fid list.
1982                my @slice = @{$fids}[$i .. $end];
1983                # Get the relevant attributes.
1984                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1985                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1986                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1987                push @aList, @aShort;
1988            }
1989        }
1990        # Now we should have all the interesting attributes in @aList. Populate the hash with
1991        # them.
1992        for my $aListEntry (@aList) {
1993            my $fid = $aListEntry->[0];
1994            if (exists $retVal->{$fid}) {
1995                push @{$retVal->{$fid}}, $aListEntry;
1996            }
1997        }
1998        # Return the result.
1999        return $retVal;
2000    }
2001    
2002    
2003  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.91

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3