[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.92, Sun Mar 23 16:33:15 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14        use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 29  Line 32 
32      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
33      print $stats->Show();      print $stats->Show();
34    
 This module makes use of the internal Sprout property C<_erdb>.  
   
35  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
36  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
37  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 80 
80  =item subsysFile  =item subsysFile
81    
82  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
85    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
86    
87    =item options
88    
89    Reference to a hash of command-line options.
90    
91  =back  =back
92    
# Line 88  Line 94 
94    
95  sub new {  sub new {
96      # Get the parameters.      # Get the parameters.
97      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
98      # Load the list of genomes into a hash.      # Create the genome hash.
99      my %genomes;      my %genomes = ();
100        # We only need it if load-only is NOT specified.
101        if (! $options->{loadOnly}) {
102      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
103          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
104          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
105          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107      } else {      } else {
108          my $type = ref $genomeFile;          my $type = ref $genomeFile;
109          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 114  Line 123 
123                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
124                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
125                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
126                      if (undef $accessCode) {                          if (! defined($accessCode)) {
127                          $accessCode = 1;                          $accessCode = 1;
128                      }                      }
129                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 133 
133              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
134          }          }
135      }      }
136        }
137      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
138      my %subsystems = ();      my %subsystems = ();
139        # We only need it if load-only is NOT specified.
140        if (! $options->{loadOnly}) {
141      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
142          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
143          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
144                # Loop through, checking for the NMPDR file.
145                for my $sub (@subs) {
146                    if ($fig->nmpdr_subsystem($sub)) {
147                        $subsystems{$sub} = 1;
148                    }
149                }
150      } else {      } else {
151          my $type = ref $subsysFile;          my $type = ref $subsysFile;
152          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 166 
166              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
167          }          }
168      }      }
169            # Go through the subsys hash again, creating the keyword list for each subsystem.
170            for my $subsystem (keys %subsystems) {
171                my $name = $subsystem;
172                $name =~ s/_/ /g;
173                $subsystems{$subsystem} = $name;
174            }
175        }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 184 
184                    subsystems => \%subsystems,                    subsystems => \%subsystems,
185                    sprout => $sprout,                    sprout => $sprout,
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout->{_erdb},                    erdb => $sprout,
188                    loaders => []                    loaders => [],
189                      options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
194      return $retVal;      return $retVal;
195  }  }
196    
197    =head3 LoadOnly
198    
199        my $flag = $spl->LoadOnly;
200    
201    Return TRUE if we are in load-only mode, else FALSE.
202    
203    =cut
204    
205    sub LoadOnly {
206        my ($self) = @_;
207        return $self->{options}->{loadOnly};
208    }
209    
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
214    
215  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
216    
# Line 192  Line 235 
235    
236  =back  =back
237    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
238  =cut  =cut
239  #: Return Type $%;  #: Return Type $%;
240  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 245 
245      # Get the genome count.      # Get the genome count.
246      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
254        if ($self->{options}->{loadOnly}) {
255            Trace("Loading from existing files.") if T(2);
256        } else {
257            Trace("Generating genome data.") if T(2);
258            # Get the full info for the FIG genomes.
259            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
260                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
261      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
262      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
263          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
264          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
265          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
266          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
267          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
268          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
269          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
270          # Get the full taxonomy.          # Get the full taxonomy.
271          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
272                # Get the version. If no version is specified, we default to the genome ID by itself.
273                my $version = $fig->genome_version($genomeID);
274                if (! defined($version)) {
275                    $version = $genomeID;
276                }
277                # Get the DNA size.
278                my $dnaSize = $fig->genome_szdna($genomeID);
279                # Open the NMPDR group file for this genome.
280                my $group;
281                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
282                    defined($group = <TMP>)) {
283                    # Clean the line ending.
284                    chomp $group;
285                } else {
286                    # No group, so use the default.
287                    $group = $FIG_Config::otherGroup;
288                }
289                close TMP;
290                # Get the contigs.
291                my @contigs = $fig->all_contigs($genomeID);
292                # Get this genome's info array.
293                my $info = $genomeInfo{$genomeID};
294          # Output the genome record.          # Output the genome record.
295          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
296                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
297          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
         my @contigs = $fig->all_contigs($genomeID);  
298          for my $contigID (@contigs) {          for my $contigID (@contigs) {
299              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
300              $loadContig->Add("contigIn");              $loadContig->Add("contigIn");
# Line 262  Line 323 
323              }              }
324          }          }
325      }      }
326        }
327      # Finish the loads.      # Finish the loads.
328      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
329      # Return the result.      # Return the result.
330      return $retVal;      return $retVal;
331  }  }
332    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence is  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
333  =head3 LoadFeatureData  =head3 LoadFeatureData
334    
335  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
336    
337  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
338    
# Line 400  Line 342 
342    
343      Feature      Feature
344      FeatureAlias      FeatureAlias
345        IsAliasOf
346      FeatureLink      FeatureLink
347      FeatureTranslation      FeatureTranslation
348      FeatureUpstream      FeatureUpstream
349      IsLocatedIn      IsLocatedIn
350        HasFeature
351        HasRoleInSubsystem
352        FeatureEssential
353        FeatureVirulent
354        FeatureIEDB
355        CDD
356        IsPresentOnProteinOf
357    
358  =over 4  =over 4
359    
# Line 418  Line 368 
368  sub LoadFeatureData {  sub LoadFeatureData {
369      # Get this object instance.      # Get this object instance.
370      my ($self) = @_;      my ($self) = @_;
371      # Get the FIG object.      # Get the FIG and Sprout objects.
372      my $fig = $self->{fig};      my $fig = $self->{fig};
373        my $sprout = $self->{sprout};
374      # Get the table of genome IDs.      # Get the table of genome IDs.
375      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
376      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
377      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
378      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
379      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
380      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
381      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
382      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
383        my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
384        my $loadHasFeature = $self->_TableLoader('HasFeature');
385        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
386        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
387        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
388        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
389        my $loadCDD = $self->_TableLoader('CDD');
390        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
391        # Get the subsystem hash.
392        my $subHash = $self->{subsystems};
393        # Get the property keys.
394        my $propKeys = $self->{propKeys};
395        # Create a hashes to hold CDD and alias values.
396        my %CDD = ();
397        my %alias = ();
398      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
399      # locations.      # locations.
400      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
401      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
402            Trace("Loading from existing files.") if T(2);
403        } else {
404            Trace("Generating feature data.") if T(2);
405      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
406      for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
407            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
408            for my $genomeID (@allGenomes) {
409          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
410          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
411          # Get the feature list for this genome.          # Get the feature list for this genome.
412          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
413                # Sort and count the list.
414                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
415                my $count = scalar @featureTuples;
416                my @fids = map { $_->[0] } @featureTuples;
417                Trace("$count features found for genome $genomeID.") if T(3);
418                # Get the attributes for this genome and put them in a hash by feature ID.
419                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
420                Trace("Looping through features for $genomeID.") if T(3);
421                # Set up for our duplicate-feature check.
422                my $oldFeatureID = "";
423          # Loop through the features.          # Loop through the features.
424          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
425              # Split the tuple.              # Split the tuple.
426              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
427              # Create the feature record.                  # Check for duplicates.
428              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
429                        Trace("Duplicate feature $featureID found.") if T(1);
430                    } else {
431                        $oldFeatureID = $featureID;
432                        # Count this feature.
433                        $loadFeature->Add("featureIn");
434                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
435                        # Sprout database requires a single character.
436                        if (! defined($quality) || $quality eq "") {
437                            $quality = " ";
438                        }
439                        # Begin building the keywords. We start with the genome ID, the
440                        # feature ID, the taxonomy, and the organism name.
441                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
442                                        $fig->taxonomy_of($genomeID));
443              # Create the aliases.              # Create the aliases.
444              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
445                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
446              }                          $loadIsAliasOf->Put($alias, $featureID);
447                            push @keywords, $alias;
448                            # If this is a locus tag, also add its natural form as a keyword.
449                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
450                            if ($naturalName) {
451                                push @keywords, $naturalName;
452                            }
453                            # If this is the first time for the specified alias, create its
454                            # alias record.
455                            if (! exists $alias{$alias}) {
456                                $loadFeatureAlias->Put($alias);
457                                $alias{$alias} = 1;
458                            }
459                        }
460                        # Add the corresponding IDs. Note we have to remove the FIG ID from the
461                        # return list. It's already among the keywords.
462                        my @corresponders = grep { $_ !~ /^fig/} $fig->get_corresponding_ids($featureID);
463                        push @keywords, @corresponders;
464                        Trace("Assignment for $featureID is: $assignment") if T(4);
465                        # Break the assignment into words and shove it onto the
466                        # keyword list.
467                        push @keywords, split(/\s+/, $assignment);
468                        # Link this feature to the parent genome.
469                        $loadHasFeature->Put($genomeID, $featureID, $type);
470              # Get the links.              # Get the links.
471              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
472              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 485 
485                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
486                  }                  }
487              }              }
488                        # Now we need to find the subsystems this feature participates in.
489                        # We also add the subsystems to the keyword list. Before we do that,
490                        # we must convert underscores to spaces.
491                        my @subsystems = $fig->peg_to_subsystems($featureID);
492                        for my $subsystem (@subsystems) {
493                            # Only proceed if we like this subsystem.
494                            if (exists $subHash->{$subsystem}) {
495                                # Store the has-role link.
496                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
497                                # Save the subsystem's keyword data.
498                                my $subKeywords = $subHash->{$subsystem};
499                                push @keywords, split /\s+/, $subKeywords;
500                                # Now we need to get this feature's role in the subsystem.
501                                my $subObject = $fig->get_subsystem($subsystem);
502                                my @roleColumns = $subObject->get_peg_roles($featureID);
503                                my @allRoles = $subObject->get_roles();
504                                for my $col (@roleColumns) {
505                                    my $role = $allRoles[$col];
506                                    push @keywords, split /\s+/, $role;
507                                    push @keywords, $subObject->get_role_abbr($col);
508                                }
509                            }
510                        }
511                        # There are three special attributes computed from property
512                        # data that we build next. If the special attribute is non-empty,
513                        # its name will be added to the keyword list. First, we get all
514                        # the attributes for this feature. They will come back as
515                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
516                        # [name, value, value with URL]. (We don't need the PEG, since
517                        # we already know it.)
518                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
519                                             @{$attributes->{$featureID}};
520                        # Now we process each of the special attributes.
521                        if (SpecialAttribute($featureID, \@attributes,
522                                             1, [0,2], '^(essential|potential_essential)$',
523                                             $loadFeatureEssential)) {
524                            push @keywords, 'essential';
525                            $loadFeature->Add('essential');
526                        }
527                        if (SpecialAttribute($featureID, \@attributes,
528                                             0, [2], '^virulen',
529                                             $loadFeatureVirulent)) {
530                            push @keywords, 'virulent';
531                            $loadFeature->Add('virulent');
532                        }
533                        if (SpecialAttribute($featureID, \@attributes,
534                                             0, [0,2], '^iedb_',
535                                             $loadFeatureIEDB)) {
536                            push @keywords, 'iedb';
537                            $loadFeature->Add('iedb');
538                        }
539                        # Now we have some other attributes we need to process. Currently,
540                        # this is CDD and CELLO, but we expect the number to increase.
541                        my %attributeHash = ();
542                        for my $attrRow (@{$attributes->{$featureID}}) {
543                            my (undef, $key, @values) = @{$attrRow};
544                            $key =~ /^([^:]+)::(.+)/;
545                            if (exists $attributeHash{$1}) {
546                                $attributeHash{$1}->{$2} = \@values;
547                            } else {
548                                $attributeHash{$1} = {$2 => \@values};
549                            }
550                        }
551                        my $celloValue = "unknown";
552                        # Pull in the CELLO attribute. There will never be more than one.
553                        # If we have one, it's a feature attribute AND a keyword.
554                        my @celloData = keys %{$attributeHash{CELLO}};
555                        if (@celloData) {
556                            $celloValue = $celloData[0];
557                            push @keywords, $celloValue;
558                        }
559                        # Now we handle CDD. This is a bit more complicated, because
560                        # there are multiple CDDs per protein.
561                        if (exists $attributeHash{CDD}) {
562                            # Get the hash of CDD IDs to scores for this feature. We
563                            # already know it exists because of the above IF.
564                            my $cddHash = $attributeHash{CDD};
565                            my @cddData = sort keys %{$cddHash};
566                            for my $cdd (@cddData) {
567                                # Extract the score for this CDD and decode it.
568                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]);
569                                my $realScore = FIGRules::DecodeScore($codeScore);
570                                # We can't afford to crash because of a bad attribute
571                                # value, hence the IF below.
572                                if (! defined($realScore)) {
573                                    # Bad score, so count it.
574                                    $loadFeature->Add('badCDDscore');
575                                } else {
576                                    # Create the connection.
577                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
578                                    # If this CDD does not yet exist, create its record.
579                                    if (! exists $CDD{$cdd}) {
580                                        $CDD{$cdd} = 1;
581                                        $loadCDD->Put($cdd);
582                                    }
583                                }
584                            }
585                        }
586                        # Now we need to bust up hyphenated words in the keyword
587                        # list. We keep them separate and put them at the end so
588                        # the original word order is available.
589                        my $keywordString = "";
590                        my $bustedString = "";
591                        for my $keyword (@keywords) {
592                            if (length $keyword >= 3) {
593                                $keywordString .= " $keyword";
594                                if ($keyword =~ /-/) {
595                                    my @words = split /-/, $keyword;
596                                    $bustedString .= join(" ", "", @words);
597                                }
598                            }
599                        }
600                        $keywordString .= $bustedString;
601                        # Get rid of annoying punctuation.
602                        $keywordString =~ s/[();]//g;
603                        # Clean the keyword list.
604                        my $cleanWords = $sprout->CleanKeywords($keywordString);
605                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
606                        # Now we need to process the feature's locations. First, we split them up.
607                        my @locationList = split /\s*,\s*/, $locations;
608                        # Next, we convert them to Sprout location objects.
609                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
610                        # Assemble them into a sprout location string for later.
611                        my $locationString = join(", ", map { $_->String } @locObjectList);
612              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
613              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
614              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
615              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
616              my @locationList = split /\s*,\s*/, $locations;                      my $i = 1;
617              # Loop through the locations.              # Loop through the locations.
618              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
619                  # Parse the location.                          # Split this location into a list of chunks.
                 my $locObject = BasicLocation->new($location);  
                 # Split it into a list of chunks.  
620                  my @locOList = ();                  my @locOList = ();
621                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
622                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 488  Line 625 
625                  push @locOList, $locObject;                  push @locOList, $locObject;
626                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
627                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
628                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
629                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
630                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
631                      $i++;                      $i++;
632                  }                  }
633              }              }
634                        # Finally, reassemble the location objects into a list of Sprout location strings.
635                        # Create the feature record.
636                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString);
637          }          }
638      }      }
639      # Finish the loads.              Trace("Genome $genomeID processed.") if T(3);
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
640          }          }
641      }      }
642      # Finish the loads.      # Finish the loads.
# Line 571  Line 646 
646    
647  =head3 LoadSubsystemData  =head3 LoadSubsystemData
648    
649  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
650    
651  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
652    
# Line 584  Line 659 
659  The following relations are loaded by this method.  The following relations are loaded by this method.
660    
661      Subsystem      Subsystem
662        SubsystemClass
663      Role      Role
664        RoleEC
665        IsIdentifiedByEC
666      SSCell      SSCell
667      ContainsFeature      ContainsFeature
668      IsGenomeOf      IsGenomeOf
# Line 592  Line 670 
670      OccursInSubsystem      OccursInSubsystem
671      ParticipatesIn      ParticipatesIn
672      HasSSCell      HasSSCell
673        ConsistsOfRoles
674        RoleSubset
675        HasRoleSubset
676        ConsistsOfGenomes
677        GenomeSubset
678        HasGenomeSubset
679        Catalyzes
680        Diagram
681        RoleOccursIn
682    
683  =over 4  =over 4
684    
# Line 601  Line 688 
688    
689  =back  =back
690    
 B<TO DO>  
   
 Generate RoleName table?  
   
691  =cut  =cut
692  #: Return Type $%;  #: Return Type $%;
693  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 701 
701      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
702      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
703      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
704      my $subsysCount = @subsysIDs;      # Get the map list.
705      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
706      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
707      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
708      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
709      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
710      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role');
711      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
712      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
713      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
714      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
715      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
716      Trace("Beginning subsystem data load.") if T(2);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
717        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
718        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
719        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
720        my $loadHasSSCell = $self->_TableLoader('HasSSCell');
721        my $loadRoleSubset = $self->_TableLoader('RoleSubset');
722        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
723        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
724        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
725        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
726        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
727        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
728        if ($self->{options}->{loadOnly}) {
729            Trace("Loading from existing files.") if T(2);
730        } else {
731            Trace("Generating subsystem data.") if T(2);
732            # This hash will contain the roles for each EC. When we're done, this
733            # information will be used to generate the Catalyzes table.
734            my %ecToRoles = ();
735      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
736      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
737      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
738      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
739            # and memorize up the reactions.
740            my ($genomeID, $roleID);
741      my %roleData = ();      my %roleData = ();
742      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
743                # Get the subsystem object.
744                my $sub = $fig->get_subsystem($subsysID);
745                # Only proceed if the subsystem has a spreadsheet.
746                if (defined($sub) && ! $sub->{empty_ss}) {
747          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
748          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
749          # Create the subsystem record.          # Create the subsystem record.
750          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
751          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
752          my @roles = $fig->subsystem_to_roles($subsysID);                  my $description = $sub->get_description();
753          # Connect the roles to the subsystem. If a role is new, we create                  $loadSubsystem->Put($subsysID, $curator, $description, $notes);
754          # a role record for it.                  # Now for the classification string. This comes back as a list
755          for my $roleID (@roles) {                  # reference and we convert it to a space-delimited string.
756                    my $classList = $fig->subsystem_classification($subsysID);
757                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
758                    $loadSubsystemClass->Put($subsysID, $classString);
759                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
760                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
761                        # Get the role's abbreviation.
762                        my $abbr = $sub->get_role_abbr($col);
763                        # Connect to this role.
764              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
765              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
766                        # If it's a new role, add it to the role table.
767              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
768                            # Get the role's abbreviation.
769                            # Add the role.
770                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
771                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
772                            # Check for an EC number.
773                            if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
774                                my $ec = $1;
775                                $loadIsIdentifiedByEC->Put($roleID, $ec);
776                                # Check to see if this is our first encounter with this EC.
777                                if (exists $ecToRoles{$ec}) {
778                                    # No, so just add this role to the EC list.
779                                    push @{$ecToRoles{$ec}}, $roleID;
780                                } else {
781                                    # Output this EC.
782                                    $loadRoleEC->Put($ec);
783                                    # Create its role list.
784                                    $ecToRoles{$ec} = [$roleID];
785                                }
786              }              }
787          }          }
788          # Now all roles for this subsystem have been filled in. We create the                  }
789          # spreadsheet by matches roles to genomes. To do this, we need to                  # Now we create the spreadsheet for the subsystem by matching roles to
790          # get the genomes on the sheet.                  # genomes. Each genome is a row and each role is a column. We may need
791                    # to actually create the roles as we find them.
792          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
793          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
794          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
795              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
796                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
797                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
798                            my $cellCount = 0;
799                            # Create a list for the PEGs we find. This list will be used
800                            # to generate cluster numbers.
801                            my @pegsFound = ();
802                            # Create a hash that maps spreadsheet IDs to PEGs. We will
803                            # use this to generate the ContainsFeature data after we have
804                            # the cluster numbers.
805                            my %cellPegs = ();
806                            # Get the genome's variant code for this subsystem.
807                            my $variantCode = $sub->get_variant_code($row);
808                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
809                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
810                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
811                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
812                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
813                      # Only proceed if features exist.                      # Only proceed if features exist.
814                      if (@pegs > 0) {                      if (@pegs > 0) {
815                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
816                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
817                                    my $cellID = "$subsysID:$genomeID:$col";
818                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
819                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
820                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
821                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
822                          # Attach the features to it.                                  # Remember its features.
823                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
824                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
825                                    $pegCount += @pegs;
826                                }
827                            }
828                            # If we found some cells for this genome, we need to compute clusters and
829                            # denote it participates in the subsystem.
830                            if ($pegCount > 0) {
831                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
832                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
833                                # Create a hash mapping PEG IDs to cluster numbers.
834                                # We default to -1 for all of them.
835                                my %clusterOf = map { $_ => -1 } @pegsFound;
836                                # Partition the PEGs found into clusters.
837                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
838                                for (my $i = 0; $i <= $#clusters; $i++) {
839                                    my $subList = $clusters[$i];
840                                    for my $peg (@{$subList}) {
841                                        $clusterOf{$peg} = $i;
842                                    }
843                                }
844                                # Create the ContainsFeature data.
845                                for my $cellID (keys %cellPegs) {
846                                    my $cellList = $cellPegs{$cellID};
847                                    for my $cellPeg (@$cellList) {
848                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
849                          }                          }
850                      }                      }
851                  }                  }
852              }              }
853          }          }
854                    # Now we need to generate the subsets. The subset names must be concatenated to
855                    # the subsystem name to make them unique keys. There are two types of subsets:
856                    # genome subsets and role subsets. We do the role subsets first.
857                    my @subsetNames = $sub->get_subset_names();
858                    for my $subsetID (@subsetNames) {
859                        # Create the subset record.
860                        my $actualID = "$subsysID:$subsetID";
861                        $loadRoleSubset->Put($actualID);
862                        # Connect the subset to the subsystem.
863                        $loadHasRoleSubset->Put($subsysID, $actualID);
864                        # Connect the subset to its roles.
865                        my @roles = $sub->get_subsetC_roles($subsetID);
866                        for my $roleID (@roles) {
867                            $loadConsistsOfRoles->Put($actualID, $roleID);
868      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
869  }  }
870                    # Next the genome subsets.
871  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
872                    for my $subsetID (@subsetNames) {
873  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
874                        my $actualID = "$subsysID:$subsetID";
875  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
876                        # Connect the subset to the subsystem.
877  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
878  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
879                        my @genomes = $sub->get_subsetR($subsetID);
880  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
881                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
882      Diagram                      }
883      RoleOccursIn                  }
884                }
885  =over 4          }
886            # Now we loop through the diagrams. We need to create the diagram records
887  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
888            # in subsystems (and therefore appear in the %ecToRoles hash) are
889  Returns a statistics object for the loads.          # included.
890            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
891          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
892          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
893          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 739  Line 895 
895          # Now we need to link all the map's roles to it.          # Now we need to link all the map's roles to it.
896          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
897          my %roleHash = ();          my %roleHash = ();
898          for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
899                    if (exists $ecToRoles{$ec}) {
900                        for my $role (@{$ecToRoles{$ec}}) {
901              if (! $roleHash{$role}) {              if (! $roleHash{$role}) {
902                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($role, $map);
903                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
904              }              }
905          }          }
906      }      }
907                }
908            }
909            # Before we leave, we must create the Catalyzes table. We start with the reactions,
910            # then use the "ecToRoles" table to convert EC numbers to role IDs.
911            my @reactions = $fig->all_reactions();
912            for my $reactionID (@reactions) {
913                # Get this reaction's list of roles. The results will be EC numbers.
914                my @ecs = $fig->catalyzed_by($reactionID);
915                # Loop through the roles, creating catalyzation records.
916                for my $thisEC (@ecs) {
917                    if (exists $ecToRoles{$thisEC}) {
918                        for my $thisRole (@{$ecToRoles{$thisEC}}) {
919                            $loadCatalyzes->Put($thisRole, $reactionID);
920                        }
921                    }
922                }
923            }
924        }
925      # Finish the load.      # Finish the load.
926      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
927      return $retVal;      return $retVal;
# Line 753  Line 929 
929    
930  =head3 LoadPropertyData  =head3 LoadPropertyData
931    
932  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
933    
934  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
935    
# Line 787  Line 963 
963      my $fig = $self->{fig};      my $fig = $self->{fig};
964      # Get the genome hash.      # Get the genome hash.
965      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
966      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
967      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
968      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
969      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
970            Trace("Loading from existing files.") if T(2);
971        } else {
972            Trace("Generating property data.") if T(2);
973      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
974      my %propertyKeys = ();      my %propertyKeys = ();
975      my $nextID = 1;      my $nextID = 1;
976            # Get the attributes we intend to store in the property table.
977            my $propKeys = $self->{propKeys};
978      # Loop through the genomes.      # Loop through the genomes.
979      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
980          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
981          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
982          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
983          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
984          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
985          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
986          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
987              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
988              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
989              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
990              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
991                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
992                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
993                  # character.                  # character.
# Line 830  Line 1005 
1005                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
1006                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
1007              }              }
1008                # Update the statistics.
1009                Trace("$propertyCount attributes processed.") if T(3);
1010                $loadHasProperty->Add("propertiesIn", $propertyCount);
1011          }          }
1012      }      }
1013      # Finish the load.      # Finish the load.
# Line 839  Line 1017 
1017    
1018  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1019    
1020  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1021    
1022  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1023    
# Line 871  Line 1049 
1049      my $fig = $self->{fig};      my $fig = $self->{fig};
1050      # Get the genome hash.      # Get the genome hash.
1051      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1052      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1053      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1054      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1055      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1056      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1057      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1058      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1059            Trace("Loading from existing files.") if T(2);
1060        } else {
1061            Trace("Generating annotation data.") if T(2);
1062      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1063      # user records.      # user records.
1064      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1072 
1072      # Loop through the genomes.      # Loop through the genomes.
1073      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1074          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1075              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1076              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1077              my %seenTimestamps = ();              my %seenTimestamps = ();
1078              # Check for a functional assignment.              # Get the genome's annotations.
1079              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1080              if ($func) {              Trace("Processing annotations.") if T(2);
1081                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1082                  # assignment annotation for it.                  # Get the annotation tuple.
1083                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1084                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1085                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1086                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1087                      # stop the substitution search.                      # stop the substitution search.
1088                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1092 
1092                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1093                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1094                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1095                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1096                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1097                              $timestamp++;                      my $keyStamp = $timestamp;
1098                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1099                            $keyStamp++;
1100                          }                          }
1101                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1102                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1103                          # Insure the user exists.                          # Insure the user exists.
1104                          if (! $users{$user}) {                          if (! $users{$user}) {
1105                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1107 
1107                              $users{$user} = 1;                              $users{$user} = 1;
1108                          }                          }
1109                          # Generate the annotation.                          # Generate the annotation.
1110                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1111                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1112                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1113                      } else {                      } else {
# Line 950  Line 1117 
1117                  }                  }
1118              }              }
1119          }          }
     }  
1120      # Finish the load.      # Finish the load.
1121      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1122      return $retVal;      return $retVal;
# Line 958  Line 1124 
1124    
1125  =head3 LoadSourceData  =head3 LoadSourceData
1126    
1127  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1128    
1129  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1130    
# Line 991  Line 1157 
1157      my $fig = $self->{fig};      my $fig = $self->{fig};
1158      # Get the genome hash.      # Get the genome hash.
1159      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1160      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1161      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1162      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1163      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1164      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1165            Trace("Loading from existing files.") if T(2);
1166        } else {
1167            Trace("Generating annotation data.") if T(2);
1168      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1169      my %sourceURL = ();      my %sourceURL = ();
1170      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1178 
1178              chomp $line;              chomp $line;
1179              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1180              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1181              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1182                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1183                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1184              }              }
1185              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1186                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1187                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1188                        $sourceDesc{$sourceID} = $sourceID;
1189              }              }
1190          }          }
1191          close TMP;          close TMP;
1192      }      }
1193            # Write the source descriptions.
1194            for my $sourceID (keys %sourceDesc) {
1195                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1196            }
1197        }
1198      # Finish the load.      # Finish the load.
1199      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1200      return $retVal;      return $retVal;
# Line 1028  Line 1202 
1202    
1203  =head3 LoadExternalData  =head3 LoadExternalData
1204    
1205  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1206    
1207  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1208    
# Line 1060  Line 1234 
1234      my $fig = $self->{fig};      my $fig = $self->{fig};
1235      # Get the genome hash.      # Get the genome hash.
1236      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1237      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1238      # it the key.      # it the key.
1239      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1240      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1241      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1242      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1243      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1244            Trace("Loading from existing files.") if T(2);
1245        } else {
1246            Trace("Generating external data.") if T(2);
1247      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1248      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1249      my $orgLine;      my $orgLine;
1250      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1251          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1257 
1257      close ORGS;      close ORGS;
1258      # Now the function file.      # Now the function file.
1259      my $funcLine;      my $funcLine;
1260      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1261      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1262          # Clean the line ending.          # Clean the line ending.
1263          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1273 
1273              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1274          }          }
1275      }      }
1276        }
1277      # Finish the load.      # Finish the load.
1278      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1279      return $retVal;      return $retVal;
1280  }  }
1281    
 =head3 LoadGroupData  
1282    
1283  C<< my $stats = $spl->LoadGroupData(); >>  =head3 LoadReactionData
1284    
1285        my $stats = $spl->LoadReactionData();
1286    
1287    Load the reaction data from FIG into Sprout.
1288    
1289  Load the genome Groups into Sprout.  Reaction data connects reactions to the compounds that participate in them.
1290    
1291  The following relations are loaded by this method.  The following relations are loaded by this method.
1292    
1293      GenomeGroups      Reaction
1294        ReactionURL
1295        Compound
1296        CompoundName
1297        CompoundCAS
1298        IsIdentifiedByCAS
1299        HasCompoundName
1300        IsAComponentOf
1301    
1302  There is no direct support for genome groups in FIG, so we access the SEED  This method proceeds reaction by reaction rather than genome by genome.
 files directly.  
1303    
1304  =over 4  =over 4
1305    
# Line 1125  Line 1311 
1311    
1312  =cut  =cut
1313  #: Return Type $%;  #: Return Type $%;
1314  sub LoadGroupData {  sub LoadReactionData {
1315        # Get this object instance.
1316        my ($self) = @_;
1317        # Get the FIG object.
1318        my $fig = $self->{fig};
1319        # Create load objects for each of the tables we're loading.
1320        my $loadReaction = $self->_TableLoader('Reaction');
1321        my $loadReactionURL = $self->_TableLoader('ReactionURL');
1322        my $loadCompound = $self->_TableLoader('Compound');
1323        my $loadCompoundName = $self->_TableLoader('CompoundName');
1324        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1325        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1326        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1327        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1328        if ($self->{options}->{loadOnly}) {
1329            Trace("Loading from existing files.") if T(2);
1330        } else {
1331            Trace("Generating reaction data.") if T(2);
1332            # We need some hashes to prevent duplicates.
1333            my %compoundNames = ();
1334            my %compoundCASes = ();
1335            # First we create the compounds.
1336            my @compounds = $fig->all_compounds();
1337            for my $cid (@compounds) {
1338                # Check for names.
1339                my @names = $fig->names_of_compound($cid);
1340                # Each name will be given a priority number, starting with 1.
1341                my $prio = 1;
1342                for my $name (@names) {
1343                    if (! exists $compoundNames{$name}) {
1344                        $loadCompoundName->Put($name);
1345                        $compoundNames{$name} = 1;
1346                    }
1347                    $loadHasCompoundName->Put($cid, $name, $prio++);
1348                }
1349                # Create the main compound record. Note that the first name
1350                # becomes the label.
1351                my $label = (@names > 0 ? $names[0] : $cid);
1352                $loadCompound->Put($cid, $label);
1353                # Check for a CAS ID.
1354                my $cas = $fig->cas($cid);
1355                if ($cas) {
1356                    $loadIsIdentifiedByCAS->Put($cid, $cas);
1357                    if (! exists $compoundCASes{$cas}) {
1358                        $loadCompoundCAS->Put($cas);
1359                        $compoundCASes{$cas} = 1;
1360                    }
1361                }
1362            }
1363            # All the compounds are set up, so we need to loop through the reactions next. First,
1364            # we initialize the discriminator index. This is a single integer used to insure
1365            # duplicate elements in a reaction are not accidentally collapsed.
1366            my $discrim = 0;
1367            my @reactions = $fig->all_reactions();
1368            for my $reactionID (@reactions) {
1369                # Create the reaction record.
1370                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1371                # Compute the reaction's URL.
1372                my $url = HTML::reaction_link($reactionID);
1373                # Put it in the ReactionURL table.
1374                $loadReactionURL->Put($reactionID, $url);
1375                # Now we need all of the reaction's compounds. We get these in two phases,
1376                # substrates first and then products.
1377                for my $product (0, 1) {
1378                    # Get the compounds of the current type for the current reaction. FIG will
1379                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1380                    # have location data in SEED, so it defaults to the empty string.
1381                    my @compounds = $fig->reaction2comp($reactionID, $product);
1382                    for my $compData (@compounds) {
1383                        # Extract the compound data from the current tuple.
1384                        my ($cid, $stoich, $main) = @{$compData};
1385                        # Link the compound to the reaction.
1386                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1387                                                 $product, $stoich);
1388                    }
1389                }
1390            }
1391        }
1392        # Finish the load.
1393        my $retVal = $self->_FinishAll();
1394        return $retVal;
1395    }
1396    
1397    =head3 LoadSynonymData
1398    
1399        my $stats = $spl->LoadSynonymData();
1400    
1401    Load the synonym groups into Sprout.
1402    
1403    The following relations are loaded by this method.
1404    
1405        SynonymGroup
1406        IsSynonymGroupFor
1407    
1408    The source information for these relations is taken from the C<maps_to_id> method
1409    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1410    SQL against the FIG database.
1411    
1412    =over 4
1413    
1414    =item RETURNS
1415    
1416    Returns a statistics object for the loads.
1417    
1418    =back
1419    
1420    =cut
1421    #: Return Type $%;
1422    sub LoadSynonymData {
1423      # Get this object instance.      # Get this object instance.
1424      my ($self) = @_;      my ($self) = @_;
1425      # Get the FIG object.      # Get the FIG object.
1426      my $fig = $self->{fig};      my $fig = $self->{fig};
1427      # Get the genome hash.      # Get the genome hash.
1428      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1429      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1430      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1431      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1432        if ($self->{options}->{loadOnly}) {
1433            Trace("Loading from existing files.") if T(2);
1434        } else {
1435            Trace("Generating synonym group data.") if T(2);
1436            # Get the database handle.
1437            my $dbh = $fig->db_handle();
1438            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1439            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1440            my $result = $sth->execute();
1441            if (! defined($result)) {
1442                Confess("Database error in Synonym load: " . $sth->errstr());
1443            } else {
1444                Trace("Processing synonym results.") if T(2);
1445                # Remember the current synonym.
1446                my $current_syn = "";
1447                # Count the features.
1448                my $featureCount = 0;
1449                my $entryCount = 0;
1450                # Loop through the synonym/peg pairs.
1451                while (my @row = $sth->fetchrow()) {
1452                    # Get the synonym group ID and feature ID.
1453                    my ($syn_id, $peg) = @row;
1454                    # Count this row.
1455                    $entryCount++;
1456                    if ($entryCount % 1000 == 0) {
1457                        Trace("$entryCount rows processed.") if T(3);
1458                    }
1459                    # Insure it's for one of our genomes.
1460                    my $genomeID = FIG::genome_of($peg);
1461                    if (exists $genomeHash->{$genomeID}) {
1462                        # Verify the synonym.
1463                        if ($syn_id ne $current_syn) {
1464                            # It's new, so put it in the group table.
1465                            $loadSynonymGroup->Put($syn_id);
1466                            $current_syn = $syn_id;
1467                        }
1468                        # Connect the synonym to the peg.
1469                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1470                        # Count this feature.
1471                        $featureCount++;
1472                        if ($featureCount % 1000 == 0) {
1473                            Trace("$featureCount features processed.") if T(3);
1474                        }
1475                    }
1476                }
1477                Trace("$entryCount rows produced $featureCount features.") if T(2);
1478            }
1479        }
1480        # Finish the load.
1481        my $retVal = $self->_FinishAll();
1482        return $retVal;
1483    }
1484    
1485    =head3 LoadFamilyData
1486    
1487        my $stats = $spl->LoadFamilyData();
1488    
1489    Load the protein families into Sprout.
1490    
1491    The following relations are loaded by this method.
1492    
1493        Family
1494        IsFamilyForFeature
1495    
1496    The source information for these relations is taken from the C<families_for_protein>,
1497    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1498    
1499    =over 4
1500    
1501    =item RETURNS
1502    
1503    Returns a statistics object for the loads.
1504    
1505    =back
1506    
1507    =cut
1508    #: Return Type $%;
1509    sub LoadFamilyData {
1510        # Get this object instance.
1511        my ($self) = @_;
1512        # Get the FIG object.
1513        my $fig = $self->{fig};
1514        # Get the genome hash.
1515        my $genomeHash = $self->{genomes};
1516        # Create load objects for the tables we're loading.
1517        my $loadFamily = $self->_TableLoader('Family');
1518        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1519        if ($self->{options}->{loadOnly}) {
1520            Trace("Loading from existing files.") if T(2);
1521        } else {
1522            Trace("Generating family data.") if T(2);
1523            # Create a hash for the family IDs.
1524            my %familyHash = ();
1525      # Loop through the genomes.      # Loop through the genomes.
1526      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1527      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1528          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1529          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1530          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1531              defined($line = <TMP>)) {                  # Get this feature's families.
1532              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1533              chomp $line;                  # Loop through the families, connecting them to the feature.
1534              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1535              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1536              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1537                        if (! exists $familyHash{$family}) {
1538                            $familyHash{$family} = 1;
1539                            $loadFamily->Add("families", 1);
1540                            my $size = $fig->sz_family($family);
1541                            my $func = $fig->family_function($family);
1542                            $loadFamily->Put($family, $size, $func);
1543                        }
1544                    }
1545                }
1546            }
1547        }
1548        # Finish the load.
1549        my $retVal = $self->_FinishAll();
1550        return $retVal;
1551    }
1552    
1553    =head3 LoadDrugData
1554    
1555        my $stats = $spl->LoadDrugData();
1556    
1557    Load the drug target data into Sprout.
1558    
1559    The following relations are loaded by this method.
1560    
1561        PDB
1562        DocksWith
1563        IsProteinForFeature
1564        Ligand
1565    
1566    The source information for these relations is taken from attributes. The
1567    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1568    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1569    attribute contains the information for the B<DocksWith> relationship. It is
1570    expected that additional attributes and tables will be added in the future.
1571    
1572    =over 4
1573    
1574    =item RETURNS
1575    
1576    Returns a statistics object for the loads.
1577    
1578    =back
1579    
1580    =cut
1581    #: Return Type $%;
1582    sub LoadDrugData {
1583        # Get this object instance.
1584        my ($self) = @_;
1585        # Get the FIG object.
1586        my $fig = $self->{fig};
1587        # Get the genome hash.
1588        my $genomeHash = $self->{genomes};
1589        # Create load objects for the tables we're loading.
1590        my $loadPDB = $self->_TableLoader('PDB');
1591        my $loadLigand = $self->_TableLoader('Ligand');
1592        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1593        my $loadDocksWith = $self->_TableLoader('DocksWith');
1594        if ($self->{options}->{loadOnly}) {
1595            Trace("Loading from existing files.") if T(2);
1596        } else {
1597            Trace("Generating drug target data.") if T(2);
1598            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1599            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1600            # this process, PDB information is collected in a hash table and then
1601            # unspooled after both relationships are created.
1602            my %pdbHash = ();
1603            Trace("Generating docking data.") if T(2);
1604            # Get all the docking data. This may cause problems if there are too many PDBs,
1605            # at which point we'll need another algorithm. The indicator that this is
1606            # happening will be a timeout error in the next statement.
1607            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1608                                                  ['docking_results', $FIG_Config::dockLimit]);
1609            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1610            for my $dockData (@dockData) {
1611                # Get the docking data components.
1612                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1613                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1614                $pdbID = lc $pdbID;
1615                # Strip off the object type.
1616                $pdbID =~ s/pdb://;
1617                # Extract the ZINC ID from the docking key. Note that there are two possible
1618                # formats.
1619                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1620                if (! $zinc_id) {
1621                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1622                    $loadDocksWith->Add("errors");
1623                } else {
1624                    # Get the pieces of the value and parse the energy.
1625                    # Note that we don't care about the rank, since
1626                    # we can sort on the energy level itself in our database.
1627                    my ($energy, $tool, $type) = @valueData;
1628                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1629                    # Ignore predicted results.
1630                    if ($type ne "Predicted") {
1631                        # Count this docking result.
1632                        if (! exists $pdbHash{$pdbID}) {
1633                            $pdbHash{$pdbID} = 1;
1634                        } else {
1635                            $pdbHash{$pdbID}++;
1636                        }
1637                        # Write the result to the output.
1638                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1639                                            $total, $vanderwaals);
1640                    }
1641                }
1642            }
1643            Trace("Connecting features.") if T(2);
1644            # Loop through the genomes.
1645            for my $genome (sort keys %{$genomeHash}) {
1646                Trace("Generating PDBs for $genome.") if T(3);
1647                # Get all of the PDBs that BLAST against this genome's features.
1648                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1649                for my $pdbData (@attributeData) {
1650                    # The PDB ID is coded as a subkey.
1651                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1652                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1653                        $loadPDB->Add("errors");
1654                    } else {
1655                        my $pdbID = $1;
1656                        # Insure the PDB is in the hash.
1657                        if (! exists $pdbHash{$pdbID}) {
1658                            $pdbHash{$pdbID} = 0;
1659                        }
1660                        # The score and locations are coded in the attribute value.
1661                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1662                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1663                            $loadIsProteinForFeature->Add("errors");
1664                        } else {
1665                            my ($score, $locData) = ($1,$2);
1666                            # The location data may not be present, so we have to start with some
1667                            # defaults and then check.
1668                            my ($start, $end) = (1, 0);
1669                            if ($locData) {
1670                                $locData =~ /(\d+)-(\d+)/;
1671                                $start = $1;
1672                                $end = $2;
1673                            }
1674                            # If we still don't have the end location, compute it from
1675                            # the feature length.
1676                            if (! $end) {
1677                                # Most features have one location, but we do a list iteration
1678                                # just in case.
1679                                my @locations = $fig->feature_location($pdbData->[0]);
1680                                $end = 0;
1681                                for my $loc (@locations) {
1682                                    my $locObject = BasicLocation->new($loc);
1683                                    $end += $locObject->Length;
1684                                }
1685                            }
1686                            # Decode the score.
1687                            my $realScore = FIGRules::DecodeScore($score);
1688                            # Connect the PDB to the feature.
1689                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1690                        }
1691                    }
1692                }
1693            }
1694            # We've got all our PDBs now, so we unspool them from the hash.
1695            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1696            my $count = 0;
1697            for my $pdbID (sort keys %pdbHash) {
1698                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1699                $count++;
1700                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1701            }
1702            # Finally we create the ligand table. This information can be found in the
1703            # zinc_name attribute.
1704            Trace("Loading ligands.") if T(2);
1705            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1706            my $last_zinc_id = "";
1707            my $zinc_id = "";
1708            my $done = 0;
1709            while (! $done) {
1710                # Get the next 10000 ligands. We insist that the object ID is greater than
1711                # the last ID we processed.
1712                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1713                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1714                                                           ["ZINC:$zinc_id", "zinc_name"]);
1715                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1716                if (! @attributeData) {
1717                    # Here there are no attributes left, so we quit the loop.
1718                    $done = 1;
1719                } else {
1720                    # Process the attribute data we've received.
1721                    for my $zinc_data (@attributeData) {
1722                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1723                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1724                            $zinc_id = $1;
1725                            # Check for a duplicate.
1726                            if ($zinc_id eq $last_zinc_id) {
1727                                $loadLigand->Add("duplicate");
1728                            } else {
1729                                # Here it's safe to output the ligand. The ligand name is the attribute value
1730                                # (third column in the row).
1731                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1732                                # Insure we don't try to add this ID again.
1733                                $last_zinc_id = $zinc_id;
1734                            }
1735                        } else {
1736                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1737                            $loadLigand->Add("errors");
1738          }          }
1739          close TMP;                  }
1740                }
1741            }
1742            Trace("Ligands loaded.") if T(2);
1743      }      }
1744      # Finish the load.      # Finish the load.
1745      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1746      return $retVal;      return $retVal;
1747  }  }
1748    
1749    
1750  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1751    
1752    =head3 SpecialAttribute
1753    
1754        my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1755    
1756    Look for special attributes of a given type. A special attribute is found by comparing one of
1757    the columns of the incoming attribute list to a search pattern. If a match is found, then
1758    a set of columns is put into an output table connected to the specified ID.
1759    
1760    For example, when processing features, the attribute list we look at has three columns: attribute
1761    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1762    begins with C<iedb_>. The call signature is therefore
1763    
1764        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1765    
1766    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1767    to the output along with the specified feature ID.
1768    
1769    =over 4
1770    
1771    =item id
1772    
1773    ID of the object whose special attributes are being loaded. This forms the first column of the
1774    output.
1775    
1776    =item attributes
1777    
1778    Reference to a list of tuples.
1779    
1780    =item idxMatch
1781    
1782    Index in each tuple of the column to be matched against the pattern. If the match is
1783    successful, an output record will be generated.
1784    
1785    =item idxValues
1786    
1787    Reference to a list containing the indexes in each tuple of the columns to be put as
1788    the second column of the output.
1789    
1790    =item pattern
1791    
1792    Pattern to be matched against the specified column. The match will be case-insensitive.
1793    
1794    =item loader
1795    
1796    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1797    but technically it could be anything with a C<Put> method.
1798    
1799    =item RETURN
1800    
1801    Returns a count of the matches found.
1802    
1803    =item
1804    
1805    =back
1806    
1807    =cut
1808    
1809    sub SpecialAttribute {
1810        # Get the parameters.
1811        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1812        # Declare the return variable.
1813        my $retVal = 0;
1814        # Loop through the attribute rows.
1815        for my $row (@{$attributes}) {
1816            # Check for a match.
1817            if ($row->[$idxMatch] =~ m/$pattern/i) {
1818                # We have a match, so output a row. This is a bit tricky, since we may
1819                # be putting out multiple columns of data from the input.
1820                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1821                $loader->Put($id, $value);
1822                $retVal++;
1823            }
1824        }
1825        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1826        # Return the number of matches.
1827        return $retVal;
1828    }
1829    
1830  =head3 TableLoader  =head3 TableLoader
1831    
1832  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 1841 
1841    
1842  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1843    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
1844  =item RETURN  =item RETURN
1845    
1846  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1186  Line 1851 
1851    
1852  sub _TableLoader {  sub _TableLoader {
1853      # Get the parameters.      # Get the parameters.
1854      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
1855      # Create the load object.      # Create the load object.
1856      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
1857      # Cache it in the loader list.      # Cache it in the loader list.
1858      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1859      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 1887 
1887      my $retVal = Stats->new();      my $retVal = Stats->new();
1888      # Get the loader list.      # Get the loader list.
1889      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1890        # Create a hash to hold the statistics objects, keyed on relation name.
1891        my %loaderHash = ();
1892      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1893      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1894      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1895            # Get the relation name.
1896            my $relName = $loader->RelName;
1897            # Check the ignore flag.
1898            if ($loader->Ignore) {
1899                Trace("Relation $relName not loaded.") if T(2);
1900            } else {
1901                # Here we really need to finish.
1902                Trace("Finishing $relName.") if T(2);
1903          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1904                $loaderHash{$relName} = $stats;
1905            }
1906        }
1907        # Now we loop through again, actually loading the tables. We want to finish before
1908        # loading so that if something goes wrong at this point, all the load files are usable
1909        # and we don't have to redo all that work.
1910        for my $relName (sort keys %loaderHash) {
1911            # Get the statistics for this relation.
1912            my $stats = $loaderHash{$relName};
1913            # Check for a database load.
1914            if ($self->{options}->{dbLoad}) {
1915                # Here we want to use the load file just created to load the database.
1916                Trace("Loading relation $relName.") if T(2);
1917                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1918                # Accumulate the statistics from the DB load.
1919                $stats->Accumulate($newStats);
1920            }
1921          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1922          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1923      }      }
1924      # Return the load statistics.      # Return the load statistics.
1925      return $retVal;      return $retVal;
1926  }  }
1927    
1928    =head3 GetGenomeAttributes
1929    
1930        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
1931    
1932    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1933    attributes for all the features of a genome in a single call, then organizes them into
1934    a hash.
1935    
1936    =over 4
1937    
1938    =item fig
1939    
1940    FIG-like object for accessing attributes.
1941    
1942    =item genomeID
1943    
1944    ID of the genome who's attributes are desired.
1945    
1946    =item fids
1947    
1948    Reference to a list of the feature IDs whose attributes are to be kept.
1949    
1950    =item propKeys
1951    
1952    A list of the keys to retrieve.
1953    
1954    =item RETURN
1955    
1956    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1957    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1958    the attribute key, and one or more attribute values.
1959    
1960    =back
1961    
1962    =cut
1963    
1964    sub GetGenomeAttributes {
1965        # Get the parameters.
1966        my ($fig, $genomeID, $fids, $propKeys) = @_;
1967        # Declare the return variable.
1968        my $retVal = {};
1969        # Initialize the hash. This not only enables us to easily determine which FIDs to
1970        # keep, it insures that the caller sees a list reference for every known fid,
1971        # simplifying the logic.
1972        for my $fid (@{$fids}) {
1973            $retVal->{$fid} = [];
1974        }
1975        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1976        # an eval is used.
1977        my @aList = ();
1978        eval {
1979            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1980            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1981        };
1982        # Check for a problem.
1983        if ($@) {
1984            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1985            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1986            # but allows us to continue processing.
1987            my $nFids = scalar @{$fids};
1988            for (my $i = 0; $i < $nFids; $i += 100) {
1989                # Determine the index of the last feature ID we'll be specifying on this pass.
1990                # Normally it's $i + 99, but if we're close to the end it may be less.
1991                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1992                # Get a slice of the fid list.
1993                my @slice = @{$fids}[$i .. $end];
1994                # Get the relevant attributes.
1995                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1996                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1997                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1998                push @aList, @aShort;
1999            }
2000        }
2001        # Now we should have all the interesting attributes in @aList. Populate the hash with
2002        # them.
2003        for my $aListEntry (@aList) {
2004            my $fid = $aListEntry->[0];
2005            if (exists $retVal->{$fid}) {
2006                push @{$retVal->{$fid}}, $aListEntry;
2007            }
2008        }
2009        # Return the result.
2010        return $retVal;
2011    }
2012    
2013    
2014  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.92

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3