[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.20, Wed Nov 2 21:54:40 2005 UTC revision 1.90, Thu Dec 6 14:53:50 2007 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 30  Line 32 
32      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
33      print $stats->Show();      print $stats->Show();
34    
 This module makes use of the internal Sprout property C<_erdb>.  
   
35  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
36  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
37  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 52  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 80  Line 80 
80  =item subsysFile  =item subsysFile
81    
82  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
85    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
86    
87  =item options  =item options
88    
# Line 94  Line 95 
95  sub new {  sub new {
96      # Get the parameters.      # Get the parameters.
97      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
98      # Load the list of genomes into a hash.      # Create the genome hash.
99      my %genomes;      my %genomes = ();
100        # We only need it if load-only is NOT specified.
101        if (! $options->{loadOnly}) {
102      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
103          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
104          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
105          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107      } else {      } else {
108          my $type = ref $genomeFile;          my $type = ref $genomeFile;
109          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 119  Line 123 
123                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
124                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
125                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
126                      if (undef $accessCode) {                          if (! defined($accessCode)) {
127                          $accessCode = 1;                          $accessCode = 1;
128                      }                      }
129                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 129  Line 133 
133              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
134          }          }
135      }      }
136        }
137      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
138      my %subsystems = ();      my %subsystems = ();
139        # We only need it if load-only is NOT specified.
140        if (! $options->{loadOnly}) {
141      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
142          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
143          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
144                # Loop through, checking for the NMPDR file.
145                for my $sub (@subs) {
146                    if ($fig->nmpdr_subsystem($sub)) {
147                        $subsystems{$sub} = 1;
148                    }
149                }
150      } else {      } else {
151          my $type = ref $subsysFile;          my $type = ref $subsysFile;
152          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 153  Line 166 
166              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
167          }          }
168      }      }
169            # Go through the subsys hash again, creating the keyword list for each subsystem.
170            for my $subsystem (keys %subsystems) {
171                my $name = $subsystem;
172                $name =~ s/_/ /g;
173    #            my $classes = $fig->subsystem_classification($subsystem);
174    #            $name .= " " . join(" ", @{$classes});
175                $subsystems{$subsystem} = $name;
176            }
177        }
178        # Get the list of NMPDR-oriented attribute keys.
179        my @propKeys = $fig->get_group_keys("NMPDR");
180      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
181      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
182      # Create the Sprout load object.      # Create the Sprout load object.
# Line 162  Line 186 
186                    subsystems => \%subsystems,                    subsystems => \%subsystems,
187                    sprout => $sprout,                    sprout => $sprout,
188                    loadDirectory => $directory,                    loadDirectory => $directory,
189                    erdb => $sprout->{_erdb},                    erdb => $sprout,
190                    loaders => [],                    loaders => [],
191                    options => $options                    options => $options,
192                      propKeys => \@propKeys,
193                   };                   };
194      # Bless and return it.      # Bless and return it.
195      bless $retVal, $class;      bless $retVal, $class;
196      return $retVal;      return $retVal;
197  }  }
198    
199    =head3 LoadOnly
200    
201        my $flag = $spl->LoadOnly;
202    
203    Return TRUE if we are in load-only mode, else FALSE.
204    
205    =cut
206    
207    sub LoadOnly {
208        my ($self) = @_;
209        return $self->{options}->{loadOnly};
210    }
211    
212    
213  =head3 LoadGenomeData  =head3 LoadGenomeData
214    
215  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
216    
217  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
218    
# Line 198  Line 237 
237    
238  =back  =back
239    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
240  =cut  =cut
241  #: Return Type $%;  #: Return Type $%;
242  sub LoadGenomeData {  sub LoadGenomeData {
# Line 216  Line 247 
247      # Get the genome count.      # Get the genome count.
248      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
249      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
250      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
251      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
252      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
253      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
254      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
255      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
256        if ($self->{options}->{loadOnly}) {
257            Trace("Loading from existing files.") if T(2);
258        } else {
259            Trace("Generating genome data.") if T(2);
260      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
261      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
262          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
263          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
264          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
265          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
266          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
267          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
268          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
269          # Get the full taxonomy.          # Get the full taxonomy.
270          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
271                # Get the version. If no version is specified, we default to the genome ID by itself.
272                my $version = $fig->genome_version($genomeID);
273                if (! defined($version)) {
274                    $version = $genomeID;
275                }
276                # Get the DNA size.
277                my $dnaSize = $fig->genome_szdna($genomeID);
278                # Open the NMPDR group file for this genome.
279                my $group;
280                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
281                    defined($group = <TMP>)) {
282                    # Clean the line ending.
283                    chomp $group;
284                } else {
285                    # No group, so use the default.
286                    $group = $FIG_Config::otherGroup;
287                }
288                close TMP;
289          # Output the genome record.          # Output the genome record.
290          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
291                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
292          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
293          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
294          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 268  Line 319 
319              }              }
320          }          }
321      }      }
322        }
323      # Finish the loads.      # Finish the loads.
324      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
325      # Return the result.      # Return the result.
326      return $retVal;      return $retVal;
327  }  }
328    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence in  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
329  =head3 LoadFeatureData  =head3 LoadFeatureData
330    
331  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
332    
333  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
334    
# Line 406  Line 338 
338    
339      Feature      Feature
340      FeatureAlias      FeatureAlias
341        IsAliasOf
342      FeatureLink      FeatureLink
343      FeatureTranslation      FeatureTranslation
344      FeatureUpstream      FeatureUpstream
345      IsLocatedIn      IsLocatedIn
346        HasFeature
347        HasRoleInSubsystem
348        FeatureEssential
349        FeatureVirulent
350        FeatureIEDB
351        CDD
352        IsPresentOnProteinOf
353    
354  =over 4  =over 4
355    
# Line 424  Line 364 
364  sub LoadFeatureData {  sub LoadFeatureData {
365      # Get this object instance.      # Get this object instance.
366      my ($self) = @_;      my ($self) = @_;
367      # Get the FIG object.      # Get the FIG and Sprout objects.
368      my $fig = $self->{fig};      my $fig = $self->{fig};
369      # Find out if this is a limited run.      my $sprout = $self->{sprout};
     my $limited = $self->{options}->{limitedFeatures};  
370      # Get the table of genome IDs.      # Get the table of genome IDs.
371      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
372      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
373      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
374      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
375      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
376      my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
377      if (! $limited) {      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
378          $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
379          $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
380          $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadHasFeature = $self->_TableLoader('HasFeature');
381      }      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
382        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
383        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
384        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
385        my $loadCDD = $self->_TableLoader('CDD');
386        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
387        # Get the subsystem hash.
388        my $subHash = $self->{subsystems};
389        # Get the property keys.
390        my $propKeys = $self->{propKeys};
391        # Create a hashes to hold CDD and alias values.
392        my %CDD = ();
393        my %alias = ();
394      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
395      # locations.      # locations.
396      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
397      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
398            Trace("Loading from existing files.") if T(2);
399        } else {
400            Trace("Generating feature data.") if T(2);
401      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
402      for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
403            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
404            for my $genomeID (@allGenomes) {
405          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
406          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
407          # Get the feature list for this genome.          # Get the feature list for this genome.
408          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
409                # Sort and count the list.
410                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
411                my $count = scalar @featureTuples;
412                my @fids = map { $_->[0] } @featureTuples;
413                Trace("$count features found for genome $genomeID.") if T(3);
414                # Get the attributes for this genome and put them in a hash by feature ID.
415                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
416                Trace("Looping through features for $genomeID.") if T(3);
417                # Set up for our duplicate-feature check.
418                my $oldFeatureID = "";
419          # Loop through the features.          # Loop through the features.
420          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
421              # Split the tuple.              # Split the tuple.
422              my ($featureID, $locations, undef, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
423              # Create the feature record.                  # Check for duplicates.
424              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
425                        Trace("Duplicate feature $featureID found.") if T(1);
426                    } else {
427                        $oldFeatureID = $featureID;
428                        # Count this feature.
429                        $loadFeature->Add("featureIn");
430                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
431                        # Sprout database requires a single character.
432                        if (! defined($quality) || $quality eq "") {
433                            $quality = " ";
434                        }
435                        # Begin building the keywords. We start with the genome ID, the
436                        # feature ID, the taxonomy, and the organism name.
437                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
438                                        $fig->taxonomy_of($genomeID));
439              # Create the aliases.              # Create the aliases.
440              for my $alias ($fig->feature_aliases($featureID)) {              for my $alias ($fig->feature_aliases($featureID)) {
441                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
442              }                          $loadIsAliasOf->Put($alias, $featureID);
443              # The next stuff is for a full load only.                          push @keywords, $alias;
444              if (! $limited) {                          # If this is a locus tag, also add its natural form as a keyword.
445                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
446                            if ($naturalName) {
447                                push @keywords, $naturalName;
448                            }
449                            # If this is the first time for the specified alias, create its
450                            # alias record.
451                            if (! exists $alias{$alias}) {
452                                $loadFeatureAlias->Put($alias);
453                                $alias{$alias} = 1;
454                            }
455                        }
456                        Trace("Assignment for $featureID is: $assignment") if T(4);
457                        # Break the assignment into words and shove it onto the
458                        # keyword list.
459                        push @keywords, split(/\s+/, $assignment);
460                        # Link this feature to the parent genome.
461                        $loadHasFeature->Put($genomeID, $featureID, $type);
462                  # Get the links.                  # Get the links.
463                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
464                  for my $link (@links) {                  for my $link (@links) {
# Line 483  Line 477 
477                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
478                      }                      }
479                  }                  }
480                        # Now we need to find the subsystems this feature participates in.
481                        # We also add the subsystems to the keyword list. Before we do that,
482                        # we must convert underscores to spaces.
483                        my @subsystems = $fig->peg_to_subsystems($featureID);
484                        for my $subsystem (@subsystems) {
485                            # Only proceed if we like this subsystem.
486                            if (exists $subHash->{$subsystem}) {
487                                # Store the has-role link.
488                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
489                                # Save the subsystem's keyword data.
490                                my $subKeywords = $subHash->{$subsystem};
491                                push @keywords, split /\s+/, $subKeywords;
492                                # Now we need to get this feature's role in the subsystem.
493                                my $subObject = $fig->get_subsystem($subsystem);
494                                my @roleColumns = $subObject->get_peg_roles($featureID);
495                                my @allRoles = $subObject->get_roles();
496                                for my $col (@roleColumns) {
497                                    my $role = $allRoles[$col];
498                                    push @keywords, split /\s+/, $role;
499                                    push @keywords, $subObject->get_role_abbr($col);
500                                }
501                            }
502                        }
503                        # There are three special attributes computed from property
504                        # data that we build next. If the special attribute is non-empty,
505                        # its name will be added to the keyword list. First, we get all
506                        # the attributes for this feature. They will come back as
507                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
508                        # [name, value, value with URL]. (We don't need the PEG, since
509                        # we already know it.)
510                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
511                                             @{$attributes->{$featureID}};
512                        # Now we process each of the special attributes.
513                        if (SpecialAttribute($featureID, \@attributes,
514                                             1, [0,2], '^(essential|potential_essential)$',
515                                             $loadFeatureEssential)) {
516                            push @keywords, 'essential';
517                            $loadFeature->Add('essential');
518                        }
519                        if (SpecialAttribute($featureID, \@attributes,
520                                             0, [2], '^virulen',
521                                             $loadFeatureVirulent)) {
522                            push @keywords, 'virulent';
523                            $loadFeature->Add('virulent');
524                        }
525                        if (SpecialAttribute($featureID, \@attributes,
526                                             0, [0,2], '^iedb_',
527                                             $loadFeatureIEDB)) {
528                            push @keywords, 'iedb';
529                            $loadFeature->Add('iedb');
530                        }
531                        # Now we have some other attributes we need to process. Currently,
532                        # this is CDD and CELLO, but we expect the number to increase.
533                        my %attributeHash = ();
534                        for my $attrRow (@{$attributes->{$featureID}}) {
535                            my (undef, $key, @values) = @{$attrRow};
536                            $key =~ /^([^:]+)::(.+)/;
537                            if (exists $attributeHash{$1}) {
538                                $attributeHash{$1}->{$2} = \@values;
539                            } else {
540                                $attributeHash{$1} = {$2 => \@values};
541                            }
542                        }
543                        my $celloValue = "unknown";
544                        # Pull in the CELLO attribute. There will never be more than one.
545                        # If we have one, it's a feature attribute AND a keyword.
546                        my @celloData = keys %{$attributeHash{CELLO}};
547                        if (@celloData) {
548                            $celloValue = $celloData[0];
549                            push @keywords, $celloValue;
550                        }
551                        # Now we handle CDD. This is a bit more complicated, because
552                        # there are multiple CDDs per protein.
553                        if (exists $attributeHash{CDD}) {
554                            # Get the hash of CDD IDs to scores for this feature. We
555                            # already know it exists because of the above IF.
556                            my $cddHash = $attributeHash{CDD};
557                            my @cddData = sort keys %{$cddHash};
558                            for my $cdd (@cddData) {
559                                # Extract the score for this CDD and decode it.
560                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]);
561                                my $realScore = FIGRules::DecodeScore($codeScore);
562                                # We can't afford to crash because of a bad attribute
563                                # value, hence the IF below.
564                                if (! defined($realScore)) {
565                                    # Bad score, so count it.
566                                    $loadFeature->Add('badCDDscore');
567                                } else {
568                                    # Create the connection.
569                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
570                                    # If this CDD does not yet exist, create its record.
571                                    if (! exists $CDD{$cdd}) {
572                                        $CDD{$cdd} = 1;
573                                        $loadCDD->Put($cdd);
574                                    }
575                                }
576              }              }
577                        }
578                        # Now we need to bust up hyphenated words in the keyword
579                        # list. We keep them separate and put them at the end so
580                        # the original word order is available.
581                        my $keywordString = "";
582                        my $bustedString = "";
583                        for my $keyword (@keywords) {
584                            if (length $keyword >= 3) {
585                                $keywordString .= " $keyword";
586                                if ($keyword =~ /-/) {
587                                    my @words = split /-/, $keyword;
588                                    $bustedString .= join(" ", "", @words);
589                                }
590                            }
591                        }
592                        $keywordString .= $bustedString;
593                        # Get rid of annoying punctuation.
594                        $keywordString =~ s/[();]//g;
595                        # Clean the keyword list.
596                        my $cleanWords = $sprout->CleanKeywords($keywordString);
597                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
598                        # Now we need to process the feature's locations. First, we split them up.
599                        my @locationList = split /\s*,\s*/, $locations;
600                        # Next, we convert them to Sprout location objects.
601                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
602                        # Assemble them into a sprout location string for later.
603                        my $locationString = join(", ", map { $_->String } @locObjectList);
604              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
605              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
606              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
607              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
             my @locationList = split /\s*,\s*/, $locations;  
             # Create the location position indicator.  
608              my $i = 1;              my $i = 1;
609              # Loop through the locations.              # Loop through the locations.
610              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
611                  # Parse the location.                          # Split this location into a list of chunks.
                 my $locObject = BasicLocation->new("$genomeID:$location");  
                 # Split it into a list of chunks.  
612                  my @locOList = ();                  my @locOList = ();
613                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
614                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 510  Line 623 
623                      $i++;                      $i++;
624                  }                  }
625              }              }
626                        # Finally, reassemble the location objects into a list of Sprout location strings.
627                        # Create the feature record.
628                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString);
629          }          }
630      }      }
631      # Finish the loads.              Trace("Genome $genomeID processed.") if T(3);
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
632          }          }
633      }      }
634      # Finish the loads.      # Finish the loads.
# Line 586  Line 638 
638    
639  =head3 LoadSubsystemData  =head3 LoadSubsystemData
640    
641  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
642    
643  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
644    
# Line 599  Line 651 
651  The following relations are loaded by this method.  The following relations are loaded by this method.
652    
653      Subsystem      Subsystem
654        SubsystemClass
655      Role      Role
656      RoleEC      RoleEC
657        IsIdentifiedByEC
658      SSCell      SSCell
659      ContainsFeature      ContainsFeature
660      IsGenomeOf      IsGenomeOf
# Line 615  Line 669 
669      GenomeSubset      GenomeSubset
670      HasGenomeSubset      HasGenomeSubset
671      Catalyzes      Catalyzes
672        Diagram
673        RoleOccursIn
674    
675  =over 4  =over 4
676    
# Line 637  Line 693 
693      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
694      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
695      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
696      my $subsysCount = @subsysIDs;      # Get the map list.
697      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
698      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
699      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
700      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
701      my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);      my $loadSubsystem = $self->_TableLoader('Subsystem');
702      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $genomeCount * $featureCount);      my $loadRole = $self->_TableLoader('Role');
703      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
704      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
705      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
706      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
707      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
708      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
709      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
710      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
711      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
712      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
713      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
714      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
715      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
716      Trace("Beginning subsystem data load.") if T(2);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
717      # This hash will contain the role for each EC. When we're done, this      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
718        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
719        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
720        if ($self->{options}->{loadOnly}) {
721            Trace("Loading from existing files.") if T(2);
722        } else {
723            Trace("Generating subsystem data.") if T(2);
724            # This hash will contain the roles for each EC. When we're done, this
725      # information will be used to generate the Catalyzes table.      # information will be used to generate the Catalyzes table.
726      my %ecToRoles = ();      my %ecToRoles = ();
727      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
# Line 670  Line 732 
732      my ($genomeID, $roleID);      my ($genomeID, $roleID);
733      my %roleData = ();      my %roleData = ();
734      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
         Trace("Creating subsystem $subsysID.") if T(3);  
         $loadSubsystem->Add("subsystemIn");  
735          # Get the subsystem object.          # Get the subsystem object.
736          my $sub = $fig->get_subsystem($subsysID);          my $sub = $fig->get_subsystem($subsysID);
737                # Only proceed if the subsystem has a spreadsheet.
738                if (defined($sub) && ! $sub->{empty_ss}) {
739                    Trace("Creating subsystem $subsysID.") if T(3);
740                    $loadSubsystem->Add("subsystemIn");
741          # Create the subsystem record.          # Create the subsystem record.
742          my $curator = $sub->get_curator();          my $curator = $sub->get_curator();
743          my $notes = $sub->get_notes();          my $notes = $sub->get_notes();
744          $loadSubsystem->Put($subsysID, $curator, $notes);          $loadSubsystem->Put($subsysID, $curator, $notes);
745                    # Now for the classification string. This comes back as a list
746                    # reference and we convert it to a space-delimited string.
747                    my $classList = $fig->subsystem_classification($subsysID);
748                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
749                    $loadSubsystemClass->Put($subsysID, $classString);
750          # Connect it to its roles. Each role is a column in the subsystem spreadsheet.          # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
751          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
752                        # Get the role's abbreviation.
753                        my $abbr = $sub->get_role_abbr($col);
754              # Connect to this role.              # Connect to this role.
755              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
756              $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
757              # If it's a new role, add it to the role table.              # If it's a new role, add it to the role table.
758              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
759                  # Get the role's abbreviation.                  # Get the role's abbreviation.
                 my $abbr = $sub->get_role_abbr($col);  
760                  # Add the role.                  # Add the role.
761                  $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
762                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
763                  # Check for an EC number.                  # Check for an EC number.
764                  if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
765                      my $ec = $1;                      my $ec = $1;
766                      $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
767                      $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
768                                if (exists $ecToRoles{$ec}) {
769                                    # No, so just add this role to the EC list.
770                                    push @{$ecToRoles{$ec}}, $roleID;
771                                } else {
772                                    # Output this EC.
773                                    $loadRoleEC->Put($ec);
774                                    # Create its role list.
775                                    $ecToRoles{$ec} = [$roleID];
776                                }
777                  }                  }
778              }              }
779          }          }
# Line 721  Line 800 
800                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
801                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
802                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
803                      my @pegs = $sub->get_pegs_from_cell($row, $col);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
804                      # Only proceed if features exist.                      # Only proceed if features exist.
805                      if (@pegs > 0) {                      if (@pegs > 0) {
806                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
# Line 742  Line 821 
821                  if ($pegCount > 0) {                  if ($pegCount > 0) {
822                      Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);                      Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
823                      $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);                      $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
                     # Partition the PEGs found into clusters.  
                     my @clusters = $fig->compute_clusters(\@pegsFound, $sub);  
824                      # Create a hash mapping PEG IDs to cluster numbers.                      # Create a hash mapping PEG IDs to cluster numbers.
825                      # We default to -1 for all of them.                      # We default to -1 for all of them.
826                      my %clusterOf = map { $_ => -1 } @pegsFound;                      my %clusterOf = map { $_ => -1 } @pegsFound;
827                                # Partition the PEGs found into clusters.
828                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
829                      for (my $i = 0; $i <= $#clusters; $i++) {                      for (my $i = 0; $i <= $#clusters; $i++) {
830                          my $subList = $clusters[$i];                          my $subList = $clusters[$i];
831                          for my $peg (@{$subList}) {                          for my $peg (@{$subList}) {
# Line 774  Line 853 
853              # Connect the subset to the subsystem.              # Connect the subset to the subsystem.
854              $loadHasRoleSubset->Put($subsysID, $actualID);              $loadHasRoleSubset->Put($subsysID, $actualID);
855              # Connect the subset to its roles.              # Connect the subset to its roles.
856              my @roles = $sub->get_subset($subsetID);                      my @roles = $sub->get_subsetC_roles($subsetID);
857              for my $roleID (@roles) {              for my $roleID (@roles) {
858                  $loadConsistsOfRoles->Put($actualID, $roleID);                  $loadConsistsOfRoles->Put($actualID, $roleID);
859              }              }
# Line 794  Line 873 
873              }              }
874          }          }
875      }      }
     # Before we leave, we must create the Catalyzes table. We start with the reactions,  
     # then use the "ecToRoles" table to convert EC numbers to role IDs.  
     my @reactions = $fig->all_reactions();  
     for my $reactionID (@reactions) {  
         # Get this reaction's list of roles. The results will be EC numbers.  
         my @roles = $fig->catalyzed_by($reactionID);  
         # Loop through the roles, creating catalyzation records.  
         for my $thisRole (@roles) {  
             if (exists $ecToRoles{$thisRole}) {  
                 $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);  
             }  
         }  
     }  
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
876  }  }
877            # Now we loop through the diagrams. We need to create the diagram records
878  =head3 LoadDiagramData          # and link each diagram to its roles. Note that only roles which occur
879            # in subsystems (and therefore appear in the %ecToRoles hash) are
880  C<< my $stats = $spl->LoadDiagramData(); >>          # included.
881            for my $map (@maps) {
 Load the diagram data from FIG into Sprout.  
   
 Diagrams are used to organize functional roles. The diagram shows the  
 connections between chemicals that interact with a subsystem.  
   
 The following relations are loaded by this method.  
   
     Diagram  
     RoleOccursIn  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
882          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
883          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
884          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 859  Line 886 
886          # Now we need to link all the map's roles to it.          # Now we need to link all the map's roles to it.
887          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
888          my %roleHash = ();          my %roleHash = ();
889          for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
890                    if (exists $ecToRoles{$ec}) {
891                        for my $role (@{$ecToRoles{$ec}}) {
892              if (! $roleHash{$role}) {              if (! $roleHash{$role}) {
893                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($role, $map);
894                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
895              }              }
896          }          }
897      }      }
898                }
899            }
900            # Before we leave, we must create the Catalyzes table. We start with the reactions,
901            # then use the "ecToRoles" table to convert EC numbers to role IDs.
902            my @reactions = $fig->all_reactions();
903            for my $reactionID (@reactions) {
904                # Get this reaction's list of roles. The results will be EC numbers.
905                my @ecs = $fig->catalyzed_by($reactionID);
906                # Loop through the roles, creating catalyzation records.
907                for my $thisEC (@ecs) {
908                    if (exists $ecToRoles{$thisEC}) {
909                        for my $thisRole (@{$ecToRoles{$thisEC}}) {
910                            $loadCatalyzes->Put($thisRole, $reactionID);
911                        }
912                    }
913                }
914            }
915        }
916      # Finish the load.      # Finish the load.
917      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
918      return $retVal;      return $retVal;
# Line 873  Line 920 
920    
921  =head3 LoadPropertyData  =head3 LoadPropertyData
922    
923  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
924    
925  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
926    
# Line 907  Line 954 
954      my $fig = $self->{fig};      my $fig = $self->{fig};
955      # Get the genome hash.      # Get the genome hash.
956      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
957      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
958      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
959      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
960      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
961            Trace("Loading from existing files.") if T(2);
962        } else {
963            Trace("Generating property data.") if T(2);
964      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
965      my %propertyKeys = ();      my %propertyKeys = ();
966      my $nextID = 1;      my $nextID = 1;
967            # Get the attributes we intend to store in the property table.
968            my $propKeys = $self->{propKeys};
969      # Loop through the genomes.      # Loop through the genomes.
970      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
971          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
972          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
973          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
974          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
975          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
976          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
977          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
978              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
979              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
980              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
981              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
982                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
983                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
984                  # character.                  # character.
# Line 950  Line 996 
996                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
997                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
998              }              }
999                # Update the statistics.
1000                Trace("$propertyCount attributes processed.") if T(3);
1001                $loadHasProperty->Add("propertiesIn", $propertyCount);
1002          }          }
1003      }      }
1004      # Finish the load.      # Finish the load.
# Line 959  Line 1008 
1008    
1009  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1010    
1011  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1012    
1013  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1014    
# Line 991  Line 1040 
1040      my $fig = $self->{fig};      my $fig = $self->{fig};
1041      # Get the genome hash.      # Get the genome hash.
1042      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1043      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1044      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1045      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1046      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1047      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1048      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1049      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1050            Trace("Loading from existing files.") if T(2);
1051        } else {
1052            Trace("Generating annotation data.") if T(2);
1053      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1054      # user records.      # user records.
1055      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 1012  Line 1063 
1063      # Loop through the genomes.      # Loop through the genomes.
1064      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1065          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1066              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1067              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1068              my %seenTimestamps = ();              my %seenTimestamps = ();
1069              # Check for a functional assignment.              # Get the genome's annotations.
1070              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1071              if ($func) {              Trace("Processing annotations.") if T(2);
1072                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1073                  # assignment annotation for it.                  # Get the annotation tuple.
1074                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
             }  
             # Now loop through the real annotations.  
             for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                 my ($fid, $timestamp, $user, $text) = @{$tuple};  
1075                  # Here we fix up the annotation text. "\r" is removed,                  # Here we fix up the annotation text. "\r" is removed,
1076                  # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1077                  # modifier so that new-lines inside the text do not                  # modifier so that new-lines inside the text do not
1078                  # stop the substitution search.                  # stop the substitution search.
1079                  $text =~ s/\r//gs;                  $text =~ s/\r//gs;
# Line 1051  Line 1086 
1086                      # Here it's a number. We need to insure the one we use to form                      # Here it's a number. We need to insure the one we use to form
1087                      # the key is unique.                      # the key is unique.
1088                      my $keyStamp = $timestamp;                      my $keyStamp = $timestamp;
1089                      while ($seenTimestamps{$keyStamp}) {                      while ($seenTimestamps{"$peg:$keyStamp"}) {
1090                          $keyStamp++;                          $keyStamp++;
1091                      }                      }
                     $seenTimestamps{$keyStamp} = 1;  
1092                      my $annotationID = "$peg:$keyStamp";                      my $annotationID = "$peg:$keyStamp";
1093                        $seenTimestamps{$annotationID} = 1;
1094                      # Insure the user exists.                      # Insure the user exists.
1095                      if (! $users{$user}) {                      if (! $users{$user}) {
1096                          $loadSproutUser->Put($user, "SEED user");                          $loadSproutUser->Put($user, "SEED user");
# Line 1080  Line 1115 
1115    
1116  =head3 LoadSourceData  =head3 LoadSourceData
1117    
1118  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1119    
1120  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1121    
# Line 1113  Line 1148 
1148      my $fig = $self->{fig};      my $fig = $self->{fig};
1149      # Get the genome hash.      # Get the genome hash.
1150      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1151      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1152      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1153      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1154      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1155      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1156            Trace("Loading from existing files.") if T(2);
1157        } else {
1158            Trace("Generating annotation data.") if T(2);
1159      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1160      my %sourceURL = ();      my %sourceURL = ();
1161      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1148  Line 1185 
1185      for my $sourceID (keys %sourceDesc) {      for my $sourceID (keys %sourceDesc) {
1186          $loadSource->Put($sourceID, $sourceDesc{$sourceID});          $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1187      }      }
1188        }
1189      # Finish the load.      # Finish the load.
1190      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1191      return $retVal;      return $retVal;
# Line 1155  Line 1193 
1193    
1194  =head3 LoadExternalData  =head3 LoadExternalData
1195    
1196  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1197    
1198  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1199    
# Line 1187  Line 1225 
1225      my $fig = $self->{fig};      my $fig = $self->{fig};
1226      # Get the genome hash.      # Get the genome hash.
1227      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1228      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1229      # it the key.      # it the key.
1230      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1231      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1232      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1233      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1234      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1235            Trace("Loading from existing files.") if T(2);
1236        } else {
1237            Trace("Generating external data.") if T(2);
1238      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1239      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1240      my $orgLine;      my $orgLine;
1241      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1242          # Clean the input line.          # Clean the input line.
# Line 1208  Line 1248 
1248      close ORGS;      close ORGS;
1249      # Now the function file.      # Now the function file.
1250      my $funcLine;      my $funcLine;
1251      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1252      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1253          # Clean the line ending.          # Clean the line ending.
1254          chomp $funcLine;          chomp $funcLine;
# Line 1224  Line 1264 
1264              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1265          }          }
1266      }      }
1267        }
1268      # Finish the load.      # Finish the load.
1269      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1270      return $retVal;      return $retVal;
# Line 1232  Line 1273 
1273    
1274  =head3 LoadReactionData  =head3 LoadReactionData
1275    
1276  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1277    
1278  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1279    
# Line 1245  Line 1286 
1286      Compound      Compound
1287      CompoundName      CompoundName
1288      CompoundCAS      CompoundCAS
1289        IsIdentifiedByCAS
1290        HasCompoundName
1291      IsAComponentOf      IsAComponentOf
1292    
1293  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
# Line 1264  Line 1307 
1307      my ($self) = @_;      my ($self) = @_;
1308      # Get the FIG object.      # Get the FIG object.
1309      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
1310      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1311      my $loadReaction = $self->_TableLoader('Reaction', $genomeCount * 4000);      my $loadReaction = $self->_TableLoader('Reaction');
1312      my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1313      my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);      my $loadCompound = $self->_TableLoader('Compound');
1314      my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1315      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1316      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1317      Trace("Beginning reaction/compound data load.") if T(2);      my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1318        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1319        if ($self->{options}->{loadOnly}) {
1320            Trace("Loading from existing files.") if T(2);
1321        } else {
1322            Trace("Generating reaction data.") if T(2);
1323            # We need some hashes to prevent duplicates.
1324            my %compoundNames = ();
1325            my %compoundCASes = ();
1326      # First we create the compounds.      # First we create the compounds.
1327      my @compounds = $fig->all_compounds();      my @compounds = $fig->all_compounds();
1328      for my $cid (@compounds) {      for my $cid (@compounds) {
# Line 1283  Line 1331 
1331          # Each name will be given a priority number, starting with 1.          # Each name will be given a priority number, starting with 1.
1332          my $prio = 1;          my $prio = 1;
1333          for my $name (@names) {          for my $name (@names) {
1334              $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1335                        $loadCompoundName->Put($name);
1336                        $compoundNames{$name} = 1;
1337                    }
1338                    $loadHasCompoundName->Put($cid, $name, $prio++);
1339          }          }
1340          # Create the main compound record. Note that the first name          # Create the main compound record. Note that the first name
1341          # becomes the label.          # becomes the label.
# Line 1292  Line 1344 
1344          # Check for a CAS ID.          # Check for a CAS ID.
1345          my $cas = $fig->cas($cid);          my $cas = $fig->cas($cid);
1346          if ($cas) {          if ($cas) {
1347              $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1348                    if (! exists $compoundCASes{$cas}) {
1349                        $loadCompoundCAS->Put($cas);
1350                        $compoundCASes{$cas} = 1;
1351                    }
1352          }          }
1353      }      }
1354      # All the compounds are set up, so we need to loop through the reactions next. First,      # All the compounds are set up, so we need to loop through the reactions next. First,
# Line 1323  Line 1379 
1379              }              }
1380          }          }
1381      }      }
1382        }
1383      # Finish the load.      # Finish the load.
1384      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1385      return $retVal;      return $retVal;
1386  }  }
1387    
1388  =head3 LoadGroupData  =head3 LoadSynonymData
1389    
1390  C<< my $stats = $spl->LoadGroupData(); >>      my $stats = $spl->LoadSynonymData();
1391    
1392  Load the genome Groups into Sprout.  Load the synonym groups into Sprout.
1393    
1394  The following relations are loaded by this method.  The following relations are loaded by this method.
1395    
1396      GenomeGroups      SynonymGroup
1397        IsSynonymGroupFor
1398    
1399  There is no direct support for genome groups in FIG, so we access the SEED  The source information for these relations is taken from the C<maps_to_id> method
1400  files directly.  of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1401    SQL against the FIG database.
1402    
1403  =over 4  =over 4
1404    
# Line 1351  Line 1410 
1410    
1411  =cut  =cut
1412  #: Return Type $%;  #: Return Type $%;
1413  sub LoadGroupData {  sub LoadSynonymData {
1414      # Get this object instance.      # Get this object instance.
1415      my ($self) = @_;      my ($self) = @_;
1416      # Get the FIG object.      # Get the FIG object.
1417      my $fig = $self->{fig};      my $fig = $self->{fig};
1418      # Get the genome hash.      # Get the genome hash.
1419      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1420      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1421      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1422      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1423        if ($self->{options}->{loadOnly}) {
1424            Trace("Loading from existing files.") if T(2);
1425        } else {
1426            Trace("Generating synonym group data.") if T(2);
1427            # Get the database handle.
1428            my $dbh = $fig->db_handle();
1429            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1430            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1431            my $result = $sth->execute();
1432            if (! defined($result)) {
1433                Confess("Database error in Synonym load: " . $sth->errstr());
1434            } else {
1435                Trace("Processing synonym results.") if T(2);
1436                # Remember the current synonym.
1437                my $current_syn = "";
1438                # Count the features.
1439                my $featureCount = 0;
1440                my $entryCount = 0;
1441                # Loop through the synonym/peg pairs.
1442                while (my @row = $sth->fetchrow()) {
1443                    # Get the synonym group ID and feature ID.
1444                    my ($syn_id, $peg) = @row;
1445                    # Count this row.
1446                    $entryCount++;
1447                    if ($entryCount % 1000 == 0) {
1448                        Trace("$entryCount rows processed.") if T(3);
1449                    }
1450                    # Insure it's for one of our genomes.
1451                    my $genomeID = FIG::genome_of($peg);
1452                    if (exists $genomeHash->{$genomeID}) {
1453                        # Verify the synonym.
1454                        if ($syn_id ne $current_syn) {
1455                            # It's new, so put it in the group table.
1456                            $loadSynonymGroup->Put($syn_id);
1457                            $current_syn = $syn_id;
1458                        }
1459                        # Connect the synonym to the peg.
1460                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1461                        # Count this feature.
1462                        $featureCount++;
1463                        if ($featureCount % 1000 == 0) {
1464                            Trace("$featureCount features processed.") if T(3);
1465                        }
1466                    }
1467                }
1468                Trace("$entryCount rows produced $featureCount features.") if T(2);
1469            }
1470        }
1471        # Finish the load.
1472        my $retVal = $self->_FinishAll();
1473        return $retVal;
1474    }
1475    
1476    =head3 LoadFamilyData
1477    
1478        my $stats = $spl->LoadFamilyData();
1479    
1480    Load the protein families into Sprout.
1481    
1482    The following relations are loaded by this method.
1483    
1484        Family
1485        IsFamilyForFeature
1486    
1487    The source information for these relations is taken from the C<families_for_protein>,
1488    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1489    
1490    =over 4
1491    
1492    =item RETURNS
1493    
1494    Returns a statistics object for the loads.
1495    
1496    =back
1497    
1498    =cut
1499    #: Return Type $%;
1500    sub LoadFamilyData {
1501        # Get this object instance.
1502        my ($self) = @_;
1503        # Get the FIG object.
1504        my $fig = $self->{fig};
1505        # Get the genome hash.
1506        my $genomeHash = $self->{genomes};
1507        # Create load objects for the tables we're loading.
1508        my $loadFamily = $self->_TableLoader('Family');
1509        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1510        if ($self->{options}->{loadOnly}) {
1511            Trace("Loading from existing files.") if T(2);
1512        } else {
1513            Trace("Generating family data.") if T(2);
1514            # Create a hash for the family IDs.
1515            my %familyHash = ();
1516      # Loop through the genomes.      # Loop through the genomes.
1517      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1518      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1519          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1520          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1521          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1522              defined($line = <TMP>)) {                  # Get this feature's families.
1523              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1524              chomp $line;                  # Loop through the families, connecting them to the feature.
1525              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1526              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1527              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1528                        if (! exists $familyHash{$family}) {
1529                            $familyHash{$family} = 1;
1530                            $loadFamily->Add("families", 1);
1531                            my $size = $fig->sz_family($family);
1532                            my $func = $fig->family_function($family);
1533                            $loadFamily->Put($family, $size, $func);
1534                        }
1535                    }
1536                }
1537            }
1538        }
1539        # Finish the load.
1540        my $retVal = $self->_FinishAll();
1541        return $retVal;
1542    }
1543    
1544    =head3 LoadDrugData
1545    
1546        my $stats = $spl->LoadDrugData();
1547    
1548    Load the drug target data into Sprout.
1549    
1550    The following relations are loaded by this method.
1551    
1552        PDB
1553        DocksWith
1554        IsProteinForFeature
1555        Ligand
1556    
1557    The source information for these relations is taken from attributes. The
1558    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1559    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1560    attribute contains the information for the B<DocksWith> relationship. It is
1561    expected that additional attributes and tables will be added in the future.
1562    
1563    =over 4
1564    
1565    =item RETURNS
1566    
1567    Returns a statistics object for the loads.
1568    
1569    =back
1570    
1571    =cut
1572    #: Return Type $%;
1573    sub LoadDrugData {
1574        # Get this object instance.
1575        my ($self) = @_;
1576        # Get the FIG object.
1577        my $fig = $self->{fig};
1578        # Get the genome hash.
1579        my $genomeHash = $self->{genomes};
1580        # Create load objects for the tables we're loading.
1581        my $loadPDB = $self->_TableLoader('PDB');
1582        my $loadLigand = $self->_TableLoader('Ligand');
1583        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1584        my $loadDocksWith = $self->_TableLoader('DocksWith');
1585        if ($self->{options}->{loadOnly}) {
1586            Trace("Loading from existing files.") if T(2);
1587        } else {
1588            Trace("Generating drug target data.") if T(2);
1589            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1590            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1591            # this process, PDB information is collected in a hash table and then
1592            # unspooled after both relationships are created.
1593            my %pdbHash = ();
1594            Trace("Generating docking data.") if T(2);
1595            # Get all the docking data. This may cause problems if there are too many PDBs,
1596            # at which point we'll need another algorithm. The indicator that this is
1597            # happening will be a timeout error in the next statement.
1598            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1599                                                  ['docking_results', $FIG_Config::dockLimit]);
1600            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1601            for my $dockData (@dockData) {
1602                # Get the docking data components.
1603                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1604                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1605                $pdbID = lc $pdbID;
1606                # Strip off the object type.
1607                $pdbID =~ s/pdb://;
1608                # Extract the ZINC ID from the docking key. Note that there are two possible
1609                # formats.
1610                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1611                if (! $zinc_id) {
1612                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1613                    $loadDocksWith->Add("errors");
1614                } else {
1615                    # Get the pieces of the value and parse the energy.
1616                    # Note that we don't care about the rank, since
1617                    # we can sort on the energy level itself in our database.
1618                    my ($energy, $tool, $type) = @valueData;
1619                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1620                    # Ignore predicted results.
1621                    if ($type ne "Predicted") {
1622                        # Count this docking result.
1623                        if (! exists $pdbHash{$pdbID}) {
1624                            $pdbHash{$pdbID} = 1;
1625                        } else {
1626                            $pdbHash{$pdbID}++;
1627                        }
1628                        # Write the result to the output.
1629                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1630                                            $total, $vanderwaals);
1631                    }
1632                }
1633            }
1634            Trace("Connecting features.") if T(2);
1635            # Loop through the genomes.
1636            for my $genome (sort keys %{$genomeHash}) {
1637                Trace("Generating PDBs for $genome.") if T(3);
1638                # Get all of the PDBs that BLAST against this genome's features.
1639                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1640                for my $pdbData (@attributeData) {
1641                    # The PDB ID is coded as a subkey.
1642                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1643                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1644                        $loadPDB->Add("errors");
1645                    } else {
1646                        my $pdbID = $1;
1647                        # Insure the PDB is in the hash.
1648                        if (! exists $pdbHash{$pdbID}) {
1649                            $pdbHash{$pdbID} = 0;
1650                        }
1651                        # The score and locations are coded in the attribute value.
1652                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1653                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1654                            $loadIsProteinForFeature->Add("errors");
1655                        } else {
1656                            my ($score, $locData) = ($1,$2);
1657                            # The location data may not be present, so we have to start with some
1658                            # defaults and then check.
1659                            my ($start, $end) = (1, 0);
1660                            if ($locData) {
1661                                $locData =~ /(\d+)-(\d+)/;
1662                                $start = $1;
1663                                $end = $2;
1664                            }
1665                            # If we still don't have the end location, compute it from
1666                            # the feature length.
1667                            if (! $end) {
1668                                # Most features have one location, but we do a list iteration
1669                                # just in case.
1670                                my @locations = $fig->feature_location($pdbData->[0]);
1671                                $end = 0;
1672                                for my $loc (@locations) {
1673                                    my $locObject = BasicLocation->new($loc);
1674                                    $end += $locObject->Length;
1675                                }
1676                            }
1677                            # Decode the score.
1678                            my $realScore = FIGRules::DecodeScore($score);
1679                            # Connect the PDB to the feature.
1680                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1681                        }
1682                    }
1683                }
1684            }
1685            # We've got all our PDBs now, so we unspool them from the hash.
1686            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1687            my $count = 0;
1688            for my $pdbID (sort keys %pdbHash) {
1689                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1690                $count++;
1691                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1692            }
1693            # Finally we create the ligand table. This information can be found in the
1694            # zinc_name attribute.
1695            Trace("Loading ligands.") if T(2);
1696            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1697            my $last_zinc_id = "";
1698            my $zinc_id = "";
1699            my $done = 0;
1700            while (! $done) {
1701                # Get the next 10000 ligands. We insist that the object ID is greater than
1702                # the last ID we processed.
1703                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1704                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1705                                                           ["ZINC:$zinc_id", "zinc_name"]);
1706                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1707                if (! @attributeData) {
1708                    # Here there are no attributes left, so we quit the loop.
1709                    $done = 1;
1710                } else {
1711                    # Process the attribute data we've received.
1712                    for my $zinc_data (@attributeData) {
1713                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1714                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1715                            $zinc_id = $1;
1716                            # Check for a duplicate.
1717                            if ($zinc_id eq $last_zinc_id) {
1718                                $loadLigand->Add("duplicate");
1719                            } else {
1720                                # Here it's safe to output the ligand. The ligand name is the attribute value
1721                                # (third column in the row).
1722                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1723                                # Insure we don't try to add this ID again.
1724                                $last_zinc_id = $zinc_id;
1725                            }
1726                        } else {
1727                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1728                            $loadLigand->Add("errors");
1729                        }
1730                    }
1731          }          }
1732          close TMP;          }
1733            Trace("Ligands loaded.") if T(2);
1734      }      }
1735      # Finish the load.      # Finish the load.
1736      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1737      return $retVal;      return $retVal;
1738  }  }
1739    
1740    
1741  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1742    
1743    =head3 SpecialAttribute
1744    
1745        my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1746    
1747    Look for special attributes of a given type. A special attribute is found by comparing one of
1748    the columns of the incoming attribute list to a search pattern. If a match is found, then
1749    a set of columns is put into an output table connected to the specified ID.
1750    
1751    For example, when processing features, the attribute list we look at has three columns: attribute
1752    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1753    begins with C<iedb_>. The call signature is therefore
1754    
1755        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1756    
1757    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1758    to the output along with the specified feature ID.
1759    
1760    =over 4
1761    
1762    =item id
1763    
1764    ID of the object whose special attributes are being loaded. This forms the first column of the
1765    output.
1766    
1767    =item attributes
1768    
1769    Reference to a list of tuples.
1770    
1771    =item idxMatch
1772    
1773    Index in each tuple of the column to be matched against the pattern. If the match is
1774    successful, an output record will be generated.
1775    
1776    =item idxValues
1777    
1778    Reference to a list containing the indexes in each tuple of the columns to be put as
1779    the second column of the output.
1780    
1781    =item pattern
1782    
1783    Pattern to be matched against the specified column. The match will be case-insensitive.
1784    
1785    =item loader
1786    
1787    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1788    but technically it could be anything with a C<Put> method.
1789    
1790    =item RETURN
1791    
1792    Returns a count of the matches found.
1793    
1794    =item
1795    
1796    =back
1797    
1798    =cut
1799    
1800    sub SpecialAttribute {
1801        # Get the parameters.
1802        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1803        # Declare the return variable.
1804        my $retVal = 0;
1805        # Loop through the attribute rows.
1806        for my $row (@{$attributes}) {
1807            # Check for a match.
1808            if ($row->[$idxMatch] =~ m/$pattern/i) {
1809                # We have a match, so output a row. This is a bit tricky, since we may
1810                # be putting out multiple columns of data from the input.
1811                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1812                $loader->Put($id, $value);
1813                $retVal++;
1814            }
1815        }
1816        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1817        # Return the number of matches.
1818        return $retVal;
1819    }
1820    
1821  =head3 TableLoader  =head3 TableLoader
1822    
1823  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1398  Line 1832 
1832    
1833  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1834    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
1835  =item RETURN  =item RETURN
1836    
1837  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1412  Line 1842 
1842    
1843  sub _TableLoader {  sub _TableLoader {
1844      # Get the parameters.      # Get the parameters.
1845      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
1846      # Create the load object.      # Create the load object.
1847      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
1848      # Cache it in the loader list.      # Cache it in the loader list.
1849      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1850      # Return it to the caller.      # Return it to the caller.
# Line 1448  Line 1878 
1878      my $retVal = Stats->new();      my $retVal = Stats->new();
1879      # Get the loader list.      # Get the loader list.
1880      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1881        # Create a hash to hold the statistics objects, keyed on relation name.
1882        my %loaderHash = ();
1883      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1884      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1885      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1886          # Trace the fact that we're cleaning up.          # Get the relation name.
1887          my $relName = $loader->RelName;          my $relName = $loader->RelName;
1888          Trace("Finishing load for $relName.") if T(2);          # Check the ignore flag.
1889            if ($loader->Ignore) {
1890                Trace("Relation $relName not loaded.") if T(2);
1891            } else {
1892                # Here we really need to finish.
1893                Trace("Finishing $relName.") if T(2);
1894          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1895                $loaderHash{$relName} = $stats;
1896            }
1897        }
1898        # Now we loop through again, actually loading the tables. We want to finish before
1899        # loading so that if something goes wrong at this point, all the load files are usable
1900        # and we don't have to redo all that work.
1901        for my $relName (sort keys %loaderHash) {
1902            # Get the statistics for this relation.
1903            my $stats = $loaderHash{$relName};
1904            # Check for a database load.
1905          if ($self->{options}->{dbLoad}) {          if ($self->{options}->{dbLoad}) {
1906              # Here we want to use the load file just created to load the database.              # Here we want to use the load file just created to load the database.
1907              Trace("Loading relation $relName.") if T(2);              Trace("Loading relation $relName.") if T(2);
# Line 1469  Line 1916 
1916      return $retVal;      return $retVal;
1917  }  }
1918    
1919    =head3 GetGenomeAttributes
1920    
1921        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
1922    
1923    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1924    attributes for all the features of a genome in a single call, then organizes them into
1925    a hash.
1926    
1927    =over 4
1928    
1929    =item fig
1930    
1931    FIG-like object for accessing attributes.
1932    
1933    =item genomeID
1934    
1935    ID of the genome who's attributes are desired.
1936    
1937    =item fids
1938    
1939    Reference to a list of the feature IDs whose attributes are to be kept.
1940    
1941    =item propKeys
1942    
1943    A list of the keys to retrieve.
1944    
1945    =item RETURN
1946    
1947    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1948    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1949    the attribute key, and one or more attribute values.
1950    
1951    =back
1952    
1953    =cut
1954    
1955    sub GetGenomeAttributes {
1956        # Get the parameters.
1957        my ($fig, $genomeID, $fids, $propKeys) = @_;
1958        # Declare the return variable.
1959        my $retVal = {};
1960        # Initialize the hash. This not only enables us to easily determine which FIDs to
1961        # keep, it insures that the caller sees a list reference for every known fid,
1962        # simplifying the logic.
1963        for my $fid (@{$fids}) {
1964            $retVal->{$fid} = [];
1965        }
1966        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1967        # an eval is used.
1968        my @aList = ();
1969        eval {
1970            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1971            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1972        };
1973        # Check for a problem.
1974        if ($@) {
1975            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1976            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1977            # but allows us to continue processing.
1978            my $nFids = scalar @{$fids};
1979            for (my $i = 0; $i < $nFids; $i += 100) {
1980                # Determine the index of the last feature ID we'll be specifying on this pass.
1981                # Normally it's $i + 99, but if we're close to the end it may be less.
1982                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1983                # Get a slice of the fid list.
1984                my @slice = @{$fids}[$i .. $end];
1985                # Get the relevant attributes.
1986                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1987                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1988                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1989                push @aList, @aShort;
1990            }
1991        }
1992        # Now we should have all the interesting attributes in @aList. Populate the hash with
1993        # them.
1994        for my $aListEntry (@aList) {
1995            my $fid = $aListEntry->[0];
1996            if (exists $retVal->{$fid}) {
1997                push @{$retVal->{$fid}}, $aListEntry;
1998            }
1999        }
2000        # Return the result.
2001        return $retVal;
2002    }
2003    
2004    
2005  1;  1;

Legend:
Removed from v.1.20  
changed lines
  Added in v.1.90

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3