[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.88, Mon Nov 5 22:52:06 2007 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14        use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 29  Line 32 
32      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
33      print $stats->Show();      print $stats->Show();
34    
 This module makes use of the internal Sprout property C<_erdb>.  
   
35  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
36  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
37  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 80 
80  =item subsysFile  =item subsysFile
81    
82  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
85    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
86    
87    =item options
88    
89    Reference to a hash of command-line options.
90    
91  =back  =back
92    
# Line 88  Line 94 
94    
95  sub new {  sub new {
96      # Get the parameters.      # Get the parameters.
97      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
98      # Load the list of genomes into a hash.      # Create the genome hash.
99      my %genomes;      my %genomes = ();
100        # We only need it if load-only is NOT specified.
101        if (! $options->{loadOnly}) {
102      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
103          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
104          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
105          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107      } else {      } else {
108          my $type = ref $genomeFile;          my $type = ref $genomeFile;
109          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 114  Line 123 
123                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
124                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
125                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
126                      if (undef $accessCode) {                          if (! defined($accessCode)) {
127                          $accessCode = 1;                          $accessCode = 1;
128                      }                      }
129                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 133 
133              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
134          }          }
135      }      }
136        }
137      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
138      my %subsystems = ();      my %subsystems = ();
139        # We only need it if load-only is NOT specified.
140        if (! $options->{loadOnly}) {
141      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
142          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
143          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
144                # Loop through, checking for the NMPDR file.
145                for my $sub (@subs) {
146                    if ($fig->nmpdr_subsystem($sub)) {
147                        $subsystems{$sub} = 1;
148                    }
149                }
150      } else {      } else {
151          my $type = ref $subsysFile;          my $type = ref $subsysFile;
152          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 166 
166              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
167          }          }
168      }      }
169            # Go through the subsys hash again, creating the keyword list for each subsystem.
170            for my $subsystem (keys %subsystems) {
171                my $name = $subsystem;
172                $name =~ s/_/ /g;
173    #            my $classes = $fig->subsystem_classification($subsystem);
174    #            $name .= " " . join(" ", @{$classes});
175                $subsystems{$subsystem} = $name;
176            }
177        }
178        # Get the list of NMPDR-oriented attribute keys.
179        my @propKeys = $fig->get_group_keys("NMPDR");
180      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
181      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
182      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 186 
186                    subsystems => \%subsystems,                    subsystems => \%subsystems,
187                    sprout => $sprout,                    sprout => $sprout,
188                    loadDirectory => $directory,                    loadDirectory => $directory,
189                    erdb => $sprout->{_erdb},                    erdb => $sprout,
190                    loaders => []                    loaders => [],
191                      options => $options,
192                      propKeys => \@propKeys,
193                   };                   };
194      # Bless and return it.      # Bless and return it.
195      bless $retVal, $class;      bless $retVal, $class;
196      return $retVal;      return $retVal;
197  }  }
198    
199    =head3 LoadOnly
200    
201    C<< my $flag = $spl->LoadOnly; >>
202    
203    Return TRUE if we are in load-only mode, else FALSE.
204    
205    =cut
206    
207    sub LoadOnly {
208        my ($self) = @_;
209        return $self->{options}->{loadOnly};
210    }
211    
212    
213  =head3 LoadGenomeData  =head3 LoadGenomeData
214    
215  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 192  Line 237 
237    
238  =back  =back
239    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
240  =cut  =cut
241  #: Return Type $%;  #: Return Type $%;
242  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 247 
247      # Get the genome count.      # Get the genome count.
248      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
249      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
250      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
251      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
252      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
253      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
254      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
255      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
256        if ($self->{options}->{loadOnly}) {
257            Trace("Loading from existing files.") if T(2);
258        } else {
259            Trace("Generating genome data.") if T(2);
260      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
261      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
262          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
263          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
264          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
265          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
266          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
267          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
268          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
269          # Get the full taxonomy.          # Get the full taxonomy.
270          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
271                # Get the version. If no version is specified, we default to the genome ID by itself.
272                my $version = $fig->genome_version($genomeID);
273                if (! defined($version)) {
274                    $version = $genomeID;
275                }
276                # Get the DNA size.
277                my $dnaSize = $fig->genome_szdna($genomeID);
278                # Open the NMPDR group file for this genome.
279                my $group;
280                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
281                    defined($group = <TMP>)) {
282                    # Clean the line ending.
283                    chomp $group;
284                } else {
285                    # No group, so use the default.
286                    $group = $FIG_Config::otherGroup;
287                }
288                close TMP;
289          # Output the genome record.          # Output the genome record.
290          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
291                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
292          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
293          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
294          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 262  Line 319 
319              }              }
320          }          }
321      }      }
322        }
323      # Finish the loads.      # Finish the loads.
324      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
325      # Return the result.      # Return the result.
326      return $retVal;      return $retVal;
327  }  }
328    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence is  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
329  =head3 LoadFeatureData  =head3 LoadFeatureData
330    
331  C<< my $stats = $spl->LoadFeatureData(); >>  C<< my $stats = $spl->LoadFeatureData(); >>
# Line 400  Line 338 
338    
339      Feature      Feature
340      FeatureAlias      FeatureAlias
341        IsAliasOf
342      FeatureLink      FeatureLink
343      FeatureTranslation      FeatureTranslation
344      FeatureUpstream      FeatureUpstream
345      IsLocatedIn      IsLocatedIn
346        HasFeature
347        HasRoleInSubsystem
348        FeatureEssential
349        FeatureVirulent
350        FeatureIEDB
351        CDD
352        IsPresentOnProteinOf
353    
354  =over 4  =over 4
355    
# Line 418  Line 364 
364  sub LoadFeatureData {  sub LoadFeatureData {
365      # Get this object instance.      # Get this object instance.
366      my ($self) = @_;      my ($self) = @_;
367      # Get the FIG object.      # Get the FIG and Sprout objects.
368      my $fig = $self->{fig};      my $fig = $self->{fig};
369        my $sprout = $self->{sprout};
370      # Get the table of genome IDs.      # Get the table of genome IDs.
371      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
372      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
373      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
374      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
375      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
376      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
377      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
378      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
379        my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
380        my $loadHasFeature = $self->_TableLoader('HasFeature');
381        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
382        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
383        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
384        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
385        my $loadCDD = $self->_TableLoader('CDD');
386        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
387        # Get the subsystem hash.
388        my $subHash = $self->{subsystems};
389        # Get the property keys.
390        my $propKeys = $self->{propKeys};
391        # Create a hashes to hold CDD and alias values.
392        my %CDD = ();
393        my %alias = ();
394      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
395      # locations.      # locations.
396      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
397      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
398            Trace("Loading from existing files.") if T(2);
399        } else {
400            Trace("Generating feature data.") if T(2);
401      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
402      for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
403            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
404            for my $genomeID (@allGenomes) {
405          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
406          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
407          # Get the feature list for this genome.          # Get the feature list for this genome.
408          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
409                # Sort and count the list.
410                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
411                my $count = scalar @featureTuples;
412                my @fids = map { $_->[0] } @featureTuples;
413                Trace("$count features found for genome $genomeID.") if T(3);
414                # Get the attributes for this genome and put them in a hash by feature ID.
415                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
416                Trace("Looping through features for $genomeID.") if T(3);
417                # Set up for our duplicate-feature check.
418                my $oldFeatureID = "";
419          # Loop through the features.          # Loop through the features.
420          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
421              # Split the tuple.              # Split the tuple.
422              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
423              # Create the feature record.                  # Check for duplicates.
424              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
425                        Trace("Duplicate feature $featureID found.") if T(1);
426                    } else {
427                        $oldFeatureID = $featureID;
428                        # Count this feature.
429                        $loadFeature->Add("featureIn");
430                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
431                        # Sprout database requires a single character.
432                        if (! defined($quality) || $quality eq "") {
433                            $quality = " ";
434                        }
435                        # Begin building the keywords. We start with the genome ID, the
436                        # feature ID, the taxonomy, and the organism name.
437                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
438                                        $fig->taxonomy_of($genomeID));
439              # Create the aliases.              # Create the aliases.
440              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
441                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
442              }                          $loadIsAliasOf->Put($alias, $featureID);
443                            push @keywords, $alias;
444                            # If this is a locus tag, also add its natural form as a keyword.
445                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
446                            if ($naturalName) {
447                                push @keywords, $naturalName;
448                            }
449                            # If this is the first time for the specified alias, create its
450                            # alias record.
451                            if (! exists $alias{$alias}) {
452                                $loadFeatureAlias->Put($alias);
453                                $alias{$alias} = 1;
454                            }
455                        }
456                        Trace("Assignment for $featureID is: $assignment") if T(4);
457                        # Break the assignment into words and shove it onto the
458                        # keyword list.
459                        push @keywords, split(/\s+/, $assignment);
460                        # Link this feature to the parent genome.
461                        $loadHasFeature->Put($genomeID, $featureID, $type);
462              # Get the links.              # Get the links.
463              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
464              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 477 
477                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
478                  }                  }
479              }              }
480                        # Now we need to find the subsystems this feature participates in.
481                        # We also add the subsystems to the keyword list. Before we do that,
482                        # we must convert underscores to spaces.
483                        my @subsystems = $fig->peg_to_subsystems($featureID);
484                        for my $subsystem (@subsystems) {
485                            # Only proceed if we like this subsystem.
486                            if (exists $subHash->{$subsystem}) {
487                                # Store the has-role link.
488                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
489                                # Save the subsystem's keyword data.
490                                my $subKeywords = $subHash->{$subsystem};
491                                push @keywords, split /\s+/, $subKeywords;
492                                # Now we need to get this feature's role in the subsystem.
493                                my $subObject = $fig->get_subsystem($subsystem);
494                                my @roleColumns = $subObject->get_peg_roles($featureID);
495                                my @allRoles = $subObject->get_roles();
496                                for my $col (@roleColumns) {
497                                    my $role = $allRoles[$col];
498                                    push @keywords, split /\s+/, $role;
499                                    push @keywords, $subObject->get_role_abbr($col);
500                                }
501                            }
502                        }
503                        # There are three special attributes computed from property
504                        # data that we build next. If the special attribute is non-empty,
505                        # its name will be added to the keyword list. First, we get all
506                        # the attributes for this feature. They will come back as
507                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
508                        # [name, value, value with URL]. (We don't need the PEG, since
509                        # we already know it.)
510                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
511                                             @{$attributes->{$featureID}};
512                        # Now we process each of the special attributes.
513                        if (SpecialAttribute($featureID, \@attributes,
514                                             1, [0,2], '^(essential|potential_essential)$',
515                                             $loadFeatureEssential)) {
516                            push @keywords, 'essential';
517                            $loadFeature->Add('essential');
518                        }
519                        if (SpecialAttribute($featureID, \@attributes,
520                                             0, [2], '^virulen',
521                                             $loadFeatureVirulent)) {
522                            push @keywords, 'virulent';
523                            $loadFeature->Add('virulent');
524                        }
525                        if (SpecialAttribute($featureID, \@attributes,
526                                             0, [0,2], '^iedb_',
527                                             $loadFeatureIEDB)) {
528                            push @keywords, 'iedb';
529                            $loadFeature->Add('iedb');
530                        }
531                        # Now we have some other attributes we need to process. Currently,
532                        # this is CDD and CELLO, but we expect the number to increase.
533                        my %attributeHash = ();
534                        for my $attrRow (@{$attributes->{$featureID}}) {
535                            my (undef, $key, @values) = @{$attrRow};
536                            $key =~ /^([^:]+)::(.+)/;
537                            if (exists $attributeHash{$1}) {
538                                $attributeHash{$1}->{$2} = \@values;
539                            } else {
540                                $attributeHash{$1} = {$2 => \@values};
541                            }
542                        }
543                        my $celloValue = "unknown";
544                        # Pull in the CELLO attribute. There will never be more than one.
545                        # If we have one, it's a feature attribute AND a keyword.
546                        my @celloData = keys %{$attributeHash{CELLO}};
547                        if (@celloData) {
548                            $celloValue = $celloData[0];
549                            push @keywords, $celloValue;
550                        }
551                        # Now we handle CDD. This is a bit more complicated, because
552                        # there are multiple CDDs per protein.
553                        if (exists $attributeHash{CDD}) {
554                            # Get the hash of CDD IDs to scores for this feature. We
555                            # already know it exists because of the above IF.
556                            my $cddHash = $attributeHash{CDD};
557                            my @cddData = sort keys %{$cddHash};
558                            for my $cdd (@cddData) {
559                                # Extract the score for this CDD and decode it.
560                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);
561                                my $realScore = FIGRules::DecodeScore($codeScore);
562                                # Create the connection.
563                                $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
564                                # If this CDD does not yet exist, create its record.
565                                if (! exists $CDD{$cdd}) {
566                                    $CDD{$cdd} = 1;
567                                    $loadCDD->Put($cdd);
568                                }
569                            }
570                        }
571                        # Now we need to bust up hyphenated words in the keyword
572                        # list. We keep them separate and put them at the end so
573                        # the original word order is available.
574                        my $keywordString = "";
575                        my $bustedString = "";
576                        for my $keyword (@keywords) {
577                            if (length $keyword >= 3) {
578                                $keywordString .= " $keyword";
579                                if ($keyword =~ /-/) {
580                                    my @words = split /-/, $keyword;
581                                    $bustedString .= join(" ", "", @words);
582                                }
583                            }
584                        }
585                        $keywordString .= $bustedString;
586                        # Get rid of annoying punctuation.
587                        $keywordString =~ s/[();]//g;
588                        # Clean the keyword list.
589                        my $cleanWords = $sprout->CleanKeywords($keywordString);
590                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
591                        # Now we need to process the feature's locations. First, we split them up.
592                        my @locationList = split /\s*,\s*/, $locations;
593                        # Next, we convert them to Sprout location objects.
594                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
595              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
596              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
597              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
598              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
599              my @locationList = split /\s*,\s*/, $locations;                      my $i = 1;
600              # Loop through the locations.              # Loop through the locations.
601              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
602                  # Parse the location.                          # Split this location into a list of chunks.
                 my $locObject = BasicLocation->new($location);  
                 # Split it into a list of chunks.  
603                  my @locOList = ();                  my @locOList = ();
604                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
605                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 488  Line 608 
608                  push @locOList, $locObject;                  push @locOList, $locObject;
609                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
610                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
611                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
612                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
613                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
614                      $i++;                      $i++;
615                  }                  }
616              }              }
617                        # Finally, reassemble the location objects into a list of Sprout location strings.
618                        $locations = join(", ", map { $_->String } @locObjectList);
619                        # Create the feature record.
620                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);
621          }          }
622      }      }
623      # Finish the loads.              Trace("Genome $genomeID processed.") if T(3);
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
624          }          }
625      }      }
626      # Finish the loads.      # Finish the loads.
# Line 584  Line 643 
643  The following relations are loaded by this method.  The following relations are loaded by this method.
644    
645      Subsystem      Subsystem
646        SubsystemClass
647      Role      Role
648        RoleEC
649        IsIdentifiedByEC
650      SSCell      SSCell
651      ContainsFeature      ContainsFeature
652      IsGenomeOf      IsGenomeOf
# Line 592  Line 654 
654      OccursInSubsystem      OccursInSubsystem
655      ParticipatesIn      ParticipatesIn
656      HasSSCell      HasSSCell
657        ConsistsOfRoles
658        RoleSubset
659        HasRoleSubset
660        ConsistsOfGenomes
661        GenomeSubset
662        HasGenomeSubset
663        Catalyzes
664        Diagram
665        RoleOccursIn
666    
667  =over 4  =over 4
668    
# Line 601  Line 672 
672    
673  =back  =back
674    
 B<TO DO>  
   
 Generate RoleName table?  
   
675  =cut  =cut
676  #: Return Type $%;  #: Return Type $%;
677  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 685 
685      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
686      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
687      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
688      my $subsysCount = @subsysIDs;      # Get the map list.
689      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
690      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
691      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
692      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
693      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
694      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role');
695      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
696      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
697      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
698      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
699      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
700      Trace("Beginning subsystem data load.") if T(2);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
701        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
702        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
703        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
704        my $loadHasSSCell = $self->_TableLoader('HasSSCell');
705        my $loadRoleSubset = $self->_TableLoader('RoleSubset');
706        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
707        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
708        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
709        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
710        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
711        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
712        if ($self->{options}->{loadOnly}) {
713            Trace("Loading from existing files.") if T(2);
714        } else {
715            Trace("Generating subsystem data.") if T(2);
716            # This hash will contain the roles for each EC. When we're done, this
717            # information will be used to generate the Catalyzes table.
718            my %ecToRoles = ();
719      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
720      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
721      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
722      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
723            # and memorize up the reactions.
724            my ($genomeID, $roleID);
725      my %roleData = ();      my %roleData = ();
726      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
727                # Get the subsystem object.
728                my $sub = $fig->get_subsystem($subsysID);
729                # Only proceed if the subsystem has a spreadsheet.
730                if (defined($sub) && ! $sub->{empty_ss}) {
731          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
732          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
733          # Create the subsystem record.          # Create the subsystem record.
734          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
735          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
736          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
737          # Connect the roles to the subsystem. If a role is new, we create                  # Now for the classification string. This comes back as a list
738          # a role record for it.                  # reference and we convert it to a space-delimited string.
739          for my $roleID (@roles) {                  my $classList = $fig->subsystem_classification($subsysID);
740                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
741                    $loadSubsystemClass->Put($subsysID, $classString);
742                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
743                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
744                        # Get the role's abbreviation.
745                        my $abbr = $sub->get_role_abbr($col);
746                        # Connect to this role.
747              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
748              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
749                        # If it's a new role, add it to the role table.
750              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
751                            # Get the role's abbreviation.
752                            # Add the role.
753                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
754                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
755                            # Check for an EC number.
756                            if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
757                                my $ec = $1;
758                                $loadIsIdentifiedByEC->Put($roleID, $ec);
759                                # Check to see if this is our first encounter with this EC.
760                                if (exists $ecToRoles{$ec}) {
761                                    # No, so just add this role to the EC list.
762                                    push @{$ecToRoles{$ec}}, $roleID;
763                                } else {
764                                    # Output this EC.
765                                    $loadRoleEC->Put($ec);
766                                    # Create its role list.
767                                    $ecToRoles{$ec} = [$roleID];
768                                }
769              }              }
770          }          }
771          # Now all roles for this subsystem have been filled in. We create the                  }
772          # spreadsheet by matches roles to genomes. To do this, we need to                  # Now we create the spreadsheet for the subsystem by matching roles to
773          # get the genomes on the sheet.                  # genomes. Each genome is a row and each role is a column. We may need
774                    # to actually create the roles as we find them.
775          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
776          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
777          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
778              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
779                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
780                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
781                            my $cellCount = 0;
782                            # Create a list for the PEGs we find. This list will be used
783                            # to generate cluster numbers.
784                            my @pegsFound = ();
785                            # Create a hash that maps spreadsheet IDs to PEGs. We will
786                            # use this to generate the ContainsFeature data after we have
787                            # the cluster numbers.
788                            my %cellPegs = ();
789                            # Get the genome's variant code for this subsystem.
790                            my $variantCode = $sub->get_variant_code($row);
791                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
792                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
793                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
794                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
795                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
796                      # Only proceed if features exist.                      # Only proceed if features exist.
797                      if (@pegs > 0) {                      if (@pegs > 0) {
798                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
799                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
800                                    my $cellID = "$subsysID:$genomeID:$col";
801                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
802                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
803                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
804                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
805                          # Attach the features to it.                                  # Remember its features.
806                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
807                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
808                                    $pegCount += @pegs;
809                                }
810                            }
811                            # If we found some cells for this genome, we need to compute clusters and
812                            # denote it participates in the subsystem.
813                            if ($pegCount > 0) {
814                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
815                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
816                                # Create a hash mapping PEG IDs to cluster numbers.
817                                # We default to -1 for all of them.
818                                my %clusterOf = map { $_ => -1 } @pegsFound;
819                                # Partition the PEGs found into clusters.
820                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
821                                for (my $i = 0; $i <= $#clusters; $i++) {
822                                    my $subList = $clusters[$i];
823                                    for my $peg (@{$subList}) {
824                                        $clusterOf{$peg} = $i;
825                                    }
826                                }
827                                # Create the ContainsFeature data.
828                                for my $cellID (keys %cellPegs) {
829                                    my $cellList = $cellPegs{$cellID};
830                                    for my $cellPeg (@$cellList) {
831                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
832                          }                          }
833                      }                      }
834                  }                  }
835              }              }
836          }          }
837                    # Now we need to generate the subsets. The subset names must be concatenated to
838                    # the subsystem name to make them unique keys. There are two types of subsets:
839                    # genome subsets and role subsets. We do the role subsets first.
840                    my @subsetNames = $sub->get_subset_names();
841                    for my $subsetID (@subsetNames) {
842                        # Create the subset record.
843                        my $actualID = "$subsysID:$subsetID";
844                        $loadRoleSubset->Put($actualID);
845                        # Connect the subset to the subsystem.
846                        $loadHasRoleSubset->Put($subsysID, $actualID);
847                        # Connect the subset to its roles.
848                        my @roles = $sub->get_subsetC_roles($subsetID);
849                        for my $roleID (@roles) {
850                            $loadConsistsOfRoles->Put($actualID, $roleID);
851      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
852  }  }
853                    # Next the genome subsets.
854  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
855                    for my $subsetID (@subsetNames) {
856  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
857                        my $actualID = "$subsysID:$subsetID";
858  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
859                        # Connect the subset to the subsystem.
860  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
861  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
862                        my @genomes = $sub->get_subsetR($subsetID);
863  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
864                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
865      Diagram                      }
866      RoleOccursIn                  }
867                }
868  =over 4          }
869            # Now we loop through the diagrams. We need to create the diagram records
870  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
871            # in subsystems (and therefore appear in the %ecToRoles hash) are
872  Returns a statistics object for the loads.          # included.
873            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
874          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
875          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
876          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 739  Line 878 
878          # Now we need to link all the map's roles to it.          # Now we need to link all the map's roles to it.
879          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
880          my %roleHash = ();          my %roleHash = ();
881          for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
882                    if (exists $ecToRoles{$ec}) {
883                        for my $role (@{$ecToRoles{$ec}}) {
884              if (! $roleHash{$role}) {              if (! $roleHash{$role}) {
885                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($role, $map);
886                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
887              }              }
888          }          }
889      }      }
890                }
891            }
892            # Before we leave, we must create the Catalyzes table. We start with the reactions,
893            # then use the "ecToRoles" table to convert EC numbers to role IDs.
894            my @reactions = $fig->all_reactions();
895            for my $reactionID (@reactions) {
896                # Get this reaction's list of roles. The results will be EC numbers.
897                my @ecs = $fig->catalyzed_by($reactionID);
898                # Loop through the roles, creating catalyzation records.
899                for my $thisEC (@ecs) {
900                    if (exists $ecToRoles{$thisEC}) {
901                        for my $thisRole (@{$ecToRoles{$thisEC}}) {
902                            $loadCatalyzes->Put($thisRole, $reactionID);
903                        }
904                    }
905                }
906            }
907        }
908      # Finish the load.      # Finish the load.
909      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
910      return $retVal;      return $retVal;
# Line 787  Line 946 
946      my $fig = $self->{fig};      my $fig = $self->{fig};
947      # Get the genome hash.      # Get the genome hash.
948      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
949      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
950      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
951      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
952      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
953            Trace("Loading from existing files.") if T(2);
954        } else {
955            Trace("Generating property data.") if T(2);
956      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
957      my %propertyKeys = ();      my %propertyKeys = ();
958      my $nextID = 1;      my $nextID = 1;
959            # Get the attributes we intend to store in the property table.
960            my $propKeys = $self->{propKeys};
961      # Loop through the genomes.      # Loop through the genomes.
962      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
963          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
964          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
965          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
966          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
967          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
968          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
969          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
970              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
971              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
972              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
973              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
974                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
975                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
976                  # character.                  # character.
# Line 830  Line 988 
988                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
989                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
990              }              }
991                # Update the statistics.
992                Trace("$propertyCount attributes processed.") if T(3);
993                $loadHasProperty->Add("propertiesIn", $propertyCount);
994          }          }
995      }      }
996      # Finish the load.      # Finish the load.
# Line 871  Line 1032 
1032      my $fig = $self->{fig};      my $fig = $self->{fig};
1033      # Get the genome hash.      # Get the genome hash.
1034      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1035      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1036      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1037      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1038      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1039      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1040      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1041      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1042            Trace("Loading from existing files.") if T(2);
1043        } else {
1044            Trace("Generating annotation data.") if T(2);
1045      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1046      # user records.      # user records.
1047      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1055 
1055      # Loop through the genomes.      # Loop through the genomes.
1056      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1057          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1058              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1059              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1060              my %seenTimestamps = ();              my %seenTimestamps = ();
1061              # Check for a functional assignment.              # Get the genome's annotations.
1062              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1063              if ($func) {              Trace("Processing annotations.") if T(2);
1064                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1065                  # assignment annotation for it.                  # Get the annotation tuple.
1066                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1067                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1068                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1069                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1070                      # stop the substitution search.                      # stop the substitution search.
1071                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1075 
1075                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1076                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1077                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1078                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1079                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1080                              $timestamp++;                      my $keyStamp = $timestamp;
1081                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1082                            $keyStamp++;
1083                          }                          }
1084                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1085                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1086                          # Insure the user exists.                          # Insure the user exists.
1087                          if (! $users{$user}) {                          if (! $users{$user}) {
1088                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1090 
1090                              $users{$user} = 1;                              $users{$user} = 1;
1091                          }                          }
1092                          # Generate the annotation.                          # Generate the annotation.
1093                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1094                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1095                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1096                      } else {                      } else {
# Line 950  Line 1100 
1100                  }                  }
1101              }              }
1102          }          }
     }  
1103      # Finish the load.      # Finish the load.
1104      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1105      return $retVal;      return $retVal;
# Line 991  Line 1140 
1140      my $fig = $self->{fig};      my $fig = $self->{fig};
1141      # Get the genome hash.      # Get the genome hash.
1142      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1143      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1144      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1145      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1146      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1147      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1148            Trace("Loading from existing files.") if T(2);
1149        } else {
1150            Trace("Generating annotation data.") if T(2);
1151      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1152      my %sourceURL = ();      my %sourceURL = ();
1153      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1161 
1161              chomp $line;              chomp $line;
1162              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1163              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1164              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1165                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1166                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1167              }              }
1168              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1169                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1170                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1171                        $sourceDesc{$sourceID} = $sourceID;
1172              }              }
1173          }          }
1174          close TMP;          close TMP;
1175      }      }
1176            # Write the source descriptions.
1177            for my $sourceID (keys %sourceDesc) {
1178                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1179            }
1180        }
1181      # Finish the load.      # Finish the load.
1182      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1183      return $retVal;      return $retVal;
# Line 1060  Line 1217 
1217      my $fig = $self->{fig};      my $fig = $self->{fig};
1218      # Get the genome hash.      # Get the genome hash.
1219      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1220      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1221      # it the key.      # it the key.
1222      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1223      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1224      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1225      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1226      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1227            Trace("Loading from existing files.") if T(2);
1228        } else {
1229            Trace("Generating external data.") if T(2);
1230      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1231      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1232      my $orgLine;      my $orgLine;
1233      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1234          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1240 
1240      close ORGS;      close ORGS;
1241      # Now the function file.      # Now the function file.
1242      my $funcLine;      my $funcLine;
1243      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1244      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1245          # Clean the line ending.          # Clean the line ending.
1246          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1256 
1256              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1257          }          }
1258      }      }
1259        }
1260      # Finish the load.      # Finish the load.
1261      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1262      return $retVal;      return $retVal;
1263  }  }
1264    
 =head3 LoadGroupData  
1265    
1266  C<< my $stats = $spl->LoadGroupData(); >>  =head3 LoadReactionData
1267    
1268    C<< my $stats = $spl->LoadReactionData(); >>
1269    
1270  Load the genome Groups into Sprout.  Load the reaction data from FIG into Sprout.
1271    
1272    Reaction data connects reactions to the compounds that participate in them.
1273    
1274  The following relations are loaded by this method.  The following relations are loaded by this method.
1275    
1276      GenomeGroups      Reaction
1277        ReactionURL
1278        Compound
1279        CompoundName
1280        CompoundCAS
1281        IsIdentifiedByCAS
1282        HasCompoundName
1283        IsAComponentOf
1284    
1285  There is no direct support for genome groups in FIG, so we access the SEED  This method proceeds reaction by reaction rather than genome by genome.
 files directly.  
1286    
1287  =over 4  =over 4
1288    
# Line 1125  Line 1294 
1294    
1295  =cut  =cut
1296  #: Return Type $%;  #: Return Type $%;
1297  sub LoadGroupData {  sub LoadReactionData {
1298        # Get this object instance.
1299        my ($self) = @_;
1300        # Get the FIG object.
1301        my $fig = $self->{fig};
1302        # Create load objects for each of the tables we're loading.
1303        my $loadReaction = $self->_TableLoader('Reaction');
1304        my $loadReactionURL = $self->_TableLoader('ReactionURL');
1305        my $loadCompound = $self->_TableLoader('Compound');
1306        my $loadCompoundName = $self->_TableLoader('CompoundName');
1307        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1308        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1309        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1310        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1311        if ($self->{options}->{loadOnly}) {
1312            Trace("Loading from existing files.") if T(2);
1313        } else {
1314            Trace("Generating reaction data.") if T(2);
1315            # We need some hashes to prevent duplicates.
1316            my %compoundNames = ();
1317            my %compoundCASes = ();
1318            # First we create the compounds.
1319            my @compounds = $fig->all_compounds();
1320            for my $cid (@compounds) {
1321                # Check for names.
1322                my @names = $fig->names_of_compound($cid);
1323                # Each name will be given a priority number, starting with 1.
1324                my $prio = 1;
1325                for my $name (@names) {
1326                    if (! exists $compoundNames{$name}) {
1327                        $loadCompoundName->Put($name);
1328                        $compoundNames{$name} = 1;
1329                    }
1330                    $loadHasCompoundName->Put($cid, $name, $prio++);
1331                }
1332                # Create the main compound record. Note that the first name
1333                # becomes the label.
1334                my $label = (@names > 0 ? $names[0] : $cid);
1335                $loadCompound->Put($cid, $label);
1336                # Check for a CAS ID.
1337                my $cas = $fig->cas($cid);
1338                if ($cas) {
1339                    $loadIsIdentifiedByCAS->Put($cid, $cas);
1340                    if (! exists $compoundCASes{$cas}) {
1341                        $loadCompoundCAS->Put($cas);
1342                        $compoundCASes{$cas} = 1;
1343                    }
1344                }
1345            }
1346            # All the compounds are set up, so we need to loop through the reactions next. First,
1347            # we initialize the discriminator index. This is a single integer used to insure
1348            # duplicate elements in a reaction are not accidentally collapsed.
1349            my $discrim = 0;
1350            my @reactions = $fig->all_reactions();
1351            for my $reactionID (@reactions) {
1352                # Create the reaction record.
1353                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1354                # Compute the reaction's URL.
1355                my $url = HTML::reaction_link($reactionID);
1356                # Put it in the ReactionURL table.
1357                $loadReactionURL->Put($reactionID, $url);
1358                # Now we need all of the reaction's compounds. We get these in two phases,
1359                # substrates first and then products.
1360                for my $product (0, 1) {
1361                    # Get the compounds of the current type for the current reaction. FIG will
1362                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1363                    # have location data in SEED, so it defaults to the empty string.
1364                    my @compounds = $fig->reaction2comp($reactionID, $product);
1365                    for my $compData (@compounds) {
1366                        # Extract the compound data from the current tuple.
1367                        my ($cid, $stoich, $main) = @{$compData};
1368                        # Link the compound to the reaction.
1369                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1370                                                 $product, $stoich);
1371                    }
1372                }
1373            }
1374        }
1375        # Finish the load.
1376        my $retVal = $self->_FinishAll();
1377        return $retVal;
1378    }
1379    
1380    =head3 LoadSynonymData
1381    
1382    C<< my $stats = $spl->LoadSynonymData(); >>
1383    
1384    Load the synonym groups into Sprout.
1385    
1386    The following relations are loaded by this method.
1387    
1388        SynonymGroup
1389        IsSynonymGroupFor
1390    
1391    The source information for these relations is taken from the C<maps_to_id> method
1392    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1393    SQL against the FIG database.
1394    
1395    =over 4
1396    
1397    =item RETURNS
1398    
1399    Returns a statistics object for the loads.
1400    
1401    =back
1402    
1403    =cut
1404    #: Return Type $%;
1405    sub LoadSynonymData {
1406      # Get this object instance.      # Get this object instance.
1407      my ($self) = @_;      my ($self) = @_;
1408      # Get the FIG object.      # Get the FIG object.
1409      my $fig = $self->{fig};      my $fig = $self->{fig};
1410      # Get the genome hash.      # Get the genome hash.
1411      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1412      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1413      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1414      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1415        if ($self->{options}->{loadOnly}) {
1416            Trace("Loading from existing files.") if T(2);
1417        } else {
1418            Trace("Generating synonym group data.") if T(2);
1419            # Get the database handle.
1420            my $dbh = $fig->db_handle();
1421            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1422            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1423            my $result = $sth->execute();
1424            if (! defined($result)) {
1425                Confess("Database error in Synonym load: " . $sth->errstr());
1426            } else {
1427                # Remember the current synonym.
1428                my $current_syn = "";
1429                # Count the features.
1430                my $featureCount = 0;
1431                # Loop through the synonym/peg pairs.
1432                while (my @row = $sth->fetchrow()) {
1433                    # Get the synonym group ID and feature ID.
1434                    my ($syn_id, $peg) = @row;
1435                    # Insure it's for one of our genomes.
1436                    my $genomeID = FIG::genome_of($peg);
1437                    if (exists $genomeHash->{$genomeID}) {
1438                        # Verify the synonym.
1439                        if ($syn_id ne $current_syn) {
1440                            # It's new, so put it in the group table.
1441                            $loadSynonymGroup->Put($syn_id);
1442                            $current_syn = $syn_id;
1443                        }
1444                        # Connect the synonym to the peg.
1445                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1446                        # Count this feature.
1447                        $featureCount++;
1448                        if ($featureCount % 1000 == 0) {
1449                            Trace("$featureCount features processed.") if T(3);
1450                        }
1451                    }
1452                }
1453            }
1454        }
1455        # Finish the load.
1456        my $retVal = $self->_FinishAll();
1457        return $retVal;
1458    }
1459    
1460    =head3 LoadFamilyData
1461    
1462    C<< my $stats = $spl->LoadFamilyData(); >>
1463    
1464    Load the protein families into Sprout.
1465    
1466    The following relations are loaded by this method.
1467    
1468        Family
1469        IsFamilyForFeature
1470    
1471    The source information for these relations is taken from the C<families_for_protein>,
1472    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1473    
1474    =over 4
1475    
1476    =item RETURNS
1477    
1478    Returns a statistics object for the loads.
1479    
1480    =back
1481    
1482    =cut
1483    #: Return Type $%;
1484    sub LoadFamilyData {
1485        # Get this object instance.
1486        my ($self) = @_;
1487        # Get the FIG object.
1488        my $fig = $self->{fig};
1489        # Get the genome hash.
1490        my $genomeHash = $self->{genomes};
1491        # Create load objects for the tables we're loading.
1492        my $loadFamily = $self->_TableLoader('Family');
1493        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1494        if ($self->{options}->{loadOnly}) {
1495            Trace("Loading from existing files.") if T(2);
1496        } else {
1497            Trace("Generating family data.") if T(2);
1498            # Create a hash for the family IDs.
1499            my %familyHash = ();
1500      # Loop through the genomes.      # Loop through the genomes.
1501      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1502      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1503          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1504          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1505          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1506              defined($line = <TMP>)) {                  # Get this feature's families.
1507              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1508              chomp $line;                  # Loop through the families, connecting them to the feature.
1509              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1510              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1511              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1512                        if (! exists $familyHash{$family}) {
1513                            $familyHash{$family} = 1;
1514                            $loadFamily->Add("families", 1);
1515                            my $size = $fig->sz_family($family);
1516                            my $func = $fig->family_function($family);
1517                            $loadFamily->Put($family, $size, $func);
1518                        }
1519                    }
1520                }
1521          }          }
         close TMP;  
1522      }      }
1523      # Finish the load.      # Finish the load.
1524      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1525      return $retVal;      return $retVal;
1526  }  }
1527    
1528    =head3 LoadDrugData
1529    
1530    C<< my $stats = $spl->LoadDrugData(); >>
1531    
1532    Load the drug target data into Sprout.
1533    
1534    The following relations are loaded by this method.
1535    
1536        PDB
1537        DocksWith
1538        IsProteinForFeature
1539        Ligand
1540    
1541    The source information for these relations is taken from attributes. The
1542    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1543    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1544    attribute contains the information for the B<DocksWith> relationship. It is
1545    expected that additional attributes and tables will be added in the future.
1546    
1547    =over 4
1548    
1549    =item RETURNS
1550    
1551    Returns a statistics object for the loads.
1552    
1553    =back
1554    
1555    =cut
1556    #: Return Type $%;
1557    sub LoadDrugData {
1558        # Get this object instance.
1559        my ($self) = @_;
1560        # Get the FIG object.
1561        my $fig = $self->{fig};
1562        # Get the genome hash.
1563        my $genomeHash = $self->{genomes};
1564        # Create load objects for the tables we're loading.
1565        my $loadPDB = $self->_TableLoader('PDB');
1566        my $loadLigand = $self->_TableLoader('Ligand');
1567        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1568        my $loadDocksWith = $self->_TableLoader('DocksWith');
1569        if ($self->{options}->{loadOnly}) {
1570            Trace("Loading from existing files.") if T(2);
1571        } else {
1572            Trace("Generating drug target data.") if T(2);
1573            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1574            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1575            # this process, PDB information is collected in a hash table and then
1576            # unspooled after both relationships are created.
1577            my %pdbHash = ();
1578            Trace("Generating docking data.") if T(2);
1579            # Get all the docking data. This may cause problems if there are too many PDBs,
1580            # at which point we'll need another algorithm. The indicator that this is
1581            # happening will be a timeout error in the next statement.
1582            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1583                                                  ['docking_results', $FIG_Config::dockLimit]);
1584            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1585            for my $dockData (@dockData) {
1586                # Get the docking data components.
1587                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1588                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1589                $pdbID = lc $pdbID;
1590                # Strip off the object type.
1591                $pdbID =~ s/pdb://;
1592                # Extract the ZINC ID from the docking key. Note that there are two possible
1593                # formats.
1594                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1595                if (! $zinc_id) {
1596                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1597                    $loadDocksWith->Add("errors");
1598                } else {
1599                    # Get the pieces of the value and parse the energy.
1600                    # Note that we don't care about the rank, since
1601                    # we can sort on the energy level itself in our database.
1602                    my ($energy, $tool, $type) = @valueData;
1603                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1604                    # Ignore predicted results.
1605                    if ($type ne "Predicted") {
1606                        # Count this docking result.
1607                        if (! exists $pdbHash{$pdbID}) {
1608                            $pdbHash{$pdbID} = 1;
1609                        } else {
1610                            $pdbHash{$pdbID}++;
1611                        }
1612                        # Write the result to the output.
1613                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1614                                            $total, $vanderwaals);
1615                    }
1616                }
1617            }
1618            Trace("Connecting features.") if T(2);
1619            # Loop through the genomes.
1620            for my $genome (sort keys %{$genomeHash}) {
1621                Trace("Generating PDBs for $genome.") if T(3);
1622                # Get all of the PDBs that BLAST against this genome's features.
1623                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1624                for my $pdbData (@attributeData) {
1625                    # The PDB ID is coded as a subkey.
1626                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1627                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1628                        $loadPDB->Add("errors");
1629                    } else {
1630                        my $pdbID = $1;
1631                        # Insure the PDB is in the hash.
1632                        if (! exists $pdbHash{$pdbID}) {
1633                            $pdbHash{$pdbID} = 0;
1634                        }
1635                        # The score and locations are coded in the attribute value.
1636                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1637                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1638                            $loadIsProteinForFeature->Add("errors");
1639                        } else {
1640                            my ($score, $locData) = ($1,$2);
1641                            # The location data may not be present, so we have to start with some
1642                            # defaults and then check.
1643                            my ($start, $end) = (1, 0);
1644                            if ($locData) {
1645                                $locData =~ /(\d+)-(\d+)/;
1646                                $start = $1;
1647                                $end = $2;
1648                            }
1649                            # If we still don't have the end location, compute it from
1650                            # the feature length.
1651                            if (! $end) {
1652                                # Most features have one location, but we do a list iteration
1653                                # just in case.
1654                                my @locations = $fig->feature_location($pdbData->[0]);
1655                                $end = 0;
1656                                for my $loc (@locations) {
1657                                    my $locObject = BasicLocation->new($loc);
1658                                    $end += $locObject->Length;
1659                                }
1660                            }
1661                            # Decode the score.
1662                            my $realScore = FIGRules::DecodeScore($score);
1663                            # Connect the PDB to the feature.
1664                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1665                        }
1666                    }
1667                }
1668            }
1669            # We've got all our PDBs now, so we unspool them from the hash.
1670            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1671            my $count = 0;
1672            for my $pdbID (sort keys %pdbHash) {
1673                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1674                $count++;
1675                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1676            }
1677            # Finally we create the ligand table. This information can be found in the
1678            # zinc_name attribute.
1679            Trace("Loading ligands.") if T(2);
1680            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1681            my $last_zinc_id = "";
1682            my $zinc_id = "";
1683            my $done = 0;
1684            while (! $done) {
1685                # Get the next 10000 ligands. We insist that the object ID is greater than
1686                # the last ID we processed.
1687                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1688                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1689                                                           ["ZINC:$zinc_id", "zinc_name"]);
1690                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1691                if (! @attributeData) {
1692                    # Here there are no attributes left, so we quit the loop.
1693                    $done = 1;
1694                } else {
1695                    # Process the attribute data we've received.
1696                    for my $zinc_data (@attributeData) {
1697                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1698                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1699                            $zinc_id = $1;
1700                            # Check for a duplicate.
1701                            if ($zinc_id eq $last_zinc_id) {
1702                                $loadLigand->Add("duplicate");
1703                            } else {
1704                                # Here it's safe to output the ligand. The ligand name is the attribute value
1705                                # (third column in the row).
1706                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1707                                # Insure we don't try to add this ID again.
1708                                $last_zinc_id = $zinc_id;
1709                            }
1710                        } else {
1711                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1712                            $loadLigand->Add("errors");
1713                        }
1714                    }
1715                }
1716            }
1717            Trace("Ligands loaded.") if T(2);
1718        }
1719        # Finish the load.
1720        my $retVal = $self->_FinishAll();
1721        return $retVal;
1722    }
1723    
1724    
1725  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1726    
1727    =head3 SpecialAttribute
1728    
1729    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1730    
1731    Look for special attributes of a given type. A special attribute is found by comparing one of
1732    the columns of the incoming attribute list to a search pattern. If a match is found, then
1733    a set of columns is put into an output table connected to the specified ID.
1734    
1735    For example, when processing features, the attribute list we look at has three columns: attribute
1736    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1737    begins with C<iedb_>. The call signature is therefore
1738    
1739        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1740    
1741    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1742    to the output along with the specified feature ID.
1743    
1744    =over 4
1745    
1746    =item id
1747    
1748    ID of the object whose special attributes are being loaded. This forms the first column of the
1749    output.
1750    
1751    =item attributes
1752    
1753    Reference to a list of tuples.
1754    
1755    =item idxMatch
1756    
1757    Index in each tuple of the column to be matched against the pattern. If the match is
1758    successful, an output record will be generated.
1759    
1760    =item idxValues
1761    
1762    Reference to a list containing the indexes in each tuple of the columns to be put as
1763    the second column of the output.
1764    
1765    =item pattern
1766    
1767    Pattern to be matched against the specified column. The match will be case-insensitive.
1768    
1769    =item loader
1770    
1771    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1772    but technically it could be anything with a C<Put> method.
1773    
1774    =item RETURN
1775    
1776    Returns a count of the matches found.
1777    
1778    =item
1779    
1780    =back
1781    
1782    =cut
1783    
1784    sub SpecialAttribute {
1785        # Get the parameters.
1786        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1787        # Declare the return variable.
1788        my $retVal = 0;
1789        # Loop through the attribute rows.
1790        for my $row (@{$attributes}) {
1791            # Check for a match.
1792            if ($row->[$idxMatch] =~ m/$pattern/i) {
1793                # We have a match, so output a row. This is a bit tricky, since we may
1794                # be putting out multiple columns of data from the input.
1795                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1796                $loader->Put($id, $value);
1797                $retVal++;
1798            }
1799        }
1800        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1801        # Return the number of matches.
1802        return $retVal;
1803    }
1804    
1805  =head3 TableLoader  =head3 TableLoader
1806    
1807  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 1816 
1816    
1817  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1818    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
1819  =item RETURN  =item RETURN
1820    
1821  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1186  Line 1826 
1826    
1827  sub _TableLoader {  sub _TableLoader {
1828      # Get the parameters.      # Get the parameters.
1829      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
1830      # Create the load object.      # Create the load object.
1831      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
1832      # Cache it in the loader list.      # Cache it in the loader list.
1833      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1834      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 1862 
1862      my $retVal = Stats->new();      my $retVal = Stats->new();
1863      # Get the loader list.      # Get the loader list.
1864      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1865        # Create a hash to hold the statistics objects, keyed on relation name.
1866        my %loaderHash = ();
1867      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1868      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1869      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1870            # Get the relation name.
1871            my $relName = $loader->RelName;
1872            # Check the ignore flag.
1873            if ($loader->Ignore) {
1874                Trace("Relation $relName not loaded.") if T(2);
1875            } else {
1876                # Here we really need to finish.
1877                Trace("Finishing $relName.") if T(2);
1878          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1879                $loaderHash{$relName} = $stats;
1880            }
1881        }
1882        # Now we loop through again, actually loading the tables. We want to finish before
1883        # loading so that if something goes wrong at this point, all the load files are usable
1884        # and we don't have to redo all that work.
1885        for my $relName (sort keys %loaderHash) {
1886            # Get the statistics for this relation.
1887            my $stats = $loaderHash{$relName};
1888            # Check for a database load.
1889            if ($self->{options}->{dbLoad}) {
1890                # Here we want to use the load file just created to load the database.
1891                Trace("Loading relation $relName.") if T(2);
1892                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1893                # Accumulate the statistics from the DB load.
1894                $stats->Accumulate($newStats);
1895            }
1896          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1897          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1898      }      }
1899      # Return the load statistics.      # Return the load statistics.
1900      return $retVal;      return $retVal;
1901  }  }
1902    
1903    =head3 GetGenomeAttributes
1904    
1905    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>
1906    
1907    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1908    attributes for all the features of a genome in a single call, then organizes them into
1909    a hash.
1910    
1911    =over 4
1912    
1913    =item fig
1914    
1915    FIG-like object for accessing attributes.
1916    
1917    =item genomeID
1918    
1919    ID of the genome who's attributes are desired.
1920    
1921    =item fids
1922    
1923    Reference to a list of the feature IDs whose attributes are to be kept.
1924    
1925    =item propKeys
1926    
1927    A list of the keys to retrieve.
1928    
1929    =item RETURN
1930    
1931    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1932    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1933    the attribute key, and one or more attribute values.
1934    
1935    =back
1936    
1937    =cut
1938    
1939    sub GetGenomeAttributes {
1940        # Get the parameters.
1941        my ($fig, $genomeID, $fids, $propKeys) = @_;
1942        # Declare the return variable.
1943        my $retVal = {};
1944        # Initialize the hash. This not only enables us to easily determine which FIDs to
1945        # keep, it insures that the caller sees a list reference for every known fid,
1946        # simplifying the logic.
1947        for my $fid (@{$fids}) {
1948            $retVal->{$fid} = [];
1949        }
1950        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1951        # an eval is used.
1952        my @aList = ();
1953        eval {
1954            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1955            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1956        };
1957        # Check for a problem.
1958        if ($@) {
1959            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1960            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1961            # but allows us to continue processing.
1962            my $nFids = scalar @{$fids};
1963            for (my $i = 0; $i < $nFids; $i += 100) {
1964                # Determine the index of the last feature ID we'll be specifying on this pass.
1965                # Normally it's $i + 99, but if we're close to the end it may be less.
1966                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1967                # Get a slice of the fid list.
1968                my @slice = @{$fids}[$i .. $end];
1969                # Get the relevant attributes.
1970                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1971                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1972                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1973                push @aList, @aShort;
1974            }
1975        }
1976        # Now we should have all the interesting attributes in @aList. Populate the hash with
1977        # them.
1978        for my $aListEntry (@aList) {
1979            my $fid = $aListEntry->[0];
1980            if (exists $retVal->{$fid}) {
1981                push @{$retVal->{$fid}}, $aListEntry;
1982            }
1983        }
1984        # Return the result.
1985        return $retVal;
1986    }
1987    
1988    
1989  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.88

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3