[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.86, Mon Aug 20 23:30:35 2007 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14        use HTML;
15    
16  =head1 Sprout Load Methods  =head1 Sprout Load Methods
17    
# Line 29  Line 31 
31      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
32      print $stats->Show();      print $stats->Show();
33    
 This module makes use of the internal Sprout property C<_erdb>.  
   
34  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
35  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
36  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 51 
51    
52  =head3 new  =head3 new
53    
54  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
55    
56  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
57  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 79 
79  =item subsysFile  =item subsysFile
80    
81  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
82  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
83  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
84    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
85    
86    =item options
87    
88    Reference to a hash of command-line options.
89    
90  =back  =back
91    
# Line 88  Line 93 
93    
94  sub new {  sub new {
95      # Get the parameters.      # Get the parameters.
96      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
97      # Load the list of genomes into a hash.      # Create the genome hash.
98      my %genomes;      my %genomes = ();
99        # We only need it if load-only is NOT specified.
100        if (! $options->{loadOnly}) {
101      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
102          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
103          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
# Line 114  Line 121 
121                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
122                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
123                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
124                      if (undef $accessCode) {                          if (! defined($accessCode)) {
125                          $accessCode = 1;                          $accessCode = 1;
126                      }                      }
127                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 131 
131              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
132          }          }
133      }      }
134        }
135      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
136      my %subsystems = ();      my %subsystems = ();
137        # We only need it if load-only is NOT specified.
138        if (! $options->{loadOnly}) {
139      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
140          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
141          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
142                # Loop through, checking for the NMPDR file.
143                for my $sub (@subs) {
144                    if ($fig->nmpdr_subsystem($sub)) {
145                        $subsystems{$sub} = 1;
146                    }
147                }
148      } else {      } else {
149          my $type = ref $subsysFile;          my $type = ref $subsysFile;
150          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 164 
164              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
165          }          }
166      }      }
167            # Go through the subsys hash again, creating the keyword list for each subsystem.
168            for my $subsystem (keys %subsystems) {
169                my $name = $subsystem;
170                $name =~ s/_/ /g;
171    #            my $classes = $fig->subsystem_classification($subsystem);
172    #            $name .= " " . join(" ", @{$classes});
173                $subsystems{$subsystem} = $name;
174            }
175        }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 184 
184                    subsystems => \%subsystems,                    subsystems => \%subsystems,
185                    sprout => $sprout,                    sprout => $sprout,
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout->{_erdb},                    erdb => $sprout,
188                    loaders => []                    loaders => [],
189                      options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
194      return $retVal;      return $retVal;
195  }  }
196    
197    =head3 LoadOnly
198    
199    C<< my $flag = $spl->LoadOnly; >>
200    
201    Return TRUE if we are in load-only mode, else FALSE.
202    
203    =cut
204    
205    sub LoadOnly {
206        my ($self) = @_;
207        return $self->{options}->{loadOnly};
208    }
209    
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 192  Line 235 
235    
236  =back  =back
237    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
238  =cut  =cut
239  #: Return Type $%;  #: Return Type $%;
240  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 245 
245      # Get the genome count.      # Get the genome count.
246      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
254        if ($self->{options}->{loadOnly}) {
255            Trace("Loading from existing files.") if T(2);
256        } else {
257            Trace("Generating genome data.") if T(2);
258      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
259      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
260          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
261          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
262          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
263          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
264          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
265          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
266          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
267          # Get the full taxonomy.          # Get the full taxonomy.
268          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
269                # Get the version. If no version is specified, we default to the genome ID by itself.
270                my $version = $fig->genome_version($genomeID);
271                if (! defined($version)) {
272                    $version = $genomeID;
273                }
274                # Get the DNA size.
275                my $dnaSize = $fig->genome_szdna($genomeID);
276                # Open the NMPDR group file for this genome.
277                my $group;
278                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
279                    defined($group = <TMP>)) {
280                    # Clean the line ending.
281                    chomp $group;
282                } else {
283                    # No group, so use the default.
284                    $group = $FIG_Config::otherGroup;
285                }
286                close TMP;
287          # Output the genome record.          # Output the genome record.
288          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
289                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
290          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
291          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
292          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 262  Line 317 
317              }              }
318          }          }
319      }      }
320        }
321      # Finish the loads.      # Finish the loads.
322      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
323      # Return the result.      # Return the result.
324      return $retVal;      return $retVal;
325  }  }
326    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence is  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
327  =head3 LoadFeatureData  =head3 LoadFeatureData
328    
329  C<< my $stats = $spl->LoadFeatureData(); >>  C<< my $stats = $spl->LoadFeatureData(); >>
# Line 400  Line 336 
336    
337      Feature      Feature
338      FeatureAlias      FeatureAlias
339        IsAliasOf
340      FeatureLink      FeatureLink
341      FeatureTranslation      FeatureTranslation
342      FeatureUpstream      FeatureUpstream
343      IsLocatedIn      IsLocatedIn
344        HasFeature
345        HasRoleInSubsystem
346        FeatureEssential
347        FeatureVirulent
348        FeatureIEDB
349        CDD
350        IsPresentOnProteinOf
351    
352  =over 4  =over 4
353    
# Line 418  Line 362 
362  sub LoadFeatureData {  sub LoadFeatureData {
363      # Get this object instance.      # Get this object instance.
364      my ($self) = @_;      my ($self) = @_;
365      # Get the FIG object.      # Get the FIG and Sprout objects.
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367        my $sprout = $self->{sprout};
368      # Get the table of genome IDs.      # Get the table of genome IDs.
369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
370      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
371      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
372      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
373      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
374      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
375      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
376      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
377        my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
378        my $loadHasFeature = $self->_TableLoader('HasFeature');
379        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
380        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
381        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
382        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
383        my $loadCDD = $self->_TableLoader('CDD');
384        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
385        # Get the subsystem hash.
386        my $subHash = $self->{subsystems};
387        # Get the property keys.
388        my $propKeys = $self->{propKeys};
389        # Create a hashes to hold CDD and alias values.
390        my %CDD = ();
391        my %alias = ();
392      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
393      # locations.      # locations.
394      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
395      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
396            Trace("Loading from existing files.") if T(2);
397        } else {
398            Trace("Generating feature data.") if T(2);
399      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
400      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
401          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
402          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
403          # Get the feature list for this genome.          # Get the feature list for this genome.
404          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
405                # Sort and count the list.
406                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
407                my $count = scalar @featureTuples;
408                my @fids = map { $_->[0] } @featureTuples;
409                Trace("$count features found for genome $genomeID.") if T(3);
410                # Get the attributes for this genome and put them in a hash by feature ID.
411                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
412                # Set up for our duplicate-feature check.
413                my $oldFeatureID = "";
414          # Loop through the features.          # Loop through the features.
415          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
416              # Split the tuple.              # Split the tuple.
417              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
418              # Create the feature record.                  # Check for duplicates.
419              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
420                        Trace("Duplicate feature $featureID found.") if T(1);
421                    } else {
422                        $oldFeatureID = $featureID;
423                        # Count this feature.
424                        $loadFeature->Add("featureIn");
425                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
426                        # Sprout database requires a single character.
427                        if (! defined($quality) || $quality eq "") {
428                            $quality = " ";
429                        }
430                        # Begin building the keywords. We start with the genome ID, the
431                        # feature ID, the taxonomy, and the organism name.
432                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
433                                        $fig->taxonomy_of($genomeID));
434              # Create the aliases.              # Create the aliases.
435              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
436                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
437              }                          $loadIsAliasOf->Put($alias, $featureID);
438                            push @keywords, $alias;
439                            # If this is a locus tag, also add its natural form as a keyword.
440                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
441                            if ($naturalName) {
442                                push @keywords, $naturalName;
443                            }
444                            # If this is the first time for the specified alias, create its
445                            # alias record.
446                            if (! exists $alias{$alias}) {
447                                $loadFeatureAlias->Put($alias);
448                                $alias{$alias} = 1;
449                            }
450                        }
451                        Trace("Assignment for $featureID is: $assignment") if T(4);
452                        # Break the assignment into words and shove it onto the
453                        # keyword list.
454                        push @keywords, split(/\s+/, $assignment);
455                        # Link this feature to the parent genome.
456                        $loadHasFeature->Put($genomeID, $featureID, $type);
457              # Get the links.              # Get the links.
458              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
459              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 472 
472                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
473                  }                  }
474              }              }
475                        # Now we need to find the subsystems this feature participates in.
476                        # We also add the subsystems to the keyword list. Before we do that,
477                        # we must convert underscores to spaces.
478                        my @subsystems = $fig->peg_to_subsystems($featureID);
479                        for my $subsystem (@subsystems) {
480                            # Only proceed if we like this subsystem.
481                            if (exists $subHash->{$subsystem}) {
482                                # Store the has-role link.
483                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
484                                # Save the subsystem's keyword data.
485                                my $subKeywords = $subHash->{$subsystem};
486                                push @keywords, split /\s+/, $subKeywords;
487                                # Now we need to get this feature's role in the subsystem.
488                                my $subObject = $fig->get_subsystem($subsystem);
489                                my @roleColumns = $subObject->get_peg_roles($featureID);
490                                my @allRoles = $subObject->get_roles();
491                                for my $col (@roleColumns) {
492                                    my $role = $allRoles[$col];
493                                    push @keywords, split /\s+/, $role;
494                                    push @keywords, $subObject->get_role_abbr($col);
495                                }
496                            }
497                        }
498                        # There are three special attributes computed from property
499                        # data that we build next. If the special attribute is non-empty,
500                        # its name will be added to the keyword list. First, we get all
501                        # the attributes for this feature. They will come back as
502                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
503                        # [name, value, value with URL]. (We don't need the PEG, since
504                        # we already know it.)
505                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
506                                             @{$attributes->{$featureID}};
507                        # Now we process each of the special attributes.
508                        if (SpecialAttribute($featureID, \@attributes,
509                                             1, [0,2], '^(essential|potential_essential)$',
510                                             $loadFeatureEssential)) {
511                            push @keywords, 'essential';
512                            $loadFeature->Add('essential');
513                        }
514                        if (SpecialAttribute($featureID, \@attributes,
515                                             0, [2], '^virulen',
516                                             $loadFeatureVirulent)) {
517                            push @keywords, 'virulent';
518                            $loadFeature->Add('virulent');
519                        }
520                        if (SpecialAttribute($featureID, \@attributes,
521                                             0, [0,2], '^iedb_',
522                                             $loadFeatureIEDB)) {
523                            push @keywords, 'iedb';
524                            $loadFeature->Add('iedb');
525                        }
526                        # Now we have some other attributes we need to process. Currently,
527                        # this is CDD and CELLO, but we expect the number to increase.
528                        my %attributeHash = ();
529                        for my $attrRow (@{$attributes->{$featureID}}) {
530                            my (undef, $key, @values) = @{$attrRow};
531                            $key =~ /^([^:]+)::(.+)/;
532                            if (exists $attributeHash{$1}) {
533                                $attributeHash{$1}->{$2} = \@values;
534                            } else {
535                                $attributeHash{$1} = {$2 => \@values};
536                            }
537                        }
538                        my $celloValue = "unknown";
539                        # Pull in the CELLO attribute. There will never be more than one.
540                        # If we have one, it's a feature attribute AND a keyword.
541                        my @celloData = keys %{$attributeHash{CELLO}};
542                        if (@celloData) {
543                            $celloValue = $celloData[0];
544                            push @keywords, $celloValue;
545                        }
546                        # Now we handle CDD. This is a bit more complicated, because
547                        # there are multiple CDDs per protein.
548                        if (exists $attributeHash{CDD}) {
549                            # Get the hash of CDD IDs to scores for this feature. We
550                            # already know it exists because of the above IF.
551                            my $cddHash = $attributeHash{CDD};
552                            my @cddData = sort keys %{$cddHash};
553                            for my $cdd (@cddData) {
554                                # Extract the score for this CDD and decode it.
555                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);
556                                my $realScore = FIGRules::DecodeScore($codeScore);
557                                # Create the connection.
558                                $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
559                                # If this CDD does not yet exist, create its record.
560                                if (! exists $CDD{$cdd}) {
561                                    $CDD{$cdd} = 1;
562                                    $loadCDD->Put($cdd);
563                                }
564                            }
565                        }
566                        # Now we need to bust up hyphenated words in the keyword
567                        # list. We keep them separate and put them at the end so
568                        # the original word order is available.
569                        my $keywordString = "";
570                        my $bustedString = "";
571                        for my $keyword (@keywords) {
572                            if (length $keyword >= 3) {
573                                $keywordString .= " $keyword";
574                                if ($keyword =~ /-/) {
575                                    my @words = split /-/, $keyword;
576                                    $bustedString .= join(" ", "", @words);
577                                }
578                            }
579                        }
580                        $keywordString .= $bustedString;
581                        # Get rid of annoying punctuation.
582                        $keywordString =~ s/[();]//g;
583                        # Clean the keyword list.
584                        my $cleanWords = $sprout->CleanKeywords($keywordString);
585                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
586                        # Now we need to process the feature's locations. First, we split them up.
587                        my @locationList = split /\s*,\s*/, $locations;
588                        # Next, we convert them to Sprout location objects.
589                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
590              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
591              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
592              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
593              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
594              my @locationList = split /\s*,\s*/, $locations;                      my $i = 1;
595              # Loop through the locations.              # Loop through the locations.
596              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
597                  # Parse the location.                          # Split this location into a list of chunks.
                 my $locObject = BasicLocation->new($location);  
                 # Split it into a list of chunks.  
598                  my @locOList = ();                  my @locOList = ();
599                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
600                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 488  Line 603 
603                  push @locOList, $locObject;                  push @locOList, $locObject;
604                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
605                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
606                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
607                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
608                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
609                      $i++;                      $i++;
610                  }                  }
611              }              }
612          }                      # Finally, reassemble the location objects into a list of Sprout location strings.
613      }                      $locations = join(", ", map { $_->String } @locObjectList);
614      # Finish the loads.                      # Create the feature record.
615      my $retVal = $self->_FinishAll();                      $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
616                  }                  }
617              }              }
618          }          }
# Line 584  Line 637 
637  The following relations are loaded by this method.  The following relations are loaded by this method.
638    
639      Subsystem      Subsystem
640        SubsystemClass
641      Role      Role
642        RoleEC
643        IsIdentifiedByEC
644      SSCell      SSCell
645      ContainsFeature      ContainsFeature
646      IsGenomeOf      IsGenomeOf
# Line 592  Line 648 
648      OccursInSubsystem      OccursInSubsystem
649      ParticipatesIn      ParticipatesIn
650      HasSSCell      HasSSCell
651        ConsistsOfRoles
652        RoleSubset
653        HasRoleSubset
654        ConsistsOfGenomes
655        GenomeSubset
656        HasGenomeSubset
657        Catalyzes
658        Diagram
659        RoleOccursIn
660    
661  =over 4  =over 4
662    
# Line 601  Line 666 
666    
667  =back  =back
668    
 B<TO DO>  
   
 Generate RoleName table?  
   
669  =cut  =cut
670  #: Return Type $%;  #: Return Type $%;
671  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 679 
679      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
680      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
681      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
682      my $subsysCount = @subsysIDs;      # Get the map list.
683      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
684      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
685      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
686      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
687      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
688      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role');
689      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
690      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
691      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
692      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
693      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
694      Trace("Beginning subsystem data load.") if T(2);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
695        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
696        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
697        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
698        my $loadHasSSCell = $self->_TableLoader('HasSSCell');
699        my $loadRoleSubset = $self->_TableLoader('RoleSubset');
700        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
701        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
702        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
703        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
704        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
705        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
706        if ($self->{options}->{loadOnly}) {
707            Trace("Loading from existing files.") if T(2);
708        } else {
709            Trace("Generating subsystem data.") if T(2);
710            # This hash will contain the roles for each EC. When we're done, this
711            # information will be used to generate the Catalyzes table.
712            my %ecToRoles = ();
713      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
714      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
715      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
716      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
717            # and memorize up the reactions.
718            my ($genomeID, $roleID);
719      my %roleData = ();      my %roleData = ();
720      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
721                # Get the subsystem object.
722                my $sub = $fig->get_subsystem($subsysID);
723                # Only proceed if the subsystem has a spreadsheet.
724                if (defined($sub) && ! $sub->{empty_ss}) {
725          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
726          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
727          # Create the subsystem record.          # Create the subsystem record.
728          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
729          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
730          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
731          # Connect the roles to the subsystem. If a role is new, we create                  # Now for the classification string. This comes back as a list
732          # a role record for it.                  # reference and we convert it to a space-delimited string.
733          for my $roleID (@roles) {                  my $classList = $fig->subsystem_classification($subsysID);
734                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
735                    $loadSubsystemClass->Put($subsysID, $classString);
736                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
737                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
738                        # Get the role's abbreviation.
739                        my $abbr = $sub->get_role_abbr($col);
740                        # Connect to this role.
741              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
742              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
743                        # If it's a new role, add it to the role table.
744              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
745                            # Get the role's abbreviation.
746                            # Add the role.
747                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
748                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
749                            # Check for an EC number.
750                            if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
751                                my $ec = $1;
752                                $loadIsIdentifiedByEC->Put($roleID, $ec);
753                                # Check to see if this is our first encounter with this EC.
754                                if (exists $ecToRoles{$ec}) {
755                                    # No, so just add this role to the EC list.
756                                    push @{$ecToRoles{$ec}}, $roleID;
757                                } else {
758                                    # Output this EC.
759                                    $loadRoleEC->Put($ec);
760                                    # Create its role list.
761                                    $ecToRoles{$ec} = [$roleID];
762              }              }
763          }          }
764          # Now all roles for this subsystem have been filled in. We create the                      }
765          # spreadsheet by matches roles to genomes. To do this, we need to                  }
766          # get the genomes on the sheet.                  # Now we create the spreadsheet for the subsystem by matching roles to
767                    # genomes. Each genome is a row and each role is a column. We may need
768                    # to actually create the roles as we find them.
769          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
770          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
771          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
772              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
773                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
774                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
775                            my $cellCount = 0;
776                            # Create a list for the PEGs we find. This list will be used
777                            # to generate cluster numbers.
778                            my @pegsFound = ();
779                            # Create a hash that maps spreadsheet IDs to PEGs. We will
780                            # use this to generate the ContainsFeature data after we have
781                            # the cluster numbers.
782                            my %cellPegs = ();
783                            # Get the genome's variant code for this subsystem.
784                            my $variantCode = $sub->get_variant_code($row);
785                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
786                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
787                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
788                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
789                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
790                      # Only proceed if features exist.                      # Only proceed if features exist.
791                      if (@pegs > 0) {                      if (@pegs > 0) {
792                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
793                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
794                                    my $cellID = "$subsysID:$genomeID:$col";
795                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
796                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
797                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
798                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
799                          # Attach the features to it.                                  # Remember its features.
800                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
801                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
802                                    $pegCount += @pegs;
803                                }
804                            }
805                            # If we found some cells for this genome, we need to compute clusters and
806                            # denote it participates in the subsystem.
807                            if ($pegCount > 0) {
808                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
809                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
810                                # Create a hash mapping PEG IDs to cluster numbers.
811                                # We default to -1 for all of them.
812                                my %clusterOf = map { $_ => -1 } @pegsFound;
813                                # Partition the PEGs found into clusters.
814                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
815                                for (my $i = 0; $i <= $#clusters; $i++) {
816                                    my $subList = $clusters[$i];
817                                    for my $peg (@{$subList}) {
818                                        $clusterOf{$peg} = $i;
819                                    }
820                                }
821                                # Create the ContainsFeature data.
822                                for my $cellID (keys %cellPegs) {
823                                    my $cellList = $cellPegs{$cellID};
824                                    for my $cellPeg (@$cellList) {
825                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
826                          }                          }
827                      }                      }
828                  }                  }
829              }              }
830          }          }
831                    # Now we need to generate the subsets. The subset names must be concatenated to
832                    # the subsystem name to make them unique keys. There are two types of subsets:
833                    # genome subsets and role subsets. We do the role subsets first.
834                    my @subsetNames = $sub->get_subset_names();
835                    for my $subsetID (@subsetNames) {
836                        # Create the subset record.
837                        my $actualID = "$subsysID:$subsetID";
838                        $loadRoleSubset->Put($actualID);
839                        # Connect the subset to the subsystem.
840                        $loadHasRoleSubset->Put($subsysID, $actualID);
841                        # Connect the subset to its roles.
842                        my @roles = $sub->get_subsetC_roles($subsetID);
843                        for my $roleID (@roles) {
844                            $loadConsistsOfRoles->Put($actualID, $roleID);
845      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
846  }  }
847                    # Next the genome subsets.
848  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
849                    for my $subsetID (@subsetNames) {
850  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
851                        my $actualID = "$subsysID:$subsetID";
852  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
853                        # Connect the subset to the subsystem.
854  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
855  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
856                        my @genomes = $sub->get_subsetR($subsetID);
857  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
858                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
859      Diagram                      }
860      RoleOccursIn                  }
861                }
862  =over 4          }
863            # Now we loop through the diagrams. We need to create the diagram records
864  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
865            # in subsystems (and therefore appear in the %ecToRoles hash) are
866  Returns a statistics object for the loads.          # included.
867            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
868          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
869          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
870          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 740  Line 873 
873          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
874          my %roleHash = ();          my %roleHash = ();
875          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
876              if (! $roleHash{$role}) {                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
877                  $loadRoleOccursIn->Put($role, $map);                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
878                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
879              }              }
880          }          }
881      }      }
882            # Before we leave, we must create the Catalyzes table. We start with the reactions,
883            # then use the "ecToRoles" table to convert EC numbers to role IDs.
884            my @reactions = $fig->all_reactions();
885            for my $reactionID (@reactions) {
886                # Get this reaction's list of roles. The results will be EC numbers.
887                my @ecs = $fig->catalyzed_by($reactionID);
888                # Loop through the roles, creating catalyzation records.
889                for my $thisEC (@ecs) {
890                    if (exists $ecToRoles{$thisEC}) {
891                        for my $thisRole (@{$ecToRoles{$thisEC}}) {
892                            $loadCatalyzes->Put($thisRole, $reactionID);
893                        }
894                    }
895                }
896            }
897        }
898      # Finish the load.      # Finish the load.
899      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
900      return $retVal;      return $retVal;
# Line 787  Line 936 
936      my $fig = $self->{fig};      my $fig = $self->{fig};
937      # Get the genome hash.      # Get the genome hash.
938      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
939      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
940      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
941      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
942      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
943            Trace("Loading from existing files.") if T(2);
944        } else {
945            Trace("Generating property data.") if T(2);
946      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
947      my %propertyKeys = ();      my %propertyKeys = ();
948      my $nextID = 1;      my $nextID = 1;
949            # Get the attributes we intend to store in the property table.
950            my $propKeys = $self->{propKeys};
951      # Loop through the genomes.      # Loop through the genomes.
952      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
953          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
954          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
955          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
956          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
957          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
958          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
959          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
960              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
961              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
962              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
963              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
964                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
965                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
966                  # character.                  # character.
# Line 830  Line 978 
978                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
979                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
980              }              }
981                # Update the statistics.
982                Trace("$propertyCount attributes processed.") if T(3);
983                $loadHasProperty->Add("propertiesIn", $propertyCount);
984          }          }
985      }      }
986      # Finish the load.      # Finish the load.
# Line 871  Line 1022 
1022      my $fig = $self->{fig};      my $fig = $self->{fig};
1023      # Get the genome hash.      # Get the genome hash.
1024      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1025      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1026      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1027      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1028      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1029      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1030      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1031      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1032            Trace("Loading from existing files.") if T(2);
1033        } else {
1034            Trace("Generating annotation data.") if T(2);
1035      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1036      # user records.      # user records.
1037      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1045 
1045      # Loop through the genomes.      # Loop through the genomes.
1046      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1047          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1048              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1049              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1050              my %seenTimestamps = ();              my %seenTimestamps = ();
1051              # Check for a functional assignment.              # Get the genome's annotations.
1052              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1053              if ($func) {              Trace("Processing annotations.") if T(2);
1054                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1055                  # assignment annotation for it.                  # Get the annotation tuple.
1056                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1057                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1058                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1059                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1060                      # stop the substitution search.                      # stop the substitution search.
1061                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1065 
1065                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1066                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1067                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1068                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1069                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1070                              $timestamp++;                      my $keyStamp = $timestamp;
1071                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1072                            $keyStamp++;
1073                          }                          }
1074                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1075                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1076                          # Insure the user exists.                          # Insure the user exists.
1077                          if (! $users{$user}) {                          if (! $users{$user}) {
1078                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1080 
1080                              $users{$user} = 1;                              $users{$user} = 1;
1081                          }                          }
1082                          # Generate the annotation.                          # Generate the annotation.
1083                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1084                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1085                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1086                      } else {                      } else {
# Line 950  Line 1090 
1090                  }                  }
1091              }              }
1092          }          }
     }  
1093      # Finish the load.      # Finish the load.
1094      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1095      return $retVal;      return $retVal;
# Line 991  Line 1130 
1130      my $fig = $self->{fig};      my $fig = $self->{fig};
1131      # Get the genome hash.      # Get the genome hash.
1132      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1133      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1134      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1135      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1136      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1137      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1138            Trace("Loading from existing files.") if T(2);
1139        } else {
1140            Trace("Generating annotation data.") if T(2);
1141      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1142      my %sourceURL = ();      my %sourceURL = ();
1143      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1151 
1151              chomp $line;              chomp $line;
1152              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1153              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1154              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1155                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1156                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1157              }              }
1158              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1159                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1160                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1161                        $sourceDesc{$sourceID} = $sourceID;
1162              }              }
1163          }          }
1164          close TMP;          close TMP;
1165      }      }
1166            # Write the source descriptions.
1167            for my $sourceID (keys %sourceDesc) {
1168                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1169            }
1170        }
1171      # Finish the load.      # Finish the load.
1172      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1173      return $retVal;      return $retVal;
# Line 1060  Line 1207 
1207      my $fig = $self->{fig};      my $fig = $self->{fig};
1208      # Get the genome hash.      # Get the genome hash.
1209      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1210      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1211      # it the key.      # it the key.
1212      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1213      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1214      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1215      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1216      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1217            Trace("Loading from existing files.") if T(2);
1218        } else {
1219            Trace("Generating external data.") if T(2);
1220      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1221      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1222      my $orgLine;      my $orgLine;
1223      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1224          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1230 
1230      close ORGS;      close ORGS;
1231      # Now the function file.      # Now the function file.
1232      my $funcLine;      my $funcLine;
1233      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1234      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1235          # Clean the line ending.          # Clean the line ending.
1236          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1246 
1246              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1247          }          }
1248      }      }
1249        }
1250      # Finish the load.      # Finish the load.
1251      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1252      return $retVal;      return $retVal;
1253  }  }
1254    
 =head3 LoadGroupData  
1255    
1256  C<< my $stats = $spl->LoadGroupData(); >>  =head3 LoadReactionData
1257    
1258    C<< my $stats = $spl->LoadReactionData(); >>
1259    
1260    Load the reaction data from FIG into Sprout.
1261    
1262  Load the genome Groups into Sprout.  Reaction data connects reactions to the compounds that participate in them.
1263    
1264  The following relations are loaded by this method.  The following relations are loaded by this method.
1265    
1266      GenomeGroups      Reaction
1267        ReactionURL
1268        Compound
1269        CompoundName
1270        CompoundCAS
1271        IsIdentifiedByCAS
1272        HasCompoundName
1273        IsAComponentOf
1274    
1275  There is no direct support for genome groups in FIG, so we access the SEED  This method proceeds reaction by reaction rather than genome by genome.
1276  files directly.  
1277    =over 4
1278    
1279    =item RETURNS
1280    
1281    Returns a statistics object for the loads.
1282    
1283    =back
1284    
1285    =cut
1286    #: Return Type $%;
1287    sub LoadReactionData {
1288        # Get this object instance.
1289        my ($self) = @_;
1290        # Get the FIG object.
1291        my $fig = $self->{fig};
1292        # Create load objects for each of the tables we're loading.
1293        my $loadReaction = $self->_TableLoader('Reaction');
1294        my $loadReactionURL = $self->_TableLoader('ReactionURL');
1295        my $loadCompound = $self->_TableLoader('Compound');
1296        my $loadCompoundName = $self->_TableLoader('CompoundName');
1297        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1298        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1299        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1300        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1301        if ($self->{options}->{loadOnly}) {
1302            Trace("Loading from existing files.") if T(2);
1303        } else {
1304            Trace("Generating reaction data.") if T(2);
1305            # We need some hashes to prevent duplicates.
1306            my %compoundNames = ();
1307            my %compoundCASes = ();
1308            # First we create the compounds.
1309            my @compounds = $fig->all_compounds();
1310            for my $cid (@compounds) {
1311                # Check for names.
1312                my @names = $fig->names_of_compound($cid);
1313                # Each name will be given a priority number, starting with 1.
1314                my $prio = 1;
1315                for my $name (@names) {
1316                    if (! exists $compoundNames{$name}) {
1317                        $loadCompoundName->Put($name);
1318                        $compoundNames{$name} = 1;
1319                    }
1320                    $loadHasCompoundName->Put($cid, $name, $prio++);
1321                }
1322                # Create the main compound record. Note that the first name
1323                # becomes the label.
1324                my $label = (@names > 0 ? $names[0] : $cid);
1325                $loadCompound->Put($cid, $label);
1326                # Check for a CAS ID.
1327                my $cas = $fig->cas($cid);
1328                if ($cas) {
1329                    $loadIsIdentifiedByCAS->Put($cid, $cas);
1330                    if (! exists $compoundCASes{$cas}) {
1331                        $loadCompoundCAS->Put($cas);
1332                        $compoundCASes{$cas} = 1;
1333                    }
1334                }
1335            }
1336            # All the compounds are set up, so we need to loop through the reactions next. First,
1337            # we initialize the discriminator index. This is a single integer used to insure
1338            # duplicate elements in a reaction are not accidentally collapsed.
1339            my $discrim = 0;
1340            my @reactions = $fig->all_reactions();
1341            for my $reactionID (@reactions) {
1342                # Create the reaction record.
1343                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1344                # Compute the reaction's URL.
1345                my $url = HTML::reaction_link($reactionID);
1346                # Put it in the ReactionURL table.
1347                $loadReactionURL->Put($reactionID, $url);
1348                # Now we need all of the reaction's compounds. We get these in two phases,
1349                # substrates first and then products.
1350                for my $product (0, 1) {
1351                    # Get the compounds of the current type for the current reaction. FIG will
1352                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1353                    # have location data in SEED, so it defaults to the empty string.
1354                    my @compounds = $fig->reaction2comp($reactionID, $product);
1355                    for my $compData (@compounds) {
1356                        # Extract the compound data from the current tuple.
1357                        my ($cid, $stoich, $main) = @{$compData};
1358                        # Link the compound to the reaction.
1359                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1360                                                 $product, $stoich);
1361                    }
1362                }
1363            }
1364        }
1365        # Finish the load.
1366        my $retVal = $self->_FinishAll();
1367        return $retVal;
1368    }
1369    
1370    =head3 LoadSynonymData
1371    
1372    C<< my $stats = $spl->LoadSynonymData(); >>
1373    
1374    Load the synonym groups into Sprout.
1375    
1376    The following relations are loaded by this method.
1377    
1378        SynonymGroup
1379        IsSynonymGroupFor
1380    
1381    The source information for these relations is taken from the C<maps_to_id> method
1382    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1383    SQL against the FIG database.
1384    
1385  =over 4  =over 4
1386    
# Line 1125  Line 1392 
1392    
1393  =cut  =cut
1394  #: Return Type $%;  #: Return Type $%;
1395  sub LoadGroupData {  sub LoadSynonymData {
1396      # Get this object instance.      # Get this object instance.
1397      my ($self) = @_;      my ($self) = @_;
1398      # Get the FIG object.      # Get the FIG object.
1399      my $fig = $self->{fig};      my $fig = $self->{fig};
1400      # Get the genome hash.      # Get the genome hash.
1401      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1402      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1403      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1404      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1405        if ($self->{options}->{loadOnly}) {
1406            Trace("Loading from existing files.") if T(2);
1407        } else {
1408            Trace("Generating synonym group data.") if T(2);
1409            # Get the database handle.
1410            my $dbh = $fig->db_handle();
1411            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1412            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1413            my $result = $sth->execute();
1414            if (! defined($result)) {
1415                Confess("Database error in Synonym load: " . $sth->errstr());
1416            } else {
1417                # Remember the current synonym.
1418                my $current_syn = "";
1419                # Count the features.
1420                my $featureCount = 0;
1421                # Loop through the synonym/peg pairs.
1422                while (my @row = $sth->fetchrow()) {
1423                    # Get the synonym group ID and feature ID.
1424                    my ($syn_id, $peg) = @row;
1425                    # Insure it's for one of our genomes.
1426                    my $genomeID = FIG::genome_of($peg);
1427                    if (exists $genomeHash->{$genomeID}) {
1428                        # Verify the synonym.
1429                        if ($syn_id ne $current_syn) {
1430                            # It's new, so put it in the group table.
1431                            $loadSynonymGroup->Put($syn_id);
1432                            $current_syn = $syn_id;
1433                        }
1434                        # Connect the synonym to the peg.
1435                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1436                        # Count this feature.
1437                        $featureCount++;
1438                        if ($featureCount % 1000 == 0) {
1439                            Trace("$featureCount features processed.") if T(3);
1440                        }
1441                    }
1442                }
1443            }
1444        }
1445        # Finish the load.
1446        my $retVal = $self->_FinishAll();
1447        return $retVal;
1448    }
1449    
1450    =head3 LoadFamilyData
1451    
1452    C<< my $stats = $spl->LoadFamilyData(); >>
1453    
1454    Load the protein families into Sprout.
1455    
1456    The following relations are loaded by this method.
1457    
1458        Family
1459        IsFamilyForFeature
1460    
1461    The source information for these relations is taken from the C<families_for_protein>,
1462    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1463    
1464    =over 4
1465    
1466    =item RETURNS
1467    
1468    Returns a statistics object for the loads.
1469    
1470    =back
1471    
1472    =cut
1473    #: Return Type $%;
1474    sub LoadFamilyData {
1475        # Get this object instance.
1476        my ($self) = @_;
1477        # Get the FIG object.
1478        my $fig = $self->{fig};
1479        # Get the genome hash.
1480        my $genomeHash = $self->{genomes};
1481        # Create load objects for the tables we're loading.
1482        my $loadFamily = $self->_TableLoader('Family');
1483        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1484        if ($self->{options}->{loadOnly}) {
1485            Trace("Loading from existing files.") if T(2);
1486        } else {
1487            Trace("Generating family data.") if T(2);
1488            # Create a hash for the family IDs.
1489            my %familyHash = ();
1490      # Loop through the genomes.      # Loop through the genomes.
1491      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1492      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1493          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1494          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1495          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1496              defined($line = <TMP>)) {                  # Get this feature's families.
1497              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1498              chomp $line;                  # Loop through the families, connecting them to the feature.
1499              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1500              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1501              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1502                        if (! exists $familyHash{$family}) {
1503                            $familyHash{$family} = 1;
1504                            $loadFamily->Add("families", 1);
1505                            my $size = $fig->sz_family($family);
1506                            my $func = $fig->family_function($family);
1507                            $loadFamily->Put($family, $size, $func);
1508                        }
1509                    }
1510                }
1511            }
1512        }
1513        # Finish the load.
1514        my $retVal = $self->_FinishAll();
1515        return $retVal;
1516    }
1517    
1518    =head3 LoadDrugData
1519    
1520    C<< my $stats = $spl->LoadDrugData(); >>
1521    
1522    Load the drug target data into Sprout.
1523    
1524    The following relations are loaded by this method.
1525    
1526        PDB
1527        DocksWith
1528        IsProteinForFeature
1529        Ligand
1530    
1531    The source information for these relations is taken from attributes. The
1532    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1533    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1534    attribute contains the information for the B<DocksWith> relationship. It is
1535    expected that additional attributes and tables will be added in the future.
1536    
1537    =over 4
1538    
1539    =item RETURNS
1540    
1541    Returns a statistics object for the loads.
1542    
1543    =back
1544    
1545    =cut
1546    #: Return Type $%;
1547    sub LoadDrugData {
1548        # Get this object instance.
1549        my ($self) = @_;
1550        # Get the FIG object.
1551        my $fig = $self->{fig};
1552        # Get the genome hash.
1553        my $genomeHash = $self->{genomes};
1554        # Create load objects for the tables we're loading.
1555        my $loadPDB = $self->_TableLoader('PDB');
1556        my $loadLigand = $self->_TableLoader('Ligand');
1557        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1558        my $loadDocksWith = $self->_TableLoader('DocksWith');
1559        if ($self->{options}->{loadOnly}) {
1560            Trace("Loading from existing files.") if T(2);
1561        } else {
1562            Trace("Generating drug target data.") if T(2);
1563            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1564            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1565            # this process, PDB information is collected in a hash table and then
1566            # unspooled after both relationships are created.
1567            my %pdbHash = ();
1568            Trace("Generating docking data.") if T(2);
1569            # Get all the docking data. This may cause problems if there are too many PDBs,
1570            # at which point we'll need another algorithm. The indicator that this is
1571            # happening will be a timeout error in the next statement.
1572            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1573                                                  ['docking_results', $FIG_Config::dockLimit]);
1574            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1575            for my $dockData (@dockData) {
1576                # Get the docking data components.
1577                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1578                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1579                $pdbID = lc $pdbID;
1580                # Strip off the object type.
1581                $pdbID =~ s/pdb://;
1582                # Extract the ZINC ID from the docking key. Note that there are two possible
1583                # formats.
1584                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1585                if (! $zinc_id) {
1586                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1587                    $loadDocksWith->Add("errors");
1588                } else {
1589                    # Get the pieces of the value and parse the energy.
1590                    # Note that we don't care about the rank, since
1591                    # we can sort on the energy level itself in our database.
1592                    my ($energy, $tool, $type) = @valueData;
1593                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1594                    # Ignore predicted results.
1595                    if ($type ne "Predicted") {
1596                        # Count this docking result.
1597                        if (! exists $pdbHash{$pdbID}) {
1598                            $pdbHash{$pdbID} = 1;
1599                        } else {
1600                            $pdbHash{$pdbID}++;
1601                        }
1602                        # Write the result to the output.
1603                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1604                                            $total, $vanderwaals);
1605                    }
1606                }
1607            }
1608            Trace("Connecting features.") if T(2);
1609            # Loop through the genomes.
1610            for my $genome (sort keys %{$genomeHash}) {
1611                Trace("Generating PDBs for $genome.") if T(3);
1612                # Get all of the PDBs that BLAST against this genome's features.
1613                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1614                for my $pdbData (@attributeData) {
1615                    # The PDB ID is coded as a subkey.
1616                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1617                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1618                        $loadPDB->Add("errors");
1619                    } else {
1620                        my $pdbID = $1;
1621                        # Insure the PDB is in the hash.
1622                        if (! exists $pdbHash{$pdbID}) {
1623                            $pdbHash{$pdbID} = 0;
1624                        }
1625                        # The score and locations are coded in the attribute value.
1626                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1627                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1628                            $loadIsProteinForFeature->Add("errors");
1629                        } else {
1630                            my ($score, $locData) = ($1,$2);
1631                            # The location data may not be present, so we have to start with some
1632                            # defaults and then check.
1633                            my ($start, $end) = (1, 0);
1634                            if ($locData) {
1635                                $locData =~ /(\d+)-(\d+)/;
1636                                $start = $1;
1637                                $end = $2;
1638                            }
1639                            # If we still don't have the end location, compute it from
1640                            # the feature length.
1641                            if (! $end) {
1642                                # Most features have one location, but we do a list iteration
1643                                # just in case.
1644                                my @locations = $fig->feature_location($pdbData->[0]);
1645                                $end = 0;
1646                                for my $loc (@locations) {
1647                                    my $locObject = BasicLocation->new($loc);
1648                                    $end += $locObject->Length;
1649                                }
1650                            }
1651                            # Decode the score.
1652                            my $realScore = FIGRules::DecodeScore($score);
1653                            # Connect the PDB to the feature.
1654                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1655                        }
1656                    }
1657                }
1658            }
1659            # We've got all our PDBs now, so we unspool them from the hash.
1660            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1661            my $count = 0;
1662            for my $pdbID (sort keys %pdbHash) {
1663                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1664                $count++;
1665                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1666            }
1667            # Finally we create the ligand table. This information can be found in the
1668            # zinc_name attribute.
1669            Trace("Loading ligands.") if T(2);
1670            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1671            my $last_zinc_id = "";
1672            my $zinc_id = "";
1673            my $done = 0;
1674            while (! $done) {
1675                # Get the next 10000 ligands. We insist that the object ID is greater than
1676                # the last ID we processed.
1677                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1678                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1679                                                           ["ZINC:$zinc_id", "zinc_name"]);
1680                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1681                if (! @attributeData) {
1682                    # Here there are no attributes left, so we quit the loop.
1683                    $done = 1;
1684                } else {
1685                    # Process the attribute data we've received.
1686                    for my $zinc_data (@attributeData) {
1687                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1688                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1689                            $zinc_id = $1;
1690                            # Check for a duplicate.
1691                            if ($zinc_id eq $last_zinc_id) {
1692                                $loadLigand->Add("duplicate");
1693                            } else {
1694                                # Here it's safe to output the ligand. The ligand name is the attribute value
1695                                # (third column in the row).
1696                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1697                                # Insure we don't try to add this ID again.
1698                                $last_zinc_id = $zinc_id;
1699                            }
1700                        } else {
1701                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1702                            $loadLigand->Add("errors");
1703                        }
1704          }          }
1705          close TMP;              }
1706            }
1707            Trace("Ligands loaded.") if T(2);
1708      }      }
1709      # Finish the load.      # Finish the load.
1710      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1711      return $retVal;      return $retVal;
1712  }  }
1713    
1714    
1715  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1716    
1717    =head3 SpecialAttribute
1718    
1719    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1720    
1721    Look for special attributes of a given type. A special attribute is found by comparing one of
1722    the columns of the incoming attribute list to a search pattern. If a match is found, then
1723    a set of columns is put into an output table connected to the specified ID.
1724    
1725    For example, when processing features, the attribute list we look at has three columns: attribute
1726    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1727    begins with C<iedb_>. The call signature is therefore
1728    
1729        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1730    
1731    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1732    to the output along with the specified feature ID.
1733    
1734    =over 4
1735    
1736    =item id
1737    
1738    ID of the object whose special attributes are being loaded. This forms the first column of the
1739    output.
1740    
1741    =item attributes
1742    
1743    Reference to a list of tuples.
1744    
1745    =item idxMatch
1746    
1747    Index in each tuple of the column to be matched against the pattern. If the match is
1748    successful, an output record will be generated.
1749    
1750    =item idxValues
1751    
1752    Reference to a list containing the indexes in each tuple of the columns to be put as
1753    the second column of the output.
1754    
1755    =item pattern
1756    
1757    Pattern to be matched against the specified column. The match will be case-insensitive.
1758    
1759    =item loader
1760    
1761    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1762    but technically it could be anything with a C<Put> method.
1763    
1764    =item RETURN
1765    
1766    Returns a count of the matches found.
1767    
1768    =item
1769    
1770    =back
1771    
1772    =cut
1773    
1774    sub SpecialAttribute {
1775        # Get the parameters.
1776        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1777        # Declare the return variable.
1778        my $retVal = 0;
1779        # Loop through the attribute rows.
1780        for my $row (@{$attributes}) {
1781            # Check for a match.
1782            if ($row->[$idxMatch] =~ m/$pattern/i) {
1783                # We have a match, so output a row. This is a bit tricky, since we may
1784                # be putting out multiple columns of data from the input.
1785                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1786                $loader->Put($id, $value);
1787                $retVal++;
1788            }
1789        }
1790        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1791        # Return the number of matches.
1792        return $retVal;
1793    }
1794    
1795  =head3 TableLoader  =head3 TableLoader
1796    
1797  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 1806 
1806    
1807  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1808    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
1809  =item RETURN  =item RETURN
1810    
1811  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1186  Line 1816 
1816    
1817  sub _TableLoader {  sub _TableLoader {
1818      # Get the parameters.      # Get the parameters.
1819      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
1820      # Create the load object.      # Create the load object.
1821      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
1822      # Cache it in the loader list.      # Cache it in the loader list.
1823      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1824      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 1852 
1852      my $retVal = Stats->new();      my $retVal = Stats->new();
1853      # Get the loader list.      # Get the loader list.
1854      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1855        # Create a hash to hold the statistics objects, keyed on relation name.
1856        my %loaderHash = ();
1857      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1858      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1859      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1860            # Get the relation name.
1861            my $relName = $loader->RelName;
1862            # Check the ignore flag.
1863            if ($loader->Ignore) {
1864                Trace("Relation $relName not loaded.") if T(2);
1865            } else {
1866                # Here we really need to finish.
1867                Trace("Finishing $relName.") if T(2);
1868          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1869                $loaderHash{$relName} = $stats;
1870            }
1871        }
1872        # Now we loop through again, actually loading the tables. We want to finish before
1873        # loading so that if something goes wrong at this point, all the load files are usable
1874        # and we don't have to redo all that work.
1875        for my $relName (sort keys %loaderHash) {
1876            # Get the statistics for this relation.
1877            my $stats = $loaderHash{$relName};
1878            # Check for a database load.
1879            if ($self->{options}->{dbLoad}) {
1880                # Here we want to use the load file just created to load the database.
1881                Trace("Loading relation $relName.") if T(2);
1882                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1883                # Accumulate the statistics from the DB load.
1884                $stats->Accumulate($newStats);
1885            }
1886          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1887          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1888      }      }
1889      # Return the load statistics.      # Return the load statistics.
1890      return $retVal;      return $retVal;
1891  }  }
1892    
1893    =head3 GetGenomeAttributes
1894    
1895    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>
1896    
1897    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1898    attributes for all the features of a genome in a single call, then organizes them into
1899    a hash.
1900    
1901    =over 4
1902    
1903    =item fig
1904    
1905    FIG-like object for accessing attributes.
1906    
1907    =item genomeID
1908    
1909    ID of the genome who's attributes are desired.
1910    
1911    =item fids
1912    
1913    Reference to a list of the feature IDs whose attributes are to be kept.
1914    
1915    =item propKeys
1916    
1917    A list of the keys to retrieve.
1918    
1919    =item RETURN
1920    
1921    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1922    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1923    the attribute key, and one or more attribute values.
1924    
1925    =back
1926    
1927    =cut
1928    
1929    sub GetGenomeAttributes {
1930        # Get the parameters.
1931        my ($fig, $genomeID, $fids, $propKeys) = @_;
1932        # Declare the return variable.
1933        my $retVal = {};
1934        # Initialize the hash. This not only enables us to easily determine which FIDs to
1935        # keep, it insures that the caller sees a list reference for every known fid,
1936        # simplifying the logic.
1937        for my $fid (@{$fids}) {
1938            $retVal->{$fid} = [];
1939        }
1940        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1941        # an eval is used.
1942        my @aList = ();
1943        eval {
1944            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1945            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1946        };
1947        # Check for a problem.
1948        if ($@) {
1949            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1950            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1951            # but allows us to continue processing.
1952            my $nFids = scalar @{$fids};
1953            for (my $i = 0; $i < $nFids; $i += 100) {
1954                # Determine the index of the last feature ID we'll be specifying on this pass.
1955                # Normally it's $i + 99, but if we're close to the end it may be less.
1956                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1957                # Get a slice of the fid list.
1958                my @slice = @{$fids}[$i .. $end];
1959                # Get the relevant attributes.
1960                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1961                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1962                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1963                push @aList, @aShort;
1964            }
1965        }
1966        # Now we should have all the interesting attributes in @aList. Populate the hash with
1967        # them.
1968        for my $aListEntry (@aList) {
1969            my $fid = $aListEntry->[0];
1970            if (exists $retVal->{$fid}) {
1971                push @{$retVal->{$fid}}, $aListEntry;
1972            }
1973        }
1974        # Return the result.
1975        return $retVal;
1976    }
1977    
1978    
1979  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.86

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3