[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.95, Sat Sep 20 14:33:28 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14        use HTML;
15        use AliasAnalysis;
16        use BioWords;
17    
18  =head1 Sprout Load Methods  =head1 Sprout Load Methods
19    
# Line 29  Line 33 
33      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
34      print $stats->Show();      print $stats->Show();
35    
 This module makes use of the internal Sprout property C<_erdb>.  
   
36  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
37  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
38  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 53 
53    
54  =head3 new  =head3 new
55    
56  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57    
58  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
59  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 81 
81  =item subsysFile  =item subsysFile
82    
83  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
84  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
85  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
86    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
87    
88    =item options
89    
90    Reference to a hash of command-line options.
91    
92  =back  =back
93    
# Line 88  Line 95 
95    
96  sub new {  sub new {
97      # Get the parameters.      # Get the parameters.
98      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
99      # Load the list of genomes into a hash.      # Create the genome hash.
100      my %genomes;      my %genomes = ();
101        # We only need it if load-only is NOT specified.
102        if (! $options->{loadOnly}) {
103      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
104          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
105          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
106          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
107                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
108      } else {      } else {
109          my $type = ref $genomeFile;          my $type = ref $genomeFile;
110          Trace("Genome file parameter type is \"$type\".") if T(3);          Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 114  Line 124 
124                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
125                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
126                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
127                      if (undef $accessCode) {                          if (! defined($accessCode)) {
128                          $accessCode = 1;                          $accessCode = 1;
129                      }                      }
130                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 134 
134              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
135          }          }
136      }      }
137        }
138      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
139      my %subsystems = ();      my %subsystems = ();
140        # We only need it if load-only is NOT specified.
141        if (! $options->{loadOnly}) {
142      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
143          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
144          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
145                # Loop through, checking for the NMPDR file.
146                for my $sub (@subs) {
147                    if ($fig->nmpdr_subsystem($sub)) {
148                        $subsystems{$sub} = 1;
149                    }
150                }
151      } else {      } else {
152          my $type = ref $subsysFile;          my $type = ref $subsysFile;
153          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 167 
167              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
168          }          }
169      }      }
170            # Go through the subsys hash again, creating the keyword list for each subsystem.
171            for my $subsystem (keys %subsystems) {
172                my $name = $subsystem;
173                $name =~ s/_/ /g;
174                $subsystems{$subsystem} = $name;
175            }
176        }
177        # Get the list of NMPDR-oriented attribute keys.
178        my @propKeys = $fig->get_group_keys("NMPDR");
179      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
180      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
181      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 185 
185                    subsystems => \%subsystems,                    subsystems => \%subsystems,
186                    sprout => $sprout,                    sprout => $sprout,
187                    loadDirectory => $directory,                    loadDirectory => $directory,
188                    erdb => $sprout->{_erdb},                    erdb => $sprout,
189                    loaders => []                    loaders => [],
190                      options => $options,
191                      propKeys => \@propKeys,
192                   };                   };
193      # Bless and return it.      # Bless and return it.
194      bless $retVal, $class;      bless $retVal, $class;
195      return $retVal;      return $retVal;
196  }  }
197    
198    =head3 LoadOnly
199    
200        my $flag = $spl->LoadOnly;
201    
202    Return TRUE if we are in load-only mode, else FALSE.
203    
204    =cut
205    
206    sub LoadOnly {
207        my ($self) = @_;
208        return $self->{options}->{loadOnly};
209    }
210    
211    
212  =head3 LoadGenomeData  =head3 LoadGenomeData
213    
214  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
215    
216  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
217    
# Line 192  Line 236 
236    
237  =back  =back
238    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
239  =cut  =cut
240  #: Return Type $%;  #: Return Type $%;
241  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 246 
246      # Get the genome count.      # Get the genome count.
247      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
248      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
249      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
250      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
251      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig');
252      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig');
253      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
254      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence');
255        if ($self->{options}->{loadOnly}) {
256            Trace("Loading from existing files.") if T(2);
257        } else {
258            Trace("Generating genome data.") if T(2);
259            # Get the full info for the FIG genomes.
260            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
263      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
264          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
265          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
266          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
267          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
268          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
269          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
270          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
271          # Get the full taxonomy.          # Get the full taxonomy.
272          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
273                # Get the version. If no version is specified, we default to the genome ID by itself.
274                my $version = $fig->genome_version($genomeID);
275                if (! defined($version)) {
276                    $version = $genomeID;
277                }
278                # Get the DNA size.
279                my $dnaSize = $fig->genome_szdna($genomeID);
280                # Open the NMPDR group file for this genome.
281                my $group;
282                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
283                    defined($group = <TMP>)) {
284                    # Clean the line ending.
285                    chomp $group;
286                } else {
287                    # No group, so use the default.
288                    $group = $FIG_Config::otherGroup;
289                }
290                close TMP;
291                # Get the contigs.
292                my @contigs = $fig->all_contigs($genomeID);
293                # Get this genome's info array.
294                my $info = $genomeInfo{$genomeID};
295          # Output the genome record.          # Output the genome record.
296          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
         my @contigs = $fig->all_contigs($genomeID);  
299          for my $contigID (@contigs) {          for my $contigID (@contigs) {
300              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
301              $loadContig->Add("contigIn");              $loadContig->Add("contigIn");
# Line 262  Line 324 
324              }              }
325          }          }
326      }      }
327        }
328      # Finish the loads.      # Finish the loads.
329      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
330      # Return the result.      # Return the result.
331      return $retVal;      return $retVal;
332  }  }
333    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     my $genomeCount = (keys %{$genomeFilter});  
     my $featureCount = $genomeCount * 4000;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);  
     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);  
     Trace("Beginning coupling data load.") if T(2);  
     # Loop through the genomes found.  
     for my $genome (sort keys %{$genomeFilter}) {  
         Trace("Generating coupling data for $genome.") if T(3);  
         $loadCoupling->Add("genomeIn");  
         # Create a hash table for holding coupled pairs. We use this to prevent  
         # duplicates. For example, if A is coupled to B, we don't want to also  
         # assert that B is coupled to A, because we already know it. Fortunately,  
         # all couplings occur within a genome, so we can keep the hash table  
         # size reasonably small.  
         my %dupHash = ();  
         # Get all of the genome's PEGs.  
         my @pegs = $fig->pegs_of($genome);  
         # Loop through the PEGs.  
         for my $peg1 (@pegs) {  
             $loadCoupling->Add("pegIn");  
             Trace("Processing PEG $peg1 for $genome.") if T(4);  
             # Get a list of the coupled PEGs.  
             my @couplings = $fig->coupled_to($peg1);  
             # For each coupled PEG, we need to verify that a coupling already  
             # exists. If not, we have to create one.  
             for my $coupleData (@couplings) {  
                 my ($peg2, $score) = @{$coupleData};  
                 # Compute the coupling ID.  
                 my $coupleID = Sprout::CouplingID($peg1, $peg2);  
                 if (! exists $dupHash{$coupleID}) {  
                     $loadCoupling->Add("couplingIn");  
                     # Here we have a new coupling to store in the load files.  
                     Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                     # Ensure we don't do this again.  
                     $dupHash{$coupleID} = $score;  
                     # Write the coupling record.  
                     $loadCoupling->Put($coupleID, $score);  
                     # Connect it to the coupled PEGs.  
                     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                     # Get the evidence for this coupling.  
                     my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                     # Organize the evidence into a hash table.  
                     my %evidenceMap = ();  
                     # Process each evidence item.  
                     for my $evidenceData (@evidence) {  
                         $loadPCH->Add("evidenceIn");  
                         my ($peg3, $peg4, $usage) = @{$evidenceData};  
                         # Only proceed if the evidence is from a Sprout  
                         # genome.  
                         if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                             $loadUsesAsEvidence->Add("evidenceChosen");  
                             my $evidenceKey = "$coupleID $peg3 $peg4";  
                             # We store this evidence in the hash if the usage  
                             # is nonzero or no prior evidence has been found. This  
                             # insures that if there is duplicate evidence, we  
                             # at least keep the meaningful ones. Only evidence is  
                             # the hash makes it to the output.  
                             if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                 $evidenceMap{$evidenceKey} = $evidenceData;  
                             }  
                         }  
                     }  
                     for my $evidenceID (keys %evidenceMap) {  
                         # Create the evidence record.  
                         my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                         $loadPCH->Put($evidenceID, $usage);  
                         # Connect it to the coupling.  
                         $loadIsEvidencedBy->Put($coupleID, $evidenceID);  
                         # Connect it to the features.  
                         $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);  
                         $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
334  =head3 LoadFeatureData  =head3 LoadFeatureData
335    
336  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
337    
338  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
339    
# Line 400  Line 343 
343    
344      Feature      Feature
345      FeatureAlias      FeatureAlias
346        IsAliasOf
347      FeatureLink      FeatureLink
348      FeatureTranslation      FeatureTranslation
349      FeatureUpstream      FeatureUpstream
350      IsLocatedIn      IsLocatedIn
351        HasFeature
352        HasRoleInSubsystem
353        FeatureEssential
354        FeatureVirulent
355        FeatureIEDB
356        CDD
357        IsPresentOnProteinOf
358        CellLocation
359        IsPossiblePlaceFor
360        ExternalDatabase
361        IsAlsoFoundIn
362        Keyword
363    
364  =over 4  =over 4
365    
# Line 418  Line 374 
374  sub LoadFeatureData {  sub LoadFeatureData {
375      # Get this object instance.      # Get this object instance.
376      my ($self) = @_;      my ($self) = @_;
377      # Get the FIG object.      # Get the FIG and Sprout objects.
378      my $fig = $self->{fig};      my $fig = $self->{fig};
379        my $sprout = $self->{sprout};
380      # Get the table of genome IDs.      # Get the table of genome IDs.
381      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
382      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
383      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
384      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
385      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
386      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
387      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
388      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
389        my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
390        my $loadHasFeature = $self->_TableLoader('HasFeature');
391        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
392        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
393        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
394        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395        my $loadCDD = $self->_TableLoader('CDD');
396        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397        my $loadCellLocation = $self->_TableLoader('CellLocation');
398        my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399        my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400        my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401        my $loadKeyword = $self->_TableLoader('Keyword');
402        # Get the subsystem hash.
403        my $subHash = $self->{subsystems};
404        # Get the property keys.
405        my $propKeys = $self->{propKeys};
406        # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407        my %CDD = ();
408        my %alias = ();
409        my %cellLocation = ();
410        my %xdb = ();
411        # Create the bio-words object.
412        my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
413                                     stops => "$FIG_Config::sproutData/StopWords.txt",
414                                     cache => 0);
415        # One of the things we have to do here is build the keyword table, and the keyword
416        # table needs to contain the originating text and feature count for each stem. Unfortunately,
417        # the number of distinct keywords is so large it causes PERL to hang if we try to
418        # keep them in memory. As a result, we need to track them using disk files.
419        # Our approach will be to use two sequential files. One will contain stems and phonexes.
420        # Each time a stem occurs in a feature, a record will be written to that file. The stem
421        # file can then be sorted and collated to determine the number of features for each
422        # stem. A separate file will contain keywords and stems. This last file
423        # will be subjected to a sort unique on stem/keyword. The file is then merged
424        # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
425        my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
426        my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
427        my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
428        my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
429      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
430      # locations.      # locations.
431      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
432      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
433            Trace("Loading from existing files.") if T(2);
434        } else {
435            Trace("Generating feature data.") if T(2);
436      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
437      for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
438            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
439            for my $genomeID (@allGenomes) {
440          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
441          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
442          # Get the feature list for this genome.          # Get the feature list for this genome.
443          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
444                # Sort and count the list.
445                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
446                my $count = scalar @featureTuples;
447                my @fids = map { $_->[0] } @featureTuples;
448                Trace("$count features found for genome $genomeID.") if T(3);
449                # Get the attributes for this genome and put them in a hash by feature ID.
450                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
451                Trace("Looping through features for $genomeID.") if T(3);
452                # Set up for our duplicate-feature check.
453                my $oldFeatureID = "";
454          # Loop through the features.          # Loop through the features.
455          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
456              # Split the tuple.              # Split the tuple.
457              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
458              # Create the feature record.                  # Check for duplicates.
459              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
460                        Trace("Duplicate feature $featureID found.") if T(1);
461                    } else {
462                        $oldFeatureID = $featureID;
463                        # Count this feature.
464                        $loadFeature->Add("featureIn");
465                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
466                        # Sprout database requires a single character.
467                        if (! defined($quality) || $quality eq "") {
468                            $quality = " ";
469                        }
470                        # Begin building the keywords. We start with the genome ID, the
471                        # feature ID, the taxonomy, and the organism name.
472                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
473                                        $fig->taxonomy_of($genomeID));
474              # Create the aliases.              # Create the aliases.
475              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
476                  $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
477                            $loadIsAliasOf->Put($alias, $featureID);
478                            push @keywords, $alias;
479                            # If this is a locus tag, also add its natural form as a keyword.
480                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
481                            if ($naturalName) {
482                                push @keywords, $naturalName;
483                            }
484                            # If this is the first time for the specified alias, create its
485                            # alias record.
486                            if (! exists $alias{$alias}) {
487                                $loadFeatureAlias->Put($alias);
488                                $alias{$alias} = 1;
489                            }
490                        }
491                        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
492                        my @corresponders = $fig->get_corresponding_ids($featureID, 1);
493                        for my $tuple (@corresponders) {
494                            my ($id, $xdb) = @{$tuple};
495                            # Ignore SEED: that's us.
496                            if ($xdb ne 'SEED') {
497                                # Connect this ID to the feature.
498                                $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
499                                # Add it as a keyword.
500                                push @keywords, $id;
501                                # If this is a new database, create a record for it.
502                                if (! exists $xdb{$xdb}) {
503                                    $xdb{$xdb} = 1;
504                                    $loadExternalDatabase->Put($xdb);
505              }              }
506                            }
507                        }
508                        Trace("Assignment for $featureID is: $assignment") if T(4);
509                        # Break the assignment into words and shove it onto the
510                        # keyword list.
511                        push @keywords, split(/\s+/, $assignment);
512                        # Link this feature to the parent genome.
513                        $loadHasFeature->Put($genomeID, $featureID, $type);
514              # Get the links.              # Get the links.
515              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
516              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 529 
529                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
530                  }                  }
531              }              }
532                        # Now we need to find the subsystems this feature participates in.
533                        # We also add the subsystems to the keyword list. Before we do that,
534                        # we must convert underscores to spaces.
535                        my @subsystems = $fig->peg_to_subsystems($featureID);
536                        for my $subsystem (@subsystems) {
537                            # Only proceed if we like this subsystem.
538                            if (exists $subHash->{$subsystem}) {
539                                # Store the has-role link.
540                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
541                                # Save the subsystem's keyword data.
542                                my $subKeywords = $subHash->{$subsystem};
543                                push @keywords, split /\s+/, $subKeywords;
544                                # Now we need to get this feature's role in the subsystem.
545                                my $subObject = $fig->get_subsystem($subsystem);
546                                my @roleColumns = $subObject->get_peg_roles($featureID);
547                                my @allRoles = $subObject->get_roles();
548                                for my $col (@roleColumns) {
549                                    my $role = $allRoles[$col];
550                                    push @keywords, split /\s+/, $role;
551                                    push @keywords, $subObject->get_role_abbr($col);
552                                }
553                            }
554                        }
555                        # There are three special attributes computed from property
556                        # data that we build next. If the special attribute is non-empty,
557                        # its name will be added to the keyword list. First, we get all
558                        # the attributes for this feature. They will come back as
559                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
560                        # [name, value, value with URL]. (We don't need the PEG, since
561                        # we already know it.)
562                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
563                                             @{$attributes->{$featureID}};
564                        # Now we process each of the special attributes.
565                        if (SpecialAttribute($featureID, \@attributes,
566                                             1, [0,2], '^(essential|potential_essential)$',
567                                             $loadFeatureEssential)) {
568                            push @keywords, 'essential';
569                            $loadFeature->Add('essential');
570                        }
571                        if (SpecialAttribute($featureID, \@attributes,
572                                             0, [2], '^virulen',
573                                             $loadFeatureVirulent)) {
574                            push @keywords, 'virulent';
575                            $loadFeature->Add('virulent');
576                        }
577                        if (SpecialAttribute($featureID, \@attributes,
578                                             0, [0,2], '^iedb_',
579                                             $loadFeatureIEDB)) {
580                            push @keywords, 'iedb';
581                            $loadFeature->Add('iedb');
582                        }
583                        # Now we have some other attributes we need to process. To get
584                        # through them, we convert the attribute list for this feature
585                        # into a two-layer hash: key => subkey => value.
586                        my %attributeHash = ();
587                        for my $attrRow (@{$attributes->{$featureID}}) {
588                            my (undef, $key, @values) = @{$attrRow};
589                            my ($realKey, $subKey);
590                            if ($key =~ /^([^:]+)::(.+)/) {
591                                ($realKey, $subKey) = ($1, $2);
592                            } else {
593                                ($realKey, $subKey) = ($key, "");
594                            }
595                            if (exists $attributeHash{$1}) {
596                                $attributeHash{$1}->{$2} = \@values;
597                            } else {
598                                $attributeHash{$1} = {$2 => \@values};
599                            }
600                        }
601                        # First we handle CDD. This is a bit complicated, because
602                        # there are multiple CDDs per protein.
603                        if (exists $attributeHash{CDD}) {
604                            # Get the hash of CDD IDs to scores for this feature. We
605                            # already know it exists because of the above IF.
606                            my $cddHash = $attributeHash{CDD};
607                            my @cddData = sort keys %{$cddHash};
608                            for my $cdd (@cddData) {
609                                # Extract the score for this CDD and decode it.
610                                my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
611                                my $realScore = FIGRules::DecodeScore($codeScore);
612                                # We can't afford to crash because of a bad attribute
613                                # value, hence the IF below.
614                                if (! defined($realScore)) {
615                                    # Bad score, so count it.
616                                    $loadFeature->Add('badCDDscore');
617                                    Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
618                                } else {
619                                    # Create the connection.
620                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
621                                    # If this CDD does not yet exist, create its record.
622                                    if (! exists $CDD{$cdd}) {
623                                        $CDD{$cdd} = 1;
624                                        $loadCDD->Put($cdd);
625                                    }
626                                }
627                            }
628                        }
629                        # Next we do PSORT cell locations. here the confidence value
630                        # could have the value "unknown", which we translate to -1.
631                        if (exists $attributeHash{PSORT}) {
632                            # This will be a hash of cell locations to confidence
633                            # factors.
634                            my $psortHash = $attributeHash{PSORT};
635                            for my $psort (keys %{$psortHash}) {
636                                # Get the confidence, and convert it to a number if necessary.
637                                my $confidence = $psortHash->{$psort};
638                                if ($confidence eq 'unknown') {
639                                    $confidence = -1;
640                                }
641                                $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
642                                # If this cell location does not yet exist, create its record.
643                                if (! exists $cellLocation{$psort}) {
644                                    $cellLocation{$psort} = 1;
645                                    $loadCellLocation->Put($psort);
646                                }
647                                # If this is a significant location, add it as a keyword.
648                                if ($confidence > 2.5) {
649                                    push @keywords, $psort;
650                                }
651                            }
652                        }
653                        # Phobius data is next. This consists of the signal peptide location and
654                        # the transmembrane locations.
655                        my $signalList = "";
656                        my $transList = "";
657                        if (exists $attributeHash{Phobius}) {
658                            # This will be a hash of two keys (transmembrane and signal) to
659                            # location strings. If there's no value, we stuff in an empty string.
660                            $signalList = GetCommaList($attributeHash{Phobius}->{signal});
661                            $transList = GetCommaList($attributeHash{Phobius}->{transmembrane});
662                        }
663                        # Here are some more numbers: isoelectric point, molecular weight, and
664                        # the similar-to-human flag.
665                        my $isoelectric = 0;
666                        if (exists $attributeHash{isoelectric_point}) {
667                            $isoelectric = $attributeHash{isoelectric_point}->{""};
668                        }
669                        my $similarToHuman = 0;
670                        if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
671                            $similarToHuman = 1;
672                        }
673                        my $molecularWeight = 0;
674                        if (exists $attributeHash{molecular_weight}) {
675                            $molecularWeight = $attributeHash{molecular_weight}->{""};
676                        }
677                        # Create the keyword string.
678                        my $keywordString = join(" ", @keywords);
679                        Trace("Real keyword string for $featureID: $keywordString.") if T(4);
680                        # Get rid of annoying punctuation.
681                        $keywordString =~ s/[();@#\/]/ /g;
682                        # Get the list of keywords in the keyword string.
683                        my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
684                        # We need to do two things here: create the keyword string for the feature table
685                        # and write records to the keyword and stem files. The stuff we write to
686                        # the files will be taken from the following two hashes. The stuff used
687                        # to create the keyword string will be taken from the list.
688                        my (%keys, %stems, @realStems);
689                        for my $keyword (@realKeywords) {
690                            # Compute the stem and phonex for this keyword.
691                            my ($stem, $phonex) = $biowords->StemLookup($keyword);
692                            # Only proceed if a stem comes back. If no stem came back, it's a
693                            # stop word and we throw it away.
694                            if ($stem) {
695                                $keys{$keyword} = $stem;
696                                $stems{$stem} = $phonex;
697                                push @realStems, $stem;
698                            }
699                        }
700                        # Now create the keyword string.
701                        my $cleanWords = join(" ", @realStems);
702                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
703                        # Write the stem and keyword records.
704                        for my $stem (keys %stems) {
705                            Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
706                        }
707                        for my $key (keys %keys) {
708                            # The stem goes first in this file, because we want to sort
709                            # by stem and then keyword.
710                            Tracer::PutLine($keyh, [$keys{$key}, $key]);
711                        }
712                        # Now we need to process the feature's locations. First, we split them up.
713                        my @locationList = split /\s*,\s*/, $locations;
714                        # Next, we convert them to Sprout location objects.
715                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
716                        # Assemble them into a sprout location string for later.
717                        my $locationString = join(", ", map { $_->String } @locObjectList);
718                        # We'll store the sequence length in here.
719                        my $sequenceLength = 0;
720              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
721              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
722              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
723              # for Sprout.                      # for Sprout. To start, we create the location position indicator.
724              my @locationList = split /\s*,\s*/, $locations;                      my $i = 1;
725              # Loop through the locations.              # Loop through the locations.
726              for my $location (@locationList) {                      for my $locObject (@locObjectList) {
727                  # Parse the location.                          # Record the length.
728                  my $locObject = BasicLocation->new($location);                          $sequenceLength += $locObject->Length;
729                  # Split it into a list of chunks.                          # Split this location into a list of chunks.
730                  my @locOList = ();                  my @locOList = ();
731                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
732                      $loadIsLocatedIn->Add("peeling");                      $loadIsLocatedIn->Add("peeling");
# Line 488  Line 735 
735                  push @locOList, $locObject;                  push @locOList, $locObject;
736                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
737                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
738                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
739                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
740                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
741                      $i++;                      $i++;
742                  }                  }
743              }              }
744                        # Now we get some ancillary flags.
745                        my $locked = $fig->is_locked_fid($featureID);
746                        my $in_genbank = $fig->peg_in_gendb($featureID);
747                        # Create the feature record.
748                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
749                                          $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
750                                          $transList);
751                    }
752                }
753                Trace("Genome $genomeID processed.") if T(3);
754            }
755        }
756        Trace("Sorting keywords.") if T(2);
757        # Now we need to load the keyword table from the key and stem files.
758        close $keyh;
759        close $stemh;
760        Trace("Loading keywords.") if T(2);
761        $keyh = Open(undef, "<$keyFileName");
762        $stemh = Open(undef, "<$stemFileName");
763        # We'll count the keywords in here, for tracing purposes.
764        my $count = 0;
765        # These variables track the current stem's data. When an incoming
766        # keyword's stem changes, these will be recomputed.
767        my ($currentStem, $currentPhonex, $currentCount);
768        # Prime the loop by reading the first stem in the stem file.
769        my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
770        # Loop through the keyword file.
771        while (! eof $keyh) {
772            # Read this keyword.
773            my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
774            # Check to see if it's the new stem yet.
775            if ($thisStem ne $currentStem) {
776                # Yes. It's a terrible error if it's not also the next stem.
777                if ($thisStem ne $nextStem) {
778                    Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
779                } else {
780                    # Here we're okay.
781                    ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
782                    # Count the number of features for this stem.
783                    $currentCount = 0;
784                    while ($nextStem eq $thisStem) {
785                        ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
786                        $currentCount++;
787          }          }
788      }      }
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
789              }              }
790            # Now $currentStem is the same as $thisStem, and the other $current-vars
791            # contain the stem's data (phonex and count).
792            $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
793            if (++$count % 1000 == 0 && T(3)) {
794                Trace("$count keywords loaded.");
795          }          }
796      }      }
797        Trace("$count keywords loaded into keyword table.") if T(2);
798      # Finish the loads.      # Finish the loads.
799      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
800      return $retVal;      return $retVal;
# Line 571  Line 802 
802    
803  =head3 LoadSubsystemData  =head3 LoadSubsystemData
804    
805  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
806    
807  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
808    
# Line 584  Line 815 
815  The following relations are loaded by this method.  The following relations are loaded by this method.
816    
817      Subsystem      Subsystem
818        SubsystemClass
819      Role      Role
820        RoleEC
821        IsIdentifiedByEC
822      SSCell      SSCell
823      ContainsFeature      ContainsFeature
824      IsGenomeOf      IsGenomeOf
# Line 592  Line 826 
826      OccursInSubsystem      OccursInSubsystem
827      ParticipatesIn      ParticipatesIn
828      HasSSCell      HasSSCell
829        ConsistsOfRoles
830        RoleSubset
831        HasRoleSubset
832        ConsistsOfGenomes
833        GenomeSubset
834        HasGenomeSubset
835        Diagram
836        RoleOccursIn
837        SubsystemHopeNotes
838    
839  =over 4  =over 4
840    
# Line 601  Line 844 
844    
845  =back  =back
846    
 B<TO DO>  
   
 Generate RoleName table?  
   
847  =cut  =cut
848  #: Return Type $%;  #: Return Type $%;
849  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 857 
857      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
858      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
859      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
860      my $subsysCount = @subsysIDs;      # Get the map list.
861      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
862      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
863      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram');
864      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
865      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
866      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role');
867      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC');
868      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
869      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
870      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell');
871      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
872      Trace("Beginning subsystem data load.") if T(2);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
873        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
874        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
875        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
876        my $loadHasSSCell = $self->_TableLoader('HasSSCell');
877        my $loadRoleSubset = $self->_TableLoader('RoleSubset');
878        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
879        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
880        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
881        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
882        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
883        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
884        my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
885        if ($self->{options}->{loadOnly}) {
886            Trace("Loading from existing files.") if T(2);
887        } else {
888            Trace("Generating subsystem data.") if T(2);
889            # This hash will contain the roles for each EC. When we're done, this
890            # information will be used to generate the Catalyzes table.
891            my %ecToRoles = ();
892      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
893      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
894      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
895      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
896            # and memorize up the reactions.
897            my ($genomeID, $roleID);
898      my %roleData = ();      my %roleData = ();
899      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
900                # Get the subsystem object.
901                my $sub = $fig->get_subsystem($subsysID);
902                # Only proceed if the subsystem has a spreadsheet.
903                if (defined($sub) && ! $sub->{empty_ss}) {
904          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
905          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
906          # Create the subsystem record.          # Create the subsystem record.
907          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
908          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
909          my @roles = $fig->subsystem_to_roles($subsysID);                  my $version = $sub->get_version();
910          # Connect the roles to the subsystem. If a role is new, we create                  my $description = $sub->get_description();
911          # a role record for it.                  $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
912          for my $roleID (@roles) {                  # Add the hope notes.
913                    my $hopeNotes = $sub->get_hope_curation_notes();
914                    if ($hopeNotes) {
915                        $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
916                    }
917                    # Now for the classification string. This comes back as a list
918                    # reference and we convert it to a space-delimited string.
919                    my $classList = $fig->subsystem_classification($subsysID);
920                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
921                    $loadSubsystemClass->Put($subsysID, $classString);
922                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
923                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
924                        # Get the role's abbreviation.
925                        my $abbr = $sub->get_role_abbr($col);
926                        # Get its essentiality.
927                        my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
928                        # Get its reaction note.
929                        my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
930                        # Connect to this role.
931              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
932              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
933                        # If it's a new role, add it to the role table.
934              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
935                            # Get the role's abbreviation.
936                            # Add the role.
937                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
938                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
939                            # Check for an EC number.
940                            if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
941                                my $ec = $1;
942                                $loadIsIdentifiedByEC->Put($roleID, $ec);
943                                # Check to see if this is our first encounter with this EC.
944                                if (exists $ecToRoles{$ec}) {
945                                    # No, so just add this role to the EC list.
946                                    push @{$ecToRoles{$ec}}, $roleID;
947                                } else {
948                                    # Output this EC.
949                                    $loadRoleEC->Put($ec);
950                                    # Create its role list.
951                                    $ecToRoles{$ec} = [$roleID];
952                                }
953                            }
954              }              }
955          }          }
956          # Now all roles for this subsystem have been filled in. We create the                  # Now we create the spreadsheet for the subsystem by matching roles to
957          # spreadsheet by matches roles to genomes. To do this, we need to                  # genomes. Each genome is a row and each role is a column. We may need
958          # get the genomes on the sheet.                  # to actually create the roles as we find them.
959          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
960          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
961          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
962              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
963                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
964                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
965                            my $cellCount = 0;
966                            # Create a list for the PEGs we find. This list will be used
967                            # to generate cluster numbers.
968                            my @pegsFound = ();
969                            # Create a hash that maps spreadsheet IDs to PEGs. We will
970                            # use this to generate the ContainsFeature data after we have
971                            # the cluster numbers.
972                            my %cellPegs = ();
973                            # Get the genome's variant code for this subsystem.
974                            my $variantCode = $sub->get_variant_code($row);
975                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
976                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
977                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
978                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
979                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
980                      # Only proceed if features exist.                      # Only proceed if features exist.
981                      if (@pegs > 0) {                      if (@pegs > 0) {
982                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
983                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
984                                    my $cellID = "$subsysID:$genomeID:$col";
985                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
986                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
987                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
988                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
989                          # Attach the features to it.                                  # Remember its features.
990                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
991                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
992                                    $pegCount += @pegs;
993                                }
994                            }
995                            # If we found some cells for this genome, we need to compute clusters and
996                            # denote it participates in the subsystem.
997                            if ($pegCount > 0) {
998                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
999                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
1000                                # Create a hash mapping PEG IDs to cluster numbers.
1001                                # We default to -1 for all of them.
1002                                my %clusterOf = map { $_ => -1 } @pegsFound;
1003                                # Partition the PEGs found into clusters.
1004                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
1005                                for (my $i = 0; $i <= $#clusters; $i++) {
1006                                    my $subList = $clusters[$i];
1007                                    for my $peg (@{$subList}) {
1008                                        $clusterOf{$peg} = $i;
1009                                    }
1010                                }
1011                                # Create the ContainsFeature data.
1012                                for my $cellID (keys %cellPegs) {
1013                                    my $cellList = $cellPegs{$cellID};
1014                                    for my $cellPeg (@$cellList) {
1015                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
1016                          }                          }
1017                      }                      }
1018                  }                  }
1019              }              }
1020          }          }
1021                    # Now we need to generate the subsets. The subset names must be concatenated to
1022                    # the subsystem name to make them unique keys. There are two types of subsets:
1023                    # genome subsets and role subsets. We do the role subsets first.
1024                    my @subsetNames = $sub->get_subset_names();
1025                    for my $subsetID (@subsetNames) {
1026                        # Create the subset record.
1027                        my $actualID = "$subsysID:$subsetID";
1028                        $loadRoleSubset->Put($actualID);
1029                        # Connect the subset to the subsystem.
1030                        $loadHasRoleSubset->Put($subsysID, $actualID);
1031                        # Connect the subset to its roles.
1032                        my @roles = $sub->get_subsetC_roles($subsetID);
1033                        for my $roleID (@roles) {
1034                            $loadConsistsOfRoles->Put($actualID, $roleID);
1035      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
1036  }  }
1037                    # Next the genome subsets.
1038  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
1039                    for my $subsetID (@subsetNames) {
1040  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
1041                        my $actualID = "$subsysID:$subsetID";
1042  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
1043                        # Connect the subset to the subsystem.
1044  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
1045  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
1046                        my @genomes = $sub->get_subsetR($subsetID);
1047  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
1048                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
1049      Diagram                      }
1050      RoleOccursIn                  }
1051                }
1052  =over 4          }
1053            # Now we loop through the diagrams. We need to create the diagram records
1054  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
1055            # in subsystems (and therefore appear in the %ecToRoles hash) are
1056  Returns a statistics object for the loads.          # included.
1057            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
1058          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
1059          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
1060          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 739  Line 1062 
1062          # Now we need to link all the map's roles to it.          # Now we need to link all the map's roles to it.
1063          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
1064          my %roleHash = ();          my %roleHash = ();
1065          for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
1066                    if (exists $ecToRoles{$ec}) {
1067                        for my $role (@{$ecToRoles{$ec}}) {
1068              if (! $roleHash{$role}) {              if (! $roleHash{$role}) {
1069                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($role, $map);
1070                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
1071              }              }
1072          }          }
1073      }      }
1074                }
1075            }
1076        }
1077      # Finish the load.      # Finish the load.
1078      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1079      return $retVal;      return $retVal;
# Line 753  Line 1081 
1081    
1082  =head3 LoadPropertyData  =head3 LoadPropertyData
1083    
1084  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
1085    
1086  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
1087    
# Line 787  Line 1115 
1115      my $fig = $self->{fig};      my $fig = $self->{fig};
1116      # Get the genome hash.      # Get the genome hash.
1117      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1118      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1119      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
1120      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty');
1121      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1122            Trace("Loading from existing files.") if T(2);
1123        } else {
1124            Trace("Generating property data.") if T(2);
1125      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
1126      my %propertyKeys = ();      my %propertyKeys = ();
1127      my $nextID = 1;      my $nextID = 1;
1128            # Get the attributes we intend to store in the property table.
1129            my $propKeys = $self->{propKeys};
1130      # Loop through the genomes.      # Loop through the genomes.
1131      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1132          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
1133          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
1134          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
1135          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
1136          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
1137          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
1138          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
1139              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
1140              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
1141              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
1142              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
1143                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
1144                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
1145                  # character.                  # character.
# Line 830  Line 1157 
1157                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
1158                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
1159              }              }
1160                # Update the statistics.
1161                Trace("$propertyCount attributes processed.") if T(3);
1162                $loadHasProperty->Add("propertiesIn", $propertyCount);
1163          }          }
1164      }      }
1165      # Finish the load.      # Finish the load.
# Line 839  Line 1169 
1169    
1170  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1171    
1172  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1173    
1174  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1175    
# Line 871  Line 1201 
1201      my $fig = $self->{fig};      my $fig = $self->{fig};
1202      # Get the genome hash.      # Get the genome hash.
1203      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1204      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1205      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1206      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1207      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1208      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1209      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1210      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1211            Trace("Loading from existing files.") if T(2);
1212        } else {
1213            Trace("Generating annotation data.") if T(2);
1214      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1215      # user records.      # user records.
1216      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1224 
1224      # Loop through the genomes.      # Loop through the genomes.
1225      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1226          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1227              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1228              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1229              my %seenTimestamps = ();              my %seenTimestamps = ();
1230              # Check for a functional assignment.              # Get the genome's annotations.
1231              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1232              if ($func) {              Trace("Processing annotations.") if T(2);
1233                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1234                  # assignment annotation for it.                  # Get the annotation tuple.
1235                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1236                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1237                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1238                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1239                      # stop the substitution search.                      # stop the substitution search.
1240                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1244 
1244                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1245                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1246                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1247                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1248                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1249                              $timestamp++;                      my $keyStamp = $timestamp;
1250                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1251                            $keyStamp++;
1252                          }                          }
1253                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1254                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1255                          # Insure the user exists.                          # Insure the user exists.
1256                          if (! $users{$user}) {                          if (! $users{$user}) {
1257                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1259 
1259                              $users{$user} = 1;                              $users{$user} = 1;
1260                          }                          }
1261                          # Generate the annotation.                          # Generate the annotation.
1262                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1263                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1264                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1265                      } else {                      } else {
# Line 950  Line 1269 
1269                  }                  }
1270              }              }
1271          }          }
     }  
1272      # Finish the load.      # Finish the load.
1273      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1274      return $retVal;      return $retVal;
# Line 958  Line 1276 
1276    
1277  =head3 LoadSourceData  =head3 LoadSourceData
1278    
1279  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1280    
1281  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1282    
# Line 991  Line 1309 
1309      my $fig = $self->{fig};      my $fig = $self->{fig};
1310      # Get the genome hash.      # Get the genome hash.
1311      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1312      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1313      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1314      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1315      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1316      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1317            Trace("Loading from existing files.") if T(2);
1318        } else {
1319            Trace("Generating annotation data.") if T(2);
1320      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1321      my %sourceURL = ();      my %sourceURL = ();
1322      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1330 
1330              chomp $line;              chomp $line;
1331              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1332              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1333              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1334                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1335                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1336              }              }
1337              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1338                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1339                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1340                        $sourceDesc{$sourceID} = $sourceID;
1341              }              }
1342          }          }
1343          close TMP;          close TMP;
1344      }      }
1345            # Write the source descriptions.
1346            for my $sourceID (keys %sourceDesc) {
1347                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1348            }
1349        }
1350      # Finish the load.      # Finish the load.
1351      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1352      return $retVal;      return $retVal;
# Line 1028  Line 1354 
1354    
1355  =head3 LoadExternalData  =head3 LoadExternalData
1356    
1357  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1358    
1359  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1360    
# Line 1060  Line 1386 
1386      my $fig = $self->{fig};      my $fig = $self->{fig};
1387      # Get the genome hash.      # Get the genome hash.
1388      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1389      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1390      # it the key.      # it the key.
1391      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1392      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1393      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1394      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1395      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1396            Trace("Loading from existing files.") if T(2);
1397        } else {
1398            Trace("Generating external data.") if T(2);
1399      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1400      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1401      my $orgLine;      my $orgLine;
1402      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1403          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1409 
1409      close ORGS;      close ORGS;
1410      # Now the function file.      # Now the function file.
1411      my $funcLine;      my $funcLine;
1412      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1413      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1414          # Clean the line ending.          # Clean the line ending.
1415          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1425 
1425              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1426          }          }
1427      }      }
1428        }
1429      # Finish the load.      # Finish the load.
1430      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1431      return $retVal;      return $retVal;
1432  }  }
1433    
 =head3 LoadGroupData  
1434    
1435  C<< my $stats = $spl->LoadGroupData(); >>  =head3 LoadReactionData
1436    
1437        my $stats = $spl->LoadReactionData();
1438    
1439    Load the reaction data from FIG into Sprout.
1440    
1441  Load the genome Groups into Sprout.  Reaction data connects reactions to the compounds that participate in them.
1442    
1443  The following relations are loaded by this method.  The following relations are loaded by this method.
1444    
1445      GenomeGroups      Reaction
1446        ReactionURL
1447        Compound
1448        CompoundName
1449        CompoundCAS
1450        IsIdentifiedByCAS
1451        HasCompoundName
1452        IsAComponentOf
1453        Scenario
1454        Catalyzes
1455        HasScenario
1456        IsInputFor
1457        IsOutputOf
1458        ExcludesReaction
1459        IncludesReaction
1460        IsOnDiagram
1461        IncludesReaction
1462    
1463  There is no direct support for genome groups in FIG, so we access the SEED  This method proceeds reaction by reaction rather than genome by genome.
1464  files directly.  
1465    =over 4
1466    
1467    =item RETURNS
1468    
1469    Returns a statistics object for the loads.
1470    
1471    =back
1472    
1473    =cut
1474    #: Return Type $%;
1475    sub LoadReactionData {
1476        # Get this object instance.
1477        my ($self) = @_;
1478        # Get the FIG object.
1479        my $fig = $self->{fig};
1480        # Create load objects for each of the tables we're loading.
1481        my $loadReaction = $self->_TableLoader('Reaction');
1482        my $loadReactionURL = $self->_TableLoader('ReactionURL');
1483        my $loadCompound = $self->_TableLoader('Compound');
1484        my $loadCompoundName = $self->_TableLoader('CompoundName');
1485        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1486        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1487        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1488        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1489        my $loadScenario = $self->_TableLoader('Scenario');
1490        my $loadHasScenario = $self->_TableLoader('HasScenario');
1491        my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1492        my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1493        my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1494        my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1495        my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1496        my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1497        if ($self->{options}->{loadOnly}) {
1498            Trace("Loading from existing files.") if T(2);
1499        } else {
1500            Trace("Generating reaction data.") if T(2);
1501            # We need some hashes to prevent duplicates.
1502            my %compoundNames = ();
1503            my %compoundCASes = ();
1504            # First we create the compounds.
1505            my %compounds = map { $_ => 1 } $fig->all_compounds();
1506            for my $cid (keys %compounds) {
1507                # Check for names.
1508                my @names = $fig->names_of_compound($cid);
1509                # Each name will be given a priority number, starting with 1.
1510                my $prio = 1;
1511                for my $name (@names) {
1512                    if (! exists $compoundNames{$name}) {
1513                        $loadCompoundName->Put($name);
1514                        $compoundNames{$name} = 1;
1515                    }
1516                    $loadHasCompoundName->Put($cid, $name, $prio++);
1517                }
1518                # Create the main compound record. Note that the first name
1519                # becomes the label.
1520                my $label = (@names > 0 ? $names[0] : $cid);
1521                $loadCompound->Put($cid, $label);
1522                # Check for a CAS ID.
1523                my $cas = $fig->cas($cid);
1524                if ($cas) {
1525                    $loadIsIdentifiedByCAS->Put($cid, $cas);
1526                    if (! exists $compoundCASes{$cas}) {
1527                        $loadCompoundCAS->Put($cas);
1528                        $compoundCASes{$cas} = 1;
1529                    }
1530                }
1531            }
1532            # All the compounds are set up, so we need to loop through the reactions next. First,
1533            # we initialize the discriminator index. This is a single integer used to insure
1534            # duplicate elements in a reaction are not accidentally collapsed.
1535            my $discrim = 0;
1536            my %reactions = map { $_ => 1 } $fig->all_reactions();
1537            for my $reactionID (keys %reactions) {
1538                # Create the reaction record.
1539                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1540                # Compute the reaction's URL.
1541                my $url = HTML::reaction_link($reactionID);
1542                # Put it in the ReactionURL table.
1543                $loadReactionURL->Put($reactionID, $url);
1544                # Now we need all of the reaction's compounds. We get these in two phases,
1545                # substrates first and then products.
1546                for my $product (0, 1) {
1547                    # Get the compounds of the current type for the current reaction. FIG will
1548                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1549                    # have location data in SEED, so it defaults to the empty string.
1550                    my @compounds = $fig->reaction2comp($reactionID, $product);
1551                    for my $compData (@compounds) {
1552                        # Extract the compound data from the current tuple.
1553                        my ($cid, $stoich, $main) = @{$compData};
1554                        # Link the compound to the reaction.
1555                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1556                                                 $product, $stoich);
1557                    }
1558                }
1559            }
1560            # Now we run through the subsystems and roles, generating the scenarios
1561            # and connecting the reactions. We'll need some hashes to prevent
1562            # duplicates and a counter for compound group keys.
1563            my %roles = ();
1564            my %scenarios = ();
1565            my @subsystems = $fig->all_subsystems();
1566            for my $subName (@subsystems) {
1567                my $sub = $fig->get_subsystem($subName);
1568                Trace("Processing $subName reactions.") if T(3);
1569                # Get the subsystem's reactions.
1570                my %reactions = $sub->get_hope_reactions();
1571                # Loop through the roles, connecting them to the reactions.
1572                for my $role (keys %reactions) {
1573                    # Only process this role if it is new.
1574                    if (! $roles{$role}) {
1575                        $roles{$role} = 1;
1576                        my @reactions = @{$reactions{$role}};
1577                        for my $reaction (@reactions) {
1578                            $loadCatalyzes->Put($role, $reaction);
1579                        }
1580                    }
1581                }
1582                Trace("Processing $subName scenarios.") if T(3);
1583                # Get the subsystem's scenarios.
1584                my @scenarioNames = $sub->get_hope_scenario_names();
1585                # Loop through the scenarios, creating scenario data.
1586                for my $scenarioName (@scenarioNames) {
1587                    # Link this scenario to this subsystem.
1588                    $loadHasScenario->Put($subName, $scenarioName);
1589                    # If this scenario is new, we need to create it.
1590                    if (! $scenarios{$scenarioName}) {
1591                        Trace("Creating scenario $scenarioName.") if T(3);
1592                        $scenarios{$scenarioName} = 1;
1593                        # Create the scenario itself.
1594                        $loadScenario->Put($scenarioName);
1595                        # Attach the input compounds.
1596                        for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1597                            $loadIsInputFor->Put($input, $scenarioName);
1598                        }
1599                        # Now we need to set up the output compounds. They come in two
1600                        # groups, which we mark 0 and 1.
1601                        my $outputGroup = 0;
1602                        # Set up the output compounds.
1603                        for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1604                            # Attach the compounds.
1605                            for my $compound (@$outputGroup) {
1606                                $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1607                            }
1608                        }
1609                        # Create the reaction lists.
1610                        my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1611                        for my $reaction (@addReactions) {
1612                            $loadIncludesReaction->Put($scenarioName, $reaction);
1613                        }
1614                        my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1615                        for my $reaction (@notReactions) {
1616                            $loadExcludesReaction->Put($scenarioName, $reaction);
1617                        }
1618                        # Link the maps.
1619                        my @maps = $sub->get_hope_map_ids($scenarioName);
1620                        for my $map (@maps) {
1621                            $loadIsOnDiagram->Put($scenarioName, "map$map");
1622                        }
1623                    }
1624                }
1625            }
1626        }
1627        # Finish the load.
1628        my $retVal = $self->_FinishAll();
1629        return $retVal;
1630    }
1631    
1632    =head3 LoadSynonymData
1633    
1634        my $stats = $spl->LoadSynonymData();
1635    
1636    Load the synonym groups into Sprout.
1637    
1638    The following relations are loaded by this method.
1639    
1640        SynonymGroup
1641        IsSynonymGroupFor
1642    
1643    The source information for these relations is taken from the C<maps_to_id> method
1644    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1645    SQL against the FIG database.
1646    
1647  =over 4  =over 4
1648    
# Line 1125  Line 1654 
1654    
1655  =cut  =cut
1656  #: Return Type $%;  #: Return Type $%;
1657  sub LoadGroupData {  sub LoadSynonymData {
1658      # Get this object instance.      # Get this object instance.
1659      my ($self) = @_;      my ($self) = @_;
1660      # Get the FIG object.      # Get the FIG object.
1661      my $fig = $self->{fig};      my $fig = $self->{fig};
1662      # Get the genome hash.      # Get the genome hash.
1663      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1664      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1665      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1666      Trace("Beginning group data load.") if T(2);      my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1667        if ($self->{options}->{loadOnly}) {
1668            Trace("Loading from existing files.") if T(2);
1669        } else {
1670            Trace("Generating synonym group data.") if T(2);
1671            # Get the database handle.
1672            my $dbh = $fig->db_handle();
1673            # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1674            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1675            my $result = $sth->execute();
1676            if (! defined($result)) {
1677                Confess("Database error in Synonym load: " . $sth->errstr());
1678            } else {
1679                Trace("Processing synonym results.") if T(2);
1680                # Remember the current synonym.
1681                my $current_syn = "";
1682                # Count the features.
1683                my $featureCount = 0;
1684                my $entryCount = 0;
1685                # Loop through the synonym/peg pairs.
1686                while (my @row = $sth->fetchrow()) {
1687                    # Get the synonym group ID and feature ID.
1688                    my ($syn_id, $peg) = @row;
1689                    # Count this row.
1690                    $entryCount++;
1691                    if ($entryCount % 1000 == 0) {
1692                        Trace("$entryCount rows processed.") if T(3);
1693                    }
1694                    # Insure it's for one of our genomes.
1695                    my $genomeID = FIG::genome_of($peg);
1696                    if (exists $genomeHash->{$genomeID}) {
1697                        # Verify the synonym.
1698                        if ($syn_id ne $current_syn) {
1699                            # It's new, so put it in the group table.
1700                            $loadSynonymGroup->Put($syn_id);
1701                            $current_syn = $syn_id;
1702                        }
1703                        # Connect the synonym to the peg.
1704                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1705                        # Count this feature.
1706                        $featureCount++;
1707                        if ($featureCount % 1000 == 0) {
1708                            Trace("$featureCount features processed.") if T(3);
1709                        }
1710                    }
1711                }
1712                Trace("$entryCount rows produced $featureCount features.") if T(2);
1713            }
1714        }
1715        # Finish the load.
1716        my $retVal = $self->_FinishAll();
1717        return $retVal;
1718    }
1719    
1720    =head3 LoadFamilyData
1721    
1722        my $stats = $spl->LoadFamilyData();
1723    
1724    Load the protein families into Sprout.
1725    
1726    The following relations are loaded by this method.
1727    
1728        Family
1729        IsFamilyForFeature
1730    
1731    The source information for these relations is taken from the C<families_for_protein>,
1732    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1733    
1734    =over 4
1735    
1736    =item RETURNS
1737    
1738    Returns a statistics object for the loads.
1739    
1740    =back
1741    
1742    =cut
1743    #: Return Type $%;
1744    sub LoadFamilyData {
1745        # Get this object instance.
1746        my ($self) = @_;
1747        # Get the FIG object.
1748        my $fig = $self->{fig};
1749        # Get the genome hash.
1750        my $genomeHash = $self->{genomes};
1751        # Create load objects for the tables we're loading.
1752        my $loadFamily = $self->_TableLoader('Family');
1753        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1754        if ($self->{options}->{loadOnly}) {
1755            Trace("Loading from existing files.") if T(2);
1756        } else {
1757            Trace("Generating family data.") if T(2);
1758            # Create a hash for the family IDs.
1759            my %familyHash = ();
1760      # Loop through the genomes.      # Loop through the genomes.
1761      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1762      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1763          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1764          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1765          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1766              defined($line = <TMP>)) {                  # Get this feature's families.
1767              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1768              chomp $line;                  # Loop through the families, connecting them to the feature.
1769              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1770              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1771              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1772                        if (! exists $familyHash{$family}) {
1773                            $familyHash{$family} = 1;
1774                            $loadFamily->Add("families", 1);
1775                            my $size = $fig->sz_family($family);
1776                            my $func = $fig->family_function($family);
1777                            $loadFamily->Put($family, $size, $func);
1778                        }
1779                    }
1780                }
1781          }          }
         close TMP;  
1782      }      }
1783      # Finish the load.      # Finish the load.
1784      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1785      return $retVal;      return $retVal;
1786  }  }
1787    
1788    =head3 LoadDrugData
1789    
1790        my $stats = $spl->LoadDrugData();
1791    
1792    Load the drug target data into Sprout.
1793    
1794    The following relations are loaded by this method.
1795    
1796        PDB
1797        DocksWith
1798        IsProteinForFeature
1799        Ligand
1800    
1801    The source information for these relations is taken from attributes. The
1802    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1803    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1804    attribute contains the information for the B<DocksWith> relationship. It is
1805    expected that additional attributes and tables will be added in the future.
1806    
1807    =over 4
1808    
1809    =item RETURNS
1810    
1811    Returns a statistics object for the loads.
1812    
1813    =back
1814    
1815    =cut
1816    #: Return Type $%;
1817    sub LoadDrugData {
1818        # Get this object instance.
1819        my ($self) = @_;
1820        # Get the FIG object.
1821        my $fig = $self->{fig};
1822        # Get the genome hash.
1823        my $genomeHash = $self->{genomes};
1824        # Create load objects for the tables we're loading.
1825        my $loadPDB = $self->_TableLoader('PDB');
1826        my $loadLigand = $self->_TableLoader('Ligand');
1827        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1828        my $loadDocksWith = $self->_TableLoader('DocksWith');
1829        if ($self->{options}->{loadOnly}) {
1830            Trace("Loading from existing files.") if T(2);
1831        } else {
1832            Trace("Generating drug target data.") if T(2);
1833            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1834            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1835            # this process, PDB information is collected in a hash table and then
1836            # unspooled after both relationships are created.
1837            my %pdbHash = ();
1838            Trace("Generating docking data.") if T(2);
1839            # Get all the docking data. This may cause problems if there are too many PDBs,
1840            # at which point we'll need another algorithm. The indicator that this is
1841            # happening will be a timeout error in the next statement.
1842            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1843                                                  ['docking_results', $FIG_Config::dockLimit]);
1844            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1845            for my $dockData (@dockData) {
1846                # Get the docking data components.
1847                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1848                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1849                $pdbID = lc $pdbID;
1850                # Strip off the object type.
1851                $pdbID =~ s/pdb://;
1852                # Extract the ZINC ID from the docking key. Note that there are two possible
1853                # formats.
1854                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1855                if (! $zinc_id) {
1856                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1857                    $loadDocksWith->Add("errors");
1858                } else {
1859                    # Get the pieces of the value and parse the energy.
1860                    # Note that we don't care about the rank, since
1861                    # we can sort on the energy level itself in our database.
1862                    my ($energy, $tool, $type) = @valueData;
1863                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1864                    # Ignore predicted results.
1865                    if ($type ne "Predicted") {
1866                        # Count this docking result.
1867                        if (! exists $pdbHash{$pdbID}) {
1868                            $pdbHash{$pdbID} = 1;
1869                        } else {
1870                            $pdbHash{$pdbID}++;
1871                        }
1872                        # Write the result to the output.
1873                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1874                                            $total, $vanderwaals);
1875                    }
1876                }
1877            }
1878            Trace("Connecting features.") if T(2);
1879            # Loop through the genomes.
1880            for my $genome (sort keys %{$genomeHash}) {
1881                Trace("Generating PDBs for $genome.") if T(3);
1882                # Get all of the PDBs that BLAST against this genome's features.
1883                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1884                for my $pdbData (@attributeData) {
1885                    # The PDB ID is coded as a subkey.
1886                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1887                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1888                        $loadPDB->Add("errors");
1889                    } else {
1890                        my $pdbID = $1;
1891                        # Insure the PDB is in the hash.
1892                        if (! exists $pdbHash{$pdbID}) {
1893                            $pdbHash{$pdbID} = 0;
1894                        }
1895                        # The score and locations are coded in the attribute value.
1896                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1897                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1898                            $loadIsProteinForFeature->Add("errors");
1899                        } else {
1900                            my ($score, $locData) = ($1,$2);
1901                            # The location data may not be present, so we have to start with some
1902                            # defaults and then check.
1903                            my ($start, $end) = (1, 0);
1904                            if ($locData) {
1905                                $locData =~ /(\d+)-(\d+)/;
1906                                $start = $1;
1907                                $end = $2;
1908                            }
1909                            # If we still don't have the end location, compute it from
1910                            # the feature length.
1911                            if (! $end) {
1912                                # Most features have one location, but we do a list iteration
1913                                # just in case.
1914                                my @locations = $fig->feature_location($pdbData->[0]);
1915                                $end = 0;
1916                                for my $loc (@locations) {
1917                                    my $locObject = BasicLocation->new($loc);
1918                                    $end += $locObject->Length;
1919                                }
1920                            }
1921                            # Decode the score.
1922                            my $realScore = FIGRules::DecodeScore($score);
1923                            # Connect the PDB to the feature.
1924                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1925                        }
1926                    }
1927                }
1928            }
1929            # We've got all our PDBs now, so we unspool them from the hash.
1930            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1931            my $count = 0;
1932            for my $pdbID (sort keys %pdbHash) {
1933                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1934                $count++;
1935                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1936            }
1937            # Finally we create the ligand table. This information can be found in the
1938            # zinc_name attribute.
1939            Trace("Loading ligands.") if T(2);
1940            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1941            my $last_zinc_id = "";
1942            my $zinc_id = "";
1943            my $done = 0;
1944            while (! $done) {
1945                # Get the next 10000 ligands. We insist that the object ID is greater than
1946                # the last ID we processed.
1947                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1948                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1949                                                           ["ZINC:$zinc_id", "zinc_name"]);
1950                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1951                if (! @attributeData) {
1952                    # Here there are no attributes left, so we quit the loop.
1953                    $done = 1;
1954                } else {
1955                    # Process the attribute data we've received.
1956                    for my $zinc_data (@attributeData) {
1957                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1958                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1959                            $zinc_id = $1;
1960                            # Check for a duplicate.
1961                            if ($zinc_id eq $last_zinc_id) {
1962                                $loadLigand->Add("duplicate");
1963                            } else {
1964                                # Here it's safe to output the ligand. The ligand name is the attribute value
1965                                # (third column in the row).
1966                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1967                                # Insure we don't try to add this ID again.
1968                                $last_zinc_id = $zinc_id;
1969                            }
1970                        } else {
1971                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1972                            $loadLigand->Add("errors");
1973                        }
1974                    }
1975                }
1976            }
1977            Trace("Ligands loaded.") if T(2);
1978        }
1979        # Finish the load.
1980        my $retVal = $self->_FinishAll();
1981        return $retVal;
1982    }
1983    
1984    
1985  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1986    
1987    =head3 SpecialAttribute
1988    
1989        my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1990    
1991    Look for special attributes of a given type. A special attribute is found by comparing one of
1992    the columns of the incoming attribute list to a search pattern. If a match is found, then
1993    a set of columns is put into an output table connected to the specified ID.
1994    
1995    For example, when processing features, the attribute list we look at has three columns: attribute
1996    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1997    begins with C<iedb_>. The call signature is therefore
1998    
1999        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
2000    
2001    The pattern is matched against column 0, and if we have a match, then column 2's value is put
2002    to the output along with the specified feature ID.
2003    
2004    =over 4
2005    
2006    =item id
2007    
2008    ID of the object whose special attributes are being loaded. This forms the first column of the
2009    output.
2010    
2011    =item attributes
2012    
2013    Reference to a list of tuples.
2014    
2015    =item idxMatch
2016    
2017    Index in each tuple of the column to be matched against the pattern. If the match is
2018    successful, an output record will be generated.
2019    
2020    =item idxValues
2021    
2022    Reference to a list containing the indexes in each tuple of the columns to be put as
2023    the second column of the output.
2024    
2025    =item pattern
2026    
2027    Pattern to be matched against the specified column. The match will be case-insensitive.
2028    
2029    =item loader
2030    
2031    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
2032    but technically it could be anything with a C<Put> method.
2033    
2034    =item RETURN
2035    
2036    Returns a count of the matches found.
2037    
2038    =item
2039    
2040    =back
2041    
2042    =cut
2043    
2044    sub SpecialAttribute {
2045        # Get the parameters.
2046        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
2047        # Declare the return variable.
2048        my $retVal = 0;
2049        # Loop through the attribute rows.
2050        for my $row (@{$attributes}) {
2051            # Check for a match.
2052            if ($row->[$idxMatch] =~ m/$pattern/i) {
2053                # We have a match, so output a row. This is a bit tricky, since we may
2054                # be putting out multiple columns of data from the input.
2055                my $value = join(" ", map { $row->[$_] } @{$idxValues});
2056                $loader->Put($id, $value);
2057                $retVal++;
2058            }
2059        }
2060        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
2061        # Return the number of matches.
2062        return $retVal;
2063    }
2064    
2065  =head3 TableLoader  =head3 TableLoader
2066    
2067  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 2076 
2076    
2077  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
2078    
 =item rowCount (optional)  
   
 Estimated maximum number of rows in the table.  
   
2079  =item RETURN  =item RETURN
2080    
2081  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1186  Line 2086 
2086    
2087  sub _TableLoader {  sub _TableLoader {
2088      # Get the parameters.      # Get the parameters.
2089      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName) = @_;
2090      # Create the load object.      # Create the load object.
2091      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
2092      # Cache it in the loader list.      # Cache it in the loader list.
2093      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
2094      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 2122 
2122      my $retVal = Stats->new();      my $retVal = Stats->new();
2123      # Get the loader list.      # Get the loader list.
2124      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
2125        # Create a hash to hold the statistics objects, keyed on relation name.
2126        my %loaderHash = ();
2127      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
2128      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
2129      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
2130            # Get the relation name.
2131            my $relName = $loader->RelName;
2132            # Check the ignore flag.
2133            if ($loader->Ignore) {
2134                Trace("Relation $relName not loaded.") if T(2);
2135            } else {
2136                # Here we really need to finish.
2137                Trace("Finishing $relName.") if T(2);
2138          my $stats = $loader->Finish();          my $stats = $loader->Finish();
2139                $loaderHash{$relName} = $stats;
2140            }
2141        }
2142        # Now we loop through again, actually loading the tables. We want to finish before
2143        # loading so that if something goes wrong at this point, all the load files are usable
2144        # and we don't have to redo all that work.
2145        for my $relName (sort keys %loaderHash) {
2146            # Get the statistics for this relation.
2147            my $stats = $loaderHash{$relName};
2148            # Check for a database load.
2149            if ($self->{options}->{dbLoad}) {
2150                # Here we want to use the load file just created to load the database.
2151                Trace("Loading relation $relName.") if T(2);
2152                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
2153                # Accumulate the statistics from the DB load.
2154                $stats->Accumulate($newStats);
2155            }
2156          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
2157          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
2158      }      }
2159      # Return the load statistics.      # Return the load statistics.
2160      return $retVal;      return $retVal;
2161  }  }
2162    
2163    =head3 GetGenomeAttributes
2164    
2165        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2166    
2167    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2168    attributes for all the features of a genome in a single call, then organizes them into
2169    a hash.
2170    
2171    =over 4
2172    
2173    =item fig
2174    
2175    FIG-like object for accessing attributes.
2176    
2177    =item genomeID
2178    
2179    ID of the genome who's attributes are desired.
2180    
2181    =item fids
2182    
2183    Reference to a list of the feature IDs whose attributes are to be kept.
2184    
2185    =item propKeys
2186    
2187    A list of the keys to retrieve.
2188    
2189    =item RETURN
2190    
2191    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2192    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2193    the attribute key, and one or more attribute values.
2194    
2195    =back
2196    
2197    =cut
2198    
2199    sub GetGenomeAttributes {
2200        # Get the parameters.
2201        my ($fig, $genomeID, $fids, $propKeys) = @_;
2202        # Declare the return variable.
2203        my $retVal = {};
2204        # Initialize the hash. This not only enables us to easily determine which FIDs to
2205        # keep, it insures that the caller sees a list reference for every known fid,
2206        # simplifying the logic.
2207        for my $fid (@{$fids}) {
2208            $retVal->{$fid} = [];
2209        }
2210        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
2211        # an eval is used.
2212        my @aList = ();
2213        eval {
2214            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
2215            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
2216        };
2217        # Check for a problem.
2218        if ($@) {
2219            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
2220            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
2221            # but allows us to continue processing.
2222            my $nFids = scalar @{$fids};
2223            for (my $i = 0; $i < $nFids; $i += 100) {
2224                # Determine the index of the last feature ID we'll be specifying on this pass.
2225                # Normally it's $i + 99, but if we're close to the end it may be less.
2226                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
2227                # Get a slice of the fid list.
2228                my @slice = @{$fids}[$i .. $end];
2229                # Get the relevant attributes.
2230                Trace("Retrieving attributes for fids $i to $end.") if T(3);
2231                my @aShort = $fig->get_attributes(\@slice, $propKeys);
2232                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
2233                push @aList, @aShort;
2234            }
2235        }
2236        # Now we should have all the interesting attributes in @aList. Populate the hash with
2237        # them.
2238        for my $aListEntry (@aList) {
2239            my $fid = $aListEntry->[0];
2240            if (exists $retVal->{$fid}) {
2241                push @{$retVal->{$fid}}, $aListEntry;
2242            }
2243        }
2244        # Return the result.
2245        return $retVal;
2246    }
2247    
2248    =head3 GetCommaList
2249    
2250        my $string = GetCommaList($value);
2251    
2252    Create a comma-separated list of the values in a list reference. If the
2253    list reference is a scalar, it will be returned unchanged. If it is
2254    undefined, an empty string will be returned. The idea is that we may be
2255    looking at a string, a list, or nothing, but whatever comes out will be a
2256    string.
2257    
2258    =over 4
2259    
2260    =item value
2261    
2262    Reference to a list of values to be assembled into the return string.
2263    
2264    =item RETURN
2265    
2266    Returns a scalar string containing the content of the input value.
2267    
2268    =back
2269    
2270    =cut
2271    
2272    sub GetCommaList {
2273        # Get the parameters.
2274        my ($value) = @_;
2275        # Declare the return variable.
2276        my $retVal = "";
2277        # Only proceed if we have an input value.
2278        if (defined $value) {
2279            # Analyze the input value.
2280            if (ref $value eq 'ARRAY') {
2281                # Here it's a list reference.
2282                $retVal = join(", ", @$value);
2283            } else {
2284                # Here it's not. Flatten it to a scalar.
2285                $retVal = "$value";
2286            }
2287        }
2288        # Return the result.
2289        return $retVal;
2290    }
2291    
2292    
2293  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.95

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3