[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.78, Wed Nov 15 12:15:30 2006 UTC revision 1.91, Thu Feb 14 19:15:18 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 50  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 101  Line 103 
103              # Here we want all the complete genomes and an access code of 1.              # Here we want all the complete genomes and an access code of 1.
104              my @genomeList = $fig->genomes(1);              my @genomeList = $fig->genomes(1);
105              %genomes = map { $_ => 1 } @genomeList;              %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107          } else {          } else {
108              my $type = ref $genomeFile;              my $type = ref $genomeFile;
109              Trace("Genome file parameter type is \"$type\".") if T(3);              Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 167  Line 170 
170          for my $subsystem (keys %subsystems) {          for my $subsystem (keys %subsystems) {
171              my $name = $subsystem;              my $name = $subsystem;
172              $name =~ s/_/ /g;              $name =~ s/_/ /g;
             my $classes = $fig->subsystem_classification($subsystem);  
             $name .= " " . join(" ", @{$classes});  
173              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
174          }          }
175      }      }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 183  Line 186 
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout,                    erdb => $sprout,
188                    loaders => [],                    loaders => [],
189                    options => $options                    options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
# Line 192  Line 196 
196    
197  =head3 LoadOnly  =head3 LoadOnly
198    
199  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
200    
201  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
202    
# Line 203  Line 207 
207      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
208  }  }
209    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
214    
215  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
216    
# Line 255  Line 247 
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
254      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
255          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
256      } else {      } else {
# Line 274  Line 266 
266              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
267              # Get the full taxonomy.              # Get the full taxonomy.
268              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
269                # Get the version. If no version is specified, we default to the genome ID by itself.
270                my $version = $fig->genome_version($genomeID);
271                if (! defined($version)) {
272                    $version = $genomeID;
273                }
274                # Get the DNA size.
275                my $dnaSize = $fig->genome_szdna($genomeID);
276              # Open the NMPDR group file for this genome.              # Open the NMPDR group file for this genome.
277              my $group;              my $group;
278              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
# Line 286  Line 285 
285              }              }
286              close TMP;              close TMP;
287              # Output the genome record.              # Output the genome record.
288              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
289                               $group, $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
290              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
291              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
292              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 325  Line 324 
324      return $retVal;      return $retVal;
325  }  }
326    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     # Set up an ID counter for the PCHs.  
     my $pchID = 0;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Get the ID for this evidence.  
                             $pchID++;  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($pchID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $pchID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($pchID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($pchID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
327  =head3 LoadFeatureData  =head3 LoadFeatureData
328    
329  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
330    
331  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
332    
# Line 463  Line 336 
336    
337      Feature      Feature
338      FeatureAlias      FeatureAlias
339        IsAliasOf
340      FeatureLink      FeatureLink
341      FeatureTranslation      FeatureTranslation
342      FeatureUpstream      FeatureUpstream
# Line 472  Line 346 
346      FeatureEssential      FeatureEssential
347      FeatureVirulent      FeatureVirulent
348      FeatureIEDB      FeatureIEDB
349        CDD
350        IsPresentOnProteinOf
351    
352  =over 4  =over 4
353    
# Line 493  Line 369 
369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
370      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
371      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
372      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
373      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
374        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
375      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
376      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
377      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
378      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);      my $loadHasFeature = $self->_TableLoader('HasFeature');
379      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
380      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
381      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
382      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
383        my $loadCDD = $self->_TableLoader('CDD');
384        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
385      # Get the subsystem hash.      # Get the subsystem hash.
386      my $subHash = $self->{subsystems};      my $subHash = $self->{subsystems};
387        # Get the property keys.
388        my $propKeys = $self->{propKeys};
389        # Create a hashes to hold CDD and alias values.
390        my %CDD = ();
391        my %alias = ();
392      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
393      # locations.      # locations.
394      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 513  Line 397 
397      } else {      } else {
398          Trace("Generating feature data.") if T(2);          Trace("Generating feature data.") if T(2);
399          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
400          for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
401            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
402            for my $genomeID (@allGenomes) {
403              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
404              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
405              # Get the feature list for this genome.              # Get the feature list for this genome.
406              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
407              # Sort and count the list.              # Sort and count the list.
408              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
409              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
410                my @fids = map { $_->[0] } @featureTuples;
411              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
412                # Get the attributes for this genome and put them in a hash by feature ID.
413                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
414                Trace("Looping through features for $genomeID.") if T(3);
415              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
416              my $oldFeatureID = "";              my $oldFeatureID = "";
417              # Loop through the features.              # Loop through the features.
418              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
419                  # Split the tuple.                  # Split the tuple.
420                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
421                  # Check for duplicates.                  # Check for duplicates.
422                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
423                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 535  Line 425 
425                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
426                      # Count this feature.                      # Count this feature.
427                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
428                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
429                        # Sprout database requires a single character.
430                        if (! defined($quality) || $quality eq "") {
431                            $quality = " ";
432                        }
433                      # Begin building the keywords. We start with the genome ID, the                      # Begin building the keywords. We start with the genome ID, the
434                      # feature ID, and the organism name.                      # feature ID, the taxonomy, and the organism name.
435                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID));                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
436                      # Get the functional assignment and aliases. This                                      $fig->taxonomy_of($genomeID));
                     # depends on the feature type.  
                     my $assignment;  
                     if ($type eq "peg") {  
                         $assignment = $fig->function_of($featureID);  
437                          # Create the aliases.                          # Create the aliases.
438                          for my $alias ($fig->feature_aliases($featureID)) {                          for my $alias ($fig->feature_aliases($featureID)) {
439                              $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
440                            $loadIsAliasOf->Put($alias, $featureID);
441                              push @keywords, $alias;                              push @keywords, $alias;
442                            # If this is a locus tag, also add its natural form as a keyword.
443                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
444                            if ($naturalName) {
445                                push @keywords, $naturalName;
446                            }
447                            # If this is the first time for the specified alias, create its
448                            # alias record.
449                            if (! exists $alias{$alias}) {
450                                $loadFeatureAlias->Put($alias);
451                                $alias{$alias} = 1;
452                          }                          }
                     } else {  
                         # For other types, the assignment is the first (and ONLY) alias.  
                         ($assignment) = $fig->feature_aliases($featureID);  
453                      }                      }
454                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
455                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
# Line 578  Line 477 
477                      }                      }
478                      # Now we need to find the subsystems this feature participates in.                      # Now we need to find the subsystems this feature participates in.
479                      # We also add the subsystems to the keyword list. Before we do that,                      # We also add the subsystems to the keyword list. Before we do that,
480                      # we must convert underscores to spaces and tack on the classifications.                      # we must convert underscores to spaces.
481                      my @subsystems = $fig->peg_to_subsystems($featureID);                      my @subsystems = $fig->peg_to_subsystems($featureID);
482                      for my $subsystem (@subsystems) {                      for my $subsystem (@subsystems) {
483                          # Only proceed if we like this subsystem.                          # Only proceed if we like this subsystem.
# Line 607  Line 506 
506                      # [name, value, value with URL]. (We don't need the PEG, since                      # [name, value, value with URL]. (We don't need the PEG, since
507                      # we already know it.)                      # we already know it.)
508                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
509                                           $fig->get_attributes($featureID);                                           @{$attributes->{$featureID}};
510                      # Now we process each of the special attributes.                      # Now we process each of the special attributes.
511                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
512                                           1, [0,2], '^(essential|potential_essential)$',                                           1, [0,2], '^(essential|potential_essential)$',
# Line 627  Line 526 
526                          push @keywords, 'iedb';                          push @keywords, 'iedb';
527                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
528                      }                      }
529                        # Now we have some other attributes we need to process. Currently,
530                        # this is CDD and CELLO, but we expect the number to increase.
531                        my %attributeHash = ();
532                        for my $attrRow (@{$attributes->{$featureID}}) {
533                            my (undef, $key, @values) = @{$attrRow};
534                            $key =~ /^([^:]+)::(.+)/;
535                            if (exists $attributeHash{$1}) {
536                                $attributeHash{$1}->{$2} = \@values;
537                            } else {
538                                $attributeHash{$1} = {$2 => \@values};
539                            }
540                        }
541                        my $celloValue = "unknown";
542                        # Pull in the CELLO attribute. There will never be more than one.
543                        # If we have one, it's a feature attribute AND a keyword.
544                        my @celloData = keys %{$attributeHash{CELLO}};
545                        if (@celloData) {
546                            $celloValue = $celloData[0];
547                            push @keywords, $celloValue;
548                        }
549                        # Now we handle CDD. This is a bit more complicated, because
550                        # there are multiple CDDs per protein.
551                        if (exists $attributeHash{CDD}) {
552                            # Get the hash of CDD IDs to scores for this feature. We
553                            # already know it exists because of the above IF.
554                            my $cddHash = $attributeHash{CDD};
555                            my @cddData = sort keys %{$cddHash};
556                            for my $cdd (@cddData) {
557                                # Extract the score for this CDD and decode it.
558                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]);
559                                my $realScore = FIGRules::DecodeScore($codeScore);
560                                # We can't afford to crash because of a bad attribute
561                                # value, hence the IF below.
562                                if (! defined($realScore)) {
563                                    # Bad score, so count it.
564                                    $loadFeature->Add('badCDDscore');
565                                } else {
566                                    # Create the connection.
567                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
568                                    # If this CDD does not yet exist, create its record.
569                                    if (! exists $CDD{$cdd}) {
570                                        $CDD{$cdd} = 1;
571                                        $loadCDD->Put($cdd);
572                                    }
573                                }
574                            }
575                        }
576                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
577                      # list.                      # list. We keep them separate and put them at the end so
578                        # the original word order is available.
579                      my $keywordString = "";                      my $keywordString = "";
580                        my $bustedString = "";
581                      for my $keyword (@keywords) {                      for my $keyword (@keywords) {
582                          if (length $keyword >= 4) {                          if (length $keyword >= 3) {
583                              $keywordString .= " $keyword";                              $keywordString .= " $keyword";
584                              if ($keyword =~ /-/) {                              if ($keyword =~ /-/) {
585                                  my @words = grep { length($_) >= 4 } split /-/, $keyword;                                  my @words = split /-/, $keyword;
586                                  $keywordString .= join(" ", "", @words);                                  $bustedString .= join(" ", "", @words);
587                              }                              }
588                          }                          }
589                      }                      }
590                        $keywordString .= $bustedString;
591                        # Get rid of annoying punctuation.
592                        $keywordString =~ s/[();]//g;
593                      # Clean the keyword list.                      # Clean the keyword list.
594                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my $cleanWords = $sprout->CleanKeywords($keywordString);
595                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
596                      # Create the feature record.                      # Now we need to process the feature's locations. First, we split them up.
597                      $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);                      my @locationList = split /\s*,\s*/, $locations;
598                        # Next, we convert them to Sprout location objects.
599                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
600                        # Assemble them into a sprout location string for later.
601                        my $locationString = join(", ", map { $_->String } @locObjectList);
602                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
603                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
604                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
605                      # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                     my @locationList = split /\s*,\s*/, $locations;  
                     # Create the location position indicator.  
606                      my $i = 1;                      my $i = 1;
607                      # Loop through the locations.                      # Loop through the locations.
608                      for my $location (@locationList) {                      for my $locObject (@locObjectList) {
609                          # Parse the location.                          # Split this location into a list of chunks.
                         my $locObject = BasicLocation->new("$genomeID:$location");  
                         # Split it into a list of chunks.  
610                          my @locOList = ();                          my @locOList = ();
611                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
612                              $loadIsLocatedIn->Add("peeling");                              $loadIsLocatedIn->Add("peeling");
# Line 670  Line 621 
621                              $i++;                              $i++;
622                          }                          }
623                      }                      }
624                        # Finally, reassemble the location objects into a list of Sprout location strings.
625                        # Create the feature record.
626                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString);
627                  }                  }
628              }              }
629                Trace("Genome $genomeID processed.") if T(3);
630          }          }
631      }      }
632      # Finish the loads.      # Finish the loads.
# Line 681  Line 636 
636    
637  =head3 LoadSubsystemData  =head3 LoadSubsystemData
638    
639  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
640    
641  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
642    
# Line 697  Line 652 
652      SubsystemClass      SubsystemClass
653      Role      Role
654      RoleEC      RoleEC
655        IsIdentifiedByEC
656      SSCell      SSCell
657      ContainsFeature      ContainsFeature
658      IsGenomeOf      IsGenomeOf
# Line 738  Line 694 
694      # Get the map list.      # Get the map list.
695      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
696      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
697      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
698      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
699      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
700      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
701      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
702      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
703      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
704      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
705      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
706      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
707      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
708      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
709      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
710      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
711      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
712      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
713      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
714      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
715      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
716      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
717        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
718      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
719          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
720      } else {      } else {
721          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
722          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
723          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
724          my %ecToRoles = ();          my %ecToRoles = ();
725          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 776  Line 733 
733              # Get the subsystem object.              # Get the subsystem object.
734              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
735              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
736              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
737                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
738                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
739                  # Create the subsystem record.                  # Create the subsystem record.
# Line 790  Line 747 
747                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
748                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
749                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
750                        # Get the role's abbreviation.
751                        my $abbr = $sub->get_role_abbr($col);
752                      # Connect to this role.                      # Connect to this role.
753                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
754                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
755                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
756                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
757                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
758                          # Add the role.                          # Add the role.
759                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
760                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
761                          # Check for an EC number.                          # Check for an EC number.
762                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
763                              my $ec = $1;                              my $ec = $1;
764                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
765                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
766                                if (exists $ecToRoles{$ec}) {
767                                    # No, so just add this role to the EC list.
768                                    push @{$ecToRoles{$ec}}, $roleID;
769                                } else {
770                                    # Output this EC.
771                                    $loadRoleEC->Put($ec);
772                                    # Create its role list.
773                                    $ecToRoles{$ec} = [$roleID];
774                                }
775                          }                          }
776                      }                      }
777                  }                  }
# Line 917  Line 884 
884              # Now we need to link all the map's roles to it.              # Now we need to link all the map's roles to it.
885              # A hash is used to prevent duplicates.              # A hash is used to prevent duplicates.
886              my %roleHash = ();              my %roleHash = ();
887              for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
888                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {                  if (exists $ecToRoles{$ec}) {
889                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);                      for my $role (@{$ecToRoles{$ec}}) {
890                            if (! $roleHash{$role}) {
891                                $loadRoleOccursIn->Put($role, $map);
892                      $roleHash{$role} = 1;                      $roleHash{$role} = 1;
893                  }                  }
894              }              }
895          }          }
896                }
897            }
898          # Before we leave, we must create the Catalyzes table. We start with the reactions,          # Before we leave, we must create the Catalyzes table. We start with the reactions,
899          # then use the "ecToRoles" table to convert EC numbers to role IDs.          # then use the "ecToRoles" table to convert EC numbers to role IDs.
900          my @reactions = $fig->all_reactions();          my @reactions = $fig->all_reactions();
901          for my $reactionID (@reactions) {          for my $reactionID (@reactions) {
902              # Get this reaction's list of roles. The results will be EC numbers.              # Get this reaction's list of roles. The results will be EC numbers.
903              my @roles = $fig->catalyzed_by($reactionID);              my @ecs = $fig->catalyzed_by($reactionID);
904              # Loop through the roles, creating catalyzation records.              # Loop through the roles, creating catalyzation records.
905              for my $thisRole (@roles) {              for my $thisEC (@ecs) {
906                  if (exists $ecToRoles{$thisRole}) {                  if (exists $ecToRoles{$thisEC}) {
907                      $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);                      for my $thisRole (@{$ecToRoles{$thisEC}}) {
908                            $loadCatalyzes->Put($thisRole, $reactionID);
909                        }
910                  }                  }
911              }              }
912          }          }
# Line 945  Line 918 
918    
919  =head3 LoadPropertyData  =head3 LoadPropertyData
920    
921  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
922    
923  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
924    
# Line 981  Line 954 
954      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
955      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
956      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
957      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
958      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
959          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
960      } else {      } else {
# Line 989  Line 962 
962          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
963          my %propertyKeys = ();          my %propertyKeys = ();
964          my $nextID = 1;          my $nextID = 1;
965            # Get the attributes we intend to store in the property table.
966            my $propKeys = $self->{propKeys};
967          # Loop through the genomes.          # Loop through the genomes.
968          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
969              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
970              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
971              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
972              my $propertyCount = 0;              my $propertyCount = 0;
973              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
974              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
975                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
976                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
977                  my @attributeList = $fig->get_attributes($fid);              for my $attributeData (@attributes) {
978                  # Add essentiality and virulence attributes.                  # Pull apart the attribute tuple.
979                  if ($fig->essential($fid)) {                  my ($fid, $key, $value, $url) = @{$attributeData};
                     push @attributeList, [$fid, 'essential', 1, ''];  
                 }  
                 if ($fig->virulent($fid)) {  
                     push @attributeList, [$fid, 'virulent', 1, ''];  
                 }  
                 if (scalar @attributeList) {  
                     $featureCount++;  
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
980                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
981                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
982                      # character.                      # character.
# Line 1037  Line 994 
994                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
995                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
996                  }                  }
             }  
997              # Update the statistics.              # Update the statistics.
998              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
999              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1000          }          }
1001      }      }
# Line 1051  Line 1006 
1006    
1007  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1008    
1009  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1010    
1011  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1012    
# Line 1085  Line 1040 
1040      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1041      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1042      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1043      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1044      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1045      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1046      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1047      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1048          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1049      } else {      } else {
# Line 1158  Line 1113 
1113    
1114  =head3 LoadSourceData  =head3 LoadSourceData
1115    
1116  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1117    
1118  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1119    
# Line 1192  Line 1147 
1147      # Get the genome hash.      # Get the genome hash.
1148      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1149      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1150      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1151      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1152      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1153      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1236  Line 1191 
1191    
1192  =head3 LoadExternalData  =head3 LoadExternalData
1193    
1194  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1195    
1196  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1197    
# Line 1316  Line 1271 
1271    
1272  =head3 LoadReactionData  =head3 LoadReactionData
1273    
1274  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1275    
1276  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1277    
# Line 1329  Line 1284 
1284      Compound      Compound
1285      CompoundName      CompoundName
1286      CompoundCAS      CompoundCAS
1287        IsIdentifiedByCAS
1288        HasCompoundName
1289      IsAComponentOf      IsAComponentOf
1290    
1291  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
# Line 1350  Line 1307 
1307      my $fig = $self->{fig};      my $fig = $self->{fig};
1308      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1309      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1310      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1311      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1312      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1313      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1314      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1315        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1316        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1317      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1318          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1319      } else {      } else {
1320          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1321            # We need some hashes to prevent duplicates.
1322            my %compoundNames = ();
1323            my %compoundCASes = ();
1324          # First we create the compounds.          # First we create the compounds.
1325          my @compounds = $fig->all_compounds();          my @compounds = $fig->all_compounds();
1326          for my $cid (@compounds) {          for my $cid (@compounds) {
# Line 1367  Line 1329 
1329              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1330              my $prio = 1;              my $prio = 1;
1331              for my $name (@names) {              for my $name (@names) {
1332                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1333                        $loadCompoundName->Put($name);
1334                        $compoundNames{$name} = 1;
1335                    }
1336                    $loadHasCompoundName->Put($cid, $name, $prio++);
1337              }              }
1338              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1339              # becomes the label.              # becomes the label.
# Line 1376  Line 1342 
1342              # Check for a CAS ID.              # Check for a CAS ID.
1343              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1344              if ($cas) {              if ($cas) {
1345                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1346                    if (! exists $compoundCASes{$cas}) {
1347                        $loadCompoundCAS->Put($cas);
1348                        $compoundCASes{$cas} = 1;
1349                    }
1350              }              }
1351          }          }
1352          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
# Line 1413  Line 1383 
1383      return $retVal;      return $retVal;
1384  }  }
1385    
 =head3 LoadGroupData  
   
 C<< my $stats = $spl->LoadGroupData(); >>  
   
 Load the genome Groups into Sprout.  
   
 The following relations are loaded by this method.  
   
     GenomeGroups  
   
 Currently, we do not use groups. We used to use them for NMPDR groups,  
 butThere is no direct support for genome groups in FIG, so we access the SEED  
 files directly.  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadGroupData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     # Create a load object for the table we're loading.  
     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating group data.") if T(2);  
         # Currently there are no groups.  
     }  
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
1386  =head3 LoadSynonymData  =head3 LoadSynonymData
1387    
1388  C<< my $stats = $spl->LoadSynonymData(); >>      my $stats = $spl->LoadSynonymData();
1389    
1390  Load the synonym groups into Sprout.  Load the synonym groups into Sprout.
1391    
# Line 1498  Line 1424 
1424          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1425          # Get the database handle.          # Get the database handle.
1426          my $dbh = $fig->db_handle();          my $dbh = $fig->db_handle();
1427          # Ask for the synonyms.          # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1428          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1429          my $result = $sth->execute();          my $result = $sth->execute();
1430          if (! defined($result)) {          if (! defined($result)) {
1431              Confess("Database error in Synonym load: " . $sth->errstr());              Confess("Database error in Synonym load: " . $sth->errstr());
1432          } else {          } else {
1433                Trace("Processing synonym results.") if T(2);
1434              # Remember the current synonym.              # Remember the current synonym.
1435              my $current_syn = "";              my $current_syn = "";
1436              # Count the features.              # Count the features.
1437              my $featureCount = 0;              my $featureCount = 0;
1438                my $entryCount = 0;
1439              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1440              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1441                  # Get the synonym ID and feature ID.                  # Get the synonym group ID and feature ID.
1442                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1443                    # Count this row.
1444                    $entryCount++;
1445                    if ($entryCount % 1000 == 0) {
1446                        Trace("$entryCount rows processed.") if T(3);
1447                    }
1448                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1449                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
1450                  if (exists $genomeHash->{$genomeID}) {                  if (exists $genomeHash->{$genomeID}) {
# Line 1530  Line 1463 
1463                      }                      }
1464                  }                  }
1465              }              }
1466                Trace("$entryCount rows produced $featureCount features.") if T(2);
1467          }          }
1468      }      }
1469      # Finish the load.      # Finish the load.
# Line 1539  Line 1473 
1473    
1474  =head3 LoadFamilyData  =head3 LoadFamilyData
1475    
1476  C<< my $stats = $spl->LoadFamilyData(); >>      my $stats = $spl->LoadFamilyData();
1477    
1478  Load the protein families into Sprout.  Load the protein families into Sprout.
1479    
# Line 1607  Line 1541 
1541    
1542  =head3 LoadDrugData  =head3 LoadDrugData
1543    
1544  C<< my $stats = $spl->LoadDrugData(); >>      my $stats = $spl->LoadDrugData();
1545    
1546  Load the drug target data into Sprout.  Load the drug target data into Sprout.
1547    
1548  The following relations are loaded by this method.  The following relations are loaded by this method.
1549    
     DrugProject  
     ContainsTopic  
     DrugTopic  
     ContainsAnalysisOf  
1550      PDB      PDB
1551      IncludesBound      DocksWith
1552      IsBoundIn      IsProteinForFeature
     BindsWith  
1553      Ligand      Ligand
     DescribesProteinForFeature  
     FeatureConservation  
1554    
1555  The source information for these relations is taken from flat files in the  The source information for these relations is taken from attributes. The
1556  C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains  C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1557  a list of drug project names paired with file names. The named file (in the  The C<zinc_name> attribute describes the ligands. The C<docking_results>
1558  same directory) contains all the data for the project.  attribute contains the information for the B<DocksWith> relationship. It is
1559    expected that additional attributes and tables will be added in the future.
1560    
1561  =over 4  =over 4
1562    
# Line 1648  Line 1576 
1576      # Get the genome hash.      # Get the genome hash.
1577      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1578      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
     my $loadDrugProject = $self->_TableLoader('DrugProject');  
     my $loadContainsTopic = $self->_TableLoader('ContainsTopic');  
     my $loadDrugTopic = $self->_TableLoader('DrugTopic');  
     my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');  
1579      my $loadPDB = $self->_TableLoader('PDB');      my $loadPDB = $self->_TableLoader('PDB');
     my $loadIncludesBound = $self->_TableLoader('IncludesBound');  
     my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');  
     my $loadBindsWith = $self->_TableLoader('BindsWith');  
1580      my $loadLigand = $self->_TableLoader('Ligand');      my $loadLigand = $self->_TableLoader('Ligand');
1581      my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');      my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1582      my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');      my $loadDocksWith = $self->_TableLoader('DocksWith');
1583      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1584          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1585      } else {      } else {
1586          Trace("Generating drug target data.") if T(2);          Trace("Generating drug target data.") if T(2);
1587          # Load the project list. The file comes in as a list of chomped lines,          # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1588          # and we split them on the TAB character to make the project name the          # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1589          # key and the file name the value of the resulting hash.          # this process, PDB information is collected in a hash table and then
1590          my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");          # unspooled after both relationships are created.
1591          # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects          my %pdbHash = ();
1592          # may occur multiple times in a single project file or even in multiple project          Trace("Generating docking data.") if T(2);
1593          # files.          # Get all the docking data. This may cause problems if there are too many PDBs,
1594          my %ligands = ();          # at which point we'll need another algorithm. The indicator that this is
1595          my %pdbs = ();          # happening will be a timeout error in the next statement.
1596          my %features = ();          my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1597          my %bindings = ();                                                ['docking_results', $FIG_Config::dockLimit]);
1598          # Set up a counter for drug topics. This will be used as the key.          Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1599          my $topicCounter = 0;          for my $dockData (@dockData) {
1600          # Loop through the projects. We sort the keys not because we need them sorted, but              # Get the docking data components.
1601          # because it makes it easier to infer our progress from trace messages.              my ($pdbID, $docking_key, @valueData) = @{$dockData};
1602          for my $project (sort keys %projects) {              # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1603              Trace("Processing project $project.") if T(3);              $pdbID = lc $pdbID;
1604              # Only proceed if the download file exists.              # Strip off the object type.
1605              my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";              $pdbID =~ s/pdb://;
1606              if (! -f $projectFile) {              # Extract the ZINC ID from the docking key. Note that there are two possible
1607                  Trace("Project file $projectFile not found.") if T(0);              # formats.
1608              } else {              my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1609                  # Create the project record.              if (! $zinc_id) {
1610                  $loadDrugProject->Put($project);                  Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1611                  # Create a hash for the topics. Each project has one or more topics. The                  $loadDocksWith->Add("errors");
1612                  # topic is identified by a URL, a category, and an identifier.              } else {
1613                  my %topics = ();                  # Get the pieces of the value and parse the energy.
1614                  # Now we can open the project file.                  # Note that we don't care about the rank, since
1615                  Trace("Reading project file $projectFile.") if T(3);                  # we can sort on the energy level itself in our database.
1616                  Open(\*PROJECT, "<$projectFile");                  my ($energy, $tool, $type) = @valueData;
1617                  # Get the first record, which is a list of column headers. We don't use this                  my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1618                  # for anything, but it may be useful for debugging.                  # Ignore predicted results.
1619                  my $headerLine = <PROJECT>;                  if ($type ne "Predicted") {
1620                  # Loop through the rest of the records.                      # Count this docking result.
1621                  while (! eof PROJECT) {                      if (! exists $pdbHash{$pdbID}) {
1622                      # Get the current line of data. Note that not all lines will have all                          $pdbHash{$pdbID} = 1;
1623                      # the fields. In particular, the CLIBE data is fairly rare.                      } else {
1624                      my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,                          $pdbHash{$pdbID}++;
1625                          $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,                      }
1626                          $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,                      # Write the result to the output.
1627                          $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,                      $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1628                          $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)                                          $total, $vanderwaals);
1629                         = Tracer::GetLine(\*PROJECT);                  }
1630                      # The tag contains an identifier for the current line of data followed              }
                     # by a text statement that generally matches a property name in the  
                     # main database. We split it up, since the identifier goes with  
                     # the PDB data and the text statement is part of the topic.  
                     my ($lineID, $topicTag) = split /\s*,\s*/, $tag;  
                     $loadDrugProject->Add("data line");  
                     # Check for a new topic.  
                     my $topicData = "$category\t$topicTag\t$refURL";  
                     if (! exists $topics{$topicData}) {  
                         # Here we have a new topic. Compute its ID.  
                         $topicCounter++;  
                         $topics{$topicData} = $topicCounter;  
                         # Create its database record.  
                         $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,  
                                             $topicTag);  
                         # Connect it to the project.  
                         $loadContainsTopic->Put($project, $topicCounter);  
                         $loadDrugTopic->Add("topic");  
                     }  
                     # Now we know the topic ID exists in the hash and the topic will  
                     # appear in the database, so we get this topic's ID.  
                     my $topicID = $topics{$topicData};  
                     # If the feature in this line is new, we need to save its conservation  
                     # number.  
                     if (! exists $features{$peg}) {  
                         $loadFeatureConservation->Put($peg, $conservation);  
                         $features{$peg} = 1;  
                     }  
                     # Now we have two PDBs to deal with-- a bound PDB and a free PDB.  
                     # The free PDB will have data about docking points; the bound PDB  
                     # will have data about docking. We store both types as PDBs, and  
                     # the special data comes from relationships. First we process the  
                     # bound PDB.  
                     if ($pdbBound) {  
                         $loadPDB->Add("bound line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadIncludesBound->Put($topicID, $pdbBound);  
                         # Check for CLIBE data.  
                         if ($clibeInfo) {  
                             $loadLigand->Add("clibes");  
                             # We have CLIBE data, so we create a ligand and relate it to the PDB.  
                             if (! exists $ligands{$clibeInfo}) {  
                                 # This is a new ligand, so create its record.  
                                 $loadLigand->Put($clibeInfo);  
                                 $loadLigand->Add("ligand");  
                                 # Make sure we know this ligand already exists.  
                                 $ligands{$clibeInfo} = 1;  
                             }  
                             # Now connect the PDB to the ligand using the CLIBE data.  
                             $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,  
                                                 $clibeSolvationE, $clibeVanderwaals);  
                         }  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);  
                     }  
                     # Next is the free PDB.  
                     if ($pdbFree) {  
                         $loadPDB->Add("free line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,  
                                                      $passWeightFile, $passWeightInfo, $passAspFile);  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);  
                     }  
                     # If we have both PDBs, we may need to link them.  
                     if ($pdbFree && $pdbBound) {  
                         $loadIsBoundIn->Add("connection");  
                         # Insure we only link them once.  
                         my $bindingKey =  "$pdbFree\t$pdbBound";  
                         if (! exists $bindings{$bindingKey}) {  
                             $loadIsBoundIn->Add("newConnection");  
                             $loadIsBoundIn->Put($pdbFree, $pdbBound);  
                             $bindings{$bindingKey} = 1;  
1631                          }                          }
1632            Trace("Connecting features.") if T(2);
1633            # Loop through the genomes.
1634            for my $genome (sort keys %{$genomeHash}) {
1635                Trace("Generating PDBs for $genome.") if T(3);
1636                # Get all of the PDBs that BLAST against this genome's features.
1637                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1638                for my $pdbData (@attributeData) {
1639                    # The PDB ID is coded as a subkey.
1640                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1641                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1642                        $loadPDB->Add("errors");
1643                    } else {
1644                        my $pdbID = $1;
1645                        # Insure the PDB is in the hash.
1646                        if (! exists $pdbHash{$pdbID}) {
1647                            $pdbHash{$pdbID} = 0;
1648                        }
1649                        # The score and locations are coded in the attribute value.
1650                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1651                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1652                            $loadIsProteinForFeature->Add("errors");
1653                        } else {
1654                            my ($score, $locData) = ($1,$2);
1655                            # The location data may not be present, so we have to start with some
1656                            # defaults and then check.
1657                            my ($start, $end) = (1, 0);
1658                            if ($locData) {
1659                                $locData =~ /(\d+)-(\d+)/;
1660                                $start = $1;
1661                                $end = $2;
1662                            }
1663                            # If we still don't have the end location, compute it from
1664                            # the feature length.
1665                            if (! $end) {
1666                                # Most features have one location, but we do a list iteration
1667                                # just in case.
1668                                my @locations = $fig->feature_location($pdbData->[0]);
1669                                $end = 0;
1670                                for my $loc (@locations) {
1671                                    my $locObject = BasicLocation->new($loc);
1672                                    $end += $locObject->Length;
1673                                }
1674                            }
1675                            # Decode the score.
1676                            my $realScore = FIGRules::DecodeScore($score);
1677                            # Connect the PDB to the feature.
1678                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1679                        }
1680                    }
1681                }
1682            }
1683            # We've got all our PDBs now, so we unspool them from the hash.
1684            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1685            my $count = 0;
1686            for my $pdbID (sort keys %pdbHash) {
1687                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1688                $count++;
1689                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1690            }
1691            # Finally we create the ligand table. This information can be found in the
1692            # zinc_name attribute.
1693            Trace("Loading ligands.") if T(2);
1694            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1695            my $last_zinc_id = "";
1696            my $zinc_id = "";
1697            my $done = 0;
1698            while (! $done) {
1699                # Get the next 10000 ligands. We insist that the object ID is greater than
1700                # the last ID we processed.
1701                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1702                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1703                                                           ["ZINC:$zinc_id", "zinc_name"]);
1704                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1705                if (! @attributeData) {
1706                    # Here there are no attributes left, so we quit the loop.
1707                    $done = 1;
1708                } else {
1709                    # Process the attribute data we've received.
1710                    for my $zinc_data (@attributeData) {
1711                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1712                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1713                            $zinc_id = $1;
1714                            # Check for a duplicate.
1715                            if ($zinc_id eq $last_zinc_id) {
1716                                $loadLigand->Add("duplicate");
1717                            } else {
1718                                # Here it's safe to output the ligand. The ligand name is the attribute value
1719                                # (third column in the row).
1720                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1721                                # Insure we don't try to add this ID again.
1722                                $last_zinc_id = $zinc_id;
1723                            }
1724                        } else {
1725                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1726                            $loadLigand->Add("errors");
1727                      }                      }
1728                  }                  }
                 # Close off this project.  
                 close PROJECT;  
1729              }              }
1730          }          }
1731            Trace("Ligands loaded.") if T(2);
1732      }      }
1733      # Finish the load.      # Finish the load.
1734      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1801  Line 1740 
1740    
1741  =head3 SpecialAttribute  =head3 SpecialAttribute
1742    
1743  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>      my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1744    
1745  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1746  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
# Line 1877  Line 1816 
1816      return $retVal;      return $retVal;
1817  }  }
1818    
 =head3 CreatePDB  
   
 C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>  
   
 Insure that a PDB record exists for the identified PDB. If one does not exist, it will be  
 created.  
   
 =over 4  
   
 =item pdbID  
   
 ID string (usually an unqualified file name) for the desired PDB.  
   
 =item title  
   
 Title to use if the PDB must be created.  
   
 =item type  
   
 Type of PDB: C<free> or C<bound>  
   
 =item pdbHash  
   
 Hash containing the IDs of PDBs that have already been created.  
   
 =item pdbLoader  
   
 Load object for the PDB table.  
   
 =back  
   
 =cut  
   
 sub CreatePDB {  
     # Get the parameters.  
     my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;  
     $pdbLoader->Add("PDB check");  
     # Check to see if this is a new PDB.  
     if (! exists $pdbHash->{$pdbID}) {  
         # It is, so we create it.  
         $pdbLoader->Put($pdbID, $title, $type);  
         $pdbHash->{$pdbID} = 1;  
         # Count it.  
         $pdbLoader->Add("PDB-$type");  
     }  
 }  
   
1819  =head3 TableLoader  =head3 TableLoader
1820    
1821  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1938  Line 1830 
1830    
1831  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1832    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
1833  =item RETURN  =item RETURN
1834    
1835  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1952  Line 1840 
1840    
1841  sub _TableLoader {  sub _TableLoader {
1842      # Get the parameters.      # Get the parameters.
1843      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
1844      # Create the load object.      # Create the load object.
1845      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
1846      # Cache it in the loader list.      # Cache it in the loader list.
1847      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1848      # Return it to the caller.      # Return it to the caller.
# Line 2027  Line 1914 
1914      return $retVal;      return $retVal;
1915  }  }
1916    
1917    =head3 GetGenomeAttributes
1918    
1919        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
1920    
1921    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1922    attributes for all the features of a genome in a single call, then organizes them into
1923    a hash.
1924    
1925    =over 4
1926    
1927    =item fig
1928    
1929    FIG-like object for accessing attributes.
1930    
1931    =item genomeID
1932    
1933    ID of the genome who's attributes are desired.
1934    
1935    =item fids
1936    
1937    Reference to a list of the feature IDs whose attributes are to be kept.
1938    
1939    =item propKeys
1940    
1941    A list of the keys to retrieve.
1942    
1943    =item RETURN
1944    
1945    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1946    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1947    the attribute key, and one or more attribute values.
1948    
1949    =back
1950    
1951    =cut
1952    
1953    sub GetGenomeAttributes {
1954        # Get the parameters.
1955        my ($fig, $genomeID, $fids, $propKeys) = @_;
1956        # Declare the return variable.
1957        my $retVal = {};
1958        # Initialize the hash. This not only enables us to easily determine which FIDs to
1959        # keep, it insures that the caller sees a list reference for every known fid,
1960        # simplifying the logic.
1961        for my $fid (@{$fids}) {
1962            $retVal->{$fid} = [];
1963        }
1964        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1965        # an eval is used.
1966        my @aList = ();
1967        eval {
1968            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1969            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1970        };
1971        # Check for a problem.
1972        if ($@) {
1973            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1974            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1975            # but allows us to continue processing.
1976            my $nFids = scalar @{$fids};
1977            for (my $i = 0; $i < $nFids; $i += 100) {
1978                # Determine the index of the last feature ID we'll be specifying on this pass.
1979                # Normally it's $i + 99, but if we're close to the end it may be less.
1980                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1981                # Get a slice of the fid list.
1982                my @slice = @{$fids}[$i .. $end];
1983                # Get the relevant attributes.
1984                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1985                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1986                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1987                push @aList, @aShort;
1988            }
1989        }
1990        # Now we should have all the interesting attributes in @aList. Populate the hash with
1991        # them.
1992        for my $aListEntry (@aList) {
1993            my $fid = $aListEntry->[0];
1994            if (exists $retVal->{$fid}) {
1995                push @{$retVal->{$fid}}, $aListEntry;
1996            }
1997        }
1998        # Return the result.
1999        return $retVal;
2000    }
2001    
2002    
2003  1;  1;

Legend:
Removed from v.1.78  
changed lines
  Added in v.1.91

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3