[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.61, Sun Jul 30 01:41:34 2006 UTC revision 1.87, Mon Sep 10 18:16:54 2007 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
# Line 80  Line 81 
81  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
82  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
83  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
84  in its data directory.) Only subsystem data related to the trusted subsystems is loaded.  in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
85    
86  =item options  =item options
87    
# Line 120  Line 121 
121                      # an omitted access code can be defaulted to 1.                      # an omitted access code can be defaulted to 1.
122                      for my $genomeLine (@genomeList) {                      for my $genomeLine (@genomeList) {
123                          my ($genomeID, $accessCode) = split("\t", $genomeLine);                          my ($genomeID, $accessCode) = split("\t", $genomeLine);
124                          if (undef $accessCode) {                          if (! defined($accessCode)) {
125                              $accessCode = 1;                              $accessCode = 1;
126                          }                          }
127                          $genomes{$genomeID} = $accessCode;                          $genomes{$genomeID} = $accessCode;
# Line 138  Line 139 
139          if (! defined $subsysFile || $subsysFile eq '') {          if (! defined $subsysFile || $subsysFile eq '') {
140              # Here we want all the usable subsystems. First we get the whole list.              # Here we want all the usable subsystems. First we get the whole list.
141              my @subs = $fig->all_subsystems();              my @subs = $fig->all_subsystems();
142              # Loop through, checking for usability.              # Loop through, checking for the NMPDR file.
143              for my $sub (@subs) {              for my $sub (@subs) {
144                  if ($fig->usable_subsystem($sub)) {                  if ($fig->nmpdr_subsystem($sub)) {
145                      $subsystems{$sub} = 1;                      $subsystems{$sub} = 1;
146                  }                  }
147              }              }
# Line 163  Line 164 
164                  Confess("Invalid subsystem parameter in SproutLoad constructor.");                  Confess("Invalid subsystem parameter in SproutLoad constructor.");
165              }              }
166          }          }
167            # Go through the subsys hash again, creating the keyword list for each subsystem.
168            for my $subsystem (keys %subsystems) {
169                my $name = $subsystem;
170                $name =~ s/_/ /g;
171    #            my $classes = $fig->subsystem_classification($subsystem);
172    #            $name .= " " . join(" ", @{$classes});
173                $subsystems{$subsystem} = $name;
174      }      }
175        }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 175  Line 186 
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout,                    erdb => $sprout,
188                    loaders => [],                    loaders => [],
189                    options => $options                    options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
# Line 195  Line 207 
207      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
208  }  }
209    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
# Line 247  Line 247 
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
254      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
255          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
256      } else {      } else {
# Line 266  Line 266 
266              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
267              # Get the full taxonomy.              # Get the full taxonomy.
268              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
269                # Get the version. If no version is specified, we default to the genome ID by itself.
270                my $version = $fig->genome_version($genomeID);
271                if (! defined($version)) {
272                    $version = $genomeID;
273                }
274                # Get the DNA size.
275                my $dnaSize = $fig->genome_szdna($genomeID);
276                # Open the NMPDR group file for this genome.
277                my $group;
278                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
279                    defined($group = <TMP>)) {
280                    # Clean the line ending.
281                    chomp $group;
282                } else {
283                    # No group, so use the default.
284                    $group = $FIG_Config::otherGroup;
285                }
286                close TMP;
287              # Output the genome record.              # Output the genome record.
288              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
289                               $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
290              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
291              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
292              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 306  Line 324 
324      return $retVal;      return $retVal;
325  }  }
326    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     # Set up an ID counter for the PCHs.  
     my $pchID = 0;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Get the ID for this evidence.  
                             $pchID++;  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($pchID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $pchID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($pchID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($pchID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
327  =head3 LoadFeatureData  =head3 LoadFeatureData
328    
329  C<< my $stats = $spl->LoadFeatureData(); >>  C<< my $stats = $spl->LoadFeatureData(); >>
# Line 444  Line 336 
336    
337      Feature      Feature
338      FeatureAlias      FeatureAlias
339        IsAliasOf
340      FeatureLink      FeatureLink
341      FeatureTranslation      FeatureTranslation
342      FeatureUpstream      FeatureUpstream
343      IsLocatedIn      IsLocatedIn
344      HasFeature      HasFeature
345        HasRoleInSubsystem
346        FeatureEssential
347        FeatureVirulent
348        FeatureIEDB
349        CDD
350        IsPresentOnProteinOf
351    
352  =over 4  =over 4
353    
# Line 463  Line 362 
362  sub LoadFeatureData {  sub LoadFeatureData {
363      # Get this object instance.      # Get this object instance.
364      my ($self) = @_;      my ($self) = @_;
365      # Get the FIG object.      # Get the FIG and Sprout objects.
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367        my $sprout = $self->{sprout};
368      # Get the table of genome IDs.      # Get the table of genome IDs.
369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
370      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
371      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
372      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
373      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
374        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
375      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
376      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
377      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
378      my $loadHasFeature = $self->_TableLoader('HasFeature');      my $loadHasFeature = $self->_TableLoader('HasFeature');
379        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
380        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
381        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
382        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
383        my $loadCDD = $self->_TableLoader('CDD');
384        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
385        # Get the subsystem hash.
386        my $subHash = $self->{subsystems};
387        # Get the property keys.
388        my $propKeys = $self->{propKeys};
389        # Create a hashes to hold CDD and alias values.
390        my %CDD = ();
391        my %alias = ();
392      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
393      # locations.      # locations.
394      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 487  Line 401 
401              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
402              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
403              # Get the feature list for this genome.              # Get the feature list for this genome.
404              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
405              # Sort and count the list.              # Sort and count the list.
406              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
407              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
408                my @fids = map { $_->[0] } @featureTuples;
409              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
410                # Get the attributes for this genome and put them in a hash by feature ID.
411                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
412              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
413              my $oldFeatureID = "";              my $oldFeatureID = "";
414              # Loop through the features.              # Loop through the features.
415              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
416                  # Split the tuple.                  # Split the tuple.
417                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
418                  # Check for duplicates.                  # Check for duplicates.
419                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
420                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 505  Line 422 
422                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
423                      # Count this feature.                      # Count this feature.
424                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
425                      # Create the feature record.                      # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
426                      $loadFeature->Put($featureID, 1, $type);                      # Sprout database requires a single character.
427                      # Link it to the parent genome.                      if (! defined($quality) || $quality eq "") {
428                      $loadHasFeature->Put($genomeID, $featureID, $type);                          $quality = " ";
429                        }
430                        # Begin building the keywords. We start with the genome ID, the
431                        # feature ID, the taxonomy, and the organism name.
432                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
433                                        $fig->taxonomy_of($genomeID));
434                      # Create the aliases.                      # Create the aliases.
435                      for my $alias ($fig->feature_aliases($featureID)) {                      for my $alias ($fig->feature_aliases($featureID)) {
436                          $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
437                      }                          $loadIsAliasOf->Put($alias, $featureID);
438                            push @keywords, $alias;
439                            # If this is a locus tag, also add its natural form as a keyword.
440                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
441                            if ($naturalName) {
442                                push @keywords, $naturalName;
443                            }
444                            # If this is the first time for the specified alias, create its
445                            # alias record.
446                            if (! exists $alias{$alias}) {
447                                $loadFeatureAlias->Put($alias);
448                                $alias{$alias} = 1;
449                            }
450                        }
451                        Trace("Assignment for $featureID is: $assignment") if T(4);
452                        # Break the assignment into words and shove it onto the
453                        # keyword list.
454                        push @keywords, split(/\s+/, $assignment);
455                        # Link this feature to the parent genome.
456                        $loadHasFeature->Put($genomeID, $featureID, $type);
457                      # Get the links.                      # Get the links.
458                      my @links = $fig->fid_links($featureID);                      my @links = $fig->fid_links($featureID);
459                      for my $link (@links) {                      for my $link (@links) {
# Line 531  Line 472 
472                              $loadFeatureUpstream->Put($featureID, $upstream);                              $loadFeatureUpstream->Put($featureID, $upstream);
473                          }                          }
474                      }                      }
475                        # Now we need to find the subsystems this feature participates in.
476                        # We also add the subsystems to the keyword list. Before we do that,
477                        # we must convert underscores to spaces.
478                        my @subsystems = $fig->peg_to_subsystems($featureID);
479                        for my $subsystem (@subsystems) {
480                            # Only proceed if we like this subsystem.
481                            if (exists $subHash->{$subsystem}) {
482                                # Store the has-role link.
483                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
484                                # Save the subsystem's keyword data.
485                                my $subKeywords = $subHash->{$subsystem};
486                                push @keywords, split /\s+/, $subKeywords;
487                                # Now we need to get this feature's role in the subsystem.
488                                my $subObject = $fig->get_subsystem($subsystem);
489                                my @roleColumns = $subObject->get_peg_roles($featureID);
490                                my @allRoles = $subObject->get_roles();
491                                for my $col (@roleColumns) {
492                                    my $role = $allRoles[$col];
493                                    push @keywords, split /\s+/, $role;
494                                    push @keywords, $subObject->get_role_abbr($col);
495                                }
496                            }
497                        }
498                        # There are three special attributes computed from property
499                        # data that we build next. If the special attribute is non-empty,
500                        # its name will be added to the keyword list. First, we get all
501                        # the attributes for this feature. They will come back as
502                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
503                        # [name, value, value with URL]. (We don't need the PEG, since
504                        # we already know it.)
505                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
506                                             @{$attributes->{$featureID}};
507                        # Now we process each of the special attributes.
508                        if (SpecialAttribute($featureID, \@attributes,
509                                             1, [0,2], '^(essential|potential_essential)$',
510                                             $loadFeatureEssential)) {
511                            push @keywords, 'essential';
512                            $loadFeature->Add('essential');
513                        }
514                        if (SpecialAttribute($featureID, \@attributes,
515                                             0, [2], '^virulen',
516                                             $loadFeatureVirulent)) {
517                            push @keywords, 'virulent';
518                            $loadFeature->Add('virulent');
519                        }
520                        if (SpecialAttribute($featureID, \@attributes,
521                                             0, [0,2], '^iedb_',
522                                             $loadFeatureIEDB)) {
523                            push @keywords, 'iedb';
524                            $loadFeature->Add('iedb');
525                        }
526                        # Now we have some other attributes we need to process. Currently,
527                        # this is CDD and CELLO, but we expect the number to increase.
528                        my %attributeHash = ();
529                        for my $attrRow (@{$attributes->{$featureID}}) {
530                            my (undef, $key, @values) = @{$attrRow};
531                            $key =~ /^([^:]+)::(.+)/;
532                            if (exists $attributeHash{$1}) {
533                                $attributeHash{$1}->{$2} = \@values;
534                            } else {
535                                $attributeHash{$1} = {$2 => \@values};
536                            }
537                        }
538                        my $celloValue = "unknown";
539                        # Pull in the CELLO attribute. There will never be more than one.
540                        # If we have one, it's a feature attribute AND a keyword.
541                        my @celloData = keys %{$attributeHash{CELLO}};
542                        if (@celloData) {
543                            $celloValue = $celloData[0];
544                            push @keywords, $celloValue;
545                        }
546                        # Now we handle CDD. This is a bit more complicated, because
547                        # there are multiple CDDs per protein.
548                        if (exists $attributeHash{CDD}) {
549                            # Get the hash of CDD IDs to scores for this feature. We
550                            # already know it exists because of the above IF.
551                            my $cddHash = $attributeHash{CDD};
552                            my @cddData = sort keys %{$cddHash};
553                            for my $cdd (@cddData) {
554                                # Extract the score for this CDD and decode it.
555                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);
556                                my $realScore = FIGRules::DecodeScore($codeScore);
557                                # Create the connection.
558                                $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
559                                # If this CDD does not yet exist, create its record.
560                                if (! exists $CDD{$cdd}) {
561                                    $CDD{$cdd} = 1;
562                                    $loadCDD->Put($cdd);
563                                }
564                            }
565                        }
566                        # Now we need to bust up hyphenated words in the keyword
567                        # list. We keep them separate and put them at the end so
568                        # the original word order is available.
569                        my $keywordString = "";
570                        my $bustedString = "";
571                        for my $keyword (@keywords) {
572                            if (length $keyword >= 3) {
573                                $keywordString .= " $keyword";
574                                if ($keyword =~ /-/) {
575                                    my @words = split /-/, $keyword;
576                                    $bustedString .= join(" ", "", @words);
577                                }
578                            }
579                        }
580                        $keywordString .= $bustedString;
581                        # Get rid of annoying punctuation.
582                        $keywordString =~ s/[();]//g;
583                        # Clean the keyword list.
584                        my $cleanWords = $sprout->CleanKeywords($keywordString);
585                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
586                        # Now we need to process the feature's locations. First, we split them up.
587                        my @locationList = split /\s*,\s*/, $locations;
588                        # Next, we convert them to Sprout location objects.
589                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
590                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
591                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
592                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
593                      # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                     my @locationList = split /\s*,\s*/, $locations;  
                     # Create the location position indicator.  
594                      my $i = 1;                      my $i = 1;
595                      # Loop through the locations.                      # Loop through the locations.
596                      for my $location (@locationList) {                      for my $locObject (@locObjectList) {
597                          # Parse the location.                          # Split this location into a list of chunks.
                         my $locObject = BasicLocation->new("$genomeID:$location");  
                         # Split it into a list of chunks.  
598                          my @locOList = ();                          my @locOList = ();
599                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
600                              $loadIsLocatedIn->Add("peeling");                              $loadIsLocatedIn->Add("peeling");
# Line 557  Line 609 
609                              $i++;                              $i++;
610                          }                          }
611                      }                      }
612                  }                      # Finally, reassemble the location objects into a list of Sprout location strings.
613              }                      $locations = join(", ", map { $_->String } @locObjectList);
614          }                      # Create the feature record.
615      }                      $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating BBH data.") if T(2);  
         # Now we loop through the genomes, generating the data for each one.  
         for my $genomeID (sort keys %{$genomeHash}) {  
             $loadIsBidirectionalBestHitOf->Add("genomeIn");  
             Trace("Processing features for genome $genomeID.") if T(3);  
             # Get the feature list for this genome.  
             my $features = $fig->all_features_detailed($genomeID);  
             # Loop through the features.  
             for my $featureData (@{$features}) {  
                 # Split the tuple.  
                 my ($featureID, $locations, $aliases, $type) = @{$featureData};  
                 # Get the bi-directional best hits.  
                 my @bbhList = $fig->bbhs($featureID);  
                 for my $bbhEntry (@bbhList) {  
                     # Get the target feature ID and the score.  
                     my ($targetID, $score) = @{$bbhEntry};  
                     # Check the target feature's genome.  
                     my $targetGenomeID = $fig->genome_of($targetID);  
                     # Only proceed if it's one of our genomes.  
                     if ($genomeHash->{$targetGenomeID}) {  
                         $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                            $score);  
                     }  
616                  }                  }
617              }              }
618          }          }
# Line 652  Line 640 
640      SubsystemClass      SubsystemClass
641      Role      Role
642      RoleEC      RoleEC
643        IsIdentifiedByEC
644      SSCell      SSCell
645      ContainsFeature      ContainsFeature
646      IsGenomeOf      IsGenomeOf
# Line 693  Line 682 
682      # Get the map list.      # Get the map list.
683      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
684      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
685      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
686      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
687      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
688      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
689      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
690      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
691      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
692      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
693      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
694      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
695      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
696      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
697      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
698      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
699      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
700      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
701      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
702      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
703      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
704      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
705        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
706      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
707          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
708      } else {      } else {
709          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
710          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
711          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
712          my %ecToRoles = ();          my %ecToRoles = ();
713          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 731  Line 721 
721              # Get the subsystem object.              # Get the subsystem object.
722              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
723              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
724              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
725                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
726                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
727                  # Create the subsystem record.                  # Create the subsystem record.
728                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
729                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
730                  $loadSubsystem->Put($subsysID, $curator, $notes);                  $loadSubsystem->Put($subsysID, $curator, $notes);
731                  my $class = $fig->subsystem_classification($subsysID);                  # Now for the classification string. This comes back as a list
732                  if ($class) {                  # reference and we convert it to a space-delimited string.
733                      $loadSubsystemClass->Put($subsysID, $class);                  my $classList = $fig->subsystem_classification($subsysID);
734                  }                  my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
735                    $loadSubsystemClass->Put($subsysID, $classString);
736                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
737                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
738                        # Get the role's abbreviation.
739                        my $abbr = $sub->get_role_abbr($col);
740                      # Connect to this role.                      # Connect to this role.
741                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
742                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
743                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
744                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
745                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
746                          # Add the role.                          # Add the role.
747                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
748                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
749                          # Check for an EC number.                          # Check for an EC number.
750                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
751                              my $ec = $1;                              my $ec = $1;
752                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
753                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
754                                if (exists $ecToRoles{$ec}) {
755                                    # No, so just add this role to the EC list.
756                                    push @{$ecToRoles{$ec}}, $roleID;
757                                } else {
758                                    # Output this EC.
759                                    $loadRoleEC->Put($ec);
760                                    # Create its role list.
761                                    $ecToRoles{$ec} = [$roleID];
762                                }
763                          }                          }
764                      }                      }
765                  }                  }
# Line 871  Line 872 
872              # Now we need to link all the map's roles to it.              # Now we need to link all the map's roles to it.
873              # A hash is used to prevent duplicates.              # A hash is used to prevent duplicates.
874              my %roleHash = ();              my %roleHash = ();
875              for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
876                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {                  if (exists $ecToRoles{$ec}) {
877                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);                      for my $role (@{$ecToRoles{$ec}}) {
878                            if (! $roleHash{$role}) {
879                                $loadRoleOccursIn->Put($role, $map);
880                      $roleHash{$role} = 1;                      $roleHash{$role} = 1;
881                  }                  }
882              }              }
883          }          }
884                }
885            }
886          # Before we leave, we must create the Catalyzes table. We start with the reactions,          # Before we leave, we must create the Catalyzes table. We start with the reactions,
887          # then use the "ecToRoles" table to convert EC numbers to role IDs.          # then use the "ecToRoles" table to convert EC numbers to role IDs.
888          my @reactions = $fig->all_reactions();          my @reactions = $fig->all_reactions();
889          for my $reactionID (@reactions) {          for my $reactionID (@reactions) {
890              # Get this reaction's list of roles. The results will be EC numbers.              # Get this reaction's list of roles. The results will be EC numbers.
891              my @roles = $fig->catalyzed_by($reactionID);              my @ecs = $fig->catalyzed_by($reactionID);
892              # Loop through the roles, creating catalyzation records.              # Loop through the roles, creating catalyzation records.
893              for my $thisRole (@roles) {              for my $thisEC (@ecs) {
894                  if (exists $ecToRoles{$thisRole}) {                  if (exists $ecToRoles{$thisEC}) {
895                      $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);                      for my $thisRole (@{$ecToRoles{$thisEC}}) {
896                            $loadCatalyzes->Put($thisRole, $reactionID);
897                        }
898                  }                  }
899              }              }
900          }          }
# Line 935  Line 942 
942      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
943      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
944      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
945      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
946      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
947          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
948      } else {      } else {
# Line 943  Line 950 
950          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
951          my %propertyKeys = ();          my %propertyKeys = ();
952          my $nextID = 1;          my $nextID = 1;
953            # Get the attributes we intend to store in the property table.
954            my $propKeys = $self->{propKeys};
955          # Loop through the genomes.          # Loop through the genomes.
956          for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
957              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
958              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
959              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
960              my $propertyCount = 0;              my $propertyCount = 0;
961              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
962              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
963                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
964                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
965                  my @attributeList = $fig->get_attributes($fid, '', '', '');              for my $attributeData (@attributes) {
966                  if (scalar @attributeList) {                  # Pull apart the attribute tuple.
967                      $featureCount++;                  my ($fid, $key, $value, $url) = @{$attributeData};
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
968                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
969                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
970                      # character.                      # character.
# Line 984  Line 982 
982                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
983                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
984                  }                  }
             }  
985              # Update the statistics.              # Update the statistics.
986              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
987              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
988          }          }
989      }      }
# Line 1032  Line 1028 
1028      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1029      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1030      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1031      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1032      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1033      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1034      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1035      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1036          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1037      } else {      } else {
# Line 1139  Line 1135 
1135      # Get the genome hash.      # Get the genome hash.
1136      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1137      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1138      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1139      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1140      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1141      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1276  Line 1272 
1272      Compound      Compound
1273      CompoundName      CompoundName
1274      CompoundCAS      CompoundCAS
1275        IsIdentifiedByCAS
1276        HasCompoundName
1277      IsAComponentOf      IsAComponentOf
1278    
1279  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
# Line 1297  Line 1295 
1295      my $fig = $self->{fig};      my $fig = $self->{fig};
1296      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1297      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1298      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1299      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1300      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1301      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1302      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1303        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1304        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1305      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1306          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1307      } else {      } else {
1308          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1309            # We need some hashes to prevent duplicates.
1310            my %compoundNames = ();
1311            my %compoundCASes = ();
1312          # First we create the compounds.          # First we create the compounds.
1313          my @compounds = $fig->all_compounds();          my @compounds = $fig->all_compounds();
1314          for my $cid (@compounds) {          for my $cid (@compounds) {
# Line 1314  Line 1317 
1317              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1318              my $prio = 1;              my $prio = 1;
1319              for my $name (@names) {              for my $name (@names) {
1320                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1321                        $loadCompoundName->Put($name);
1322                        $compoundNames{$name} = 1;
1323                    }
1324                    $loadHasCompoundName->Put($cid, $name, $prio++);
1325              }              }
1326              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1327              # becomes the label.              # becomes the label.
# Line 1323  Line 1330 
1330              # Check for a CAS ID.              # Check for a CAS ID.
1331              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1332              if ($cas) {              if ($cas) {
1333                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1334                    if (! exists $compoundCASes{$cas}) {
1335                        $loadCompoundCAS->Put($cas);
1336                        $compoundCASes{$cas} = 1;
1337                    }
1338              }              }
1339          }          }
1340          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
# Line 1360  Line 1371 
1371      return $retVal;      return $retVal;
1372  }  }
1373    
 =head3 LoadGroupData  
   
 C<< my $stats = $spl->LoadGroupData(); >>  
   
 Load the genome Groups into Sprout.  
   
 The following relations are loaded by this method.  
   
     GenomeGroups  
   
 There is no direct support for genome groups in FIG, so we access the SEED  
 files directly.  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadGroupData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     # Create a load object for the table we're loading.  
     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating group data.") if T(2);  
         # Loop through the genomes.  
         my $line;  
         for my $genomeID (keys %{$genomeHash}) {  
             Trace("Processing $genomeID.") if T(3);  
             # Open the NMPDR group file for this genome.  
             if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&  
                 defined($line = <TMP>)) {  
                 # Clean the line ending.  
                 chomp $line;  
                 # Add the group to the table. Note that there can only be one group  
                 # per genome.  
                 $loadGenomeGroups->Put($genomeID, $line);  
             }  
             close TMP;  
         }  
     }  
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
1374  =head3 LoadSynonymData  =head3 LoadSynonymData
1375    
1376  C<< my $stats = $spl->LoadSynonymData(); >>  C<< my $stats = $spl->LoadSynonymData(); >>
# Line 1458  Line 1412 
1412          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1413          # Get the database handle.          # Get the database handle.
1414          my $dbh = $fig->db_handle();          my $dbh = $fig->db_handle();
1415          # Ask for the synonyms.          # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1416          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1417          my $result = $sth->execute();          my $result = $sth->execute();
1418          if (! defined($result)) {          if (! defined($result)) {
# Line 1470  Line 1424 
1424              my $featureCount = 0;              my $featureCount = 0;
1425              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1426              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1427                  # Get the synonym ID and feature ID.                  # Get the synonym group ID and feature ID.
1428                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1429                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1430                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
# Line 1506  Line 1460 
1460  The following relations are loaded by this method.  The following relations are loaded by this method.
1461    
1462      Family      Family
1463      ContainsFeature      IsFamilyForFeature
1464    
1465  The source information for these relations is taken from the C<families_for_protein>,  The source information for these relations is taken from the C<families_for_protein>,
1466  C<family_function>, and C<sz_family> methods of the B<FIG> object.  C<family_function>, and C<sz_family> methods of the B<FIG> object.
# Line 1530  Line 1484 
1484      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1485      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
1486      my $loadFamily = $self->_TableLoader('Family');      my $loadFamily = $self->_TableLoader('Family');
1487      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');      my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1488      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1489          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1490      } else {      } else {
# Line 1542  Line 1496 
1496              Trace("Processing features for $genomeID.") if T(2);              Trace("Processing features for $genomeID.") if T(2);
1497              # Loop through this genome's PEGs.              # Loop through this genome's PEGs.
1498              for my $fid ($fig->all_features($genomeID, "peg")) {              for my $fid ($fig->all_features($genomeID, "peg")) {
1499                  $loadContainsFeature->Add("features", 1);                  $loadIsFamilyForFeature->Add("features", 1);
1500                  # Get this feature's families.                  # Get this feature's families.
1501                  my @families = $fig->families_for_protein($fid);                  my @families = $fig->families_for_protein($fid);
1502                  # Loop through the families, connecting them to the feature.                  # Loop through the families, connecting them to the feature.
1503                  for my $family (@families) {                  for my $family (@families) {
1504                      $loadContainsFeature->Put($family, $fid);                      $loadIsFamilyForFeature->Put($family, $fid);
1505                      # If this is a new family, create a record for it.                      # If this is a new family, create a record for it.
1506                      if (! exists $familyHash{$family}) {                      if (! exists $familyHash{$family}) {
1507                            $familyHash{$family} = 1;
1508                          $loadFamily->Add("families", 1);                          $loadFamily->Add("families", 1);
1509                          my $size = $fig->sz_family($family);                          my $size = $fig->sz_family($family);
1510                          my $func = $fig->family_function($family);                          my $func = $fig->family_function($family);
# Line 1564  Line 1519 
1519      return $retVal;      return $retVal;
1520  }  }
1521    
1522    =head3 LoadDrugData
1523    
1524    C<< my $stats = $spl->LoadDrugData(); >>
1525    
1526    Load the drug target data into Sprout.
1527    
1528    The following relations are loaded by this method.
1529    
1530        PDB
1531        DocksWith
1532        IsProteinForFeature
1533        Ligand
1534    
1535    The source information for these relations is taken from attributes. The
1536    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1537    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1538    attribute contains the information for the B<DocksWith> relationship. It is
1539    expected that additional attributes and tables will be added in the future.
1540    
1541    =over 4
1542    
1543    =item RETURNS
1544    
1545    Returns a statistics object for the loads.
1546    
1547    =back
1548    
1549    =cut
1550    #: Return Type $%;
1551    sub LoadDrugData {
1552        # Get this object instance.
1553        my ($self) = @_;
1554        # Get the FIG object.
1555        my $fig = $self->{fig};
1556        # Get the genome hash.
1557        my $genomeHash = $self->{genomes};
1558        # Create load objects for the tables we're loading.
1559        my $loadPDB = $self->_TableLoader('PDB');
1560        my $loadLigand = $self->_TableLoader('Ligand');
1561        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1562        my $loadDocksWith = $self->_TableLoader('DocksWith');
1563        if ($self->{options}->{loadOnly}) {
1564            Trace("Loading from existing files.") if T(2);
1565        } else {
1566            Trace("Generating drug target data.") if T(2);
1567            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1568            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1569            # this process, PDB information is collected in a hash table and then
1570            # unspooled after both relationships are created.
1571            my %pdbHash = ();
1572            Trace("Generating docking data.") if T(2);
1573            # Get all the docking data. This may cause problems if there are too many PDBs,
1574            # at which point we'll need another algorithm. The indicator that this is
1575            # happening will be a timeout error in the next statement.
1576            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1577                                                  ['docking_results', $FIG_Config::dockLimit]);
1578            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1579            for my $dockData (@dockData) {
1580                # Get the docking data components.
1581                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1582                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1583                $pdbID = lc $pdbID;
1584                # Strip off the object type.
1585                $pdbID =~ s/pdb://;
1586                # Extract the ZINC ID from the docking key. Note that there are two possible
1587                # formats.
1588                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1589                if (! $zinc_id) {
1590                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1591                    $loadDocksWith->Add("errors");
1592                } else {
1593                    # Get the pieces of the value and parse the energy.
1594                    # Note that we don't care about the rank, since
1595                    # we can sort on the energy level itself in our database.
1596                    my ($energy, $tool, $type) = @valueData;
1597                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1598                    # Ignore predicted results.
1599                    if ($type ne "Predicted") {
1600                        # Count this docking result.
1601                        if (! exists $pdbHash{$pdbID}) {
1602                            $pdbHash{$pdbID} = 1;
1603                        } else {
1604                            $pdbHash{$pdbID}++;
1605                        }
1606                        # Write the result to the output.
1607                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1608                                            $total, $vanderwaals);
1609                    }
1610                }
1611            }
1612            Trace("Connecting features.") if T(2);
1613            # Loop through the genomes.
1614            for my $genome (sort keys %{$genomeHash}) {
1615                Trace("Generating PDBs for $genome.") if T(3);
1616                # Get all of the PDBs that BLAST against this genome's features.
1617                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1618                for my $pdbData (@attributeData) {
1619                    # The PDB ID is coded as a subkey.
1620                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1621                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1622                        $loadPDB->Add("errors");
1623                    } else {
1624                        my $pdbID = $1;
1625                        # Insure the PDB is in the hash.
1626                        if (! exists $pdbHash{$pdbID}) {
1627                            $pdbHash{$pdbID} = 0;
1628                        }
1629                        # The score and locations are coded in the attribute value.
1630                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1631                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1632                            $loadIsProteinForFeature->Add("errors");
1633                        } else {
1634                            my ($score, $locData) = ($1,$2);
1635                            # The location data may not be present, so we have to start with some
1636                            # defaults and then check.
1637                            my ($start, $end) = (1, 0);
1638                            if ($locData) {
1639                                $locData =~ /(\d+)-(\d+)/;
1640                                $start = $1;
1641                                $end = $2;
1642                            }
1643                            # If we still don't have the end location, compute it from
1644                            # the feature length.
1645                            if (! $end) {
1646                                # Most features have one location, but we do a list iteration
1647                                # just in case.
1648                                my @locations = $fig->feature_location($pdbData->[0]);
1649                                $end = 0;
1650                                for my $loc (@locations) {
1651                                    my $locObject = BasicLocation->new($loc);
1652                                    $end += $locObject->Length;
1653                                }
1654                            }
1655                            # Decode the score.
1656                            my $realScore = FIGRules::DecodeScore($score);
1657                            # Connect the PDB to the feature.
1658                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1659                        }
1660                    }
1661                }
1662            }
1663            # We've got all our PDBs now, so we unspool them from the hash.
1664            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1665            my $count = 0;
1666            for my $pdbID (sort keys %pdbHash) {
1667                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1668                $count++;
1669                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1670            }
1671            # Finally we create the ligand table. This information can be found in the
1672            # zinc_name attribute.
1673            Trace("Loading ligands.") if T(2);
1674            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1675            my $last_zinc_id = "";
1676            my $zinc_id = "";
1677            my $done = 0;
1678            while (! $done) {
1679                # Get the next 10000 ligands. We insist that the object ID is greater than
1680                # the last ID we processed.
1681                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1682                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1683                                                           ["ZINC:$zinc_id", "zinc_name"]);
1684                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1685                if (! @attributeData) {
1686                    # Here there are no attributes left, so we quit the loop.
1687                    $done = 1;
1688                } else {
1689                    # Process the attribute data we've received.
1690                    for my $zinc_data (@attributeData) {
1691                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1692                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1693                            $zinc_id = $1;
1694                            # Check for a duplicate.
1695                            if ($zinc_id eq $last_zinc_id) {
1696                                $loadLigand->Add("duplicate");
1697                            } else {
1698                                # Here it's safe to output the ligand. The ligand name is the attribute value
1699                                # (third column in the row).
1700                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1701                                # Insure we don't try to add this ID again.
1702                                $last_zinc_id = $zinc_id;
1703                            }
1704                        } else {
1705                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1706                            $loadLigand->Add("errors");
1707                        }
1708                    }
1709                }
1710            }
1711            Trace("Ligands loaded.") if T(2);
1712        }
1713        # Finish the load.
1714        my $retVal = $self->_FinishAll();
1715        return $retVal;
1716    }
1717    
1718    
1719  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1720    
1721    =head3 SpecialAttribute
1722    
1723    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1724    
1725    Look for special attributes of a given type. A special attribute is found by comparing one of
1726    the columns of the incoming attribute list to a search pattern. If a match is found, then
1727    a set of columns is put into an output table connected to the specified ID.
1728    
1729    For example, when processing features, the attribute list we look at has three columns: attribute
1730    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1731    begins with C<iedb_>. The call signature is therefore
1732    
1733        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1734    
1735    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1736    to the output along with the specified feature ID.
1737    
1738    =over 4
1739    
1740    =item id
1741    
1742    ID of the object whose special attributes are being loaded. This forms the first column of the
1743    output.
1744    
1745    =item attributes
1746    
1747    Reference to a list of tuples.
1748    
1749    =item idxMatch
1750    
1751    Index in each tuple of the column to be matched against the pattern. If the match is
1752    successful, an output record will be generated.
1753    
1754    =item idxValues
1755    
1756    Reference to a list containing the indexes in each tuple of the columns to be put as
1757    the second column of the output.
1758    
1759    =item pattern
1760    
1761    Pattern to be matched against the specified column. The match will be case-insensitive.
1762    
1763    =item loader
1764    
1765    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1766    but technically it could be anything with a C<Put> method.
1767    
1768    =item RETURN
1769    
1770    Returns a count of the matches found.
1771    
1772    =item
1773    
1774    =back
1775    
1776    =cut
1777    
1778    sub SpecialAttribute {
1779        # Get the parameters.
1780        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1781        # Declare the return variable.
1782        my $retVal = 0;
1783        # Loop through the attribute rows.
1784        for my $row (@{$attributes}) {
1785            # Check for a match.
1786            if ($row->[$idxMatch] =~ m/$pattern/i) {
1787                # We have a match, so output a row. This is a bit tricky, since we may
1788                # be putting out multiple columns of data from the input.
1789                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1790                $loader->Put($id, $value);
1791                $retVal++;
1792            }
1793        }
1794        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1795        # Return the number of matches.
1796        return $retVal;
1797    }
1798    
1799  =head3 TableLoader  =head3 TableLoader
1800    
1801  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1580  Line 1810 
1810    
1811  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1812    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
1813  =item RETURN  =item RETURN
1814    
1815  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1594  Line 1820 
1820    
1821  sub _TableLoader {  sub _TableLoader {
1822      # Get the parameters.      # Get the parameters.
1823      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
1824      # Create the load object.      # Create the load object.
1825      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
1826      # Cache it in the loader list.      # Cache it in the loader list.
1827      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1828      # Return it to the caller.      # Return it to the caller.
# Line 1669  Line 1894 
1894      return $retVal;      return $retVal;
1895  }  }
1896    
1897    =head3 GetGenomeAttributes
1898    
1899    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>
1900    
1901    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1902    attributes for all the features of a genome in a single call, then organizes them into
1903    a hash.
1904    
1905    =over 4
1906    
1907    =item fig
1908    
1909    FIG-like object for accessing attributes.
1910    
1911    =item genomeID
1912    
1913    ID of the genome who's attributes are desired.
1914    
1915    =item fids
1916    
1917    Reference to a list of the feature IDs whose attributes are to be kept.
1918    
1919    =item propKeys
1920    
1921    A list of the keys to retrieve.
1922    
1923    =item RETURN
1924    
1925    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
1926    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
1927    the attribute key, and one or more attribute values.
1928    
1929    =back
1930    
1931    =cut
1932    
1933    sub GetGenomeAttributes {
1934        # Get the parameters.
1935        my ($fig, $genomeID, $fids, $propKeys) = @_;
1936        # Declare the return variable.
1937        my $retVal = {};
1938        # Initialize the hash. This not only enables us to easily determine which FIDs to
1939        # keep, it insures that the caller sees a list reference for every known fid,
1940        # simplifying the logic.
1941        for my $fid (@{$fids}) {
1942            $retVal->{$fid} = [];
1943        }
1944        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1945        # an eval is used.
1946        my @aList = ();
1947        eval {
1948            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1949            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1950        };
1951        # Check for a problem.
1952        if ($@) {
1953            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1954            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1955            # but allows us to continue processing.
1956            my $nFids = scalar @{$fids};
1957            for (my $i = 0; $i < $nFids; $i += 100) {
1958                # Determine the index of the last feature ID we'll be specifying on this pass.
1959                # Normally it's $i + 99, but if we're close to the end it may be less.
1960                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1961                # Get a slice of the fid list.
1962                my @slice = @{$fids}[$i .. $end];
1963                # Get the relevant attributes.
1964                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1965                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1966                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1967                push @aList, @aShort;
1968            }
1969        }
1970        # Now we should have all the interesting attributes in @aList. Populate the hash with
1971        # them.
1972        for my $aListEntry (@aList) {
1973            my $fid = $aListEntry->[0];
1974            if (exists $retVal->{$fid}) {
1975                push @{$retVal->{$fid}}, $aListEntry;
1976            }
1977        }
1978        # Return the result.
1979        return $retVal;
1980    }
1981    
1982    
1983  1;  1;

Legend:
Removed from v.1.61  
changed lines
  Added in v.1.87

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3