[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.62, Sun Jul 30 05:44:57 2006 UTC revision 1.83, Fri May 11 06:37:38 2007 UTC
# Line 80  Line 80 
80  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
81  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83  in its data directory.) Only subsystem data related to the trusted subsystems is loaded.  in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
84    
85  =item options  =item options
86    
# Line 120  Line 120 
120                      # an omitted access code can be defaulted to 1.                      # an omitted access code can be defaulted to 1.
121                      for my $genomeLine (@genomeList) {                      for my $genomeLine (@genomeList) {
122                          my ($genomeID, $accessCode) = split("\t", $genomeLine);                          my ($genomeID, $accessCode) = split("\t", $genomeLine);
123                          if (undef $accessCode) {                          if (! defined($accessCode)) {
124                              $accessCode = 1;                              $accessCode = 1;
125                          }                          }
126                          $genomes{$genomeID} = $accessCode;                          $genomes{$genomeID} = $accessCode;
# Line 138  Line 138 
138          if (! defined $subsysFile || $subsysFile eq '') {          if (! defined $subsysFile || $subsysFile eq '') {
139              # Here we want all the usable subsystems. First we get the whole list.              # Here we want all the usable subsystems. First we get the whole list.
140              my @subs = $fig->all_subsystems();              my @subs = $fig->all_subsystems();
141              # Loop through, checking for usability.              # Loop through, checking for the NMPDR file.
142              for my $sub (@subs) {              for my $sub (@subs) {
143                  if ($fig->usable_subsystem($sub)) {                  if ($fig->nmpdr_subsystem($sub)) {
144                      $subsystems{$sub} = 1;                      $subsystems{$sub} = 1;
145                  }                  }
146              }              }
# Line 163  Line 163 
163                  Confess("Invalid subsystem parameter in SproutLoad constructor.");                  Confess("Invalid subsystem parameter in SproutLoad constructor.");
164              }              }
165          }          }
166            # Go through the subsys hash again, creating the keyword list for each subsystem.
167            for my $subsystem (keys %subsystems) {
168                my $name = $subsystem;
169                $name =~ s/_/ /g;
170                my $classes = $fig->subsystem_classification($subsystem);
171                $name .= " " . join(" ", @{$classes});
172                $subsystems{$subsystem} = $name;
173            }
174      }      }
175      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
176      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
# Line 266  Line 274 
274              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
275              # Get the full taxonomy.              # Get the full taxonomy.
276              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284                # Open the NMPDR group file for this genome.
285                my $group;
286                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
287                    defined($group = <TMP>)) {
288                    # Clean the line ending.
289                    chomp $group;
290                } else {
291                    # No group, so use the default.
292                    $group = $FIG_Config::otherGroup;
293                }
294                close TMP;
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                               $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
299              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
300              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 449  Line 475 
475      FeatureUpstream      FeatureUpstream
476      IsLocatedIn      IsLocatedIn
477      HasFeature      HasFeature
478        HasRoleInSubsystem
479        FeatureEssential
480        FeatureVirulent
481        FeatureIEDB
482    
483  =over 4  =over 4
484    
# Line 463  Line 493 
493  sub LoadFeatureData {  sub LoadFeatureData {
494      # Get this object instance.      # Get this object instance.
495      my ($self) = @_;      my ($self) = @_;
496      # Get the FIG object.      # Get the FIG and Sprout objects.
497      my $fig = $self->{fig};      my $fig = $self->{fig};
498        my $sprout = $self->{sprout};
499      # Get the table of genome IDs.      # Get the table of genome IDs.
500      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
501      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
# Line 474  Line 505 
505      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
506      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
507      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
508      my $loadHasFeature = $self->_TableLoader('HasFeature');      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);
509        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);
510        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
511        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
512        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
513        # Get the subsystem hash.
514        my $subHash = $self->{subsystems};
515      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
516      # locations.      # locations.
517      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 487  Line 524 
524              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
525              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
526              # Get the feature list for this genome.              # Get the feature list for this genome.
527              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528              # Sort and count the list.              # Sort and count the list.
529              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
536              my $oldFeatureID = "";              my $oldFeatureID = "";
537              # Loop through the features.              # Loop through the features.
538              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
539                  # Split the tuple.                  # Split the tuple.
540                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541                  # Check for duplicates.                  # Check for duplicates.
542                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
543                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 505  Line 545 
545                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
546                      # Count this feature.                      # Count this feature.
547                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
548                      # Create the feature record.                      # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                      $loadFeature->Put($featureID, 1, $type);                      # Sprout database requires a single character.
550                      # Link it to the parent genome.                      if (! defined($quality) || $quality eq "") {
551                      $loadHasFeature->Put($genomeID, $featureID, $type);                          $quality = " ";
552                        }
553                        # Begin building the keywords. We start with the genome ID, the
554                        # feature ID, the taxonomy, and the organism name.
555                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                                        $fig->taxonomy_of($genomeID));
557                      # Create the aliases.                      # Create the aliases.
558                      for my $alias ($fig->feature_aliases($featureID)) {                      for my $alias ($fig->feature_aliases($featureID)) {
559                          $loadFeatureAlias->Put($featureID, $alias);                          $loadFeatureAlias->Put($featureID, $alias);
560                            push @keywords, $alias;
561                      }                      }
562                        Trace("Assignment for $featureID is: $assignment") if T(4);
563                        # Break the assignment into words and shove it onto the
564                        # keyword list.
565                        push @keywords, split(/\s+/, $assignment);
566                        # Link this feature to the parent genome.
567                        $loadHasFeature->Put($genomeID, $featureID, $type);
568                      # Get the links.                      # Get the links.
569                      my @links = $fig->fid_links($featureID);                      my @links = $fig->fid_links($featureID);
570                      for my $link (@links) {                      for my $link (@links) {
# Line 531  Line 583 
583                              $loadFeatureUpstream->Put($featureID, $upstream);                              $loadFeatureUpstream->Put($featureID, $upstream);
584                          }                          }
585                      }                      }
586                        # Now we need to find the subsystems this feature participates in.
587                        # We also add the subsystems to the keyword list. Before we do that,
588                        # we must convert underscores to spaces and tack on the classifications.
589                        my @subsystems = $fig->peg_to_subsystems($featureID);
590                        for my $subsystem (@subsystems) {
591                            # Only proceed if we like this subsystem.
592                            if (exists $subHash->{$subsystem}) {
593                                # Store the has-role link.
594                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
595                                # Save the subsystem's keyword data.
596                                my $subKeywords = $subHash->{$subsystem};
597                                push @keywords, split /\s+/, $subKeywords;
598                                # Now we need to get this feature's role in the subsystem.
599                                my $subObject = $fig->get_subsystem($subsystem);
600                                my @roleColumns = $subObject->get_peg_roles($featureID);
601                                my @allRoles = $subObject->get_roles();
602                                for my $col (@roleColumns) {
603                                    my $role = $allRoles[$col];
604                                    push @keywords, split /\s+/, $role;
605                                    push @keywords, $subObject->get_role_abbr($col);
606                                }
607                            }
608                        }
609                        # There are three special attributes computed from property
610                        # data that we build next. If the special attribute is non-empty,
611                        # its name will be added to the keyword list. First, we get all
612                        # the attributes for this feature. They will come back as
613                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
614                        # [name, value, value with URL]. (We don't need the PEG, since
615                        # we already know it.)
616                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                             @{$attributes->{$featureID}};
618                        # Now we process each of the special attributes.
619                        if (SpecialAttribute($featureID, \@attributes,
620                                             1, [0,2], '^(essential|potential_essential)$',
621                                             $loadFeatureEssential)) {
622                            push @keywords, 'essential';
623                            $loadFeature->Add('essential');
624                        }
625                        if (SpecialAttribute($featureID, \@attributes,
626                                             0, [2], '^virulen',
627                                             $loadFeatureVirulent)) {
628                            push @keywords, 'virulent';
629                            $loadFeature->Add('virulent');
630                        }
631                        if (SpecialAttribute($featureID, \@attributes,
632                                             0, [0,2], '^iedb_',
633                                             $loadFeatureIEDB)) {
634                            push @keywords, 'iedb';
635                            $loadFeature->Add('iedb');
636                        }
637                        # Now we need to bust up hyphenated words in the keyword
638                        # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                        my $keywordString = "";
641                        my $bustedString = "";
642                        for my $keyword (@keywords) {
643                            if (length $keyword >= 3) {
644                                $keywordString .= " $keyword";
645                                if ($keyword =~ /-/) {
646                                    my @words = split /-/, $keyword;
647                                    $bustedString .= join(" ", "", @words);
648                                }
649                            }
650                        }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                        # Clean the keyword list.
655                        my $cleanWords = $sprout->CleanKeywords($keywordString);
656                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                        # Create the feature record.
658                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
660                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
661                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 566  Line 691 
691      return $retVal;      return $retVal;
692  }  }
693    
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating BBH data.") if T(2);  
         # Now we loop through the genomes, generating the data for each one.  
         for my $genomeID (sort keys %{$genomeHash}) {  
             $loadIsBidirectionalBestHitOf->Add("genomeIn");  
             Trace("Processing features for genome $genomeID.") if T(3);  
             # Get the feature list for this genome.  
             my $features = $fig->all_features_detailed($genomeID);  
             # Loop through the features.  
             for my $featureData (@{$features}) {  
                 # Split the tuple.  
                 my ($featureID, $locations, $aliases, $type) = @{$featureData};  
                 # Get the bi-directional best hits.  
                 my @bbhList = $fig->bbhs($featureID);  
                 for my $bbhEntry (@bbhList) {  
                     # Get the target feature ID and the score.  
                     my ($targetID, $score) = @{$bbhEntry};  
                     # Check the target feature's genome.  
                     my $targetGenomeID = $fig->genome_of($targetID);  
                     # Only proceed if it's one of our genomes.  
                     if ($genomeHash->{$targetGenomeID}) {  
                         $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                            $score);  
                     }  
                 }  
             }  
         }  
     }  
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
694  =head3 LoadSubsystemData  =head3 LoadSubsystemData
695    
696  C<< my $stats = $spl->LoadSubsystemData(); >>  C<< my $stats = $spl->LoadSubsystemData(); >>
# Line 731  Line 788 
788              # Get the subsystem object.              # Get the subsystem object.
789              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
790              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
791              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
792                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
793                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
794                  # Create the subsystem record.                  # Create the subsystem record.
795                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
796                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
797                  $loadSubsystem->Put($subsysID, $curator, $notes);                  $loadSubsystem->Put($subsysID, $curator, $notes);
798                  my $class = $fig->subsystem_classification($subsysID);                  # Now for the classification string. This comes back as a list
799                  if ($class) {                  # reference and we convert it to a space-delimited string.
800                      $loadSubsystemClass->Put($subsysID, $class);                  my $classList = $fig->subsystem_classification($subsysID);
801                  }                  my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                    $loadSubsystemClass->Put($subsysID, $classString);
803                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
805                      # Connect to this role.                      # Connect to this role.
# Line 943  Line 1001 
1001          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
1002          my %propertyKeys = ();          my %propertyKeys = ();
1003          my $nextID = 1;          my $nextID = 1;
1004            # Get the attributes we intend to store in the property table.
1005            my @propKeys = $fig->get_group_keys("NMPDR");
1006          # Loop through the genomes.          # Loop through the genomes.
1007          for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1008              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1009              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1010              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
1011              my $propertyCount = 0;              my $propertyCount = 0;
1012              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
1013              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);
1014                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
1015                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
1016                  my @attributeList = $fig->get_attributes($fid, '', '', '');              for my $attributeData (@attributes) {
1017                  if (scalar @attributeList) {                  # Pull apart the attribute tuple.
1018                      $featureCount++;                  my ($fid, $key, $value, $url) = @{$attributeData};
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
1019                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
1020                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
1021                      # character.                      # character.
# Line 984  Line 1033 
1033                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
1034                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
1035                  }                  }
             }  
1036              # Update the statistics.              # Update the statistics.
1037              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
1038              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1039          }          }
1040      }      }
# Line 1370  Line 1417 
1417    
1418      GenomeGroups      GenomeGroups
1419    
1420  There is no direct support for genome groups in FIG, so we access the SEED  Currently, we do not use groups. We used to use them for NMPDR groups,
1421    butThere is no direct support for genome groups in FIG, so we access the SEED
1422  files directly.  files directly.
1423    
1424  =over 4  =over 4
# Line 1396  Line 1444 
1444          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1445      } else {      } else {
1446          Trace("Generating group data.") if T(2);          Trace("Generating group data.") if T(2);
1447          # Loop through the genomes.          # Currently there are no groups.
         my $line;  
         for my $genomeID (keys %{$genomeHash}) {  
             Trace("Processing $genomeID.") if T(3);  
             # Open the NMPDR group file for this genome.  
             if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&  
                 defined($line = <TMP>)) {  
                 # Clean the line ending.  
                 chomp $line;  
                 # Add the group to the table. Note that there can only be one group  
                 # per genome.  
                 $loadGenomeGroups->Put($genomeID, $line);  
             }  
             close TMP;  
         }  
1448      }      }
1449      # Finish the load.      # Finish the load.
1450      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1506  Line 1540 
1540  The following relations are loaded by this method.  The following relations are loaded by this method.
1541    
1542      Family      Family
1543      ContainsFeature      IsFamilyForFeature
1544    
1545  The source information for these relations is taken from the C<families_for_protein>,  The source information for these relations is taken from the C<families_for_protein>,
1546  C<family_function>, and C<sz_family> methods of the B<FIG> object.  C<family_function>, and C<sz_family> methods of the B<FIG> object.
# Line 1530  Line 1564 
1564      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1565      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
1566      my $loadFamily = $self->_TableLoader('Family');      my $loadFamily = $self->_TableLoader('Family');
1567      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');      my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1568      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1569          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1570      } else {      } else {
# Line 1542  Line 1576 
1576              Trace("Processing features for $genomeID.") if T(2);              Trace("Processing features for $genomeID.") if T(2);
1577              # Loop through this genome's PEGs.              # Loop through this genome's PEGs.
1578              for my $fid ($fig->all_features($genomeID, "peg")) {              for my $fid ($fig->all_features($genomeID, "peg")) {
1579                  $loadContainsFeature->Add("features", 1);                  $loadIsFamilyForFeature->Add("features", 1);
1580                  # Get this feature's families.                  # Get this feature's families.
1581                  my @families = $fig->families_for_protein($fid);                  my @families = $fig->families_for_protein($fid);
1582                  # Loop through the families, connecting them to the feature.                  # Loop through the families, connecting them to the feature.
1583                  for my $family (@families) {                  for my $family (@families) {
1584                      $loadContainsFeature->Put($family, $fid);                      $loadIsFamilyForFeature->Put($family, $fid);
1585                      # If this is a new family, create a record for it.                      # If this is a new family, create a record for it.
1586                      if (! exists $familyHash{$family}) {                      if (! exists $familyHash{$family}) {
1587                          $familyHash{$family} = 1;                          $familyHash{$family} = 1;
# Line 1565  Line 1599 
1599      return $retVal;      return $retVal;
1600  }  }
1601    
1602    =head3 LoadDrugData
1603    
1604    C<< my $stats = $spl->LoadDrugData(); >>
1605    
1606    Load the drug target data into Sprout.
1607    
1608    The following relations are loaded by this method.
1609    
1610        PDB
1611        DocksWith
1612        IsProteinForFeature
1613        Ligand
1614    
1615    The source information for these relations is taken from attributes. The
1616    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1617    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1618    attribute contains the information for the B<DocksWith> relationship. It is
1619    expected that additional attributes and tables will be added in the future.
1620    
1621    =over 4
1622    
1623    =item RETURNS
1624    
1625    Returns a statistics object for the loads.
1626    
1627    =back
1628    
1629    =cut
1630    #: Return Type $%;
1631    sub LoadDrugData {
1632        # Get this object instance.
1633        my ($self) = @_;
1634        # Get the FIG object.
1635        my $fig = $self->{fig};
1636        # Get the genome hash.
1637        my $genomeHash = $self->{genomes};
1638        # Create load objects for the tables we're loading.
1639        my $loadPDB = $self->_TableLoader('PDB');
1640        my $loadLigand = $self->_TableLoader('Ligand');
1641        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1642        my $loadDocksWith = $self->_TableLoader('DocksWith');
1643        if ($self->{options}->{loadOnly}) {
1644            Trace("Loading from existing files.") if T(2);
1645        } else {
1646            Trace("Generating drug target data.") if T(2);
1647            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1648            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1649            # this process, PDB information is collected in a hash table and then
1650            # unspooled after both relationships are created.
1651            my %pdbHash = ();
1652            Trace("Generating docking data.") if T(2);
1653            # Get all the docking data. This may cause problems if there are too many PDBs,
1654            # at which point we'll need another algorithm. The indicator that this is
1655            # happening will be a timeout error in the next statement.
1656            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1657                                                  ['docking_results', $FIG_Config::dockLimit]);
1658            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1659            for my $dockData (@dockData) {
1660                # Get the docking data components.
1661                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1662                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1663                $pdbID = lc $pdbID;
1664                # Extract the ZINC ID from the docking key. Note that there are two possible
1665                # formats.
1666                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1667                if (! $zinc_id) {
1668                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1669                    $loadDocksWith->Add("errors");
1670                } else {
1671                    # Count this docking result.
1672                    if (! exists $pdbHash{$pdbID}) {
1673                        $pdbHash{$pdbID} = 1;
1674                    } else {
1675                        $pdbHash{$pdbID}++;
1676                    }
1677                    # Get the pieces of the value and parse the energy.
1678                    # Note that we don't care about the rank, since
1679                    # we can sort on the energy level itself in our database.
1680                    my ($energy, $tool, $type) = @valueData;
1681                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1682                    # Write the results to the output.
1683                    $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1684                                        $total, $vanderwaals);
1685                }
1686            }
1687            Trace("Connecting features.") if T(2);
1688            # Loop through the genomes.
1689            for my $genome (sort keys %{$genomeHash}) {
1690                Trace("Generating PDBs for $genome.") if T(3);
1691                # Get all of the PDBs that BLAST against this genome's features.
1692                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1693                for my $pdbData (@attributeData) {
1694                    # The PDB ID is coded as a subkey.
1695                    if ($pdbData->[1] !~ /PDB::(.+)/) {
1696                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1697                        $loadPDB->Add("errors");
1698                    } else {
1699                        my $pdbID = $1;
1700                        # Insure the PDB is in the hash.
1701                        if (! exists $pdbHash{$pdbID}) {
1702                            $pdbHash{$pdbID} = 0;
1703                        }
1704                        # The score and locations are coded in the attribute value.
1705                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1706                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1707                            $loadIsProteinForFeature->Add("errors");
1708                        } else {
1709                            my ($score, $locData) = ($1,$2);
1710                            # The location data may not be present, so we have to start with some
1711                            # defaults and then check.
1712                            my ($start, $end) = (1, 0);
1713                            if ($locData) {
1714                                $locData =~ /(\d+)-(\d+)/;
1715                                $start = $1;
1716                                $end = $2;
1717                            }
1718                            # If we still don't have the end location, compute it from
1719                            # the feature length.
1720                            if (! $end) {
1721                                # Most features have one location, but we do a list iteration
1722                                # just in case.
1723                                my @locations = $fig->feature_location($pdbData->[0]);
1724                                $end = 0;
1725                                for my $loc (@locations) {
1726                                    my $locObject = BasicLocation->new($loc);
1727                                    $end += $locObject->Length;
1728                                }
1729                            }
1730                            # Decode the score.
1731                            my $realScore = FIGRules::DecodeScore($score);
1732                            # Connect the PDB to the feature.
1733                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1734                        }
1735                    }
1736                }
1737            }
1738            # We've got all our PDBs now, so we unspool them from the hash.
1739            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1740            my $count = 0;
1741            for my $pdbID (sort keys %pdbHash) {
1742                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1743                $count++;
1744                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1745            }
1746            # Finally we create the ligand table. This information can be found in the
1747            # zinc_name attribute.
1748            Trace("Loading ligands.") if T(2);
1749            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1750            my $last_zinc_id = "";
1751            my $zinc_id = "";
1752            my $done = 0;
1753            while (! $done) {
1754                # Get the next 10000 ligands. We insist that the object ID is greater than
1755                # the last ID we processed.
1756                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1757                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1758                                                           ["ZINC:$zinc_id", "zinc_name"]);
1759                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1760                if (! @attributeData) {
1761                    # Here there are no attributes left, so we quit the loop.
1762                    $done = 1;
1763                } else {
1764                    # Process the attribute data we've received.
1765                    for my $zinc_data (@attributeData) {
1766                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1767                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1768                            $zinc_id = $1;
1769                            # Check for a duplicate.
1770                            if ($zinc_id eq $last_zinc_id) {
1771                                $loadLigand->Add("duplicate");
1772                            } else {
1773                                # Here it's safe to output the ligand. The ligand name is the attribute value
1774                                # (third column in the row).
1775                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1776                                # Insure we don't try to add this ID again.
1777                                $last_zinc_id = $zinc_id;
1778                            }
1779                        } else {
1780                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1781                            $loadLigand->Add("errors");
1782                        }
1783                    }
1784                }
1785            }
1786            Trace("Ligands loaded.") if T(2);
1787        }
1788        # Finish the load.
1789        my $retVal = $self->_FinishAll();
1790        return $retVal;
1791    }
1792    
1793    
1794  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1795    
1796    =head3 SpecialAttribute
1797    
1798    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1799    
1800    Look for special attributes of a given type. A special attribute is found by comparing one of
1801    the columns of the incoming attribute list to a search pattern. If a match is found, then
1802    a set of columns is put into an output table connected to the specified ID.
1803    
1804    For example, when processing features, the attribute list we look at has three columns: attribute
1805    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1806    begins with C<iedb_>. The call signature is therefore
1807    
1808        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1809    
1810    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1811    to the output along with the specified feature ID.
1812    
1813    =over 4
1814    
1815    =item id
1816    
1817    ID of the object whose special attributes are being loaded. This forms the first column of the
1818    output.
1819    
1820    =item attributes
1821    
1822    Reference to a list of tuples.
1823    
1824    =item idxMatch
1825    
1826    Index in each tuple of the column to be matched against the pattern. If the match is
1827    successful, an output record will be generated.
1828    
1829    =item idxValues
1830    
1831    Reference to a list containing the indexes in each tuple of the columns to be put as
1832    the second column of the output.
1833    
1834    =item pattern
1835    
1836    Pattern to be matched against the specified column. The match will be case-insensitive.
1837    
1838    =item loader
1839    
1840    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1841    but technically it could be anything with a C<Put> method.
1842    
1843    =item RETURN
1844    
1845    Returns a count of the matches found.
1846    
1847    =item
1848    
1849    =back
1850    
1851    =cut
1852    
1853    sub SpecialAttribute {
1854        # Get the parameters.
1855        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1856        # Declare the return variable.
1857        my $retVal = 0;
1858        # Loop through the attribute rows.
1859        for my $row (@{$attributes}) {
1860            # Check for a match.
1861            if ($row->[$idxMatch] =~ m/$pattern/i) {
1862                # We have a match, so output a row. This is a bit tricky, since we may
1863                # be putting out multiple columns of data from the input.
1864                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1865                $loader->Put($id, $value);
1866                $retVal++;
1867            }
1868        }
1869        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1870        # Return the number of matches.
1871        return $retVal;
1872    }
1873    
1874  =head3 TableLoader  =head3 TableLoader
1875    
1876  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1670  Line 1974 
1974      return $retVal;      return $retVal;
1975  }  }
1976    
1977    =head3 GetGenomeAttributes
1978    
1979    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
1980    
1981    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1982    attributes for all the features of a genome in a single call, then organizes them into
1983    a hash.
1984    
1985    =over 4
1986    
1987    =item fig
1988    
1989    FIG-like object for accessing attributes.
1990    
1991    =item genomeID
1992    
1993    ID of the genome who's attributes are desired.
1994    
1995    =item fids
1996    
1997    Reference to a list of the feature IDs whose attributes are to be kept.
1998    
1999    =item RETURN
2000    
2001    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2002    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2003    the attribute key, and one or more attribute values.
2004    
2005    =back
2006    
2007    =cut
2008    
2009    sub GetGenomeAttributes {
2010        # Get the parameters.
2011        my ($fig, $genomeID, $fids) = @_;
2012        # Declare the return variable.
2013        my $retVal = {};
2014        # Get a list of the attributes we care about.
2015        my @propKeys = $fig->get_group_keys("NMPDR");
2016        # Get the attributes.
2017        my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);
2018        # Initialize the hash. This not only enables us to easily determine which FIDs to
2019        # keep, it insures that the caller sees a list reference for every known fid,
2020        # simplifying the logic.
2021        for my $fid (@{$fids}) {
2022            $retVal->{$fid} = [];
2023        }
2024        # Populate the hash.
2025        for my $aListEntry (@aList) {
2026            my $fid = $aListEntry->[0];
2027            if (exists $retVal->{$fid}) {
2028                push @{$retVal->{$fid}}, $aListEntry;
2029            }
2030        }
2031        # Return the result.
2032        return $retVal;
2033    }
2034    
2035  1;  1;

Legend:
Removed from v.1.62  
changed lines
  Added in v.1.83

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3