[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.76, Fri Nov 3 00:43:22 2006 UTC revision 1.84, Thu May 17 23:44:51 2007 UTC
# Line 274  Line 274 
274              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
275              # Get the full taxonomy.              # Get the full taxonomy.
276              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284              # Open the NMPDR group file for this genome.              # Open the NMPDR group file for this genome.
285              my $group;              my $group;
286              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
# Line 286  Line 293 
293              }              }
294              close TMP;              close TMP;
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                               $group, $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
299              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
300              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 517  Line 524 
524              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
525              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
526              # Get the feature list for this genome.              # Get the feature list for this genome.
527              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528              # Sort and count the list.              # Sort and count the list.
529              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
536              my $oldFeatureID = "";              my $oldFeatureID = "";
537              # Loop through the features.              # Loop through the features.
538              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
539                  # Split the tuple.                  # Split the tuple.
540                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541                  # Check for duplicates.                  # Check for duplicates.
542                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
543                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 535  Line 545 
545                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
546                      # Count this feature.                      # Count this feature.
547                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                      # Begin building the keywords. We start with the genome ID, the                      # Begin building the keywords. We start with the genome ID, the
554                      # feature ID, and the organism name.                      # feature ID, the taxonomy, and the organism name.
555                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID));                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                      # Get the functional assignment and aliases. This                                      $fig->taxonomy_of($genomeID));
                     # depends on the feature type.  
                     my $assignment;  
                     if ($type eq "peg") {  
                         $assignment = $fig->function_of($featureID);  
557                          # Create the aliases.                          # Create the aliases.
558                          for my $alias ($fig->feature_aliases($featureID)) {                          for my $alias ($fig->feature_aliases($featureID)) {
559                              $loadFeatureAlias->Put($featureID, $alias);                              $loadFeatureAlias->Put($featureID, $alias);
560                              push @keywords, $alias;                              push @keywords, $alias;
561                          }                          }
                     } else {  
                         # For other types, the assignment is the first (and ONLY) alias.  
                         ($assignment) = $fig->feature_aliases($featureID);  
                     }  
562                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
563                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
564                      # keyword list.                      # keyword list.
# Line 607  Line 614 
614                      # [name, value, value with URL]. (We don't need the PEG, since                      # [name, value, value with URL]. (We don't need the PEG, since
615                      # we already know it.)                      # we already know it.)
616                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                           $fig->get_attributes($featureID);                                           @{$attributes->{$featureID}};
618                      # Now we process each of the special attributes.                      # Now we process each of the special attributes.
619                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
620                                           1, 2, '^(essential|potential_essential)$',                                           1, [0,2], '^(essential|potential_essential)$',
621                                           $loadFeatureEssential)) {                                           $loadFeatureEssential)) {
622                          push @keywords, 'essential';                          push @keywords, 'essential';
623                          $loadFeature->Add('essential');                          $loadFeature->Add('essential');
624                      }                      }
625                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
626                                           0, 2, '^virulen',                                           0, [2], '^virulen',
627                                           $loadFeatureVirulent)) {                                           $loadFeatureVirulent)) {
628                          push @keywords, 'virulent';                          push @keywords, 'virulent';
629                          $loadFeature->Add('virulent');                          $loadFeature->Add('virulent');
630                      }                      }
631                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
632                                           0, 2, '^iedb_',                                           0, [0,2], '^iedb_',
633                                           $loadFeatureIEDB)) {                                           $loadFeatureIEDB)) {
634                          push @keywords, 'iedb';                          push @keywords, 'iedb';
635                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
636                      }                      }
637                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
638                      # list.                      # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                      my $keywordString = "";                      my $keywordString = "";
641                        my $bustedString = "";
642                      for my $keyword (@keywords) {                      for my $keyword (@keywords) {
643                          if (length $keyword >= 4) {                          if (length $keyword >= 3) {
644                              $keywordString .= " $keyword";                              $keywordString .= " $keyword";
645                              if ($keyword =~ /-/) {                              if ($keyword =~ /-/) {
646                                  my @words = grep { length($_) >= 4 } split /-/, $keyword;                                  my @words = split /-/, $keyword;
647                                  $keywordString .= join(" ", "", @words);                                  $bustedString .= join(" ", "", @words);
648                              }                              }
649                          }                          }
650                      }                      }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                      # Clean the keyword list.                      # Clean the keyword list.
655                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my $cleanWords = $sprout->CleanKeywords($keywordString);
656                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                      # Create the feature record.                      # Create the feature record.
658                      $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
660                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
661                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 776  Line 788 
788              # Get the subsystem object.              # Get the subsystem object.
789              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
790              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
791              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
792                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
793                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
794                  # Create the subsystem record.                  # Create the subsystem record.
# Line 786  Line 798 
798                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
799                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
800                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
801                  my $classString = join(" : ", grep { $_ } @$classList);                  my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
803                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
# Line 989  Line 1001 
1001          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
1002          my %propertyKeys = ();          my %propertyKeys = ();
1003          my $nextID = 1;          my $nextID = 1;
1004            # Get the attributes we intend to store in the property table.
1005            my @propKeys = $fig->get_group_keys("NMPDR");
1006          # Loop through the genomes.          # Loop through the genomes.
1007          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1008              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1009              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1010              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
1011              my $propertyCount = 0;              my $propertyCount = 0;
1012              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
1013              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);
1014                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
1015                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
1016                  my @attributeList = $fig->get_attributes($fid, '', '', '');              for my $attributeData (@attributes) {
1017                  # Add essentiality and virulence attributes.                  # Pull apart the attribute tuple.
1018                  if ($fig->essential($fid)) {                  my ($fid, $key, $value, $url) = @{$attributeData};
                     push @attributeList, [$fid, 'essential', 1, ''];  
                 }  
                 if ($fig->virulent($fid)) {  
                     push @attributeList, [$fid, 'virulent', 1, ''];  
                 }  
                 if (scalar @attributeList) {  
                     $featureCount++;  
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
1019                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
1020                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
1021                      # character.                      # character.
# Line 1037  Line 1033 
1033                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
1034                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
1035                  }                  }
             }  
1036              # Update the statistics.              # Update the statistics.
1037              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
1038              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1039          }          }
1040      }      }
# Line 1613  Line 1607 
1607    
1608  The following relations are loaded by this method.  The following relations are loaded by this method.
1609    
     DrugProject  
     ContainsTopic  
     DrugTopic  
     ContainsAnalysisOf  
1610      PDB      PDB
1611      IncludesBound      DocksWith
1612      IsBoundIn      IsProteinForFeature
     BindsWith  
1613      Ligand      Ligand
     DescribesProteinForFeature  
     FeatureConservation  
1614    
1615  The source information for these relations is taken from flat files in the  The source information for these relations is taken from attributes. The
1616  C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains  C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1617  a list of drug project names paired with file names. The named file (in the  The C<zinc_name> attribute describes the ligands. The C<docking_results>
1618  same directory) contains all the data for the project.  attribute contains the information for the B<DocksWith> relationship. It is
1619    expected that additional attributes and tables will be added in the future.
1620    
1621  =over 4  =over 4
1622    
# Line 1648  Line 1636 
1636      # Get the genome hash.      # Get the genome hash.
1637      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1638      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
     my $loadDrugProject = $self->_TableLoader('DrugProject');  
     my $loadContainsTopic = $self->_TableLoader('ContainsTopic');  
     my $loadDrugTopic = $self->_TableLoader('DrugTopic');  
     my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');  
1639      my $loadPDB = $self->_TableLoader('PDB');      my $loadPDB = $self->_TableLoader('PDB');
     my $loadIncludesBound = $self->_TableLoader('IncludesBound');  
     my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');  
     my $loadBindsWith = $self->_TableLoader('BindsWith');  
1640      my $loadLigand = $self->_TableLoader('Ligand');      my $loadLigand = $self->_TableLoader('Ligand');
1641      my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');      my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1642      my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');      my $loadDocksWith = $self->_TableLoader('DocksWith');
1643      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1644          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1645      } else {      } else {
1646          Trace("Generating drug target data.") if T(2);          Trace("Generating drug target data.") if T(2);
1647          # Load the project list. The file comes in as a list of chomped lines,          # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1648          # and we split them on the TAB character to make the project name the          # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1649          # key and the file name the value of the resulting hash.          # this process, PDB information is collected in a hash table and then
1650          my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");          # unspooled after both relationships are created.
1651          # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects          my %pdbHash = ();
1652          # may occur multiple times in a single project file or even in multiple project          Trace("Generating docking data.") if T(2);
1653          # files.          # Get all the docking data. This may cause problems if there are too many PDBs,
1654          my %ligands = ();          # at which point we'll need another algorithm. The indicator that this is
1655          my %pdbs = ();          # happening will be a timeout error in the next statement.
1656          my %features = ();          my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1657          my %bindings = ();                                                ['docking_results', $FIG_Config::dockLimit]);
1658          # Set up a counter for drug topics. This will be used as the key.          Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1659          my $topicCounter = 0;          for my $dockData (@dockData) {
1660          # Loop through the projects. We sort the keys not because we need them sorted, but              # Get the docking data components.
1661          # because it makes it easier to infer our progress from trace messages.              my ($pdbID, $docking_key, @valueData) = @{$dockData};
1662          for my $project (sort keys %projects) {              # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1663              Trace("Processing project $project.") if T(3);              $pdbID = lc $pdbID;
1664              # Only proceed if the download file exists.              # Strip off the object type.
1665              my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";              $pdbID =~ s/pdb://;
1666              if (! -f $projectFile) {              # Extract the ZINC ID from the docking key. Note that there are two possible
1667                  Trace("Project file $projectFile not found.") if T(0);              # formats.
1668              } else {              my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1669                  # Create the project record.              if (! $zinc_id) {
1670                  $loadDrugProject->Put($project);                  Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1671                  # Create a hash for the topics. Each project has one or more topics. The                  $loadDocksWith->Add("errors");
1672                  # topic is identified by a URL, a category, and an identifier.              } else {
1673                  my %topics = ();                  # Get the pieces of the value and parse the energy.
1674                  # Now we can open the project file.                  # Note that we don't care about the rank, since
1675                  Trace("Reading project file $projectFile.") if T(3);                  # we can sort on the energy level itself in our database.
1676                  Open(\*PROJECT, "<$projectFile");                  my ($energy, $tool, $type) = @valueData;
1677                  # Get the first record, which is a list of column headers. We don't use this                  my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1678                  # for anything, but it may be useful for debugging.                  # Ignore predicted results.
1679                  my $headerLine = <PROJECT>;                  if ($type ne "Predicted") {
1680                  # Loop through the rest of the records.                      # Count this docking result.
1681                  while (! eof PROJECT) {                      if (! exists $pdbHash{$pdbID}) {
1682                      # Get the current line of data. Note that not all lines will have all                          $pdbHash{$pdbID} = 1;
1683                      # the fields. In particular, the CLIBE data is fairly rare.                      } else {
1684                      my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,                          $pdbHash{$pdbID}++;
1685                          $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,                      }
1686                          $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,                      # Write the result to the output.
1687                          $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,                      $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1688                          $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)                                          $total, $vanderwaals);
1689                         = Tracer::GetLine(\*PROJECT);                  }
1690                      # The tag contains an identifier for the current line of data followed              }
1691                      # by a text statement that generally matches a property name in the          }
1692                      # main database. We split it up, since the identifier goes with          Trace("Connecting features.") if T(2);
1693                      # the PDB data and the text statement is part of the topic.          # Loop through the genomes.
1694                      my ($lineID, $topicTag) = split /\s*,\s*/, $tag;          for my $genome (sort keys %{$genomeHash}) {
1695                      $loadDrugProject->Add("data line");              Trace("Generating PDBs for $genome.") if T(3);
1696                      # Check for a new topic.              # Get all of the PDBs that BLAST against this genome's features.
1697                      my $topicData = "$category\t$topicTag\t$refURL";              my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1698                      if (! exists $topics{$topicData}) {              for my $pdbData (@attributeData) {
1699                          # Here we have a new topic. Compute its ID.                  # The PDB ID is coded as a subkey.
1700                          $topicCounter++;                  if ($pdbData->[1] !~ /PDB::(.+)/i) {
1701                          $topics{$topicData} = $topicCounter;                      Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1702                          # Create its database record.                      $loadPDB->Add("errors");
1703                          $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,                  } else {
1704                                              $topicTag);                      my $pdbID = $1;
1705                          # Connect it to the project.                      # Insure the PDB is in the hash.
1706                          $loadContainsTopic->Put($project, $topicCounter);                      if (! exists $pdbHash{$pdbID}) {
1707                          $loadDrugTopic->Add("topic");                          $pdbHash{$pdbID} = 0;
1708                      }                      }
1709                      # Now we know the topic ID exists in the hash and the topic will                      # The score and locations are coded in the attribute value.
1710                      # appear in the database, so we get this topic's ID.                      if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1711                      my $topicID = $topics{$topicData};                          Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1712                      # If the feature in this line is new, we need to save its conservation                          $loadIsProteinForFeature->Add("errors");
1713                      # number.                      } else {
1714                      if (! exists $features{$peg}) {                          my ($score, $locData) = ($1,$2);
1715                          $loadFeatureConservation->Put($peg, $conservation);                          # The location data may not be present, so we have to start with some
1716                          $features{$peg} = 1;                          # defaults and then check.
1717                      }                          my ($start, $end) = (1, 0);
1718                      # Now we have two PDBs to deal with-- a bound PDB and a free PDB.                          if ($locData) {
1719                      # The free PDB will have data about docking points; the bound PDB                              $locData =~ /(\d+)-(\d+)/;
1720                      # will have data about docking. We store both types as PDBs, and                              $start = $1;
1721                      # the special data comes from relationships. First we process the                              $end = $2;
1722                      # bound PDB.                          }
1723                      if ($pdbBound) {                          # If we still don't have the end location, compute it from
1724                          $loadPDB->Add("bound line");                          # the feature length.
1725                          # Insure this PDB is in the database.                          if (! $end) {
1726                          $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);                              # Most features have one location, but we do a list iteration
1727                          # Connect it to this topic.                              # just in case.
1728                          $loadIncludesBound->Put($topicID, $pdbBound);                              my @locations = $fig->feature_location($pdbData->[0]);
1729                          # Check for CLIBE data.                              $end = 0;
1730                          if ($clibeInfo) {                              for my $loc (@locations) {
1731                              $loadLigand->Add("clibes");                                  my $locObject = BasicLocation->new($loc);
1732                              # We have CLIBE data, so we create a ligand and relate it to the PDB.                                  $end += $locObject->Length;
1733                              if (! exists $ligands{$clibeInfo}) {                              }
1734                                  # This is a new ligand, so create its record.                          }
1735                                  $loadLigand->Put($clibeInfo);                          # Decode the score.
1736                                  $loadLigand->Add("ligand");                          my $realScore = FIGRules::DecodeScore($score);
1737                                  # Make sure we know this ligand already exists.                          # Connect the PDB to the feature.
1738                                  $ligands{$clibeInfo} = 1;                          $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
                             }  
                             # Now connect the PDB to the ligand using the CLIBE data.  
                             $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,  
                                                 $clibeSolvationE, $clibeVanderwaals);  
                         }  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);  
                     }  
                     # Next is the free PDB.  
                     if ($pdbFree) {  
                         $loadPDB->Add("free line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,  
                                                      $passWeightFile, $passWeightInfo, $passAspFile);  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);  
                     }  
                     # If we have both PDBs, we may need to link them.  
                     if ($pdbFree && $pdbBound) {  
                         $loadIsBoundIn->Add("connection");  
                         # Insure we only link them once.  
                         my $bindingKey =  "$pdbFree\t$pdbBound";  
                         if (! exists $bindings{$bindingKey}) {  
                             $loadIsBoundIn->Add("newConnection");  
                             $loadIsBoundIn->Put($pdbFree, $pdbBound);  
                             $bindings{$bindingKey} = 1;  
1739                          }                          }
1740                      }                      }
1741                  }                  }
1742                  # Close off this project.          }
1743                  close PROJECT;          # We've got all our PDBs now, so we unspool them from the hash.
1744            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1745            my $count = 0;
1746            for my $pdbID (sort keys %pdbHash) {
1747                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1748                $count++;
1749                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1750            }
1751            # Finally we create the ligand table. This information can be found in the
1752            # zinc_name attribute.
1753            Trace("Loading ligands.") if T(2);
1754            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1755            my $last_zinc_id = "";
1756            my $zinc_id = "";
1757            my $done = 0;
1758            while (! $done) {
1759                # Get the next 10000 ligands. We insist that the object ID is greater than
1760                # the last ID we processed.
1761                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1762                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1763                                                           ["ZINC:$zinc_id", "zinc_name"]);
1764                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1765                if (! @attributeData) {
1766                    # Here there are no attributes left, so we quit the loop.
1767                    $done = 1;
1768                } else {
1769                    # Process the attribute data we've received.
1770                    for my $zinc_data (@attributeData) {
1771                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1772                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1773                            $zinc_id = $1;
1774                            # Check for a duplicate.
1775                            if ($zinc_id eq $last_zinc_id) {
1776                                $loadLigand->Add("duplicate");
1777                            } else {
1778                                # Here it's safe to output the ligand. The ligand name is the attribute value
1779                                # (third column in the row).
1780                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1781                                # Insure we don't try to add this ID again.
1782                                $last_zinc_id = $zinc_id;
1783                            }
1784                        } else {
1785                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1786                            $loadLigand->Add("errors");
1787              }              }
1788          }          }
1789      }      }
1790            }
1791            Trace("Ligands loaded.") if T(2);
1792        }
1793      # Finish the load.      # Finish the load.
1794      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1795      return $retVal;      return $retVal;
# Line 1801  Line 1800 
1800    
1801  =head3 SpecialAttribute  =head3 SpecialAttribute
1802    
1803  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, $idxValue, $pattern, $loader); >>  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1804    
1805  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1806  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
1807  another column is put into an output table connected to the specified ID.  a set of columns is put into an output table connected to the specified ID.
1808    
1809  For example, when processing features, the attribute list we look at has three columns: attribute  For example, when processing features, the attribute list we look at has three columns: attribute
1810  name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name  name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1811  begins with C<iedb_>. The call signature is therefore  begins with C<iedb_>. The call signature is therefore
1812    
1813      my $found = SpecialAttribute($fid, \@attributeList, 0, 2, '^iedb_', $loadFeatureIEDB);      my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1814    
1815  The pattern is matched against column 0, and if we have a match, then column 2's value is put  The pattern is matched against column 0, and if we have a match, then column 2's value is put
1816  to the output along with the specified feature ID.  to the output along with the specified feature ID.
# Line 1832  Line 1831 
1831  Index in each tuple of the column to be matched against the pattern. If the match is  Index in each tuple of the column to be matched against the pattern. If the match is
1832  successful, an output record will be generated.  successful, an output record will be generated.
1833    
1834  =item idxValue  =item idxValues
1835    
1836  Index in each tuple of the column to be put as the second column of the output.  Reference to a list containing the indexes in each tuple of the columns to be put as
1837    the second column of the output.
1838    
1839  =item pattern  =item pattern
1840    
# Line 1857  Line 1857 
1857    
1858  sub SpecialAttribute {  sub SpecialAttribute {
1859      # Get the parameters.      # Get the parameters.
1860      my ($id, $attributes, $idxMatch, $idxValue, $pattern, $loader) = @_;      my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1861      # Declare the return variable.      # Declare the return variable.
1862      my $retVal = 0;      my $retVal = 0;
1863      # Loop through the attribute rows.      # Loop through the attribute rows.
1864      for my $row (@{$attributes}) {      for my $row (@{$attributes}) {
1865          # Check for a match.          # Check for a match.
1866          if ($row->[$idxMatch] =~ m/$pattern/i) {          if ($row->[$idxMatch] =~ m/$pattern/i) {
1867              # We have a match, so output a row.              # We have a match, so output a row. This is a bit tricky, since we may
1868              $loader->Put($id, $row->[$idxValue]);              # be putting out multiple columns of data from the input.
1869                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1870                $loader->Put($id, $value);
1871              $retVal++;              $retVal++;
1872          }          }
1873      }      }
# Line 1874  Line 1876 
1876      return $retVal;      return $retVal;
1877  }  }
1878    
 =head3 CreatePDB  
   
 C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>  
   
 Insure that a PDB record exists for the identified PDB. If one does not exist, it will be  
 created.  
   
 =over 4  
   
 =item pdbID  
   
 ID string (usually an unqualified file name) for the desired PDB.  
   
 =item title  
   
 Title to use if the PDB must be created.  
   
 =item type  
   
 Type of PDB: C<free> or C<bound>  
   
 =item pdbHash  
   
 Hash containing the IDs of PDBs that have already been created.  
   
 =item pdbLoader  
   
 Load object for the PDB table.  
   
 =back  
   
 =cut  
   
 sub CreatePDB {  
     # Get the parameters.  
     my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;  
     $pdbLoader->Add("PDB check");  
     # Check to see if this is a new PDB.  
     if (! exists $pdbHash->{$pdbID}) {  
         # It is, so we create it.  
         $pdbLoader->Put($pdbID, $title, $type);  
         $pdbHash->{$pdbID} = 1;  
         # Count it.  
         $pdbLoader->Add("PDB-$type");  
     }  
 }  
   
1879  =head3 TableLoader  =head3 TableLoader
1880    
1881  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 2024  Line 1979 
1979      return $retVal;      return $retVal;
1980  }  }
1981    
1982    =head3 GetGenomeAttributes
1983    
1984    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
1985    
1986    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1987    attributes for all the features of a genome in a single call, then organizes them into
1988    a hash.
1989    
1990    =over 4
1991    
1992    =item fig
1993    
1994    FIG-like object for accessing attributes.
1995    
1996    =item genomeID
1997    
1998    ID of the genome who's attributes are desired.
1999    
2000    =item fids
2001    
2002    Reference to a list of the feature IDs whose attributes are to be kept.
2003    
2004    =item RETURN
2005    
2006    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2007    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2008    the attribute key, and one or more attribute values.
2009    
2010    =back
2011    
2012    =cut
2013    
2014    sub GetGenomeAttributes {
2015        # Get the parameters.
2016        my ($fig, $genomeID, $fids) = @_;
2017        # Declare the return variable.
2018        my $retVal = {};
2019        # Get a list of the attributes we care about.
2020        my @propKeys = $fig->get_group_keys("NMPDR");
2021        # Get the attributes.
2022        my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);
2023        # Initialize the hash. This not only enables us to easily determine which FIDs to
2024        # keep, it insures that the caller sees a list reference for every known fid,
2025        # simplifying the logic.
2026        for my $fid (@{$fids}) {
2027            $retVal->{$fid} = [];
2028        }
2029        # Populate the hash.
2030        for my $aListEntry (@aList) {
2031            my $fid = $aListEntry->[0];
2032            if (exists $retVal->{$fid}) {
2033                push @{$retVal->{$fid}}, $aListEntry;
2034            }
2035        }
2036        # Return the result.
2037        return $retVal;
2038    }
2039    
2040  1;  1;

Legend:
Removed from v.1.76  
changed lines
  Added in v.1.84

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3