[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.82, Tue Apr 10 06:15:35 2007 UTC revision 1.83, Fri May 11 06:37:38 2007 UTC
# Line 788  Line 788 
788              # Get the subsystem object.              # Get the subsystem object.
789              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
790              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
791              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
792                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
793                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
794                  # Create the subsystem record.                  # Create the subsystem record.
# Line 1001  Line 1001 
1001          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
1002          my %propertyKeys = ();          my %propertyKeys = ();
1003          my $nextID = 1;          my $nextID = 1;
1004            # Get the attributes we intend to store in the property table.
1005            my @propKeys = $fig->get_group_keys("NMPDR");
1006          # Loop through the genomes.          # Loop through the genomes.
1007          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1008              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1009              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1010              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
1011              my $propertyCount = 0;              my $propertyCount = 0;
1012              # Get the properties for this genome's features.              # Get the properties for this genome's features.
1013              my $attributes = GetGenomeAttributes($fig, $genomeID, \@features);              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);
1014              Trace("Property hash built for $genomeID.") if T(3);              Trace("Property list built for $genomeID.") if T(3);
1015              # Loop through the features, creating HasProperty records.              # Loop through the results, creating HasProperty records.
1016              for my $fid (@features) {              for my $attributeData (@attributes) {
1017                  # Get all attributes for this feature. We do this one feature at a time                  # Pull apart the attribute tuple.
1018                  # to insure we do not get any genome attributes.                  my ($fid, $key, $value, $url) = @{$attributeData};
                 my @attributeList = @{$attributes->{$fid}};  
                 if (scalar @attributeList) {  
                     $featureCount++;  
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
1019                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
1020                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
1021                      # character.                      # character.
# Line 1045  Line 1033 
1033                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
1034                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
1035                  }                  }
             }  
1036              # Update the statistics.              # Update the statistics.
1037              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
1038              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1039          }          }
1040      }      }
# Line 1621  Line 1607 
1607    
1608  The following relations are loaded by this method.  The following relations are loaded by this method.
1609    
     DrugProject  
     ContainsTopic  
     DrugTopic  
     ContainsAnalysisOf  
1610      PDB      PDB
1611      IncludesBound      DocksWith
1612      IsBoundIn      IsProteinForFeature
     BindsWith  
1613      Ligand      Ligand
     DescribesProteinForFeature  
     FeatureConservation  
1614    
1615  The source information for these relations is taken from flat files in the  The source information for these relations is taken from attributes. The
1616  C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains  C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1617  a list of drug project names paired with file names. The named file (in the  The C<zinc_name> attribute describes the ligands. The C<docking_results>
1618  same directory) contains all the data for the project.  attribute contains the information for the B<DocksWith> relationship. It is
1619    expected that additional attributes and tables will be added in the future.
1620    
1621  =over 4  =over 4
1622    
# Line 1656  Line 1636 
1636      # Get the genome hash.      # Get the genome hash.
1637      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1638      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
     my $loadDrugProject = $self->_TableLoader('DrugProject');  
     my $loadContainsTopic = $self->_TableLoader('ContainsTopic');  
     my $loadDrugTopic = $self->_TableLoader('DrugTopic');  
     my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');  
1639      my $loadPDB = $self->_TableLoader('PDB');      my $loadPDB = $self->_TableLoader('PDB');
     my $loadIncludesBound = $self->_TableLoader('IncludesBound');  
     my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');  
     my $loadBindsWith = $self->_TableLoader('BindsWith');  
1640      my $loadLigand = $self->_TableLoader('Ligand');      my $loadLigand = $self->_TableLoader('Ligand');
1641      my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');      my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1642      my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');      my $loadDocksWith = $self->_TableLoader('DocksWith');
1643      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1644          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1645      } else {      } else {
1646          Trace("Generating drug target data.") if T(2);          Trace("Generating drug target data.") if T(2);
1647          # Load the project list. The file comes in as a list of chomped lines,          # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1648          # and we split them on the TAB character to make the project name the          # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1649          # key and the file name the value of the resulting hash.          # this process, PDB information is collected in a hash table and then
1650          my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");          # unspooled after both relationships are created.
1651          # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects          my %pdbHash = ();
1652          # may occur multiple times in a single project file or even in multiple project          Trace("Generating docking data.") if T(2);
1653          # files.          # Get all the docking data. This may cause problems if there are too many PDBs,
1654          my %ligands = ();          # at which point we'll need another algorithm. The indicator that this is
1655          my %pdbs = ();          # happening will be a timeout error in the next statement.
1656          my %features = ();          my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1657          my %bindings = ();                                                ['docking_results', $FIG_Config::dockLimit]);
1658          # Set up a counter for drug topics. This will be used as the key.          Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1659          my $topicCounter = 0;          for my $dockData (@dockData) {
1660          # Loop through the projects. We sort the keys not because we need them sorted, but              # Get the docking data components.
1661          # because it makes it easier to infer our progress from trace messages.              my ($pdbID, $docking_key, @valueData) = @{$dockData};
1662          for my $project (sort keys %projects) {              # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1663              Trace("Processing project $project.") if T(3);              $pdbID = lc $pdbID;
1664              # Only proceed if the download file exists.              # Extract the ZINC ID from the docking key. Note that there are two possible
1665              my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";              # formats.
1666              if (! -f $projectFile) {              my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1667                  Trace("Project file $projectFile not found.") if T(0);              if (! $zinc_id) {
1668                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1669                    $loadDocksWith->Add("errors");
1670                } else {
1671                    # Count this docking result.
1672                    if (! exists $pdbHash{$pdbID}) {
1673                        $pdbHash{$pdbID} = 1;
1674              } else {              } else {
1675                  # Create the project record.                      $pdbHash{$pdbID}++;
                 $loadDrugProject->Put($project);  
                 # Create a hash for the topics. Each project has one or more topics. The  
                 # topic is identified by a URL, a category, and an identifier.  
                 my %topics = ();  
                 # Now we can open the project file.  
                 Trace("Reading project file $projectFile.") if T(3);  
                 Open(\*PROJECT, "<$projectFile");  
                 # Get the first record, which is a list of column headers. We don't use this  
                 # for anything, but it may be useful for debugging.  
                 my $headerLine = <PROJECT>;  
                 # Loop through the rest of the records.  
                 while (! eof PROJECT) {  
                     # Get the current line of data. Note that not all lines will have all  
                     # the fields. In particular, the CLIBE data is fairly rare.  
                     my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,  
                         $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,  
                         $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,  
                         $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,  
                         $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)  
                        = Tracer::GetLine(\*PROJECT);  
                     # The tag contains an identifier for the current line of data followed  
                     # by a text statement that generally matches a property name in the  
                     # main database. We split it up, since the identifier goes with  
                     # the PDB data and the text statement is part of the topic.  
                     my ($lineID, $topicTag) = split /\s*,\s*/, $tag;  
                     $loadDrugProject->Add("data line");  
                     # Check for a new topic.  
                     my $topicData = "$category\t$topicTag\t$refURL";  
                     if (! exists $topics{$topicData}) {  
                         # Here we have a new topic. Compute its ID.  
                         $topicCounter++;  
                         $topics{$topicData} = $topicCounter;  
                         # Create its database record.  
                         $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,  
                                             $topicTag);  
                         # Connect it to the project.  
                         $loadContainsTopic->Put($project, $topicCounter);  
                         $loadDrugTopic->Add("topic");  
                     }  
                     # Now we know the topic ID exists in the hash and the topic will  
                     # appear in the database, so we get this topic's ID.  
                     my $topicID = $topics{$topicData};  
                     # If the feature in this line is new, we need to save its conservation  
                     # number.  
                     if (! exists $features{$peg}) {  
                         $loadFeatureConservation->Put($peg, $conservation);  
                         $features{$peg} = 1;  
                     }  
                     # Now we have two PDBs to deal with-- a bound PDB and a free PDB.  
                     # The free PDB will have data about docking points; the bound PDB  
                     # will have data about docking. We store both types as PDBs, and  
                     # the special data comes from relationships. First we process the  
                     # bound PDB.  
                     if ($pdbBound) {  
                         $loadPDB->Add("bound line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadIncludesBound->Put($topicID, $pdbBound);  
                         # Check for CLIBE data.  
                         if ($clibeInfo) {  
                             $loadLigand->Add("clibes");  
                             # We have CLIBE data, so we create a ligand and relate it to the PDB.  
                             if (! exists $ligands{$clibeInfo}) {  
                                 # This is a new ligand, so create its record.  
                                 $loadLigand->Put($clibeInfo);  
                                 $loadLigand->Add("ligand");  
                                 # Make sure we know this ligand already exists.  
                                 $ligands{$clibeInfo} = 1;  
                             }  
                             # Now connect the PDB to the ligand using the CLIBE data.  
                             $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,  
                                                 $clibeSolvationE, $clibeVanderwaals);  
                         }  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);  
                     }  
                     # Next is the free PDB.  
                     if ($pdbFree) {  
                         $loadPDB->Add("free line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,  
                                                      $passWeightFile, $passWeightInfo, $passAspFile);  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);  
                     }  
                     # If we have both PDBs, we may need to link them.  
                     if ($pdbFree && $pdbBound) {  
                         $loadIsBoundIn->Add("connection");  
                         # Insure we only link them once.  
                         my $bindingKey =  "$pdbFree\t$pdbBound";  
                         if (! exists $bindings{$bindingKey}) {  
                             $loadIsBoundIn->Add("newConnection");  
                             $loadIsBoundIn->Put($pdbFree, $pdbBound);  
                             $bindings{$bindingKey} = 1;  
1676                          }                          }
1677                    # Get the pieces of the value and parse the energy.
1678                    # Note that we don't care about the rank, since
1679                    # we can sort on the energy level itself in our database.
1680                    my ($energy, $tool, $type) = @valueData;
1681                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1682                    # Write the results to the output.
1683                    $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1684                                        $total, $vanderwaals);
1685                      }                      }
1686                  }                  }
1687                  # Close off this project.          Trace("Connecting features.") if T(2);
1688                  close PROJECT;          # Loop through the genomes.
1689            for my $genome (sort keys %{$genomeHash}) {
1690                Trace("Generating PDBs for $genome.") if T(3);
1691                # Get all of the PDBs that BLAST against this genome's features.
1692                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1693                for my $pdbData (@attributeData) {
1694                    # The PDB ID is coded as a subkey.
1695                    if ($pdbData->[1] !~ /PDB::(.+)/) {
1696                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1697                        $loadPDB->Add("errors");
1698                    } else {
1699                        my $pdbID = $1;
1700                        # Insure the PDB is in the hash.
1701                        if (! exists $pdbHash{$pdbID}) {
1702                            $pdbHash{$pdbID} = 0;
1703                        }
1704                        # The score and locations are coded in the attribute value.
1705                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1706                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1707                            $loadIsProteinForFeature->Add("errors");
1708                        } else {
1709                            my ($score, $locData) = ($1,$2);
1710                            # The location data may not be present, so we have to start with some
1711                            # defaults and then check.
1712                            my ($start, $end) = (1, 0);
1713                            if ($locData) {
1714                                $locData =~ /(\d+)-(\d+)/;
1715                                $start = $1;
1716                                $end = $2;
1717                            }
1718                            # If we still don't have the end location, compute it from
1719                            # the feature length.
1720                            if (! $end) {
1721                                # Most features have one location, but we do a list iteration
1722                                # just in case.
1723                                my @locations = $fig->feature_location($pdbData->[0]);
1724                                $end = 0;
1725                                for my $loc (@locations) {
1726                                    my $locObject = BasicLocation->new($loc);
1727                                    $end += $locObject->Length;
1728                                }
1729                            }
1730                            # Decode the score.
1731                            my $realScore = FIGRules::DecodeScore($score);
1732                            # Connect the PDB to the feature.
1733                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1734                        }
1735              }              }
1736          }          }
1737      }      }
1738            # We've got all our PDBs now, so we unspool them from the hash.
1739            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1740            my $count = 0;
1741            for my $pdbID (sort keys %pdbHash) {
1742                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1743                $count++;
1744                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1745            }
1746            # Finally we create the ligand table. This information can be found in the
1747            # zinc_name attribute.
1748            Trace("Loading ligands.") if T(2);
1749            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1750            my $last_zinc_id = "";
1751            my $zinc_id = "";
1752            my $done = 0;
1753            while (! $done) {
1754                # Get the next 10000 ligands. We insist that the object ID is greater than
1755                # the last ID we processed.
1756                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1757                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1758                                                           ["ZINC:$zinc_id", "zinc_name"]);
1759                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1760                if (! @attributeData) {
1761                    # Here there are no attributes left, so we quit the loop.
1762                    $done = 1;
1763                } else {
1764                    # Process the attribute data we've received.
1765                    for my $zinc_data (@attributeData) {
1766                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1767                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1768                            $zinc_id = $1;
1769                            # Check for a duplicate.
1770                            if ($zinc_id eq $last_zinc_id) {
1771                                $loadLigand->Add("duplicate");
1772                            } else {
1773                                # Here it's safe to output the ligand. The ligand name is the attribute value
1774                                # (third column in the row).
1775                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1776                                # Insure we don't try to add this ID again.
1777                                $last_zinc_id = $zinc_id;
1778                            }
1779                        } else {
1780                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1781                            $loadLigand->Add("errors");
1782                        }
1783                    }
1784                }
1785            }
1786            Trace("Ligands loaded.") if T(2);
1787        }
1788      # Finish the load.      # Finish the load.
1789      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1790      return $retVal;      return $retVal;
# Line 1885  Line 1871 
1871      return $retVal;      return $retVal;
1872  }  }
1873    
 =head3 CreatePDB  
   
 C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>  
   
 Insure that a PDB record exists for the identified PDB. If one does not exist, it will be  
 created.  
   
 =over 4  
   
 =item pdbID  
   
 ID string (usually an unqualified file name) for the desired PDB.  
   
 =item title  
   
 Title to use if the PDB must be created.  
   
 =item type  
   
 Type of PDB: C<free> or C<bound>  
   
 =item pdbHash  
   
 Hash containing the IDs of PDBs that have already been created.  
   
 =item pdbLoader  
   
 Load object for the PDB table.  
   
 =back  
   
 =cut  
   
 sub CreatePDB {  
     # Get the parameters.  
     my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;  
     $pdbLoader->Add("PDB check");  
     # Check to see if this is a new PDB.  
     if (! exists $pdbHash->{$pdbID}) {  
         # It is, so we create it.  
         $pdbLoader->Put($pdbID, $title, $type);  
         $pdbHash->{$pdbID} = 1;  
         # Count it.  
         $pdbLoader->Add("PDB-$type");  
     }  
 }  
   
1874  =head3 TableLoader  =head3 TableLoader
1875    
1876  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 2034  Line 1973 
1973      # Return the load statistics.      # Return the load statistics.
1974      return $retVal;      return $retVal;
1975  }  }
1976    
1977  =head3 GetGenomeAttributes  =head3 GetGenomeAttributes
1978    
1979  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
1980    
1981  Return a hash of attributes keyed on feature ID. This method gets all the attributes  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1982  for all the features of a genome in a single call, then organizes them into a hash.  attributes for all the features of a genome in a single call, then organizes them into
1983    a hash.
1984    
1985  =over 4  =over 4
1986    
# Line 2070  Line 2011 
2011      my ($fig, $genomeID, $fids) = @_;      my ($fig, $genomeID, $fids) = @_;
2012      # Declare the return variable.      # Declare the return variable.
2013      my $retVal = {};      my $retVal = {};
2014        # Get a list of the attributes we care about.
2015        my @propKeys = $fig->get_group_keys("NMPDR");
2016      # Get the attributes.      # Get the attributes.
2017      my @aList = $fig->get_attributes("fig|$genomeID%");      my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);
2018      # Initialize the hash. This not only enables us to easily determine which FIDs to      # Initialize the hash. This not only enables us to easily determine which FIDs to
2019      # keep, it insures that the caller sees a list reference for every known fid,      # keep, it insures that the caller sees a list reference for every known fid,
2020      # simplifying the logic.      # simplifying the logic.

Legend:
Removed from v.1.82  
changed lines
  Added in v.1.83

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3