[Bio] / Sprout / DrugSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/DrugSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package DrugSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Drug Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Drug Load Group includes all of the major drug target tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = DrugSproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new DrugSproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item source
46 :    
47 :     [[FigPm]] object used to access the source data. If this parameter is undefined,
48 :     it will be created the first time the L</source> method is called.
49 :    
50 :     =item options
51 :    
52 :     Reference to a hash of command-line options.
53 :    
54 :     =item tables
55 :    
56 :     List of tables in this load group.
57 :    
58 :     =back
59 :    
60 :     =cut
61 :    
62 :     sub new {
63 :     # Get the parameters.
64 :     my ($class, $erdb, $source, $options) = @_;
65 :     # Create the table list.
66 :     my @tables = sort qw(PDB Ligand IsProteinForFeature DocksWith);
67 :     # Create the BaseSproutLoader object.
68 :     my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
69 :     # Return it.
70 :     return $retVal;
71 :     }
72 :    
73 :     =head2 Public Methods
74 :    
75 :     =head3 Generate
76 :    
77 :     $sl->Generate();
78 :    
79 :     Generate the data for the drug target files.
80 :    
81 :     =cut
82 :    
83 :     sub Generate {
84 :     # Get the parameters.
85 :     my ($self) = @_;
86 :     # Get the sprout object.
87 :     my $sprout = $self->db();
88 :     # Get the FIG object.
89 :     my $fig = $self->source();
90 :     # Is this the global section?
91 :     if ($self->global()) {
92 :     # Create the ligand table. This information can be found in the zinc_name attribute.
93 :     Trace("Loading ligands.") if T(2);
94 :     # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
95 :     my $last_zinc_id = "";
96 :     my $zinc_id = "";
97 :     my $done = 0;
98 :     while (! $done) {
99 :     # Get the next 10000 ligands. We insist that the object ID is greater than
100 :     # the last ID we processed.
101 :     Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
102 :     my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
103 :     ["ZINC:$zinc_id", "zinc_name"]);
104 :     Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
105 :     if (! @attributeData) {
106 :     # Here there are no attributes left, so we quit the loop.
107 :     $done = 1;
108 :     } else {
109 :     # Process the attribute data we've received.
110 :     for my $zinc_data (@attributeData) {
111 :     # The ZINC ID is found in the first return column, prefixed with the word ZINC.
112 :     if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
113 :     $zinc_id = $1;
114 :     # Check for a duplicate.
115 :     if ($zinc_id eq $last_zinc_id) {
116 :     $self->AddWarning('zinc-duplicate' => "Duplicate ligand $zinc_data->[0] found.");
117 :     } else {
118 :     # Here it's safe to output the ligand. The ligand name is the attribute value
119 :     # (third column in the row).
120 :     $self->PutE(Ligand => $zinc_data->[2]);
121 :     # Insure we don't try to add this ID again.
122 :     $last_zinc_id = $zinc_id;
123 :     }
124 :     } else {
125 :     $self->AddWarning('zinc-bad-id' => "Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
126 :     }
127 :     }
128 :     }
129 :     }
130 :     # Now comes the "DocksWith" relationship.
131 :     Trace("Generating docking data.") if T(2);
132 :     # This hash is used to compute the number of docking results, which is an
133 :     # attribute of the PDB.
134 :     my %pdbHash;
135 :     # Get all the docking data. This may cause problems if there are too many PDBs,
136 :     # at which point we'll need another algorithm. The indicator that this is
137 :     # happening will be a timeout error in the next statement.
138 :     my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
139 :     ['docking_results', $FIG_Config::dockLimit]);
140 :     Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
141 :     for my $dockData (@dockData) {
142 :     # Get the docking data components.
143 :     my ($pdbID, $docking_key, @valueData) = @{$dockData};
144 :     # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
145 :     $pdbID = lc $pdbID;
146 :     # Strip off the object type.
147 :     $pdbID =~ s/pdb://;
148 :     # Extract the ZINC ID from the docking key. Note that the "ZINC" string
149 :     # does not always get put in correctly, so it's optional in the pattern.
150 :     my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/i;
151 :     if (! $zinc_id) {
152 :     $self->AddWarning('dockdata-errors' => "Invalid docking result key $docking_key for $pdbID.") if T(0);
153 :     } else {
154 :     # Get the pieces of the value and parse the energy.
155 :     # Note that we don't care about the rank, since
156 :     # we can sort on the energy level itself in our database.
157 :     my ($energy, $tool, $type) = @valueData;
158 :     my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
159 :     # Ignore predicted results.
160 :     if ($type ne "Predicted") {
161 :     # Count this docking result.
162 :     $pdbHash{$pdbID}++;
163 :     # Write the result to the output.
164 :     $self->PutR(DocksWith => $pdbID, $zinc_id, electrostatic => $electrostatic,
165 :     reason => $type, tool => $tool, 'total-energy' => $total,
166 :     'vanderwaals-energy' => $vanderwaals);
167 :     }
168 :     }
169 :     }
170 :     # Now we need to find all the PDBs that have connections to features.
171 :     # At the current time, we can't parallelize this part, even though
172 :     # it's genome-based, because of the docking counts. We'll fix this
173 :     # later.
174 :     Trace("Connecting features.") if T(2);
175 :     # Loop through the genomes. We get them from the section list, then
176 :     # eliminate this section, which is the global.
177 :     my $global = $self->section();
178 :     my @genomes = grep { $_ ne $global } $self->GetSectionList();
179 :     for my $genome (@genomes) {
180 :     Trace("Generating PDB connections for $genome.") if T(3);
181 :     # Get all of the PDBs that BLAST against this genome's features.
182 :     my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
183 :     for my $pdbData (@attributeData) {
184 :     # The PDB ID is coded as a subkey.
185 :     if ($pdbData->[1] !~ /PDB::(.+)/i) {
186 :     $self->AddWarning('pdb-key-error' => "Invalid PDB ID \"$pdbData->[1]\" in attribute table.");
187 :     } else {
188 :     my $pdbID = $1;
189 :     # Insure the PDB is in the hash.
190 :     if (! exists $pdbHash{$pdbID}) {
191 :     $pdbHash{$pdbID} = 0;
192 :     }
193 :     # The score and locations are coded in the attribute value.
194 :     if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
195 :     $self->AddWarning('pdb-data-error' => "Invalid PDB data for $pdbID and feature $pdbData->[0].");
196 :     } else {
197 :     my ($score, $locData) = ($1,$2);
198 :     # The location data may not be present, so we have to start with some
199 :     # defaults and then check.
200 :     my ($start, $end) = (1, 0);
201 :     if ($locData) {
202 :     $locData =~ /(\d+)-(\d+)/;
203 :     $start = $1;
204 :     $end = $2;
205 :     }
206 :     # If we still don't have the end location, compute it from
207 :     # the feature length.
208 :     if (! $end) {
209 :     # Most features have one location, but we do a list iteration
210 :     # just in case.
211 :     my @locations = $fig->feature_location($pdbData->[0]);
212 :     $end = 0;
213 :     for my $loc (@locations) {
214 :     my $locObject = BasicLocation->new($loc);
215 :     $end += $locObject->Length;
216 :     }
217 :     }
218 :     # Decode the score.
219 :     my $realScore = FIGRules::DecodeScore($score);
220 :     # Connect the PDB to the feature.
221 :     $self->PutR(IsProteinForFeature => $pdbID, $pdbData->[0],
222 :     'start-location' => $start, score => $realScore,
223 :     'end-location' => $end);
224 :     }
225 :     }
226 :     }
227 :     # Output the PDBs found.
228 :     Trace("Unspooling PDBs") if T(2);
229 :     for my $pdbID (sort keys %pdbHash) {
230 :     $self->PutE(PDB => $pdbID, 'docking-count' => $pdbHash{$pdbID});
231 :     }
232 :     }
233 :     }
234 :     }
235 :    
236 :    
237 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3