[Bio] / Sprout / DrugSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/DrugSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package DrugSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Drug Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Drug Load Group includes all of the major drug target tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = DrugSproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new DrugSproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 : parrello 1.2 my ($class, $erdb, $options) = @_;
60 : parrello 1.1 # Create the table list.
61 :     my @tables = sort qw(PDB Ligand IsProteinForFeature DocksWith);
62 :     # Create the BaseSproutLoader object.
63 : parrello 1.2 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
64 : parrello 1.1 # Return it.
65 :     return $retVal;
66 :     }
67 :    
68 :     =head2 Public Methods
69 :    
70 :     =head3 Generate
71 :    
72 :     $sl->Generate();
73 :    
74 :     Generate the data for the drug target files.
75 :    
76 :     =cut
77 :    
78 :     sub Generate {
79 :     # Get the parameters.
80 :     my ($self) = @_;
81 :     # Get the sprout object.
82 :     my $sprout = $self->db();
83 :     # Get the FIG object.
84 :     my $fig = $self->source();
85 :     # Is this the global section?
86 :     if ($self->global()) {
87 :     # Create the ligand table. This information can be found in the zinc_name attribute.
88 :     Trace("Loading ligands.") if T(2);
89 :     # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
90 :     my $last_zinc_id = "";
91 :     my $zinc_id = "";
92 :     my $done = 0;
93 :     while (! $done) {
94 :     # Get the next 10000 ligands. We insist that the object ID is greater than
95 :     # the last ID we processed.
96 :     Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
97 :     my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
98 :     ["ZINC:$zinc_id", "zinc_name"]);
99 :     Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
100 :     if (! @attributeData) {
101 :     # Here there are no attributes left, so we quit the loop.
102 :     $done = 1;
103 :     } else {
104 :     # Process the attribute data we've received.
105 :     for my $zinc_data (@attributeData) {
106 :     # The ZINC ID is found in the first return column, prefixed with the word ZINC.
107 :     if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
108 :     $zinc_id = $1;
109 : parrello 1.2 $self->Track(zincIDs => $zinc_id, 10000);
110 :     # Check for a duplicate. These are very, very common.
111 : parrello 1.1 if ($zinc_id eq $last_zinc_id) {
112 : parrello 1.2 $self->Add('duplicate-zinc' => 1);
113 : parrello 1.1 } else {
114 :     # Here it's safe to output the ligand. The ligand name is the attribute value
115 :     # (third column in the row).
116 : parrello 1.2 $self->PutE(Ligand => $zinc_id, name => $zinc_data->[2]);
117 : parrello 1.1 # Insure we don't try to add this ID again.
118 :     $last_zinc_id = $zinc_id;
119 :     }
120 :     } else {
121 :     $self->AddWarning('zinc-bad-id' => "Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
122 :     }
123 :     }
124 :     }
125 :     }
126 : parrello 1.2 } else {
127 :     # Here we're working with a genome. We need to find all the PDBs that connect
128 :     # to this genome's features.
129 :     Trace("Connecting features.") if T(2);
130 :     my $genome = $self->section();
131 :     Trace("Generating PDB connections for $genome.") if T(3);
132 :     # We'll keep track of the PDBs we find in here.
133 : parrello 1.1 my %pdbHash;
134 : parrello 1.2 # Get all of the PDBs that BLAST against this genome's features.
135 :     my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB');
136 :     for my $pdbData (@attributeData) {
137 :     # The PDB ID is coded as a subkey.
138 :     if ($pdbData->[1] !~ /PDB::(.+)/i) {
139 :     $self->AddWarning('pdb-key-error' => "Invalid PDB ID \"$pdbData->[1]\" in attribute table.");
140 : parrello 1.1 } else {
141 : parrello 1.2 my $pdbID = lc $1;
142 :     # Insure the PDB is in the hash.
143 :     if (! exists $pdbHash{$pdbID}) {
144 :     $pdbHash{$pdbID} = 0;
145 : parrello 1.1 }
146 : parrello 1.2 # The score and locations are coded in the attribute value.
147 :     if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
148 :     $self->AddWarning('pdb-data-error' => "Invalid PDB data for $pdbID and feature $pdbData->[0].");
149 : parrello 1.1 } else {
150 : parrello 1.2 my ($score, $locData) = ($1,$2);
151 :     # The location data may not be present, so we have to start with some
152 :     # defaults and then check.
153 :     my ($start, $end) = (1, 0);
154 :     if ($locData) {
155 :     $locData =~ /(\d+)-(\d+)/;
156 :     $start = $1;
157 :     $end = $2;
158 : parrello 1.1 }
159 : parrello 1.2 # If we still don't have the end location, compute it from
160 :     # the feature length.
161 :     if (! $end) {
162 :     # Most features have one location, but we do a list iteration
163 :     # just in case.
164 :     my @locations = $fig->feature_location($pdbData->[0]);
165 :     $end = 0;
166 :     for my $loc (@locations) {
167 :     my $locObject = BasicLocation->new($loc);
168 :     $end += $locObject->Length;
169 : parrello 1.1 }
170 :     }
171 : parrello 1.2 # Decode the score.
172 :     my $realScore = FIGRules::DecodeScore($score);
173 :     # Connect the PDB to the feature.
174 :     $self->PutR(IsProteinForFeature => $pdbID, $pdbData->[0],
175 :     'start-location' => $start, score => $realScore,
176 :     'end-location' => $end);
177 : parrello 1.1 }
178 :     }
179 : parrello 1.2 }
180 :     # Output the PDBs found.
181 :     Trace("Unspooling PDBs") if T(2);
182 :     for my $pdbID (sort keys %pdbHash) {
183 :     $self->Track(PDBs => $pdbID, 100);
184 :     # We need to find every ligand that docks with this PDB. Unfortunately, the
185 :     # uploaded PDB data has upper-case IDs, while we use lower-case so that we
186 :     # map to the IDs on the PDB web site. We fix this by asking for both.
187 :     my @dockData = $fig->query_attributes('($object = ? OR $object = ?) AND $key = ? AND $value < ?',
188 : parrello 1.3 ["PDB:" . uc $pdbID, "PDB:$pdbID",
189 : parrello 1.2 'docking_results', $FIG_Config::dockLimit]);
190 :     Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
191 :     # Count the docking data actually used.
192 :     my $docksUsed = 0;
193 :     # Loop through the docking data.
194 :     for my $dockData (@dockData) {
195 :     # Get the docking data components. We ignore the object ID, since we already
196 :     # know what it is.
197 :     my (undef, $docking_key, @valueData) = @{$dockData};
198 :     # Extract the ZINC ID from the docking key. Note that the "ZINC" string
199 :     # does not always get put in correctly, so it's optional in the pattern.
200 :     my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/i;
201 :     if (! $zinc_id) {
202 :     $self->AddWarning('dockdata-errors' => "Invalid docking result key $docking_key for $pdbID.") if T(0);
203 :     } else {
204 :     # Get the pieces of the value and parse the energy.
205 :     # Note that we don't care about the rank, since
206 :     # we can sort on the energy level itself in our database.
207 :     my ($energy, $tool, $type) = @valueData;
208 :     my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
209 :     # Ignore predicted results.
210 :     if ($type ne "Predicted") {
211 :     # Write the result to the output.
212 :     $self->PutR(DocksWith => $pdbID, $zinc_id,
213 :     'electrostatic-energy' => $electrostatic,
214 :     reason => $type, tool => $tool,
215 :     'total-energy' => $total,
216 :     'vanderwaals-energy' => $vanderwaals);
217 :     # Count it.
218 :     $docksUsed++;
219 :     }
220 :     }
221 : parrello 1.1 }
222 : parrello 1.2 # Output the PDB record.
223 :     $self->PutE(PDB => $pdbID, 'docking-count' => $docksUsed);
224 : parrello 1.1 }
225 :     }
226 :     }
227 :    
228 :    
229 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3