[Bio] / Sprout / BaseSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/BaseSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package BaseSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.4 use FIG;
26 : parrello 1.2 use Time::HiRes;
27 : parrello 1.1 use base 'ERDBLoadGroup';
28 :    
29 :     # Name of the global section
30 :     use constant GLOBAL => 'Globals';
31 :    
32 :     =head1 Sprout Load Group Base Class
33 :    
34 :     =head2 Introduction
35 :    
36 :     This is the base class for all the Sprout loaders. It performs common tasks
37 :     required by multiple load groups.
38 :    
39 :     =head3 new
40 :    
41 : parrello 1.4 my $sl = BaseSproutLoader->new($erdb, $options, @tables);
42 : parrello 1.1
43 :     Construct a new BaseSproutLoader object.
44 :    
45 :     =over 4
46 :    
47 :     =item erdb
48 :    
49 :     [[SproutPm]] object for the database being loaded.
50 :    
51 :     =item source
52 :    
53 : parrello 1.7 L<FIG> object used to access the source data.
54 : parrello 1.1
55 :     =item options
56 :    
57 :     Reference to a hash of command-line options.
58 :    
59 :     =item tables
60 :    
61 :     List of tables in this load group.
62 :    
63 :     =back
64 :    
65 :     =cut
66 :    
67 :     sub new {
68 :     # Get the parameters.
69 : parrello 1.4 my ($class, $erdb, $options, @tables) = @_;
70 : parrello 1.3 # Create the base load group object.
71 : parrello 1.4 my $retVal = ERDBLoadGroup::new($class, $erdb, $options, @tables);
72 : parrello 1.2 # Return it.
73 : parrello 1.1 return $retVal;
74 :     }
75 :    
76 :    
77 :     =head2 Public Methods
78 :    
79 :     =head3 GetGenomeAttributes
80 :    
81 : parrello 1.2 my $aHashRef = $sl->GetGenomeAttributes($genomeID, \@fids);
82 : parrello 1.1
83 :     Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
84 :     attributes for all the features of a genome in a single call, then organizes them into
85 :     a hash.
86 :    
87 :     =over 4
88 :    
89 :     =item fig
90 :    
91 :     FIG-like object for accessing attributes.
92 :    
93 :     =item genomeID
94 :    
95 : parrello 1.4 ID of the genome whose attributes are desired.
96 : parrello 1.1
97 : parrello 1.4 =item fids (optional)
98 : parrello 1.1
99 : parrello 1.2 Reference to a list of feature IDs whose attributes are to be kept. If it is a list
100 :     of lists, the feature IDs will be taken from the first element in each sub-list.
101 : parrello 1.1
102 :     =item RETURN
103 :    
104 :     Returns a reference to a hash. The key of the hash is the feature ID. The value is the
105 :     reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
106 :     the attribute key, and one or more attribute values.
107 :    
108 :     =back
109 :    
110 :     =cut
111 :    
112 :     sub GetGenomeAttributes {
113 :     # Get the parameters.
114 : parrello 1.2 my ($self, $genomeID, $fids) = @_;
115 : parrello 1.4 # Get the source object.
116 :     my $fig = $self->source();
117 : parrello 1.2 # Start a timer.
118 :     my $start = time();
119 : parrello 1.4 # Initalize the FID list if we don't already have it.
120 :     if (! defined $fids) {
121 :     $fids = [ $fig->all_features($genomeID) ];
122 :     }
123 : parrello 1.2 # Declare the return variable and initialize it with all the features.
124 :     my %retVal = map { (ref $_ ? $_->[0] : $_) => [] } @$fids;
125 : parrello 1.1 # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
126 :     # an eval is used.
127 :     my @aList = ();
128 :     eval {
129 : parrello 1.2 @aList = $fig->get_attributes("fig|$genomeID%");
130 : parrello 1.1 Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(ERDBLoadGroup => 3);
131 :     };
132 :     # Check for a problem.
133 :     if ($@) {
134 :     Trace("Retrying attributes for $genomeID due to error: $@") if T(ERDBLoadGroup => 1);
135 :     # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
136 :     # but allows us to continue processing.
137 : parrello 1.2 my $nFids = scalar @$fids;
138 : parrello 1.1 for (my $i = 0; $i < $nFids; $i += 100) {
139 :     # Determine the index of the last feature ID we'll be specifying on this pass.
140 :     # Normally it's $i + 99, but if we're close to the end it may be less.
141 :     my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
142 :     # Get a slice of the fid list.
143 : parrello 1.2 my @slice = @{$fids}[$i .. $end];
144 : parrello 1.1 # Get the relevant attributes.
145 :     Trace("Retrieving attributes for fids $i to $end.") if T(ERDBLoadGroup => 3);
146 : parrello 1.2 my @aShort = $fig->get_attributes(\@slice);
147 : parrello 1.1 Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(ERDBLoadGroup => 3);
148 :     push @aList, @aShort;
149 :     }
150 :     }
151 :     # Now we should have all the interesting attributes in @aList. Populate the hash with
152 :     # them.
153 :     for my $aListEntry (@aList) {
154 :     my $fid = $aListEntry->[0];
155 : parrello 1.2 if (exists $retVal{$fid}) {
156 :     push @{$retVal{$fid}}, $aListEntry;
157 :     $self->Add(attributes => 1);
158 : parrello 1.1 }
159 :     }
160 : parrello 1.2 $self->Add('attribute-time' => time() - $start);
161 : parrello 1.1 # Return the result.
162 : parrello 1.2 return \%retVal;
163 : parrello 1.1 }
164 :    
165 :     =head3 GetSubsystems
166 :    
167 :     my $subsystems = $sl->GetSubsystems();
168 :    
169 :     Get a hash of the subsystems for this incarnation of the Sprout database.
170 :     The hash maps each subsystem ID to 1. The first time this method is called,
171 :     it creates a file listing the subsystems found. Subsequent calls read the
172 :     list from the file so that the selection of subsystems remains consistent.
173 :    
174 :     =cut
175 :    
176 :     sub GetSubsystems {
177 :     # Get the parameters.
178 :     my ($self) = @_;
179 :     # Get the sprout object.
180 :     my $sprout = $self->db();
181 :     # Get the FIG source object.
182 :     my $fig = $self->source();
183 :     # The names found will be put in here.
184 :     my @retVal = ();
185 :     # Check for the file.
186 :     my $subFileName = $sprout->LoadDirectory() . "/SubsystemList.dty";
187 :     if (-f $subFileName) {
188 :     # It's there. Get the list from it.
189 :     @retVal = Tracer::GetFile($subFileName);
190 :     } else {
191 :     # No, so compute the list and then create the file.
192 :     my @subs = $fig->all_subsystems();
193 :     for my $sub (@subs) {
194 : parrello 1.4 # Only keep NMPDR subsystems that exist on disk.
195 : parrello 1.6 if ($fig->nmpdr_subsystem($sub) && ! $fig->is_experimental_subsystem($sub)) {
196 : parrello 1.1 push @retVal, $sub;
197 :     }
198 :     }
199 :     Tracer::PutFile($subFileName, \@retVal);
200 :     }
201 : parrello 1.6 Trace(scalar(@retVal) . " subsystems in list.") if T(ERDBLoadGroup => 3);
202 : parrello 1.1 # Return the result.
203 :     my %retVal = map { $_ => 1 } @retVal;
204 :     return \%retVal;
205 :     }
206 :    
207 :    
208 :     =head3 GetSectionList
209 :    
210 : parrello 1.6 my @sections = BaseSproutLoader::GetSectionList($sprout, fig, $directory);
211 : parrello 1.1
212 :     Return a list of the sections for a Sprout load. The section list is
213 :     normally determined by retrieving a list of all the complete genomes and
214 :     adding an extra global section at the end; however, the first time the
215 :     sections are determined, they are stored in a master file so that the
216 :     same list is used regardless of what may have changed in the source data.
217 :     (A similar trick is used for subsystems).
218 :    
219 :     =over 4
220 :    
221 :     =item sprout
222 :    
223 :     [[SproutPm]] object for the database being loaded.
224 :    
225 :     =item fig
226 :    
227 : parrello 1.7 L<FIG> object from which the data is being retrieved.
228 : parrello 1.1
229 : parrello 1.6 =item directory (optional)
230 :    
231 :     Directory from which the Sprout tables are being loaded.
232 :    
233 : parrello 1.1 =item RETURN
234 :    
235 :     Returns a list of section names.
236 :    
237 :     =back
238 :    
239 :     =cut
240 :    
241 :     sub GetSectionList {
242 : parrello 1.6 my ($sprout, $fig, $directory) = @_;
243 : parrello 1.1 # Declare the return variable.
244 :     my @retVal;
245 : parrello 1.6 # Insure we have a data directory.
246 :     $directory ||= $sprout->LoadDirectory();
247 : parrello 1.1 # Look for the section list in the data directory.
248 : parrello 1.6 my $sectionFileName = $directory . "/" .
249 : parrello 1.1 ERDBGenerate::CreateFileName('section_master', undef, 'control');
250 :     if (-f $sectionFileName) {
251 :     # It's there. Get the list from it.
252 :     @retVal = Tracer::GetFile($sectionFileName);
253 :     } else {
254 : parrello 1.4 # We need to create it. First, we get the list of all complete
255 :     # genomes. As a safety feature, we only include genomes with
256 :     # an organism directory.
257 :     my @genomes = grep { -d "$FIG_Config::organisms/$_" } $fig->genomes(1);
258 :     # Sort the results and add the GLOBAL tag.
259 :     @retVal = sort { $a cmp $b } @genomes;
260 : parrello 1.1 push @retVal, GLOBAL;
261 :     # Write the list to a file for future use. This insures that if the source
262 :     # data changes, we have a consistent section list.
263 :     Tracer::PutFile($sectionFileName, \@retVal);
264 :     }
265 :     # Return the list.
266 :     return @retVal;
267 :     }
268 :    
269 :     =head3 global
270 :    
271 :     my $flag = $sl->global();
272 :    
273 :     TRUE if this is the global section, else FALSE.
274 :    
275 :     =cut
276 :    
277 :     sub global {
278 :     # Get the parameters.
279 :     my ($self) = @_;
280 :     # Return the result.
281 :     return ($self->{section} eq GLOBAL);
282 :     }
283 :    
284 :     =head3 GetCommaList
285 :    
286 :     my $string = $sl->GetCommaList($value);
287 :    
288 :     Create a comma-separated list of the values in a list reference. If the
289 :     list reference is a scalar, it will be returned unchanged. If it is
290 :     undefined, an empty string will be returned. The idea is that we may be
291 :     looking at a string, a list, or nothing, but whatever comes out will be a
292 :     string.
293 :    
294 :     =over 4
295 :    
296 :     =item value
297 :    
298 :     Reference to a list of values to be assembled into the return string.
299 :    
300 :     =item RETURN
301 :    
302 :     Returns a scalar string containing the content of the input value.
303 :    
304 :     =back
305 :    
306 :     =cut
307 :    
308 :     sub GetCommaList {
309 :     # Get the parameters.
310 :     my ($self, $value) = @_;
311 :     # Declare the return variable.
312 :     my $retVal = "";
313 :     # Only proceed if we have an input value.
314 :     if (defined $value) {
315 :     # Analyze the input value.
316 :     if (ref $value eq 'ARRAY') {
317 :     # Here it's a list reference.
318 :     $retVal = join(", ", @$value);
319 :     } else {
320 :     # Here it's not. Flatten it to a scalar.
321 :     $retVal = "$value";
322 :     }
323 :     }
324 :     # Return the result.
325 :     return $retVal;
326 :     }
327 :    
328 :    
329 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3