[Bio] / Sprout / BaseSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/BaseSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package BaseSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.2 use Time::HiRes;
26 : parrello 1.1 use base 'ERDBLoadGroup';
27 :    
28 :     # Name of the global section
29 :     use constant GLOBAL => 'Globals';
30 :    
31 :     =head1 Sprout Load Group Base Class
32 :    
33 :     =head2 Introduction
34 :    
35 :     This is the base class for all the Sprout loaders. It performs common tasks
36 :     required by multiple load groups.
37 :    
38 :     =head3 new
39 :    
40 :     my $sl = BaseSproutLoader->new($erdb, $source, $options, @tables);
41 :    
42 :     Construct a new BaseSproutLoader object.
43 :    
44 :     =over 4
45 :    
46 :     =item erdb
47 :    
48 :     [[SproutPm]] object for the database being loaded.
49 :    
50 :     =item source
51 :    
52 :     [[FigPm]] object used to access the source data.
53 :    
54 :     =item options
55 :    
56 :     Reference to a hash of command-line options.
57 :    
58 :     =item tables
59 :    
60 :     List of tables in this load group.
61 :    
62 :     =back
63 :    
64 :     =cut
65 :    
66 :     sub new {
67 :     # Get the parameters.
68 :     my ($class, $erdb, $source, $options, @tables) = @_;
69 : parrello 1.3 # Create the base load group object.
70 :     my $retVal = ERDBLoadGroup::new($class, $source, $erdb, $options, @tables);
71 : parrello 1.2 # Return it.
72 : parrello 1.1 return $retVal;
73 :     }
74 :    
75 :    
76 :     =head2 Public Methods
77 :    
78 :     =head3 GetGenomeAttributes
79 :    
80 : parrello 1.2 my $aHashRef = $sl->GetGenomeAttributes($genomeID, \@fids);
81 : parrello 1.1
82 :     Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
83 :     attributes for all the features of a genome in a single call, then organizes them into
84 :     a hash.
85 :    
86 :     =over 4
87 :    
88 :     =item fig
89 :    
90 :     FIG-like object for accessing attributes.
91 :    
92 :     =item genomeID
93 :    
94 :     ID of the genome who's attributes are desired.
95 :    
96 :     =item fids
97 :    
98 : parrello 1.2 Reference to a list of feature IDs whose attributes are to be kept. If it is a list
99 :     of lists, the feature IDs will be taken from the first element in each sub-list.
100 : parrello 1.1
101 :     =item RETURN
102 :    
103 :     Returns a reference to a hash. The key of the hash is the feature ID. The value is the
104 :     reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
105 :     the attribute key, and one or more attribute values.
106 :    
107 :     =back
108 :    
109 :     =cut
110 :    
111 :     sub GetGenomeAttributes {
112 :     # Get the parameters.
113 : parrello 1.2 my ($self, $genomeID, $fids) = @_;
114 :     # Start a timer.
115 :     my $start = time();
116 :     # Declare the return variable and initialize it with all the features.
117 :     my %retVal = map { (ref $_ ? $_->[0] : $_) => [] } @$fids;
118 : parrello 1.1 # Get the source object.
119 :     my $fig = $self->source();
120 :     # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
121 :     # an eval is used.
122 :     my @aList = ();
123 :     eval {
124 : parrello 1.2 @aList = $fig->get_attributes("fig|$genomeID%");
125 : parrello 1.1 Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(ERDBLoadGroup => 3);
126 :     };
127 :     # Check for a problem.
128 :     if ($@) {
129 :     Trace("Retrying attributes for $genomeID due to error: $@") if T(ERDBLoadGroup => 1);
130 :     # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
131 :     # but allows us to continue processing.
132 : parrello 1.2 my $nFids = scalar @$fids;
133 : parrello 1.1 for (my $i = 0; $i < $nFids; $i += 100) {
134 :     # Determine the index of the last feature ID we'll be specifying on this pass.
135 :     # Normally it's $i + 99, but if we're close to the end it may be less.
136 :     my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
137 :     # Get a slice of the fid list.
138 : parrello 1.2 my @slice = @{$fids}[$i .. $end];
139 : parrello 1.1 # Get the relevant attributes.
140 :     Trace("Retrieving attributes for fids $i to $end.") if T(ERDBLoadGroup => 3);
141 : parrello 1.2 my @aShort = $fig->get_attributes(\@slice);
142 : parrello 1.1 Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(ERDBLoadGroup => 3);
143 :     push @aList, @aShort;
144 :     }
145 :     }
146 :     # Now we should have all the interesting attributes in @aList. Populate the hash with
147 :     # them.
148 :     for my $aListEntry (@aList) {
149 :     my $fid = $aListEntry->[0];
150 : parrello 1.2 if (exists $retVal{$fid}) {
151 :     push @{$retVal{$fid}}, $aListEntry;
152 :     $self->Add(attributes => 1);
153 : parrello 1.1 }
154 :     }
155 : parrello 1.2 $self->Add('attribute-time' => time() - $start);
156 : parrello 1.1 # Return the result.
157 : parrello 1.2 return \%retVal;
158 : parrello 1.1 }
159 :    
160 :     =head3 GetSubsystems
161 :    
162 :     my $subsystems = $sl->GetSubsystems();
163 :    
164 :     Get a hash of the subsystems for this incarnation of the Sprout database.
165 :     The hash maps each subsystem ID to 1. The first time this method is called,
166 :     it creates a file listing the subsystems found. Subsequent calls read the
167 :     list from the file so that the selection of subsystems remains consistent.
168 :    
169 :     =cut
170 :    
171 :     sub GetSubsystems {
172 :     # Get the parameters.
173 :     my ($self) = @_;
174 :     # Get the sprout object.
175 :     my $sprout = $self->db();
176 :     # Get the FIG source object.
177 :     my $fig = $self->source();
178 :     # The names found will be put in here.
179 :     my @retVal = ();
180 :     # Check for the file.
181 :     my $subFileName = $sprout->LoadDirectory() . "/SubsystemList.dty";
182 :     if (-f $subFileName) {
183 :     # It's there. Get the list from it.
184 :     @retVal = Tracer::GetFile($subFileName);
185 :     } else {
186 :     # No, so compute the list and then create the file.
187 :     my @subs = $fig->all_subsystems();
188 :     for my $sub (@subs) {
189 :     if ($fig->nmpdr_subsystem($sub)) {
190 :     push @retVal, $sub;
191 :     }
192 :     }
193 :     Tracer::PutFile($subFileName, \@retVal);
194 :     }
195 :     # Return the result.
196 :     my %retVal = map { $_ => 1 } @retVal;
197 :     return \%retVal;
198 :     }
199 :    
200 :    
201 :     =head3 GetSectionList
202 :    
203 :     my @sections = BaseSproutLoader::GetSectionList($sprout, $fig);
204 :    
205 :     Return a list of the sections for a Sprout load. The section list is
206 :     normally determined by retrieving a list of all the complete genomes and
207 :     adding an extra global section at the end; however, the first time the
208 :     sections are determined, they are stored in a master file so that the
209 :     same list is used regardless of what may have changed in the source data.
210 :     (A similar trick is used for subsystems).
211 :    
212 :     =over 4
213 :    
214 :     =item sprout
215 :    
216 :     [[SproutPm]] object for the database being loaded.
217 :    
218 :     =item fig
219 :    
220 :     [[FigPm]] object from which the data is being retrieved.
221 :    
222 :     =item RETURN
223 :    
224 :     Returns a list of section names.
225 :    
226 :     =back
227 :    
228 :     =cut
229 :    
230 :     sub GetSectionList {
231 :     my ($sprout, $fig) = @_;
232 :     # Declare the return variable.
233 :     my @retVal;
234 :     # Look for the section list in the data directory.
235 :     my $sectionFileName = $sprout->LoadDirectory() . "/" .
236 :     ERDBGenerate::CreateFileName('section_master', undef, 'control');
237 :     if (-f $sectionFileName) {
238 :     # It's there. Get the list from it.
239 :     @retVal = Tracer::GetFile($sectionFileName);
240 :     } else {
241 :     # We need to create it. First, we get the list: all the complete
242 :     # genomes followed by a global section indicator. The genomes are sorted
243 :     # in lexical order.
244 :     @retVal = sort { $a cmp $b } $fig->genomes(1);
245 :     push @retVal, GLOBAL;
246 :     # Write the list to a file for future use. This insures that if the source
247 :     # data changes, we have a consistent section list.
248 :     Tracer::PutFile($sectionFileName, \@retVal);
249 :     }
250 :     # Return the list.
251 :     return @retVal;
252 :     }
253 :    
254 :     =head3 global
255 :    
256 :     my $flag = $sl->global();
257 :    
258 :     TRUE if this is the global section, else FALSE.
259 :    
260 :     =cut
261 :    
262 :     sub global {
263 :     # Get the parameters.
264 :     my ($self) = @_;
265 :     # Return the result.
266 :     return ($self->{section} eq GLOBAL);
267 :     }
268 :    
269 :     =head3 GetCommaList
270 :    
271 :     my $string = $sl->GetCommaList($value);
272 :    
273 :     Create a comma-separated list of the values in a list reference. If the
274 :     list reference is a scalar, it will be returned unchanged. If it is
275 :     undefined, an empty string will be returned. The idea is that we may be
276 :     looking at a string, a list, or nothing, but whatever comes out will be a
277 :     string.
278 :    
279 :     =over 4
280 :    
281 :     =item value
282 :    
283 :     Reference to a list of values to be assembled into the return string.
284 :    
285 :     =item RETURN
286 :    
287 :     Returns a scalar string containing the content of the input value.
288 :    
289 :     =back
290 :    
291 :     =cut
292 :    
293 :     sub GetCommaList {
294 :     # Get the parameters.
295 :     my ($self, $value) = @_;
296 :     # Declare the return variable.
297 :     my $retVal = "";
298 :     # Only proceed if we have an input value.
299 :     if (defined $value) {
300 :     # Analyze the input value.
301 :     if (ref $value eq 'ARRAY') {
302 :     # Here it's a list reference.
303 :     $retVal = join(", ", @$value);
304 :     } else {
305 :     # Here it's not. Flatten it to a scalar.
306 :     $retVal = "$value";
307 :     }
308 :     }
309 :     # Return the result.
310 :     return $retVal;
311 :     }
312 :    
313 :    
314 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3