[Bio] / Sprout / BaseSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/BaseSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package BaseSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'ERDBLoadGroup';
26 :    
27 :     # Name of the global section
28 :     use constant GLOBAL => 'Globals';
29 :    
30 :     =head1 Sprout Load Group Base Class
31 :    
32 :     =head2 Introduction
33 :    
34 :     This is the base class for all the Sprout loaders. It performs common tasks
35 :     required by multiple load groups.
36 :    
37 :     =head3 new
38 :    
39 :     my $sl = BaseSproutLoader->new($erdb, $source, $options, @tables);
40 :    
41 :     Construct a new BaseSproutLoader object.
42 :    
43 :     =over 4
44 :    
45 :     =item erdb
46 :    
47 :     [[SproutPm]] object for the database being loaded.
48 :    
49 :     =item source
50 :    
51 :     [[FigPm]] object used to access the source data.
52 :    
53 :     =item options
54 :    
55 :     Reference to a hash of command-line options.
56 :    
57 :     =item tables
58 :    
59 :     List of tables in this load group.
60 :    
61 :     =back
62 :    
63 :     =cut
64 :    
65 :     sub new {
66 :     # Get the parameters.
67 :     my ($class, $erdb, $source, $options, @tables) = @_;
68 :     # Create the BaseSproutLoader object.
69 :     my $retVal = ERDBLoadGroup::new($class, $source, $erdb, $erdb->LoadDirectory(),
70 :     $options, @tables);
71 :     # Bless and return it.
72 :     bless $retVal, $class;
73 :     return $retVal;
74 :     }
75 :    
76 :    
77 :     =head2 Public Methods
78 :    
79 :     =head3 GetGenomeAttributes
80 :    
81 :     my $aHashRef = $sl->GetGenomeAttributes($genomeID);
82 :    
83 :     Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
84 :     attributes for all the features of a genome in a single call, then organizes them into
85 :     a hash.
86 :    
87 :     =over 4
88 :    
89 :     =item fig
90 :    
91 :     FIG-like object for accessing attributes.
92 :    
93 :     =item genomeID
94 :    
95 :     ID of the genome who's attributes are desired.
96 :    
97 :     =item fids
98 :    
99 :     Reference to a list of the feature IDs whose attributes are to be kept.
100 :    
101 :     =item propKeys
102 :    
103 :     A list of the keys to retrieve.
104 :    
105 :     =item RETURN
106 :    
107 :     Returns a reference to a hash. The key of the hash is the feature ID. The value is the
108 :     reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
109 :     the attribute key, and one or more attribute values.
110 :    
111 :     =back
112 :    
113 :     =cut
114 :    
115 :     sub GetGenomeAttributes {
116 :     # Get the parameters.
117 :     my ($self, $genomeID) = @_;
118 :     # Declare the return variable.
119 :     my $retVal = {};
120 :     # Get the source object.
121 :     my $fig = $self->source();
122 :     # Get the features for this genome.
123 :     my @fids = $fig->all_features($genomeID);
124 :     # Initialize the hash. This not only enables us to easily determine which FIDs to
125 :     # keep, it insures that the caller sees a list reference for every known fid,
126 :     # simplifying the logic.
127 :     for my $fid (@fids) {
128 :     $retVal->{$fid} = [];
129 :     }
130 :     # Get the list of NMPDR-related attributes.
131 :     my @propKeys = $fig->get_group_keys("NMPDR");
132 :     # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
133 :     # an eval is used.
134 :     my @aList = ();
135 :     eval {
136 :     @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);
137 :     Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(ERDBLoadGroup => 3);
138 :     };
139 :     # Check for a problem.
140 :     if ($@) {
141 :     Trace("Retrying attributes for $genomeID due to error: $@") if T(ERDBLoadGroup => 1);
142 :     # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
143 :     # but allows us to continue processing.
144 :     my $nFids = scalar @fids;
145 :     for (my $i = 0; $i < $nFids; $i += 100) {
146 :     # Determine the index of the last feature ID we'll be specifying on this pass.
147 :     # Normally it's $i + 99, but if we're close to the end it may be less.
148 :     my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
149 :     # Get a slice of the fid list.
150 :     my @slice = @fids[$i .. $end];
151 :     # Get the relevant attributes.
152 :     Trace("Retrieving attributes for fids $i to $end.") if T(ERDBLoadGroup => 3);
153 :     my @aShort = $fig->get_attributes(\@slice, \@propKeys);
154 :     Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(ERDBLoadGroup => 3);
155 :     push @aList, @aShort;
156 :     }
157 :     }
158 :     # Now we should have all the interesting attributes in @aList. Populate the hash with
159 :     # them.
160 :     for my $aListEntry (@aList) {
161 :     my $fid = $aListEntry->[0];
162 :     if (exists $retVal->{$fid}) {
163 :     push @{$retVal->{$fid}}, $aListEntry;
164 :     }
165 :     }
166 :     # Return the result.
167 :     return $retVal;
168 :     }
169 :    
170 :     =head3 GetSubsystems
171 :    
172 :     my $subsystems = $sl->GetSubsystems();
173 :    
174 :     Get a hash of the subsystems for this incarnation of the Sprout database.
175 :     The hash maps each subsystem ID to 1. The first time this method is called,
176 :     it creates a file listing the subsystems found. Subsequent calls read the
177 :     list from the file so that the selection of subsystems remains consistent.
178 :    
179 :     =cut
180 :    
181 :     sub GetSubsystems {
182 :     # Get the parameters.
183 :     my ($self) = @_;
184 :     # Get the sprout object.
185 :     my $sprout = $self->db();
186 :     # Get the FIG source object.
187 :     my $fig = $self->source();
188 :     # The names found will be put in here.
189 :     my @retVal = ();
190 :     # Check for the file.
191 :     my $subFileName = $sprout->LoadDirectory() . "/SubsystemList.dty";
192 :     if (-f $subFileName) {
193 :     # It's there. Get the list from it.
194 :     @retVal = Tracer::GetFile($subFileName);
195 :     } else {
196 :     # No, so compute the list and then create the file.
197 :     my @subs = $fig->all_subsystems();
198 :     for my $sub (@subs) {
199 :     if ($fig->nmpdr_subsystem($sub)) {
200 :     push @retVal, $sub;
201 :     }
202 :     }
203 :     Tracer::PutFile($subFileName, \@retVal);
204 :     }
205 :     # Return the result.
206 :     my %retVal = map { $_ => 1 } @retVal;
207 :     return \%retVal;
208 :     }
209 :    
210 :    
211 :     =head3 GetSectionList
212 :    
213 :     my @sections = BaseSproutLoader::GetSectionList($sprout, $fig);
214 :    
215 :     Return a list of the sections for a Sprout load. The section list is
216 :     normally determined by retrieving a list of all the complete genomes and
217 :     adding an extra global section at the end; however, the first time the
218 :     sections are determined, they are stored in a master file so that the
219 :     same list is used regardless of what may have changed in the source data.
220 :     (A similar trick is used for subsystems).
221 :    
222 :     =over 4
223 :    
224 :     =item sprout
225 :    
226 :     [[SproutPm]] object for the database being loaded.
227 :    
228 :     =item fig
229 :    
230 :     [[FigPm]] object from which the data is being retrieved.
231 :    
232 :     =item RETURN
233 :    
234 :     Returns a list of section names.
235 :    
236 :     =back
237 :    
238 :     =cut
239 :    
240 :     sub GetSectionList {
241 :     my ($sprout, $fig) = @_;
242 :     # Declare the return variable.
243 :     my @retVal;
244 :     # Look for the section list in the data directory.
245 :     my $sectionFileName = $sprout->LoadDirectory() . "/" .
246 :     ERDBGenerate::CreateFileName('section_master', undef, 'control');
247 :     if (-f $sectionFileName) {
248 :     # It's there. Get the list from it.
249 :     @retVal = Tracer::GetFile($sectionFileName);
250 :     } else {
251 :     # We need to create it. First, we get the list: all the complete
252 :     # genomes followed by a global section indicator. The genomes are sorted
253 :     # in lexical order.
254 :     @retVal = sort { $a cmp $b } $fig->genomes(1);
255 :     push @retVal, GLOBAL;
256 :     # Write the list to a file for future use. This insures that if the source
257 :     # data changes, we have a consistent section list.
258 :     Tracer::PutFile($sectionFileName, \@retVal);
259 :     }
260 :     # Return the list.
261 :     return @retVal;
262 :     }
263 :    
264 :     =head3 global
265 :    
266 :     my $flag = $sl->global();
267 :    
268 :     TRUE if this is the global section, else FALSE.
269 :    
270 :     =cut
271 :    
272 :     sub global {
273 :     # Get the parameters.
274 :     my ($self) = @_;
275 :     # Return the result.
276 :     return ($self->{section} eq GLOBAL);
277 :     }
278 :    
279 :     =head3 GetCommaList
280 :    
281 :     my $string = $sl->GetCommaList($value);
282 :    
283 :     Create a comma-separated list of the values in a list reference. If the
284 :     list reference is a scalar, it will be returned unchanged. If it is
285 :     undefined, an empty string will be returned. The idea is that we may be
286 :     looking at a string, a list, or nothing, but whatever comes out will be a
287 :     string.
288 :    
289 :     =over 4
290 :    
291 :     =item value
292 :    
293 :     Reference to a list of values to be assembled into the return string.
294 :    
295 :     =item RETURN
296 :    
297 :     Returns a scalar string containing the content of the input value.
298 :    
299 :     =back
300 :    
301 :     =cut
302 :    
303 :     sub GetCommaList {
304 :     # Get the parameters.
305 :     my ($self, $value) = @_;
306 :     # Declare the return variable.
307 :     my $retVal = "";
308 :     # Only proceed if we have an input value.
309 :     if (defined $value) {
310 :     # Analyze the input value.
311 :     if (ref $value eq 'ARRAY') {
312 :     # Here it's a list reference.
313 :     $retVal = join(", ", @$value);
314 :     } else {
315 :     # Here it's not. Flatten it to a scalar.
316 :     $retVal = "$value";
317 :     }
318 :     }
319 :     # Return the result.
320 :     return $retVal;
321 :     }
322 :    
323 :    
324 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3