[Bio] / Sprout / ERDBLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Load Utility Object
12 :    
13 :     =head2 Introduction
14 :    
15 : parrello 1.3 This object is designed to assist with creating the load file for an ERDB
16 :     data relation. The user constructs the object by specifying an ERDB object
17 :     and a relation name. This create the load file for the relevant relation. The client
18 :     then passes in data lines which are written to a file, and calls
19 :     L</Finish> to close the file and get the statistics.
20 : parrello 1.1
21 : parrello 1.3 This module makes use of the internal ERDB method C<_IsPrimary>.
22 : parrello 1.1
23 :     =cut
24 :    
25 :     #
26 :    
27 :     =head2 Public Methods
28 :    
29 :     =head3 new
30 :    
31 : parrello 1.20 my $erload = ERDBLoad->new($erdb, $relationName, $directory, $loadOnly, $ignore);
32 : parrello 1.1
33 :     Begin loading an ERDB relation.
34 :    
35 :     =over 4
36 :    
37 :     =item erdb
38 :    
39 :     ERDB object representing the target database.
40 :    
41 :     =item relationName
42 :    
43 :     Name of the relation being loaded.
44 :    
45 :     =item directory
46 :    
47 :     Name of the directory to use for the load files, WITHOUT a trailing slash.
48 :    
49 : parrello 1.8 =item loadOnly
50 : parrello 1.2
51 : parrello 1.8 TRUE if the data is to be loaded from an existing file, FALSE if a file is
52 :     to be created.
53 : parrello 1.2
54 : parrello 1.10 =item ignore
55 :    
56 :     TRUE if the data is to be discarded. This is used to save time when only
57 :     a subset of the tables need to be loaded: the data for the ignored tables
58 :     is simply discarded.
59 :    
60 : parrello 1.1 =back
61 :    
62 :     =cut
63 :    
64 :     sub new {
65 :     # Get the parameters.
66 : parrello 1.10 my ($class, $erdb, $relationName, $directory, $loadOnly, $ignore) = @_;
67 : parrello 1.1 # Validate the directory name.
68 :     if (! -d $directory) {
69 :     Confess("Load directory \"$directory\" not found.");
70 :     }
71 :     # Determine the name for this relation's load file.
72 :     my $fileName = "$directory/$relationName.dtx";
73 : parrello 1.8 # Declare the file handle variable.
74 :     my $fileHandle;
75 : parrello 1.24 # Determine whether or not this is a simply keyed relation. For a simply keyed
76 :     # relation, we can determine at run time if it is pre-sorted, and if so, skip
77 :     # the sort step.
78 :     my $sortString = $erdb->SortNeeded($relationName);
79 :     # Get all of the key specifiers in the sort string.
80 :     my @specs = grep { $_ =~ /-k\S+/ } split /\s+/, $sortString;
81 :     # We are pre-sortable if the key is a single, non-numeric field at the beginning. If
82 :     # we are pre-sortable, we'll check each incoming key and skip the sort step if the
83 :     # keys are already in the correct order.
84 :     my $preSortable = (scalar(@specs) == 1 && $specs[0] eq "-k1,1");
85 : parrello 1.10 # Check to see if this is a load-only, ignore, or a generate-and-load.
86 :     if ($ignore) {
87 :     Trace("Relation $relationName will be ignored.") if T(2);
88 :     $fileHandle = "";
89 :     } elsif ($loadOnly) {
90 : parrello 1.8 Trace("Relation $relationName will be loaded from $fileName.") if T(2);
91 :     $fileHandle = "";
92 :     } else {
93 : parrello 1.23 # Compute the file namefor this relation. We will build a file on
94 :     # disk and then sort it into the real file when we're done.
95 :     my $fileString = ">$fileName.tmp";
96 : parrello 1.8 # Open the output file and remember its handle.
97 :     $fileHandle = Open(undef, $fileString);
98 : parrello 1.24 Trace("Relation $relationName load file created.") if T(2);
99 : parrello 1.8 }
100 : parrello 1.1 # Create the $erload object.
101 :     my $retVal = {
102 :     dbh => $erdb,
103 :     fh => $fileHandle,
104 :     fileName => $fileName,
105 :     relName => $relationName,
106 :     fileSize => 0,
107 :     lineCount => 0,
108 :     stats => Stats->new(),
109 : parrello 1.24 presorted => $preSortable,
110 :     ignore => ($ignore ? 1 : 0),
111 :     sortString => $sortString,
112 :     presorted => $preSortable,
113 :     lastKey => ""
114 : parrello 1.1 };
115 :     # Bless and return it.
116 :     bless $retVal, $class;
117 :     return $retVal;
118 :     }
119 :    
120 : parrello 1.10 =head3 Ignore
121 :    
122 : parrello 1.20 my $flag = $erload->Ignore;
123 : parrello 1.10
124 :     Return TRUE if we are ignoring this table, else FALSE.
125 :    
126 :     =cut
127 :     #: Return Type $;
128 :     sub Ignore {
129 :     # Get the parameters.
130 :     my ($self) = @_;
131 :     # Return the result.
132 :     return $self->{ignore};
133 :     }
134 :    
135 : parrello 1.1 =head3 Put
136 :    
137 : parrello 1.20 my = $erload->Put($field1, $field2, ..., $fieldN);
138 : parrello 1.1
139 :     Write a line of data to the load file. This may also cause the load file to be closed
140 :     and data read into the table.
141 :    
142 :     =over 4
143 :    
144 :     =item field1, field2, ..., fieldN
145 :    
146 :     List of field values to be put into the data line. The field values must be in the
147 :     order determined shown in the documentation for the table. Internal tabs and
148 :     new-lines will automatically be escaped before the data line is formatted.
149 :    
150 :     =back
151 :    
152 :     =cut
153 :     #: Return Type ;
154 :     sub Put {
155 : parrello 1.9 # Get the ERDBLoad instance and the field list.
156 :     my ($self, @rawFields) = @_;
157 : parrello 1.10 # Only proceed if we're not ignoring.
158 :     if (! $self->{ignore}) {
159 : parrello 1.13 # Convert the hash-string fields to their digested value.
160 :     $self->{dbh}->DigestFields($self->{relName}, \@rawFields);
161 : parrello 1.10 # Insure the field values are okay.
162 :     my $truncates = $self->{dbh}->VerifyFields($self->{relName}, \@rawFields);
163 :     # Run through the list of field values, escaping them.
164 :     my @fields = map { Tracer::Escape($_) } @rawFields;
165 :     # Form a data line from the fields.
166 :     my $line = join("\t", @fields) . "\n";
167 :     # Write the new record to the load file.
168 :     my $fh = $self->{fh};
169 :     print $fh $line;
170 :     # Determine how long this will make the load file.
171 :     my $lineLength = length $line;
172 : parrello 1.24 # Check to see if we're still pre-sorted.
173 :     if ($self->{presorted}) {
174 :     if ($fields[0] lt $self->{lastKey}) {
175 :     # This key is out of order, so we're not pre-sorded any more.
176 :     $self->{presorted} = 0;
177 :     } else {
178 :     # We're still pre-sorted, so save this key.
179 :     $self->{lastKey} = $fields[0];
180 :     }
181 :     }
182 : parrello 1.10 # Update the statistics.
183 :     $self->{fileSize} += $lineLength;
184 :     $self->{lineCount} ++;
185 :     $self->Add("lineOut");
186 :     if ($truncates > 0) {
187 :     $self->Add("truncated", $truncates);
188 :     }
189 : parrello 1.9 }
190 : parrello 1.4 }
191 :    
192 :     =head3 Add
193 :    
194 : parrello 1.20 my = $stats->Add($statName, $value);
195 : parrello 1.4
196 :     Increment the specified statistic.
197 :    
198 :     =over 4
199 :    
200 :     =item statName
201 :    
202 :     Name of the statistic to increment.
203 :    
204 : parrello 1.9 =item value (optional)
205 :    
206 :     Value by which to increment it. If omitted, C<1> is assumed.
207 :    
208 : parrello 1.4 =back
209 :    
210 :     =cut
211 :     #: Return Type ;
212 :     sub Add {
213 :     # Get the parameters.
214 : parrello 1.9 my ($self, $statName, $value) = @_;
215 :     # Fix the value.
216 :     if (! defined $value) {
217 :     $value = 1;
218 :     }
219 : parrello 1.4 # Increment the statistic.
220 : parrello 1.9 $self->{stats}->Add($statName, $value);
221 : parrello 1.1 }
222 :    
223 :     =head3 Finish
224 :    
225 : parrello 1.20 my $stats = $erload->Finish();
226 : parrello 1.1
227 : parrello 1.23 Finish loading the table. This closes and sorts the load file.
228 : parrello 1.1
229 :     =over 4
230 :    
231 :     =item RETURN
232 :    
233 :     Returns a statistics object describing what happened during the load and containing any
234 :     error messages.
235 :    
236 :     =back
237 :    
238 :     =cut
239 :    
240 :     sub Finish {
241 :     # Get this object instance.
242 :     my ($self) = @_;
243 : parrello 1.10 if ($self->{fh}) {
244 :     # Close the load file.
245 :     close $self->{fh};
246 : parrello 1.23 # Get the ERDB object.
247 :     my $erdb = $self->{dbh};
248 :     # Get the output file name.
249 :     my $fileName = $self->{fileName};
250 : parrello 1.24 # Do we need a sort?
251 :     if ($self->{presorted}) {
252 :     # No, so just rename the file.
253 :     Trace("$fileName is pre-sorted.") if T(3);
254 :     unlink $fileName;
255 :     rename "$fileName.tmp", $fileName;
256 :     } else {
257 :     # Get the sort command for this relation.
258 :     my $sortCommand = $erdb->SortNeeded($self->{relName});
259 :     Trace("Sorting into $fileName with command: $sortCommand") if T(3);
260 :     # Set up a timer.
261 :     my $start = time();
262 :     # Execute the sort command and save the error output.
263 :     my @messages = `$sortCommand 2>&1 1>$fileName <$fileName.tmp`;
264 :     # Record the time spent
265 :     $self->{stats}->Add(sortTime => (time() - $start));
266 :     # If there was no error, delete the temp file.
267 :     if (! scalar(@messages)) {
268 :     unlink "$fileName.tmp";
269 :     } else {
270 :     # Here there was an error.
271 :     Confess("Error messages from $sortCommand:\n" . join("\n", @messages));
272 :     }
273 : parrello 1.23 }
274 :     # Tell the user we're done.
275 :     Trace("Load file $fileName created.") if T(3);
276 : parrello 1.10 }
277 : parrello 1.1 # Return the statistics object.
278 :     return $self->{stats};
279 :     }
280 :    
281 : parrello 1.19 =head3 FinishAndLoad
282 :    
283 : parrello 1.20 my $stats = $erload->FinishAndLoad();
284 : parrello 1.19
285 :     Finish the load and load the table, returning the statistics.
286 :    
287 :     =cut
288 :    
289 :     sub FinishAndLoad {
290 :     # Get the parameters.
291 :     my ($self) = @_;
292 :     # Finish the load file.
293 :     my $retVal = $self->Finish();
294 :     # Load the table.
295 :     my $newStats = $self->LoadTable();
296 :     # Accumulate the stats.
297 :     $retVal->Accumulate($newStats);
298 :     # Return the result.
299 :     return $retVal;
300 :     }
301 :    
302 : parrello 1.1 =head3 RelName
303 :    
304 : parrello 1.20 my $name = $erload->RelName;
305 : parrello 1.1
306 :     Name of the relation being loaded by this object.
307 :    
308 :     =cut
309 :    
310 :     sub RelName {
311 :     # Get the object instance.
312 :     my ($self) = @_;
313 :     # Return the relation name.
314 :     return $self->{relName};
315 :     }
316 :    
317 : parrello 1.19 =head3 LoadTable
318 :    
319 : parrello 1.20 my $stats = $erload->LoadTable();
320 : parrello 1.19
321 :     Load the database table from the load file and return a statistics object.
322 :    
323 :     =cut
324 :    
325 :     sub LoadTable {
326 :     # Get the parameters.
327 :     my ($self) = @_;
328 :     # Get the database object, the file name, and the relation name.
329 :     my $erdb = $self->{dbh};
330 :     my $fileName = $self->{fileName};
331 :     my $relName = $self->{relName};
332 :     # Load the table. The third parameter indicates this is a drop and reload.
333 : parrello 1.21 my $retVal = $erdb->LoadTable($fileName, $relName, truncate => 1);
334 : parrello 1.19 # Return the result.
335 :     return $retVal;
336 :     }
337 :    
338 : parrello 1.1 1;
339 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3