[Bio] / Sprout / ERDBLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Wed Oct 12 02:58:20 2005 UTC revision 1.24, Sat Sep 20 14:30:21 2008 UTC
# Line 28  Line 28 
28    
29  =head3 new  =head3 new
30    
31  C<< my $erload = ERDBLoad->new($erdb, $relationName, $directory); >>      my $erload = ERDBLoad->new($erdb, $relationName, $directory, $loadOnly, $ignore);
32    
33  Begin loading an ERDB relation.  Begin loading an ERDB relation.
34    
# Line 46  Line 46 
46    
47  Name of the directory to use for the load files, WITHOUT a trailing slash.  Name of the directory to use for the load files, WITHOUT a trailing slash.
48    
49  =item estimatedRows (optional)  =item loadOnly
50    
51  Estimated maximum number of table rows. If omitted, the table will be created in  TRUE if the data is to be loaded from an existing file, FALSE if a file is
52  a format that permits an essentially unlimited number of rows.  to be created.
53    
54    =item ignore
55    
56    TRUE if the data is to be discarded. This is used to save time when only
57    a subset of the tables need to be loaded: the data for the ignored tables
58    is simply discarded.
59    
60  =back  =back
61    
# Line 57  Line 63 
63    
64  sub new {  sub new {
65      # Get the parameters.      # Get the parameters.
66      my ($class, $erdb, $relationName, $directory, $estimatedRows) = @_;      my ($class, $erdb, $relationName, $directory, $loadOnly, $ignore) = @_;
67      # Validate the directory name.      # Validate the directory name.
68      if (! -d $directory) {      if (! -d $directory) {
69          Confess("Load directory \"$directory\" not found.");          Confess("Load directory \"$directory\" not found.");
70      }      }
71      # Determine the name for this relation's load file.      # Determine the name for this relation's load file.
72      my $fileName = "$directory/$relationName.dtx";      my $fileName = "$directory/$relationName.dtx";
73      # If this is a primary entity relation, sort the output to remove      # Declare the file handle variable.
74      # duplicate keys.      my $fileHandle;
75      my $fileString = ($erdb->IsEntity($relationName) ?      # Determine whether or not this is a simply keyed relation. For a simply keyed
76                          "| sort +0 -1 -u -t \\t >$fileName" :      # relation, we can determine at run time if it is pre-sorted, and if so, skip
77                          ">$fileName");      # the sort step.
78        my $sortString = $erdb->SortNeeded($relationName);
79        # Get all of the key specifiers in the sort string.
80        my @specs = grep { $_ =~ /-k\S+/ } split /\s+/, $sortString;
81        # We are pre-sortable if the key is a single, non-numeric field at the beginning. If
82        # we are pre-sortable, we'll check each incoming key and skip the sort step if the
83        # keys are already in the correct order.
84        my $preSortable = (scalar(@specs) == 1 && $specs[0] eq "-k1,1");
85        # Check to see if this is a load-only, ignore, or a generate-and-load.
86        if ($ignore) {
87            Trace("Relation $relationName will be ignored.") if T(2);
88            $fileHandle = "";
89        } elsif ($loadOnly) {
90            Trace("Relation $relationName will be loaded from $fileName.") if T(2);
91            $fileHandle = "";
92        } else {
93            # Compute the file namefor this relation. We will build a file on
94            # disk and then sort it into the real file when we're done.
95            my $fileString = ">$fileName.tmp";
96      # Open the output file and remember its handle.      # Open the output file and remember its handle.
97      my $fileHandle = Open(undef, $fileString);          $fileHandle = Open(undef, $fileString);
98            Trace("Relation $relationName load file created.") if T(2);
99        }
100      # Create the $erload object.      # Create the $erload object.
101      my $retVal = {      my $retVal = {
102                    dbh => $erdb,                    dbh => $erdb,
# Line 80  Line 106 
106                    fileSize => 0,                    fileSize => 0,
107                    lineCount => 0,                    lineCount => 0,
108                    stats => Stats->new(),                    stats => Stats->new(),
109                    primary => $erdb->_IsPrimary($relationName)                    presorted => $preSortable,
110                      ignore => ($ignore ? 1 : 0),
111                      sortString => $sortString,
112                      presorted => $preSortable,
113                      lastKey => ""
114                   };                   };
115      # Bless and return it.      # Bless and return it.
116      bless $retVal, $class;      bless $retVal, $class;
117      return $retVal;      return $retVal;
118  }  }
119    
120    =head3 Ignore
121    
122        my $flag = $erload->Ignore;
123    
124    Return TRUE if we are ignoring this table, else FALSE.
125    
126    =cut
127    #: Return Type $;
128    sub Ignore {
129        # Get the parameters.
130        my ($self) = @_;
131        # Return the result.
132        return $self->{ignore};
133    }
134    
135  =head3 Put  =head3 Put
136    
137  C<< my  = $erload->Put($field1, $field2, ..., $fieldN); >>      my  = $erload->Put($field1, $field2, ..., $fieldN);
138    
139  Write a line of data to the load file. This may also cause the load file to be closed  Write a line of data to the load file. This may also cause the load file to be closed
140  and data read into the table.  and data read into the table.
# Line 107  Line 152 
152  =cut  =cut
153  #: Return Type ;  #: Return Type ;
154  sub Put {  sub Put {
155      # Get the ERDBLoad instance.      # Get the ERDBLoad instance and the field list.
156      my $self = shift @_;      my ($self, @rawFields) = @_;
157        # Only proceed if we're not ignoring.
158        if (! $self->{ignore}) {
159            # Convert the hash-string fields to their digested value.
160            $self->{dbh}->DigestFields($self->{relName}, \@rawFields);
161            # Insure the field values are okay.
162            my $truncates = $self->{dbh}->VerifyFields($self->{relName}, \@rawFields);
163      # Run through the list of field values, escaping them.      # Run through the list of field values, escaping them.
164      my @fields = map { Tracer::Escape($_) } @_;          my @fields = map { Tracer::Escape($_) } @rawFields;
     # If this is a primary relation, append the new-record field.  
     if ($self->{primary}) {  
         push @fields, '0';  
     }  
165      # Form a data line from the fields.      # Form a data line from the fields.
166      my $line = join("\t", @fields) . "\n";      my $line = join("\t", @fields) . "\n";
167      # Write the new record to the load file.      # Write the new record to the load file.
# Line 122  Line 169 
169      print $fh $line;      print $fh $line;
170      # Determine how long this will make the load file.      # Determine how long this will make the load file.
171      my $lineLength = length $line;      my $lineLength = length $line;
172            # Check to see if we're still pre-sorted.
173            if ($self->{presorted}) {
174                if ($fields[0] lt $self->{lastKey}) {
175                    # This key is out of order, so we're not pre-sorded any more.
176                    $self->{presorted} = 0;
177                } else {
178                    # We're still pre-sorted, so save this key.
179                    $self->{lastKey} = $fields[0];
180                }
181            }
182      # Update the statistics.      # Update the statistics.
183      $self->{fileSize} += $lineLength;      $self->{fileSize} += $lineLength;
184      $self->{lineCount} ++;      $self->{lineCount} ++;
185      $self->Add("lineOut");      $self->Add("lineOut");
186            if ($truncates > 0) {
187                $self->Add("truncated", $truncates);
188            }
189        }
190  }  }
191    
192  =head3 Add  =head3 Add
193    
194  C<< my  = $stats->Add($statName); >>      my  = $stats->Add($statName, $value);
195    
196  Increment the specified statistic.  Increment the specified statistic.
197    
# Line 140  Line 201 
201    
202  Name of the statistic to increment.  Name of the statistic to increment.
203    
204    =item value (optional)
205    
206    Value by which to increment it. If omitted, C<1> is assumed.
207    
208  =back  =back
209    
210  =cut  =cut
211  #: Return Type ;  #: Return Type ;
212  sub Add {  sub Add {
213      # Get the parameters.      # Get the parameters.
214      my ($self, $statName) = @_;      my ($self, $statName, $value) = @_;
215        # Fix the value.
216        if (! defined $value) {
217            $value = 1;
218        }
219      # Increment the statistic.      # Increment the statistic.
220      $self->{stats}->Add($statName);      $self->{stats}->Add($statName, $value);
221  }  }
222    
223  =head3 Finish  =head3 Finish
224    
225  C<< my $stats = $erload->Finish(); >>      my $stats = $erload->Finish();
226    
227  Finish loading the table. This closes the load file and loads its contents into the database.  Finish loading the table. This closes and sorts the load file.
 It also creates the indexes if the DBMS uses post-indexing.  
228    
229  =over 4  =over 4
230    
# Line 172  Line 240 
240  sub Finish {  sub Finish {
241      # Get this object instance.      # Get this object instance.
242      my ($self) = @_;      my ($self) = @_;
243        if ($self->{fh}) {
244      # Close the load file.      # Close the load file.
245      close $self->{fh};      close $self->{fh};
246            # Get the ERDB object.
247            my $erdb = $self->{dbh};
248            # Get the output file name.
249            my $fileName = $self->{fileName};
250            # Do we need a sort?
251            if ($self->{presorted}) {
252                # No, so just rename the file.
253                Trace("$fileName is pre-sorted.") if T(3);
254                unlink $fileName;
255                rename "$fileName.tmp", $fileName;
256            } else {
257                # Get the sort command for this relation.
258                my $sortCommand = $erdb->SortNeeded($self->{relName});
259                Trace("Sorting into $fileName with command: $sortCommand") if T(3);
260                # Set up a timer.
261                my $start = time();
262                # Execute the sort command and save the error output.
263                my @messages = `$sortCommand 2>&1 1>$fileName <$fileName.tmp`;
264                # Record the time spent
265                $self->{stats}->Add(sortTime => (time() - $start));
266                # If there was no error, delete the temp file.
267                if (! scalar(@messages)) {
268                    unlink "$fileName.tmp";
269                } else {
270                    # Here there was an error.
271                    Confess("Error messages from $sortCommand:\n" . join("\n", @messages));
272                }
273            }
274            # Tell the user we're done.
275            Trace("Load file $fileName created.") if T(3);
276        }
277      # Return the statistics object.      # Return the statistics object.
278      return $self->{stats};      return $self->{stats};
279  }  }
280    
281    =head3 FinishAndLoad
282    
283        my $stats = $erload->FinishAndLoad();
284    
285    Finish the load and load the table, returning the statistics.
286    
287    =cut
288    
289    sub FinishAndLoad {
290        # Get the parameters.
291        my ($self) = @_;
292        # Finish the load file.
293        my $retVal = $self->Finish();
294        # Load the table.
295        my $newStats = $self->LoadTable();
296        # Accumulate the stats.
297        $retVal->Accumulate($newStats);
298        # Return the result.
299        return $retVal;
300    }
301    
302  =head3 RelName  =head3 RelName
303    
304  C<< my $name = $erload->RelName; >>      my $name = $erload->RelName;
305    
306  Name of the relation being loaded by this object.  Name of the relation being loaded by this object.
307    
# Line 193  Line 314 
314      return $self->{relName};      return $self->{relName};
315  }  }
316    
317    =head3 LoadTable
318    
319        my $stats = $erload->LoadTable();
320    
321    Load the database table from the load file and return a statistics object.
322    
323    =cut
324    
325    sub LoadTable {
326        # Get the parameters.
327        my ($self) = @_;
328        # Get the database object, the file name, and the relation name.
329        my $erdb = $self->{dbh};
330        my $fileName = $self->{fileName};
331        my $relName = $self->{relName};
332        # Load the table. The third parameter indicates this is a drop and reload.
333        my $retVal = $erdb->LoadTable($fileName, $relName, truncate => 1);
334        # Return the result.
335        return $retVal;
336    }
337    
338  1;  1;
339    

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.24

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3