[Bio] / Sprout / ERDBLoadGroup.pm Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoadGroup.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu Oct 2 16:32:42 2008 UTC revision 1.7, Thu May 28 18:06:58 2009 UTC
# Line 77  Line 77 
77    
78  name of this load group  name of this load group
79    
80    =item label
81    
82    name of this worker process
83    
84  =item lastKey  =item lastKey
85    
86  ID of the last major object processed  ID of the last major object processed
# Line 111  Line 115 
115    
116  =head3 new  =head3 new
117    
118      my $edbl = ERDBLoadGroup->new($source, $db, $directory, $options, @tables);      my $edbl = ERDBLoadGroup->new($db, $directory, $options, @tables);
119    
120  Construct a new ERDBLoadGroup object. The following parameters are expected:  Construct a new ERDBLoadGroup object. The following parameters are expected:
121    
122  =over 4  =over 4
123    
 =item source  
   
 The object to be used by the subclass to access the source data. If this parameter  
 is undefined, the source object will be retrieved from the database object as soon  
 as the client calls the L</source> method.  
   
124  =item db  =item db
125    
126  The [[ErdbPm]] object for the database being loaded.  The [[ErdbPm]] object for the database being loaded.
# Line 142  Line 140 
140    
141  sub new {  sub new {
142      # Get the parameters.      # Get the parameters.
143      my ($class, $source, $db, $options, @tables) = @_;      my ($class, $db, $options, @tables) = @_;
144      # Create a statistics object      # Create a statistics object
145      my $stats = Stats->new();      my $stats = Stats->new();
146      # Compute the group name from the class name. It is the first word in      # Compute the group name from the class name. It is the first word in
# Line 160  Line 158 
158                      directory => $directory,                      directory => $directory,
159                      group => $group,                      group => $group,
160                      stats => $stats,                      stats => $stats,
161                      source => $source,                      source => undef,
162                        label => ($options->{label} || $$),
163                      lastKey => undef,                      lastKey => undef,
164                      loaders => {},                      loaders => {},
165                      tables => \@tables,                      tables => \@tables,
# Line 172  Line 171 
171      return $retVal;      return $retVal;
172  }  }
173    
174    =head3 TRAILER
175    
176    This is a string constant that always compares high against real data.
177    
178    =cut
179    
180    use constant TRAILER => "\xFF";
181    
182  =head2 Subclass Methods  =head2 Subclass Methods
183    
184  =head3 Put  =head3 Put
# Line 214  Line 221 
221      }      }
222  }  }
223    
224    =head3 PutE
225    
226        $edbl->PutE($table => $id, %fields);
227    
228    Place an entity-based table record in a load file. The first field
229    specified after the table name is the ID.
230    
231    =over 4
232    
233    =item table
234    
235    Name of the relevant table.
236    
237    =item id
238    
239    ID of the relevant entity.
240    
241    =item fields
242    
243    Hash mapping field names to values.
244    
245    =back
246    
247    =cut
248    
249    sub PutE {
250        # Get the parameters.
251        my ($self, $table, $id, %fields) = @_;
252        # Put the record.
253        $self->Put($table, id => $id, %fields);
254        # Record that we've done a putE.
255        $self->Add(putE => 1);
256    }
257    
258    =head3 PutR
259    
260        $edbl->PutR($table => $from, $to, %fields);
261    
262    Place a relationship record in a load file. The first two fields
263    specified after the table name are the from-link and the to-link,
264    respectively.
265    
266    =over 4
267    
268    =item table
269    
270    Name of the relevant relationship.
271    
272    =item from
273    
274    ID of the from-entity.
275    
276    =item to
277    
278    ID of the to-entity.
279    
280    =item fields
281    
282    Hash mapping field names to field values.
283    
284    =back
285    
286    =cut
287    
288    sub PutR {
289        # Get the parameters.
290        my ($self, $table, $from, $to, %fields) = @_;
291        # Put the record.
292        $self->Put($table, 'from-link' => $from, 'to-link' => $to, %fields);
293        # Record that we've done a PutR.
294        $self->Add(putR => 1);
295    }
296    
297    
298  =head3 Add  =head3 Add
299    
300      $edbl->Add($statName => $count);      $edbl->Add($statName => $count);
# Line 243  Line 324 
324      $self->{stats}->Add($statName => $count);      $self->{stats}->Add($statName => $count);
325  }  }
326    
327    =head3 AddWarning
328    
329        $edbl->AddWarning($errorType => $message);
330    
331    Record a warning. Warnings indicate possible errors in the incoming data.
332    The first warning of a specified type is added as a message to the load
333    statistic. All warnings are also traced at level 3.
334    
335    =over 4
336    
337    =item errorType
338    
339    Type of error indicated by the warning. This is used as the label when the
340    warning is counted in the statistics object.
341    
342    =item message
343    
344    Message describing the reason for the warning.
345    
346    =back
347    
348    =cut
349    
350    sub AddWarning {
351        # Get the parameters.
352        my ($self, $errorType, $message) = @_;
353        # Count the warning.
354        my $count = $self->Add($errorType);
355        # Is this the first one of this type?
356        if ($count == 1) {
357            # Yes, add it to the messages for the end.
358            $self->{stats}->AddMessage($errorType);
359        } else {
360            # No, just trace it.
361            Trace("Data warning: $message") if T(3);
362        }
363    }
364    
365  =head3 Track  =head3 Track
366    
367      $edbl->Track($statName => $key, $period);      $edbl->Track($statName => $key, $period);
# Line 263  Line 382 
382  =item statName  =item statName
383    
384  Name of the statistic to be incremented. This should be a plural noun  Name of the statistic to be incremented. This should be a plural noun
385  describing the object whose kep is coming in.  describing the object whose key is coming in.
386    
387  =item key  =item key
388    
# Line 288  Line 407 
407      # Do we need to output a progress message?      # Do we need to output a progress message?
408      if ($period && T(3) && ($newValue % $period == 0)) {      if ($period && T(3) && ($newValue % $period == 0)) {
409          # Yes.          # Yes.
410          Trace("$newValue $statName processed for $self->{group} group.");          MemTrace("$newValue $statName processed by $self->{label} for $self->{group} group.");
411      }      }
412  }  }
413    
# Line 342  Line 461 
461      return $self->{db};      return $self->{db};
462  }  }
463    
464    =head3 FilterRelationship
465    
466        my $stats = $edbl->FilterRelationship($type => $relationshipName);
467    
468    This method will compare a relationship's load file to a target entity
469    file and remove rows for which no target entity exists. This is useful
470    when a relationship and entity are created by different load groups, so
471    there is no opportunity in the generator to verify that the relationship
472    records are relevant to this database. Typically, this method is called
473    during post-processing, between generation by [[ErdbGeneratorPl]] and the
474    actual database table loads.
475    
476    =over 4
477    
478    =item type
479    
480    Relevant relationship direction-- C<from> or C<to>.
481    
482    =item relationshipName
483    
484    Name of the relationship whose load file is to be filtered.
485    
486    =item RETURN
487    
488    
489    
490    =back
491    
492    =cut
493    
494    sub FilterRelationship {
495        # Get the parameters.
496        my ($self, $type, $relationshipName) = @_;
497        # Declare the return variable.
498        my $retVal = Stats->new();
499        # Get the database object.
500        my $erdb = $self->db();
501        # Get the relationship's descriptor. We need this to find the relevant entity.
502        my $relData = $erdb->FindRelationship($relationshipName);
503        if (! defined $relData) {
504            Confess("Relationship $relationshipName not found in this database.");
505        } else {
506            # We have the relationship, so get the name of the target entity.
507            my $entityName = $relData->{$type};
508            # We need to find where the entity's ID will be in the relationship's
509            # load file. FROM is always first, then TO.
510            my $fieldPos = ($type eq 'from' ? 1 : 2);
511            Trace("Filtering relationship $relationshipName against $entityName using field $type($fieldPos).") if T(3);
512            # We will be reading from the entity and relationship load files in
513            # parallel, with both sorted by the entity ID. The output will be
514            # sort-piped to a temporary file.
515            my $relationshipFileName = ERDBGenerate::CreateFileName($relationshipName,
516                                                                    undef, 'data');
517            my $relationshipTempName = ERDBGenerate::CreateFileName($relationshipName,
518                                                                    undef, 'temp');
519            my $entityFileName =       ERDBGenerate::CreateFileName($entityName,
520                                                                    undef, 'data');
521            # Get the desired sort for the relationship file. We use this for
522            # the relationship output.
523            my $sortOut = $erdb->SortNeeded($relationshipName);
524            # Now we can open our files.
525            my $rih = Open(undef, "sort -k$fieldPos,$fieldPos <$relationshipFileName |");
526            my $eih = Open(undef, "sort -k1,1 <$entityFileName |");
527            my $roh = Open(undef, "| $sortOut >$relationshipTempName");
528            # Convert the field position from 1-based (for the sort) to 0-based (for PERL).
529            $fieldPos--;
530            # Get the first record in each file.
531            my ($rKey, $relRecord) = GetRecord($rih, $fieldPos);
532            my ($eKey) = GetRecord($eih, 0);
533            # Loop until we run out of records in the relationship file.
534            while ($rKey lt TRAILER) {
535                # Roll the entity file forward until we find the spot for
536                # this relationship.
537                while ($rKey gt $eKey) {
538                    ($eKey) = GetRecord($eih, 0);
539                }
540                # If we have a match, we output the relationship record.
541                # At this point eKey could be TRAILER, but rKey cannot, because
542                # it hasn't changed since the while condition was evaluated.
543                if ($eKey eq $rKey) {
544                    Tracer::PutLine($roh, $relRecord);
545                    $retVal->Add("kept-$relationshipName" => 1);
546                } else {
547                    $retVal->Add("rejected-$relationshipName" => 1);
548                }
549                # Get the next relationship record.
550                ($rKey, $relRecord) = GetRecord($rih, $fieldPos);
551            }
552            # Now we close everything and move the temp file over the top of the
553            # real relationship file.
554            Trace("Closing files.") if T(3);
555            close $rih;
556            close $eih;
557            close $roh;
558            Trace("Renaming filtered relationship file for $relationshipName.") if T(3);
559            unlink $relationshipFileName;
560            rename $relationshipTempName, $relationshipFileName;
561        }
562        # Return the result.
563        return $retVal;
564    }
565    
566    =head3 GetTables
567    
568        my @tables = ERDBLoadGroup::GetTables($erdb, $group);
569    
570    or
571    
572        my @tables = $edbl->GetTables();
573    
574    Return the list of tables belonging to the specified load group.
575    
576    =over 4
577    
578    =item erdb
579    
580    [[ErdbPm]] subclass object for the relevant database.
581    
582    =item group
583    
584    Name of the relevant group.
585    
586    =item RETURN
587    
588    Returns a list of a tables loaded by the specified group.
589    
590    =back
591    
592    =cut
593    
594    sub GetTables {
595        # Get the parameters.
596        my ($self, $group) = @_;
597        # We need a loader. If the caller gave us an ERDB object instead, we need to
598        # convert it.
599        if (! $self->isa(__PACKAGE__)) {
600            $self = $self->Loader($group, undef, {});
601        }
602        # Extract the list of tables.
603        my @retVal = @{$self->{tables}};
604        # Return the result.
605        return @retVal;
606    }
607    
608  =head2 Internal Methods  =head2 Internal Methods
609    
610  =head3 ProcessSection  =head3 ProcessSection
# Line 375  Line 638 
638      $self->{section} = $section;      $self->{section} = $section;
639      # Get the database object.      # Get the database object.
640      my $db = $self->db();      my $db = $self->db();
     # Start a timer and protect ourselves from errors.  
     my $startTime = time();  
     eval {  
641          # Get the list of tables for this group.          # Get the list of tables for this group.
642          my @tables = @{$self->{tables}};          my @tables = @{$self->{tables}};
643        # Should we skip this section?
644        if ($self->SkipIndicated($section, \@tables)) {
645            Trace("Resume mode: section $section skipped for group $self->{group}.") if T(3);
646            $self->Add("section-skips" => 1);
647        } else {
648            # Not skipping. Start a timer and protect ourselves from errors.
649            my $startTime = time();
650            eval {
651          # Get the loader hash.          # Get the loader hash.
652          my $loaderHash = $self->{loaders};          my $loaderHash = $self->{loaders};
653          # Initialize the loaders for the necessary tables.          # Initialize the loaders for the necessary tables.
# Line 388  Line 656 
656              my $loader = $loaderHash->{$table};              my $loader = $loaderHash->{$table};
657              # If it doesn't exist yet, create it.              # If it doesn't exist yet, create it.
658              if (! defined $loader) {              if (! defined $loader) {
659                  $loader = ERDBGenerate->new($db, $self->{directory}, $table);                      $loader = ERDBGenerate->new($db, $self->{directory}, $table, $self->{stats});
660                  # Save it for future use.                  # Save it for future use.
661                  $loaderHash->{$table} = $loader;                  $loaderHash->{$table} = $loader;
662                  # Count it.                  # Count it.
# Line 397  Line 665 
665              $loader->Start($section);              $loader->Start($section);
666          }          }
667          # Generate the data to put in the newly-created load files.          # Generate the data to put in the newly-created load files.
         Trace("Calling generator.") if T(3);  
668          $self->Generate();          $self->Generate();
669                # Release our hold on the source object. This allows the database object to
670                # decide whether or not we need a new one.
671                delete $self->{source};
672                # Clean up the database object.
673                $db->Cleanup();
674      };      };
675      # Did it work?      # Did it work?
676      if ($@) {      if ($@) {
677          # No, so emit an error message and abort all the loaders.              # No, so we need to emit an error message and abort all the loaders.
678          $self->{stats}->AddMessage("Error loading section $section: $@");              # First, we need to clean the new-line from the message (if any).
679                my $msg = $@;
680                chomp $msg;
681                # Figure out what we were doing at the time of the error.
682                my $place = "Error in section $section";
683          if (defined $self->{lastKey}) {          if (defined $self->{lastKey}) {
684              $self->{stats}->AddMessage("Error occurred while processing \"$self->{lastKey}\".");                  $place .= "($self->{lastKey})";
685          }          }
686                # Format the message and denote we have a section failure.
687                $self->{stats}->AddMessage("$place: $msg");
688          $self->Add("section-errors" => 1);          $self->Add("section-errors" => 1);
689                # Abort the loaders.
690          for my $loader (values %{$self->{loaders}}) {          for my $loader (values %{$self->{loaders}}) {
691              $loader->Abort();              $loader->Abort();
692          }          }
693      } else {      } else {
694          # Yes! Finish all the loaders.              # It did work! Finish all the loaders.
695          for my $loader (values %{$self->{loaders}}) {          for my $loader (values %{$self->{loaders}}) {
696              $loader->Finish();              $loader->Finish();
697          }          }
698          # Update the load count and the timer.              # Update the load count.
699          $self->Add("section-loads" => 1);          $self->Add("section-loads" => 1);
700            }
701            # Update the timer.
702          $self->Add(duration => (time() - $startTime));          $self->Add(duration => (time() - $startTime));
703      }      }
704  }  }
# Line 437  Line 718 
718      return $self->{stats}->Show();      return $self->{stats}->Show();
719  }  }
720    
721    =head3 AccumulateStats
722    
723        $edbl->AccumulateStats($stats);
724    
725    Add this load's statistics into the caller-specified statistics object.
726    
727    =over 4
728    
729    =item stats
730    
731    [[StatsPm]] object into which this load's statistics will be accumulated.
732    
733    =back
734    
735    =cut
736    
737    sub AccumulateStats {
738        # Get the parameters.
739        my ($self, $stats) = @_;
740        # Roll up our statistics in the caller's object.
741        $stats->Accumulate($self->{stats});
742    }
743    
744    
745  =head3 GetGroupHash  =head3 GetGroupHash
746    
747      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);
# Line 468  Line 773 
773      # Loop through the list of load groups.      # Loop through the list of load groups.
774      for my $group ($erdb->LoadGroupList()) {      for my $group ($erdb->LoadGroupList()) {
775          # Stash the loader's tables in the output hash.          # Stash the loader's tables in the output hash.
776          $retVal->{$group} = GetTables($erdb, $group);          $retVal->{$group} = [ GetTables($erdb, $group) ];
777      }      }
778      # Return the result.      # Return the result.
779      return $retVal;      return $retVal;
780  }  }
781    
 =head3 GetTables  
   
     my @tables = ERDBLoadGroup::GetTables($group);  
   
 Return the list of tables belonging to the specified load group.  
   
 =over 4  
   
 =item erdb  
   
 Return the list of tables for the specified load group.  
   
 =item group  
   
 Name of relevant group.  
   
 =item RETURN  
   
 Returns a list of a tables loaded by the specified group.  
   
 =back  
   
 =cut  
   
 sub GetTables {  
     # Get the parameters.  
     my ($erdb, $group) = @_;  
     # Create a loader for the specified group.  
     my $loader = $erdb->Loader($group, undef, {});  
     # Extract the list of tables.  
     my @retVal = @{$loader->{tables}};  
     # Return the result.  
     return @retVal;  
 }  
   
   
782  =head3 ComputeGroups  =head3 ComputeGroups
783    
784      my @groupList = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @groupList = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
# Line 574  Line 843 
843          }          }
844      }      }
845      # Normalize the group names and return them.      # Normalize the group names and return them.
846      return map { ucfirst $_ } @retVal;      @retVal = map { ucfirst $_ } @retVal;
847        Trace("Final group list is " . join(" ", @retVal) . ".") if T(2);
848        return @retVal;
849  }  }
850    
851  =head3 KillFileName  =head3 KillFileName
# Line 616  Line 887 
887      return $retVal;      return $retVal;
888  }  }
889    
890    =head3 SkipIndicated
891    
892        my $flag = $edbl->SkipIndicated($section, \@tables);
893    
894    Return FALSE if the current group should be run for the current section.
895    If the C<resume> option is not set, this method always returns FALSE;
896    otherwise, it will look at the files currently in the load directory and
897    if enough of them are present, it will return TRUE, indicating there's
898    no point in generating data for the indicated tables with respect to the
899    current section. In other words, it will return TRUE if, for every table,
900    there is either a load file for that table or a load file for the
901    specified section of that table.
902    
903    =over 4
904    
905    =item section
906    
907    ID of the relevant section.
908    
909    =item tables
910    
911    List of tables to check.
912    
913    =item RETURN
914    
915    Returns TRUE if load files are already generated for the specified section, else FALSE.
916    
917    =back
918    
919    =cut
920    
921    sub SkipIndicated {
922        # Get the parameters.
923        my ($self, $section, $tables) = @_;
924        # Declare the return variable. It's FALSE if there's no resume parameter.
925        my $retVal = $self->{options}->{resume};
926        # Loop through the table names while $retval is TRUE.
927        for my $table (@$tables) { last if ! $retVal;
928            # Compute the file names.
929            my @files = map { ERDBGenerate::CreateFileName($table, $_, data => $self->{directory}) }
930                (undef, $section);
931            # If neither is present, we can't skip. So, if the grep below returns an empty
932            # list, we set $retVal FALSE, which stops the loop.
933            if (scalar(grep { -f $_ } @files) == 0) {
934                $retVal = 0;
935                Trace("Section $section not found for $table in $self->{group}. Regeneration required.") if T(3);
936            }
937        }
938        # Return the result.
939        return $retVal;
940    }
941    
942    =head3 GetRecord
943    
944        my ($key, $record) = ERDBLoadGroup::GetRelRecord($ih, $fieldPos);
945    
946    Read the next record from a tab-delimited file, returning the key field
947    in the specified position and a reference to a list of all the fields. If
948    end-of-file has been reached, the value TRAILER and an empty list
949    reference will be returned.
950    
951    =over 4
952    
953    =item ih
954    
955    Open handle of the input file containing the records.
956    
957    =item fieldPos
958    
959    Ordinal position in the record of the desired key field. This should be
960    C<0> for the first field, C<1> for the second, and so forth.
961    
962    =item RETURN
963    
964    Returns a two-element list, the first of which contains the indicated key
965    field and the second of which is a reference to a list of all fields in the
966    record (including the key). If end-of-file is reached, the returned key will
967    be TRAILER and the returned list will be empty.
968    
969    =back
970    
971    =cut
972    
973    sub GetRecord {
974        # Get the parameters.
975        my ($ih, $fieldPos) = @_;
976        # Declare the return variables.
977        my ($key, $record) = (TRAILER, []);
978        # Only proceed if we're NOT at end of file.
979        if (! eof $ih) {
980            # Read the record.
981            my @fields = Tracer::GetLine($ih);
982            # Extract the key and form the list.
983            $key = $fields[$fieldPos];
984            $record = \@fields;
985        }
986        # Return the results.
987        return ($key, $record);
988    }
989    
990  =head2 Virtual Methods  =head2 Virtual Methods
991    
# Line 633  Line 1003 
1003      Confess("Pure virtual method Generate called.");      Confess("Pure virtual method Generate called.");
1004  }  }
1005    
1006    =head3 PostProcess
1007    
1008        my $stats = $edbl->PostProcess();
1009    
1010    Post-process the load files for this group. This method is called after all
1011    of the load files have been assembled, but before anything is actually loaded.
1012    It allows a final pass through the data to do filtering between groups or to
1013    accumulate totals and counters. The default is to do nothing.
1014    
1015    This method returns a statistics object describing the post-processing activity,
1016    or an undefined value if nothing happened.
1017    
1018    =cut
1019    
1020    sub PostProcess { }
1021    
1022  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3