[Bio] / Sprout / CustomAttributes.pm Repository:
ViewVC logotype

Diff of /Sprout/CustomAttributes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.31, Thu Dec 6 14:58:03 2007 UTC revision 1.32, Fri Jan 25 19:00:58 2008 UTC
# Line 385  Line 385 
385    
386  =head3 LoadAttributesFrom  =head3 LoadAttributesFrom
387    
388      my $stats = $attrDB->LoadAttributesFrom($fileName, %options);  C<< my $stats = $attrDB->LoadAttributesFrom($fileName, %options); >>
389  s  
390  Load attributes from the specified tab-delimited file. Each line of the file must  Load attributes from the specified tab-delimited file. Each line of the file must
391  contain an object ID in the first column, an attribute key name in the second  contain an object ID in the first column, an attribute key name in the second
392  column, and attribute values in the remaining columns. The attribute values will  column, and attribute values in the remaining columns. The attribute values will
# Line 423  Line 423 
423    
424  =item archive  =item archive
425    
426  If specified, the name of a file into which the incoming data file should be saved.  If specified, the name of a file into which the incoming data should be saved.
427    If I<resume> is also specified, only the lines actually loaded will be put
428    into this file.
429    
430  =item objectType  =item objectType
431    
# Line 432  Line 434 
434  =item resume  =item resume
435    
436  If specified, key-value pairs already in the database will not be reinserted.  If specified, key-value pairs already in the database will not be reinserted.
437    Specify a number to start checking after the specified number of lines and
438    then admit everything after the first line not yet loaded. Specify C<careful>
439    to check every single line. Specify C<none> to ignore this option. The default
440    is C<none>. So, if you believe that a previous load failed somewhere after 50000
441    lines, a resume value of C<50000> would skip 50000 lines in the file, then
442    check each line after that until it finds one not already in the database. The
443    first such line found and all lines after that will be loaded. On the other
444    hand, if you have a file of 100000 records, and some have been loaded and some
445    not, you would use the word C<careful>, so that every line would be checked before
446    it is inserted. A resume of C<0> will start checking the first line of the
447    input file and then begin loading once it finds a line not in the database.
448    
449    =item chunkSize
450    
451    Number of lines to load in each burst. The default is 10,000.
452    
453  =back  =back
454    
# Line 441  Line 458 
458      # Get the parameters.      # Get the parameters.
459      my ($self, $fileName, %options) = @_;      my ($self, $fileName, %options) = @_;
460      # Declare the return variable.      # Declare the return variable.
461      my $retVal = Stats->new('keys', 'values');      my $retVal = Stats->new('keys', 'values', 'linesOut');
462      # Initialize the timers.      # Initialize the timers.
463      my ($insertTime, $eraseTime, $archiveTime, $checkTime) = (0, 0, 0, 0);      my ($insertTime, $eraseTime, $archiveTime, $checkTime) = (0, 0, 0, 0);
464      # Check for append mode.      # Check for append mode.
465      my $append = ($options{append} ? 1 : 0);      my $append = ($options{append} ? 1 : 0);
466      # Check for resume mode.      # Check for resume mode.
467      my $resume = ($options{resume} ? 1 : 0);      my $resume = (defined($options{resume}) ? $options{resume} : 'none');
468      # Create a hash of key names found.      # Create a hash of key names found.
469      my %keyHash = ();      my %keyHash = ();
470        # Compute the chunk size.
471        my $chunkSize = ($options{chunkSize} ? $options{chunkSize} : 10000);
472      # Open the file for input. Note we must anticipate the possibility of an      # Open the file for input. Note we must anticipate the possibility of an
473      # open filehandle being passed in.      # open filehandle being passed in.
474      my $fh;      my $fh;
# Line 460  Line 479 
479          Trace("Attributes will be loaded from $fileName.") if T(3);          Trace("Attributes will be loaded from $fileName.") if T(3);
480          $fh = Open(undef, "<$fileName");          $fh = Open(undef, "<$fileName");
481      }      }
482        # Trace the mode.
483        Trace("Mode is $options{mode}.") if $options{mode} && T(3);
484        Trace("No mode specified.") if T(3) && ! $options{mode};
485      # Now check to see if we need to archive.      # Now check to see if we need to archive.
486      my $ah;      my $ah;
487      if ($options{archive}) {      if (exists $options{archive}) {
488          $ah = Open(undef, ">$options{archive}");          my $ah = Open(undef, ">$options{archive}");
489          Trace("Load file will be archived to $options{archive}.") if T(3);          Trace("Load file will be archived to $options{archive}.") if T(3);
490      }      }
491        # This next file is used to cache the attribute data before loading it.
492        # To avoid problems, we use a series of small files instead of one
493        # big one.
494        my $tempFileName = "$FIG_Config::temp/attributeLoadFile$$.tbl";
495      # Insure we recover from errors.      # Insure we recover from errors.
496      eval {      eval {
497            # Open the temporary file and start a counter.
498            my $th = Tracer::Open(undef, ">$tempFileName");
499            my $chunkLinesLeft = $chunkSize;
500            # If we have a resume number, process it here.
501            if ($resume =~ /\d+/) {
502                Trace("Skipping $resume lines.") if T(2);
503                my $startTime = time();
504                # Skip the specified number of lines.
505                for (my $skipped = 0; ! eof($fh) && $skipped < $resume; $skipped++) {
506                    my $line = <$fh>;
507                    $retVal->Add(skipped => 1);
508                }
509                $checkTime += time() - $startTime;
510            }
511          # Loop through the file.          # Loop through the file.
512            Trace("Starting load.") if T(2);
513          while (! eof $fh) {          while (! eof $fh) {
514              # Read the current line.              # Read the current line.
515              my ($id, $key, @values) = Tracer::GetLine($fh);              my ($id, $key, @values) = Tracer::GetLine($fh);
516              $retVal->Add(linesIn => 1);              $retVal->Add(linesIn => 1);
             # Check to see if we need to fix up the object ID.  
             if ($options{objectType}) {  
                 $id = "$options{objectType}:$id";  
             }  
             # Archive the line (if necessary).  
             if (defined $ah) {  
                 my $startTime = time();  
                 Tracer::PutLine($ah, [$id, $key, @values]);  
                 $archiveTime += time() - $startTime;  
             }  
517              # Do some validation.              # Do some validation.
518              if (! $id) {              if (! $id) {
519                  # We ignore blank lines.                  # We ignore blank lines.
# Line 500  Line 531 
531                  Trace("Line $lines for key $key has no attribute values.") if T(1);                  Trace("Line $lines for key $key has no attribute values.") if T(1);
532                  $retVal->Add(skipped => 1);                  $retVal->Add(skipped => 1);
533              } else {              } else {
534                    # Check to see if we need to fix up the object ID.
535                    if ($options{objectType}) {
536                        $id = "$options{objectType}:$id";
537                    }
538                  # The key contains a real part and an optional sub-part. We need the real part.                  # The key contains a real part and an optional sub-part. We need the real part.
539                  my ($realKey, $subKey) = $self->SplitKey($key);                  my ($realKey, $subKey) = $self->SplitKey($key);
540                  # Now we need to check for a new key.                  # Now we need to check for a new key.
541                  if (! exists $keyHash{$realKey}) {                  if (! exists $keyHash{$realKey}) {
542                      if (! $self->Exists('AttributeKey', $realKey)) {                      my $keyObject = $self->GetEntity(AttributeKey => $realKey);
543                        if (! defined($keyObject)) {
544                            # Here the specified key does not exist, which is an error.
545                          my $line = $retVal->Ask('linesIn');                          my $line = $retVal->Ask('linesIn');
546                          Confess("Attribute \"$realKey\" on line $line of $fileName not found in database.");                          Confess("Attribute \"$realKey\" on line $line of $fileName not found in database.");
547                      } else {                      } else {
548                          # Make sure we know this is no longer a new key.                          # Make sure we know this is no longer a new key.
549                          $keyHash{$realKey} = 1;                          $keyHash{$realKey} = 1;
550                          $retVal->Add(keys => 1);                          $retVal->Add(keys => 1);
551                          # If this is NOT append mode, erase the key.                          # If this is NOT append mode, erase the key. This does not delete the key
552                            # itself; it just clears out all the values.
553                          if (! $append) {                          if (! $append) {
554                              my $startTime = time();                              my $startTime = time();
555                              $self->EraseAttribute($realKey);                              $self->EraseAttribute($realKey);
# Line 523  Line 561 
561                  }                  }
562                  # If we're in resume mode, check to see if this insert is redundant.                  # If we're in resume mode, check to see if this insert is redundant.
563                  my $ok = 1;                  my $ok = 1;
564                  if ($resume) {                  if ($resume ne 'none') {
565                      my $startTime = time();                      my $startTime = time();
566                      my $count = $self->GetAttributes($id, $key, @values);                      my $count = $self->GetAttributes($id, $key, @values);
567                      $ok = ! $count;                      if ($count) {
568                            # Here the record is found, so we skip it.
569                            $ok = 0;
570                            $retVal->Add(skipped => 1);
571                        } else {
572                            # Here the record is not found. If we're in non-careful mode, we
573                            # stop resume checking at this point.
574                            if ($resume ne 'careful') {
575                                $resume = 'none';
576                            }
577                        }
578                      $checkTime += time() - $startTime;                      $checkTime += time() - $startTime;
579                  }                  }
580                  if ($ok) {                  if ($ok) {
581                      # Everything is all set up, so add the value.                      # We're in business. First, archive this row.
582                        if (defined $ah) {
583                            my $startTime = time();
584                            Tracer::PutLine($ah, [$id, $key, @values]);
585                            $archiveTime += time() - $startTime;
586                        }
587                        # We need to format the attribute data so it will work
588                        # as if it were a load file. This means we join the
589                        # values.
590                        my $valueString = join('::', @values);
591                        # Everything is all set up, so put the value in the temporary file and
592                        # count it.
593                      my $startTime = time();                      my $startTime = time();
594                      $self->AddAttribute($id, $key, @values);                      Tracer::PutLine($th, [$realKey, $id, $subKey, $valueString]);
595                      $insertTime += time() - $startTime;                      $archiveTime += time() - $startTime;
596                      # Turn off resume mode.                      $retVal->Add(linesOut => 1);
597                      $resume = 0;                      # Check to see if it's time to output a chunk.
598                        $chunkLinesLeft--;
599                        if ($chunkLinesLeft <= 0) {
600                            close $th;
601                            # Now we load the table from the file. Note that we don't do an analyze.
602                            # The analyze is done only after loading the residual.
603                            my $startTime = time();
604                            Trace("Loading attributes from $tempFileName: " . (-s $tempFileName) .
605                                  " characters.") if T(3);
606                            my $loadStats = $self->LoadTable($tempFileName, 'HasValueFor',
607                                                             mode => $options{mode}, partial => 1);
608                            $retVal->Add(insertTime => time() - $startTime);
609                            # Re-open the file and restart the counter.
610                            $th = Tracer::Open(undef, ">$tempFileName");
611                            $chunkLinesLeft = $chunkSize;
612                            $retVal->Add(chunks => 1);
613                        }
614                  } else {                  } else {
615                      # Here we skipped because of resume mode.                      # Here we skipped because of resume mode.
616                      $retVal->Add(resumeSkip => 1);                      $retVal->Add(resumeSkip => 1);
617                  }                  }
   
618                  my $progress = $retVal->Add(values => 1);                  my $progress = $retVal->Add(values => 1);
619                  Trace("$progress values loaded.") if T(3) && ($progress % 1000 == 0);                  Trace("$progress values processed.") if T(3) && ($progress % 1000 == 0);
620              }              }
621          }          }
622          $retVal->Add(eraseTime   => $eraseTime);          # Now we close the archive file. Note we undefine the handle so the error methods know
623          $retVal->Add(insertTime  => $insertTime);          # not to worry.
624          $retVal->Add(archiveTime => $archiveTime);          if (defined $ah) {
625          $retVal->Add(checkTime   => $checkTime);              close $ah;
626                undef $ah;
627            }
628            # Now we load the residual from the temporary file (if any). This time we'll do an
629            # analyze as well.
630            close $th;
631            my $startTime = time();
632            Trace("Loading residual attributes from $tempFileName: " . (-s $tempFileName) .
633                  " characters.") if T(3);
634            my $loadStats = $self->LoadTable($tempFileName, 'HasValueFor', mode => $options{mode}, partial => 1);
635            $retVal->Add(insertTime => time() - $startTime);
636            $retVal->Add(chunks => 1);
637            Trace("Attribute load successful.") if T(2);
638      };      };
639      # Check for an error.      # Check for an error.
640      if ($@) {      if ($@) {
# Line 556  Line 642 
642          my $message = $@;          my $message = $@;
643          Trace("Error during attribute load: $message") if T(0);          Trace("Error during attribute load: $message") if T(0);
644          $retVal->AddMessage($message);          $retVal->AddMessage($message);
645      }          # Close the archive file if it's open. The archive file can sometimes provide
646      # Close the archive file, if any.          # clues as to what happened.
647      if (defined $ah) {      if (defined $ah) {
         Trace("Closing archive file $options{archive}.") if T(2);  
648          close $ah;          close $ah;
649      }      }
650        }
651        # Store the timers.
652        $retVal->Add(eraseTime   => $eraseTime);
653        $retVal->Add(insertTime  => $insertTime);
654        $retVal->Add(archiveTime => $archiveTime);
655        $retVal->Add(checkTime   => $checkTime);
656      # Return the result.      # Return the result.
657      return $retVal;      return $retVal;
658  }  }

Legend:
Removed from v.1.31  
changed lines
  Added in v.1.32

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3