[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Fri Jan 28 22:51:42 2011 UTC revision 1.6, Sat Feb 26 19:05:32 2011 UTC
# Line 177  Line 177 
177    
178  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
179    
180    =item timestamps
181    
182    A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,
183    and used to prevent duplicate timestamps.
184    
185  =back  =back
186    
187  =cut  =cut
# Line 189  Line 194 
194      # Add our specialized data.      # Add our specialized data.
195      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
196      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
197        $retVal->{timestamps} = {};
198      # Return the result.      # Return the result.
199      return $retVal;      return $retVal;
200  }  }
# Line 372  Line 378 
378      if (-f "$featureDir/peg/fasta") {      if (-f "$featureDir/peg/fasta") {
379          $self->LoadProteinData("$featureDir/peg/fasta");          $self->LoadProteinData("$featureDir/peg/fasta");
380      }      }
381        # Check for annotation history. If we have it, load the history records into the
382        # database.
383        if (-f "$featureDir/annotations") {
384            $self->LoadAnnotations("$featureDir/annotations");
385        }
386  }  }
387    
388  =head3 LoadFeatureData  =head3 LoadFeatureData
# Line 408  Line 419 
419      my $assignHash = $self->ReadAssignments();      my $assignHash = $self->ReadAssignments();
420      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
421      # time, it overwrites the original.      # time, it overwrites the original.
422      my %fids;      my $fidHash = $self->{timestamps};
423        # Finally, we need the timestamp hash. The initial feature population
424      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
425      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
426      if (-f $fileName) {      if (-f $fileName) {
# Line 427  Line 439 
439              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
440              if (! exists $deletedFids{$fid}) {              if (! exists $deletedFids{$fid}) {
441                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
442                  if (exists $fids{$fid}) {                  if (exists $fidHash->{$fid}) {
443                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
444                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
445                  }                  }
# Line 461  Line 473 
473                  $sap->InsertObject('Feature', id => $fid, feature_type => $type,                  $sap->InsertObject('Feature', id => $fid, feature_type => $type,
474                                     function => $assignHash->{$fid}, locked => 0,                                     function => $assignHash->{$fid}, locked => 0,
475                                     sequence_length => $length);                                     sequence_length => $length);
476                  $fids{$fid} = 1;                  $fidHash->{$fid} = {};
477                  $stats->Add($type => 1);                  $stats->Add($type => 1);
478                  # The next step is to connect the feature record to its locations. This                  # The next step is to connect the feature record to its locations. This
479                  # involves dividing the location into segments. The following variable                  # involves dividing the location into segments. The following variable
# Line 618  Line 630 
630      }      }
631  }  }
632    
633    
634    =head3 LoadAnnotations
635    
636        $loaderObject->LoadAnnotations($fileName);
637    
638    Read in the annotation history information and use it to create annotation records.
639    
640    =over 4
641    
642    =item fileName
643    
644    Name of the annotation history file. This file is formatted with four fields per
645    record. Each field is on a separate line, with a double slash (C<//>) used as the
646    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
647    (formatted as an integer), (2) the user name, and (3) the annotation text.
648    
649    =back
650    
651    =cut
652    
653    sub LoadAnnotations {
654        # Get the parameters.
655        my ($self, $fileName) = @_;
656        # Get the timestamp hash.
657        my $timeHash = $self->{timestamps};
658        # Get the Sapling database.
659        my $sap = $self->{sap};
660        # Get the statistics object.
661        my $stats = $self->{stats};
662        # Open the input file.
663        my $ih = Tracer::Open(undef, "<$fileName");
664        # Loop through the input.
665        while (! eof $ih) {
666            # Read in the peg, timestamp, and user ID.
667            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
668            # Only proceed if the feature exists.
669            if (! exists $timeHash->{$fid}) {
670                $stats->Add(skippedAnnotation => 1);
671            } else {
672                # Change assignments by the master user to FIG assignments.
673                $text =~ s/Set master function/Set FIG function/s;
674                # Insure the time stamp is valid.
675                if ($timestamp =~ /^\d+$/) {
676                    # Here it's a number. We need to insure the one we use to form
677                    # the key is unique.
678                    my $keyStamp = $timestamp;
679                    while ($timeHash->{$fid}{$keyStamp}) {
680                        $keyStamp++;
681                        $stats->Add(skippedStamp => 1);
682                    }
683                    # Form the annotation ID.
684                    my $annotationID = "$fid:" . Tracer::Pad(9999999999 - $keyStamp, 10,
685                                                             1, "0");
686                    $timeHash->{$fid}{$keyStamp} = 1;
687                    # Generate the annotation.
688                    $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);
689                    $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,
690                                comment => $text, annotator => $user);
691                } else {
692                    # Here we have an invalid time stamp.
693                    Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);
694                }
695            }
696        }
697    }
698    
699    
700  =head3 WriteProtein  =head3 WriteProtein
701    
702      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 952  Line 1031 
1031      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1032  }  }
1033    
 =head2 Internal Utility Methods  
   
1034  =head3 CreateIdentifier  =head3 CreateIdentifier
1035    
1036      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
# Line 996  Line 1073 
1073      # Insure the identifier exists in the database.      # Insure the identifier exists in the database.
1074      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1075      # Connect the identifier to the feature.      # Connect the identifier to the feature.
1076      $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
1077    }
1078    
1079    =head2 Internal Utility Methods
1080    
1081    =head3 ReadAnnotation
1082    
1083        my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1084    
1085    Read the next record from an annotation file. The next record must exist (that is, an
1086    end-of-file check should have been performed before calling this method).
1087    
1088    =over 4
1089    
1090    =item ih
1091    
1092    Open file handle for the annotation file.
1093    
1094    =item RETURN
1095    
1096    Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1097    timestamp, (2) the user ID, and (3) the annotation text.
1098    
1099    =back
1100    
1101    =cut
1102    
1103    sub ReadAnnotation {
1104        # Get the parameter.
1105        my ($ih) = @_;
1106        # Read the three fixed fields.
1107        my $fid = <$ih>; chomp $fid;
1108        my $timestamp = <$ih>; chomp $timestamp;
1109        my $user = <$ih>; chomp $user;
1110        # Loop through the lines of the text field.
1111        my $text = "";
1112        my $line = <$ih>;
1113        while ($line ne "//\n") {
1114            $text .= $line;
1115            $line = <$ih>;
1116        }
1117        # Remove the trailing new-line from the text.
1118        chomp $text;
1119        # Return the fields.
1120        return ($fid, $timestamp, $user, $text);
1121  }  }
1122    
1123  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3