[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Thu Apr 21 19:58:18 2011 UTC revision 1.13, Fri Jul 22 19:20:11 2011 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item assignHash
61    
62    Hash of feature IDs to functional assignments. Deleted features are removed, which
63    means only features listed in this hash can be legally inserted into the database.
64    
65  =back  =back
66    
67  =cut  =cut
# Line 67  Line 72 
72      # Create the loader object.      # Create the loader object.
73      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
74      # Load the contigs.      # Load the contigs.
75      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
76      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
77      # Load the features.      # Load the features.
78      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
79      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
80      # Check for annotation history. If we have it, load the history records into the      # Check for annotation history. If we have it, load the history records into the
81      # database.      # database.
82      if (-f "$directory/annotations") {      if (-f "$directory/annotations") {
83          Trace("Processing annotations.") if T(3);          Trace("Processing annotations.") if T(SaplingDataLoader => 3);
84          $loaderObject->LoadAnnotations("$directory/annotations");          $loaderObject->LoadAnnotations("$directory/annotations");
85      }      }
86      # Load the subsystem bindings.      # Load the subsystem bindings.
87      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
88      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
89      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
90      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
91      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
92      # Return the statistics.      # Return the statistics.
93      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 130  Line 135 
135      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
136      # Delete the molecular machines.      # Delete the molecular machines.
137      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
     # Delete the annotations.  
     SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsAnnotatedBy', 'Annotation');  
138      # Delete the genome itself.      # Delete the genome itself.
139      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
140      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 233  Line 236 
236    
237  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
238    
 =item timestamps  
   
 A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,  
 and used to prevent duplicate timestamps.  
   
239  =back  =back
240    
241  =cut  =cut
# Line 250  Line 248 
248      # Add our specialized data.      # Add our specialized data.
249      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
250      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
251      $retVal->{timestamps} = {};      # Leave the assignment hash undefined until we populate it.
252        $retVal->{assignHash} = undef;
253      # Return the result.      # Return the result.
254      return $retVal;      return $retVal;
255  }  }
# Line 420  Line 419 
419      # Get the parameters.      # Get the parameters.
420      my ($self) = @_;      my ($self) = @_;
421      # Read in the functional assignments.      # Read in the functional assignments.
422      Trace("Reading functional assignments.") if T(3);      Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
423      my $assignHash = $self->ReadAssignments();      my $assignHash = $self->ReadAssignments();
424        $self->{assignHash} = $assignHash;
425      # Get the directory of feature types.      # Get the directory of feature types.
426      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
427      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
428        # Check for protein sequences. If we have some, load them into a hash.
429        my $protHash = {};
430        if (-f "$featureDir/peg/fasta") {
431            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
432            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
433        }
434      # Create the feature records for the types found.      # Create the feature records for the types found.
435      for my $type (@types) {      for my $type (@types) {
436          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
437          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
438              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
439              $self->LoadFeatureData($featureDir, $type, $assignHash);              my $evHash = {};
440                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
441                if (-f $tranFile) {
442                    $evHash = $self->LoadEvidenceCodes($tranFile);
443          }          }
444                # Now load the feature data.
445                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
446      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         Trace("Processing protein sequences.") if T(3);  
         $self->LoadProteinData("$featureDir/peg/fasta");  
447      }      }
448      # Now loop through the features, connecting them to their roles. Note that deleted  }
449      # features will not be in the assignment hash.  
450      Trace("Connecting features to roles.") if T(3);  =head3 LoadEvidenceCodes
451      for my $fid (keys %$assignHash) {  
452          $self->ConnectFunctionRoles($fid, $assignHash->{$fid});      my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
453    
454    Load the evidence codes from the specified attribute transaction log file into a
455    hash. The log file is in tab-delimited format. The first column contains the
456    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
457    the third an attribute name (we'll ignore everything but C<evidence_code>), and
458    the fourth the attribute value.
459    
460    =over 4
461    
462    =item attributeFile
463    
464    Name of the attribute transaction log file.
465    
466    =item RETURN
467    
468    Returns a reference to a hash mapping each feature ID to a comma-delimited list
469    of its evidence codes.
470    
471    =back
472    
473    =cut
474    
475    sub LoadEvidenceCodes {
476        # Get the parameters.
477        my ($self, $attributeFile) = @_;
478        # Get the Sapling database.
479        my $sap = $self->{sap};
480        # Get the statistics object.
481        my $stats = $self->{stats};
482        # Get the assignment hash: we use this to filter the feature IDs.
483        my $assignHash = $self->{assignHash};
484        # Open the attribute log file for input.
485        my $ih = Open(undef, "<$attributeFile");
486        # This two-dimensional hash will hold the evidence codes for each feature.
487        my %retVal;
488        # Loop through the attribute log file.
489        while (! eof $ih) {
490            # Get the current attribute record.
491            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
492            $stats->Add(attributeLine => 1);
493            # Insure we have all the pieces we need.
494            if (! $command || ! $fid || $key ne 'evidence_code') {
495                $stats->Add(attributeLineSkipped => 1);
496            } elsif (! $assignHash->{$fid}) {
497                # Here the attribute is for a deleted feature.
498                $stats->Add(attributeFidSkipped => 1);
499            } else {
500                # Get the sub-hash for this feature.
501                if (! exists $retVal{$fid}) {
502                    $retVal{$fid} = {};
503                }
504                my $featureSubHash = $retVal{$fid};
505                # Process according to the command.
506                if ($command eq 'ADD') {
507                    # Here we are adding a new evidence code.
508                    $featureSubHash->{$value} = 1;
509                    $stats->Add(attributeAdd => 1);
510                } elsif ($command eq 'DELETE') {
511                    # Here we are deleting an evidence code.
512                    delete $featureSubHash->{$value};
513                    $stats->Add(attributeDelete => 1);
514                } else {
515                    # here we have an unrecognized command.
516                    $stats->Add(attributeCommandSkip => 1);
517                }
518      }      }
519  }  }
520        # Loop through the hash, converting each sub-hash to a comma-delimited list of
521        # evidence codes.
522        for my $fid (keys %retVal) {
523            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
524        }
525        # Return the evidence hash.
526        return \%retVal;
527    }
528    
529    
530  =head3 LoadFeatureData  =head3 LoadFeatureData
531    
532      $loaderObject->LoadFeatureData($featureDir, $type, $assignHash);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
533    
534  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
535  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 463  Line 544 
544    
545  Type of feature to load.  Type of feature to load.
546    
547  =item assignHash  =item protHash
548    
549    Reference to a hash mapping each feature ID for a protein-encoding gene to
550    its protein sequence.
551    
552  Reference to a hash mapping each feature ID to its functional assignment.  =item evHash
553    
554    Reference to a hash mapping each feature ID to a comma-delimited list of
555    its evidence codes (if any).
556    
557  =back  =back
558    
# Line 473  Line 560 
560    
561  sub LoadFeatureData {  sub LoadFeatureData {
562      # Get the parameters.      # Get the parameters.
563      my ($self, $featureDir, $type, $assignHash) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
564      # Get the sapling database.      # Get the sapling database.
565      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
566      # Get the statistics object.      # Get the statistics object.
567      my $stats = $self->{stats};      my $stats = $self->{stats};
568        # Get the assignment hash. This tells us our functional assignments. This method is
569        # also where we remove the deleted features from it.
570        my $assignHash = $self->{assignHash};
571      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
572      # time, it overwrites the original.      # time, it overwrites the original.
573      my $fidHash = $self->{timestamps};      my %fidHash;
574      # Finally, we need the timestamp hash. The initial feature population      # Finally, we need the timestamp hash. The initial feature population
575      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
576      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
577      if (-f $fileName) {      if (-f $fileName) {
578          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
579          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
580          my %deletedFids;          # that they are not used by subsequent methods.
581          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
582          if (-f $deleteFile) {          if (-f $deleteFile) {
583              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
584                while (! eof $dh) {
585                    my ($deletedFid) = Tracer::GetLine($dh);
586                    if (exists $assignHash->{$deletedFid}) {
587                        delete $assignHash->{$deletedFid};
588                        $stats->Add(deletedFid => 1);
589                    }
590                }
591          }          }
592          # Open the main file for input.          # Open the main file for input.
593          Trace("Reading features from $fileName.") if T(3);          Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
594          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
595          while (! eof $ih) {          while (! eof $ih) {
596              # Read this feature's information.              # Read this feature's information.
597              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
598              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
599              if (! exists $deletedFids{$fid}) {              if (exists $assignHash->{$fid}) {
600                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
601                  if (exists $fidHash->{$fid}) {                  if ($fidHash{$fid}) {
602                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
603                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
604                  }                  }
605                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
606                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
607                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
608                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
609                  my @locData;                          $assignHash->{$fid} = $aliases[0];
610                  # This will contain the total sequence length.                      }
611                  my $length = 0;                      @aliases = ();
612                  # Loop through the locations.                  }
613                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
614                      # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
615                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
616                      # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
617                      $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                     # Compute the direction.  
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fidHash->{$fid} = {};  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
618              }              }
619          }          }
620      }      }
621  }  }
622    
623    
624  =head3 LoadProteinData  =head3 LoadProteinData
625    
626      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
627    
628  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
629  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
630    
631  =over 4  =over 4
632    
# Line 632  Line 634 
634    
635  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
636    
637    =item RETURN
638    
639    Returns a hash mapping feature IDs to protein sequences.
640    
641  =back  =back
642    
643  =cut  =cut
# Line 641  Line 647 
647      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
648      # Open the FASTA file for input.      # Open the FASTA file for input.
649      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
650        # Create the return hash.
651        my $retVal = {};
652      # We'll track the current protein in here.      # We'll track the current protein in here.
653      my $fid;      my $fid;
654      my $sequence = "";      my $sequence = "";
# Line 654  Line 662 
662              my $newFid = $1;              my $newFid = $1;
663              # Do we have an existing protein?              # Do we have an existing protein?
664              if (defined $fid) {              if (defined $fid) {
665                  # Yes. Write it out.                  # Yes. Store it in the hash.
666                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
667              }              }
668              # Initialize for the next protein.              # Initialize for the next protein.
669              $fid = $newFid;              $fid = $newFid;
# Line 667  Line 675 
675      }      }
676      # Do we have a residual protein.      # Do we have a residual protein.
677      if (defined $fid) {      if (defined $fid) {
678          # Yes. Write it out.          # Yes. Store it in the hash.
679          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
680      }      }
681        # Return the hash.
682        return $retVal;
683  }  }
684    
685    
# Line 695  Line 705 
705  sub LoadAnnotations {  sub LoadAnnotations {
706      # Get the parameters.      # Get the parameters.
707      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
708      # Get the timestamp hash.      # Get the assignment Hash. We use this to filter out deleted features.
709      my $timeHash = $self->{timestamps};      my $assignHash = $self->{assignHash};
710      # Get the Sapling database.      # Get the Sapling database.
711      my $sap = $self->{sap};      my $sap = $self->{sap};
712      # Get the statistics object.      # Get the statistics object.
# Line 707  Line 717 
717      while (! eof $ih) {      while (! eof $ih) {
718          # Read in the peg, timestamp, and user ID.          # Read in the peg, timestamp, and user ID.
719          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
720          # Only proceed if the feature exists.          # Only proceed if the feature is not deleted.
721          if (! exists $timeHash->{$fid}) {          if ($assignHash->{$fid}) {
722              $stats->Add(skippedAnnotation => 1);              # Add the annotation to this feature.
723          } else {              $self->MakeAnnotation($fid, $text, $user, $timestamp);
             # Change assignments by the master user to FIG assignments.  
             $text =~ s/Set master function/Set FIG function/s;  
             # Insure the time stamp is valid.  
             if ($timestamp =~ /^\d+$/) {  
                 # Here it's a number. We need to insure the one we use to form  
                 # the key is unique.  
                 my $keyStamp = $timestamp;  
                 while ($timeHash->{$fid}{$keyStamp}) {  
                     $keyStamp++;  
                     $stats->Add(skippedStamp => 1);  
                 }  
                 # Form the annotation ID.  
                 my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);  
                 $timeHash->{$fid}{$keyStamp} = 1;  
                 # Generate the annotation.  
                 $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);  
                 $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,  
                             comment => $text, annotator => $user);  
             } else {  
                 # Here we have an invalid time stamp.  
                 Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);  
             }  
724          }          }
725      }      }
726  }  }
# Line 809  Line 797 
797      while (! eof $ih) {      while (! eof $ih) {
798          # Get this subsystem.          # Get this subsystem.
799          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
800            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
801          # Normalize the subsystem name.          # Normalize the subsystem name.
802          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
803          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 827  Line 816 
816                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
817                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
818                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
819                    $stats->Add(subsystems => 1);
820                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
821                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
822                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
# Line 841  Line 831 
831                      # Compute the machine-role ID.                      # Compute the machine-role ID.
832                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
833                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
834                        $stats->Add(subsystemRoles => 1);
835                  }                  }
836              }              }
837          } else {          } else {
# Line 862  Line 853 
853          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
854          # Compute the variant key.          # Compute the variant key.
855          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
856            # Insure we have it in the database.
857            if (! $sap->Exists(Variant => $variantKey)) {
858          # Get the variant from the sapling server.          # Get the variant from the sapling server.
859          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
860                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 870  Line 863 
863                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
864                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
865                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
866          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
867          # Connect it to the subsystem.          # Connect it to the subsystem.
868          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
869                $stats->Add(subsystemVariants => 1);
870            }
871          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
872          # variant.          # variant.
873          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 1072  Line 1065 
1065      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1066  }  }
1067    
 =head3 CreateIdentifier  
   
     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);  
   
 Link an identifier to a feature. The identifier is presented in prefixed form and is of the  
 specified type and the specified confidence level.  
   
 =over 4  
   
 =item alias  
   
 Identifier to connect to the feature.  
   
 =item conf  
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
   
 ID of the relevant feature.  
   
 =back  
   
 =cut  
   
 sub CreateIdentifier {  
     # Get the parameters.  
     my ($self, $alias, $conf, $aliasType, $fid) = @_;  
     # Get the Sapling object.  
     my $sap = $self->{sap};  
     # Compute the identifier's natural form.  
     my $natural = $alias;  
     if ($natural =~ /[:|](.+)/) {  
         $natural = $1;  
     }  
     # Insure the identifier exists in the database.  
     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);  
     # Connect the identifier to the feature.  
     $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);  
 }  
   
1068  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1069    
1070  =head3 ReadAnnotation  =head3 ReadAnnotation
# Line 1151  Line 1099 
1099      # Loop through the lines of the text field.      # Loop through the lines of the text field.
1100      my $text = "";      my $text = "";
1101      my $line = <$ih>;      my $line = <$ih>;
1102      while ($line ne "//\n") {      while (defined($line) && $line ne "//\n") {
1103          $text .= $line;          $text .= $line;
1104          $line = <$ih>;          $line = <$ih>;
1105      }      }

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.13

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3