[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Fri Jan 28 22:51:42 2011 UTC revision 1.11, Thu May 12 20:00:43 2011 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item assignHash
61    
62    Hash of feature IDs to functional assignments. Deleted features are removed, which
63    means only features listed in this hash can be legally inserted into the database.
64    
65  =back  =back
66    
67  =cut  =cut
# Line 72  Line 77 
77      # Load the features.      # Load the features.
78      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(2);
79      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
80        # Check for annotation history. If we have it, load the history records into the
81        # database.
82        if (-f "$directory/annotations") {
83            Trace("Processing annotations.") if T(3);
84            $loaderObject->LoadAnnotations("$directory/annotations");
85        }
86      # Load the subsystem bindings.      # Load the subsystem bindings.
87      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(2);
88      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 112  Line 123 
123      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
124      # Create the statistics object.      # Create the statistics object.
125      my $stats = Stats->new();      my $stats = Stats->new();
126      # Delete the DNA.      # Delete the DNA sequences.
127      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
128        for my $seq (@seqs) {
129            my $delStats = $sap->Delete(DNASequence => $seq);
130            $stats->Accumulate($delStats);
131        }
132      # Delete the contigs.      # Delete the contigs.
133      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
134      # Delete the features.      # Delete the features.
# Line 128  Line 143 
143      return $stats;      return $stats;
144  }  }
145    
146    
147    =head3 Process
148    
149        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
150    
151    Load genome data from the specified directory. If the genome data already
152    exists in the database, it will be deleted first.
153    
154    =over 4
155    
156    =item sap
157    
158    L</Sapling> object for accessing the database.
159    
160    =item genome
161    
162    ID of the genome whose  data is being loaded.
163    
164    =item directory
165    
166    Name of the directory containing the genome data files.
167    
168    =item RETURN
169    
170    Returns a statistics object describing the activity during the reload.
171    
172    =back
173    
174    =cut
175    
176    sub Process {
177        # Get the parameters.
178        my ($sap, $genome, $directory) = @_;
179        # Clear the existing data for the specified genome.
180        my $stats = ClearGenome($sap, $genome);
181        # Load the new expression data from the specified directory.
182        my $newStats = Load($sap, $genome, $directory);
183        # Merge the statistics.
184        $stats->Accumulate($newStats);
185        # Return the result.
186        return $stats;
187    }
188    
189    
190  =head2 Loader Object Methods  =head2 Loader Object Methods
191    
192  =head3 new  =head3 new
# Line 189  Line 248 
248      # Add our specialized data.      # Add our specialized data.
249      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
250      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
251        # Leave the assignment hash undefined until we populate it.
252        $retVal->{assignHash} = undef;
253      # Return the result.      # Return the result.
254      return $retVal;      return $retVal;
255  }  }
# Line 357  Line 418 
418  sub LoadFeatures {  sub LoadFeatures {
419      # Get the parameters.      # Get the parameters.
420      my ($self) = @_;      my ($self) = @_;
421        # Read in the functional assignments.
422        Trace("Reading functional assignments.") if T(3);
423        my $assignHash = $self->ReadAssignments();
424        $self->{assignHash} = $assignHash;
425      # Get the directory of feature types.      # Get the directory of feature types.
426      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
427      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
428        # Check for protein sequences. If we have some, load them into a hash.
429        my $protHash = {};
430        if (-f "$featureDir/peg/fasta") {
431            Trace("Processing protein sequences.") if T(3);
432            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
433        }
434      # Create the feature records for the types found.      # Create the feature records for the types found.
435      for my $type (@types) {      for my $type (@types) {
436          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
437          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
438              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
439              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
440                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
441                if (-f $tranFile) {
442                    $evHash = $self->LoadEvidenceCodes($tranFile);
443          }          }
444                # Now load the feature data.
445                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
446      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
447      }      }
448  }  }
449    
450    =head3 LoadEvidenceCodes
451    
452        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
453    
454    Load the evidence codes from the specified attribute transaction log file into a
455    hash. The log file is in tab-delimited format. The first column contains the
456    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
457    the third an attribute name (we'll ignore everything but C<evidence_code>), and
458    the fourth the attribute value.
459    
460    =over 4
461    
462    =item attributeFile
463    
464    Name of the attribute transaction log file.
465    
466    =item RETURN
467    
468    Returns a reference to a hash mapping each feature ID to a comma-delimited list
469    of its evidence codes.
470    
471    =back
472    
473    =cut
474    
475    sub LoadEvidenceCodes {
476        # Get the parameters.
477        my ($self, $attributeFile) = @_;
478        # Get the Sapling database.
479        my $sap = $self->{sap};
480        # Get the statistics object.
481        my $stats = $self->{stats};
482        # Get the assignment hash: we use this to filter the feature IDs.
483        my $assignHash = $self->{assignHash};
484        # Open the attribute log file for input.
485        my $ih = Open(undef, "<$attributeFile");
486        # This two-dimensional hash will hold the evidence codes for each feature.
487        my %retVal;
488        # Loop through the attribute log file.
489        while (! eof $ih) {
490            # Get the current attribute record.
491            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
492            $stats->Add(attributeLine => 1);
493            # Insure we have all the pieces we need.
494            if (! $command || ! $fid || $key ne 'evidence_code') {
495                $stats->Add(attributeLineSkipped => 1);
496            } elsif (! $assignHash->{$fid}) {
497                # Here the attribute is for a deleted feature.
498                $stats->Add(attributeFidSkipped => 1);
499            } else {
500                # Get the sub-hash for this feature.
501                if (! exists $retVal{$fid}) {
502                    $retVal{$fid} = {};
503                }
504                my $featureSubHash = $retVal{$fid};
505                # Process according to the command.
506                if ($command eq 'ADD') {
507                    # Here we are adding a new evidence code.
508                    $featureSubHash->{$value} = 1;
509                    $stats->Add(attributeAdd => 1);
510                } elsif ($command eq 'DELETE') {
511                    # Here we are deleting an evidence code.
512                    delete $featureSubHash->{$value};
513                    $stats->Add(attributeDelete => 1);
514                } else {
515                    # here we have an unrecognized command.
516                    $stats->Add(attributeCommandSkip => 1);
517                }
518            }
519        }
520        # Loop through the hash, converting each sub-hash to a comma-delimited list of
521        # evidence codes.
522        for my $fid (keys %retVal) {
523            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
524        }
525        # Return the evidence hash.
526        return \%retVal;
527    }
528    
529    
530  =head3 LoadFeatureData  =head3 LoadFeatureData
531    
532      $loaderObject->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
533    
534  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
535  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 391  Line 544 
544    
545  Type of feature to load.  Type of feature to load.
546    
547    =item protHash
548    
549    Reference to a hash mapping each feature ID for a protein-encoding gene to
550    its protein sequence.
551    
552    =item evHash
553    
554    Reference to a hash mapping each feature ID to a comma-delimited list of
555    its evidence codes (if any).
556    
557  =back  =back
558    
559  =cut  =cut
560    
561  sub LoadFeatureData {  sub LoadFeatureData {
562      # Get the parameters.      # Get the parameters.
563      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
564      # Get the sapling database.      # Get the sapling database.
565      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
566      # Get the statistics object.      # Get the statistics object.
567      my $stats = $self->{stats};      my $stats = $self->{stats};
568      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
569      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
570        my $assignHash = $self->{assignHash};
571      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
572      # time, it overwrites the original.      # time, it overwrites the original.
573      my %fids;      my %fidHash;
574        # Finally, we need the timestamp hash. The initial feature population
575      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
576      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
577      if (-f $fileName) {      if (-f $fileName) {
578          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
579          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
580          my %deletedFids;          # that they are not used by subsequent methods.
581          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
582          if (-f $deleteFile) {          if (-f $deleteFile) {
583              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
584                while (! eof $dh) {
585                    my ($deletedFid) = Tracer::GetLine($dh);
586                    if (exists $assignHash->{$deletedFid}) {
587                        delete $assignHash->{$deletedFid};
588                        $stats->Add(deletedFid => 1);
589                    }
590                }
591          }          }
592          # Open the main file for input.          # Open the main file for input.
593            Trace("Reading features from $fileName.") if T(3);
594          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
595          while (! eof $ih) {          while (! eof $ih) {
596              # Read this feature's information.              # Read this feature's information.
597              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
598              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
599              if (! exists $deletedFids{$fid}) {              if (exists $assignHash->{$fid}) {
600                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
601                  if (exists $fids{$fid}) {                  if ($fidHash{$fid}) {
602                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
603                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
604                  }                  }
605                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
606                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
607                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
608                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
609                  my @locData;                          $assignHash->{$fid} = $aliases[0];
610                  # This will contain the total sequence length.                      }
611                  my $length = 0;                      @aliases = ();
612                  # Loop through the locations.                  }
613                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
614                      # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
615                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
616                      # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
617                      $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                     # Compute the direction.  
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fids{$fid} = 1;  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # Now loop through the features, connecting them to their roles. Note that deleted  
     # features will not be in the assignment hash.  
     for my $fid (keys %$assignHash) {  
         # Get the roles and the error count.  
         my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});  
         # Accumulate the errors in the stats object.  
         $stats->Add(roleErrors => $errors);  
         # Is this a suspicious function?  
         if (! defined $roles) {  
             # Yes, so track it.  
             $stats->Add(badFunction => 1);  
         } else {  
             # No, connect the roles.  
             for my $role (@$roles) {  
                 # Insure this role exists.  
                 my $hypo = hypo($role);  
                 $self->InsureEntity(Role => $role, hypothetical => $hypo);  
                 # Connect it to the feature.  
                 $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);  
618              }              }
619          }          }
620      }      }
621  }  }
622    
623    
624  =head3 LoadProteinData  =head3 LoadProteinData
625    
626      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
627    
628  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
629  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
630    
631  =over 4  =over 4
632    
# Line 578  Line 634 
634    
635  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
636    
637    =item RETURN
638    
639    Returns a hash mapping feature IDs to protein sequences.
640    
641  =back  =back
642    
643  =cut  =cut
# Line 587  Line 647 
647      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
648      # Open the FASTA file for input.      # Open the FASTA file for input.
649      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
650        # Create the return hash.
651        my $retVal = {};
652      # We'll track the current protein in here.      # We'll track the current protein in here.
653      my $fid;      my $fid;
654      my $sequence = "";      my $sequence = "";
# Line 600  Line 662 
662              my $newFid = $1;              my $newFid = $1;
663              # Do we have an existing protein?              # Do we have an existing protein?
664              if (defined $fid) {              if (defined $fid) {
665                  # Yes. Write it out.                  # Yes. Store it in the hash.
666                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
667              }              }
668              # Initialize for the next protein.              # Initialize for the next protein.
669              $fid = $newFid;              $fid = $newFid;
# Line 613  Line 675 
675      }      }
676      # Do we have a residual protein.      # Do we have a residual protein.
677      if (defined $fid) {      if (defined $fid) {
678          # Yes. Write it out.          # Yes. Store it in the hash.
679          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
680        }
681        # Return the hash.
682        return $retVal;
683    }
684    
685    
686    =head3 LoadAnnotations
687    
688        $loaderObject->LoadAnnotations($fileName);
689    
690    Read in the annotation history information and use it to create annotation records.
691    
692    =over 4
693    
694    =item fileName
695    
696    Name of the annotation history file. This file is formatted with four fields per
697    record. Each field is on a separate line, with a double slash (C<//>) used as the
698    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
699    (formatted as an integer), (2) the user name, and (3) the annotation text.
700    
701    =back
702    
703    =cut
704    
705    sub LoadAnnotations {
706        # Get the parameters.
707        my ($self, $fileName) = @_;
708        # Get the assignment Hash. We use this to filter out deleted features.
709        my $assignHash = $self->{assignHash};
710        # Get the Sapling database.
711        my $sap = $self->{sap};
712        # Get the statistics object.
713        my $stats = $self->{stats};
714        # Open the input file.
715        my $ih = Tracer::Open(undef, "<$fileName");
716        # Loop through the input.
717        while (! eof $ih) {
718            # Read in the peg, timestamp, and user ID.
719            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
720            # Only proceed if the feature is not deleted.
721            if ($assignHash->{$fid}) {
722                # Add the annotation to this feature.
723                $self->MakeAnnotation($fid, $text, $user, $timestamp);
724      }      }
725  }  }
726    }
727    
728    
729  =head3 WriteProtein  =head3 WriteProtein
730    
# Line 643  Line 751 
751      # Get the parameters.      # Get the parameters.
752      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
753      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
754      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
755      # Insure the protein exists.      # Insure the protein exists.
756      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
757      # Connect the feature to it.      # Connect the feature to it.
# Line 689  Line 797 
797      while (! eof $ih) {      while (! eof $ih) {
798          # Get this subsystem.          # Get this subsystem.
799          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
800            Trace("Processing subsystem $subsystem variant $variant.") if T(3);
801          # Normalize the subsystem name.          # Normalize the subsystem name.
802          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
803          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 707  Line 816 
816                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
817                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
818                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
819                    $stats->Add(subsystems => 1);
820                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
821                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
822                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
# Line 721  Line 831 
831                      # Compute the machine-role ID.                      # Compute the machine-role ID.
832                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
833                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
834                        $stats->Add(subsystemRoles => 1);
835                  }                  }
836              }              }
837          } else {          } else {
# Line 742  Line 853 
853          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
854          # Compute the variant key.          # Compute the variant key.
855          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
856            # Insure we have it in the database.
857            if (! $sap->Exists(Variant => $variantKey)) {
858          # Get the variant from the sapling server.          # Get the variant from the sapling server.
859          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
860                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 750  Line 863 
863                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
864                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
865                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
866          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
867          # Connect it to the subsystem.          # Connect it to the subsystem.
868          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
869                $stats->Add(subsystemVariants => 1);
870            }
871          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
872          # variant.          # variant.
873          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 954  Line 1067 
1067    
1068  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1069    
1070  =head3 CreateIdentifier  =head3 ReadAnnotation
1071    
1072      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1073    
1074  Link an identifier to a feature. The identifier is presented in prefixed form and is of the  Read the next record from an annotation file. The next record must exist (that is, an
1075  specified type and the specified confidence level.  end-of-file check should have been performed before calling this method).
1076    
1077  =over 4  =over 4
1078    
1079  =item alias  =item ih
   
 Identifier to connect to the feature.  
1080    
1081  =item conf  Open file handle for the annotation file.
1082    
1083  Confidence level (C<A> curated, C<B> normal, C<C> protein only).  =item RETURN
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
1084    
1085  ID of the relevant feature.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1086    timestamp, (2) the user ID, and (3) the annotation text.
1087    
1088  =back  =back
1089    
1090  =cut  =cut
1091    
1092  sub CreateIdentifier {  sub ReadAnnotation {
1093      # Get the parameters.      # Get the parameter.
1094      my ($self, $alias, $conf, $aliasType, $fid) = @_;      my ($ih) = @_;
1095      # Get the Sapling object.      # Read the three fixed fields.
1096      my $sap = $self->{sap};      my $fid = <$ih>; chomp $fid;
1097      # Compute the identifier's natural form.      my $timestamp = <$ih>; chomp $timestamp;
1098      my $natural = $alias;      my $user = <$ih>; chomp $user;
1099      if ($natural =~ /[:|](.+)/) {      # Loop through the lines of the text field.
1100          $natural = $1;      my $text = "";
1101      }      my $line = <$ih>;
1102      # Insure the identifier exists in the database.      while ($line ne "//\n") {
1103      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);          $text .= $line;
1104      # Connect the identifier to the feature.          $line = <$ih>;
1105      $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);      }
1106        # Remove the trailing new-line from the text.
1107        chomp $text;
1108        # Return the fields.
1109        return ($fid, $timestamp, $user, $text);
1110  }  }
1111    
1112  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3