[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Jan 11 15:04:03 2011 UTC revision 1.12, Tue Jul 12 18:41:53 2011 UTC
# Line 26  Line 26 
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28      use AliasAnalysis;      use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 56  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item assignHash
61    
62    Hash of feature IDs to functional assignments. Deleted features are removed, which
63    means only features listed in this hash can be legally inserted into the database.
64    
65  =back  =back
66    
67  =cut  =cut
# Line 66  Line 72 
72      # Create the loader object.      # Create the loader object.
73      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
74      # Load the contigs.      # Load the contigs.
75      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
76      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
77      # Load the features.      # Load the features.
78      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
79      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
80        # Check for annotation history. If we have it, load the history records into the
81        # database.
82        if (-f "$directory/annotations") {
83            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
84            $loaderObject->LoadAnnotations("$directory/annotations");
85        }
86      # Load the subsystem bindings.      # Load the subsystem bindings.
87      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
88      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
89      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
90      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
91      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
92      # Return the statistics.      # Return the statistics.
93      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 111  Line 123 
123      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
124      # Create the statistics object.      # Create the statistics object.
125      my $stats = Stats->new();      my $stats = Stats->new();
126      # Delete the DNA.      # Delete the DNA sequences.
127      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
128        for my $seq (@seqs) {
129            my $delStats = $sap->Delete(DNASequence => $seq);
130            $stats->Accumulate($delStats);
131        }
132      # Delete the contigs.      # Delete the contigs.
133      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
134      # Delete the features.      # Delete the features.
135      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
136      # Delete the molecular machines.      # Delete the molecular machines.
137      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
138      # Delete the genome itself.      # Delete the genome itself.
139      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
140      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 127  Line 143 
143      return $stats;      return $stats;
144  }  }
145    
146    
147    =head3 Process
148    
149        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
150    
151    Load genome data from the specified directory. If the genome data already
152    exists in the database, it will be deleted first.
153    
154    =over 4
155    
156    =item sap
157    
158    L</Sapling> object for accessing the database.
159    
160    =item genome
161    
162    ID of the genome whose  data is being loaded.
163    
164    =item directory
165    
166    Name of the directory containing the genome data files.
167    
168    =item RETURN
169    
170    Returns a statistics object describing the activity during the reload.
171    
172    =back
173    
174    =cut
175    
176    sub Process {
177        # Get the parameters.
178        my ($sap, $genome, $directory) = @_;
179        # Clear the existing data for the specified genome.
180        my $stats = ClearGenome($sap, $genome);
181        # Load the new expression data from the specified directory.
182        my $newStats = Load($sap, $genome, $directory);
183        # Merge the statistics.
184        $stats->Accumulate($newStats);
185        # Return the result.
186        return $stats;
187    }
188    
189    
190  =head2 Loader Object Methods  =head2 Loader Object Methods
191    
192  =head3 new  =head3 new
# Line 184  Line 244 
244      # Get the parameters.      # Get the parameters.
245      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
246      # Create the object.      # Create the object.
247      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
248          sap => $sap,      # Add our specialized data.
249          genome => $genome,      $retVal->{genome} = $genome;
250          directory => $directory,      $retVal->{directory} = $directory;
251          stats => Stats->new(qw(contigs dna pegs rnas)),      # Leave the assignment hash undefined until we populate it.
252          supportRecords => {}      $retVal->{assignHash} = undef;
253      };      # Return the result.
     # Bless and return it.  
     bless $retVal, $class;  
254      return $retVal;      return $retVal;
255  }  }
256    
# Line 360  Line 418 
418  sub LoadFeatures {  sub LoadFeatures {
419      # Get the parameters.      # Get the parameters.
420      my ($self) = @_;      my ($self) = @_;
421        # Read in the functional assignments.
422        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
423        my $assignHash = $self->ReadAssignments();
424        $self->{assignHash} = $assignHash;
425      # Get the directory of feature types.      # Get the directory of feature types.
426      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
427      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
428        # Check for protein sequences. If we have some, load them into a hash.
429        my $protHash = {};
430        if (-f "$featureDir/peg/fasta") {
431            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
432            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
433        }
434      # Create the feature records for the types found.      # Create the feature records for the types found.
435      for my $type (@types) {      for my $type (@types) {
436          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
437          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
438              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
439              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
440                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
441                if (-f $tranFile) {
442                    $evHash = $self->LoadEvidenceCodes($tranFile);
443          }          }
444                # Now load the feature data.
445                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
446      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
447      }      }
448  }  }
449    
450    =head3 LoadEvidenceCodes
451    
452        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
453    
454    Load the evidence codes from the specified attribute transaction log file into a
455    hash. The log file is in tab-delimited format. The first column contains the
456    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
457    the third an attribute name (we'll ignore everything but C<evidence_code>), and
458    the fourth the attribute value.
459    
460    =over 4
461    
462    =item attributeFile
463    
464    Name of the attribute transaction log file.
465    
466    =item RETURN
467    
468    Returns a reference to a hash mapping each feature ID to a comma-delimited list
469    of its evidence codes.
470    
471    =back
472    
473    =cut
474    
475    sub LoadEvidenceCodes {
476        # Get the parameters.
477        my ($self, $attributeFile) = @_;
478        # Get the Sapling database.
479        my $sap = $self->{sap};
480        # Get the statistics object.
481        my $stats = $self->{stats};
482        # Get the assignment hash: we use this to filter the feature IDs.
483        my $assignHash = $self->{assignHash};
484        # Open the attribute log file for input.
485        my $ih = Open(undef, "<$attributeFile");
486        # This two-dimensional hash will hold the evidence codes for each feature.
487        my %retVal;
488        # Loop through the attribute log file.
489        while (! eof $ih) {
490            # Get the current attribute record.
491            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
492            $stats->Add(attributeLine => 1);
493            # Insure we have all the pieces we need.
494            if (! $command || ! $fid || $key ne 'evidence_code') {
495                $stats->Add(attributeLineSkipped => 1);
496            } elsif (! $assignHash->{$fid}) {
497                # Here the attribute is for a deleted feature.
498                $stats->Add(attributeFidSkipped => 1);
499            } else {
500                # Get the sub-hash for this feature.
501                if (! exists $retVal{$fid}) {
502                    $retVal{$fid} = {};
503                }
504                my $featureSubHash = $retVal{$fid};
505                # Process according to the command.
506                if ($command eq 'ADD') {
507                    # Here we are adding a new evidence code.
508                    $featureSubHash->{$value} = 1;
509                    $stats->Add(attributeAdd => 1);
510                } elsif ($command eq 'DELETE') {
511                    # Here we are deleting an evidence code.
512                    delete $featureSubHash->{$value};
513                    $stats->Add(attributeDelete => 1);
514                } else {
515                    # here we have an unrecognized command.
516                    $stats->Add(attributeCommandSkip => 1);
517                }
518            }
519        }
520        # Loop through the hash, converting each sub-hash to a comma-delimited list of
521        # evidence codes.
522        for my $fid (keys %retVal) {
523            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
524        }
525        # Return the evidence hash.
526        return \%retVal;
527    }
528    
529    
530  =head3 LoadFeatureData  =head3 LoadFeatureData
531    
532      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
533    
534  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
535  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 394  Line 544 
544    
545  Type of feature to load.  Type of feature to load.
546    
547    =item protHash
548    
549    Reference to a hash mapping each feature ID for a protein-encoding gene to
550    its protein sequence.
551    
552    =item evHash
553    
554    Reference to a hash mapping each feature ID to a comma-delimited list of
555    its evidence codes (if any).
556    
557  =back  =back
558    
559  =cut  =cut
560    
561  sub LoadFeatureData {  sub LoadFeatureData {
562      # Get the parameters.      # Get the parameters.
563      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
564      # Get the sapling database.      # Get the sapling database.
565      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
566      # Get the statistics object.      # Get the statistics object.
567      my $stats = $self->{stats};      my $stats = $self->{stats};
568      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
569      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
570        my $assignHash = $self->{assignHash};
571      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
572      # time, it overwrites the original.      # time, it overwrites the original.
573      my %fids;      my %fidHash;
574        # Finally, we need the timestamp hash. The initial feature population
575      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
576      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
577      if (-f $fileName) {      if (-f $fileName) {
578          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
579            # of deleted features and remove them from the assignment hash. This insures
580            # that they are not used by subsequent methods.
581            my $deleteFile = "$featureDir/$type/deleted.features";
582            if (-f $deleteFile) {
583                my $dh = Open(undef, "<$deleteFile");
584                while (! eof $dh) {
585                    my ($deletedFid) = Tracer::GetLine($dh);
586                    if (exists $assignHash->{$deletedFid}) {
587                        delete $assignHash->{$deletedFid};
588                        $stats->Add(deletedFid => 1);
589                    }
590                }
591            }
592            # Open the main file for input.
593            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
594          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
595          while (! eof $ih) {          while (! eof $ih) {
596              # Read this feature's information.              # Read this feature's information.
597              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
598              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
599              if (exists $fids{$fid}) {              if (exists $assignHash->{$fid}) {
600                    # If the feature already exists, delete it. (This should be extremely rare.)
601                    if ($fidHash{$fid}) {
602                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
603                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
604              }              }
605              # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
606              $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
607              # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
608              # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
609              my @locData;                          $assignHash->{$fid} = $aliases[0];
610              # This will contain the total sequence length.                      }
611              my $length = 0;                      @aliases = ();
612              # Loop through the locations.                  }
613              for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
614                  # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
615                  my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
616                  # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
617                  $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                 # Compute the direction.  
                 if ($start <= $stop) {  
                     # Here we have the forward strand.  
                     my $len = $stop + 1 - $start;  
                     push @locData, [$contig, $start, '+', $len];  
                     $length += $len;  
                 } else {  
                     # Here we have the reverse strand.  
                     my $len = $start + 1 - $stop;  
                     push @locData, [$contig, $stop, '-', $len];  
                     $length += $len;  
                 }  
             }  
             # Now we can create the feature record.  
             $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                function => $assignHash->{$fid}, locked => 0,  
                                sequence_length => $length);  
             $fids{$fid} = 1;  
             $stats->Add($type => 1);  
             # The next step is to connect the feature record to its locations. This  
             # involves dividing the location into segments. The following variable  
             # will count the total number of segments.  
             my $segment = 0;  
             # Loop through the locations.  
             for my $loc (@locData) {  
                 # Get the current location's information.  
                 my ($contig, $left, $dir, $len) = @$loc;  
                 # Peel off any segments.  
                 while ($len > $maxLength) {  
                     # Process according to the direction.  
                     if ($dir eq '+') {  
                         # Forward strand: peel from the beginning.  
                         $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                $maxLength);  
                         $left += $maxLength;  
                     } else {  
                         # Reverse strand: peel from the end.  
                         $self->ConnectLocation($fid, $contig, $segment,  
                                               $left + $len - $maxLength, $dir,  
                                               $maxLength);  
                     }  
                     # Denote we've used up a segment.  
                     $len -= $maxLength;  
                     $segment++;  
                 }  
                 # Output the last segment.  
                 $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
             }  
             # Now we process the aliases and create the identifiers. We don't do this  
             # for RNA, because the RNA function is stored in the aliases.  
             if ($type ne 'rna') {  
                 for my $alias (@aliases) {  
                     my $normalized;  
                     # Determine the type.  
                     my $aliasType = AliasAnalysis::TypeOf($alias);  
                     $stats->Add(aliasAll => 1);  
                     # Is this a recognized type?  
                     if ($aliasType) {  
                         $stats->Add(aliasNormal => 1);  
                         # Yes. Write it normally.  
                         $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                         # No, but this is a specially-marked locus tag.  
                         $normalized = $1;  
                         $stats->Add(aliasLocus => 1);  
                         $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                         # No, but this is a natural locus tag.  
                         $stats->Add(aliasLocus => 1);  
                         $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                         # No, but this is a natural gene name.  
                         $stats->Add(aliasGene => 1);  
                         $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                     } elsif ($alias =~ /^\d+$/) {  
                         # Here it's a naked number, which means it's a GI number  
                         # of some sort.  
                         $stats->Add(aliasGI => 1);  
                         $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                     } elsif ($alias =~ /^protein_id\|(.+)/) {  
                         # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                         # handle that, because we don't know the feature's protein ID here.  
                         $stats->Add(aliasProtein => 1);  
                     } elsif ($alias =~ /[:|]/) {  
                         # Here it's an alias of an unknown type, so we skip it.  
                         $stats->Add(aliasUnknown => 1);  
                     } else {  
                         # Here it's a miscellaneous type.  
                         $stats->Add(aliasMisc => 1);  
                         $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                     }  
                 }  
618              }              }
619          }          }
620      }      }
621  }  }
622    
623    
624  =head3 LoadProteinData  =head3 LoadProteinData
625    
626      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
627    
628  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
629  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
630    
631  =over 4  =over 4
632    
# Line 549  Line 634 
634    
635  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
636    
637    =item RETURN
638    
639    Returns a hash mapping feature IDs to protein sequences.
640    
641  =back  =back
642    
643  =cut  =cut
# Line 558  Line 647 
647      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
648      # Open the FASTA file for input.      # Open the FASTA file for input.
649      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
650        # Create the return hash.
651        my $retVal = {};
652      # We'll track the current protein in here.      # We'll track the current protein in here.
653      my $fid;      my $fid;
654      my $sequence = "";      my $sequence = "";
# Line 571  Line 662 
662              my $newFid = $1;              my $newFid = $1;
663              # Do we have an existing protein?              # Do we have an existing protein?
664              if (defined $fid) {              if (defined $fid) {
665                  # Yes. Write it out.                  # Yes. Store it in the hash.
666                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
667              }              }
668              # Initialize for the next protein.              # Initialize for the next protein.
669              $fid = $newFid;              $fid = $newFid;
# Line 584  Line 675 
675      }      }
676      # Do we have a residual protein.      # Do we have a residual protein.
677      if (defined $fid) {      if (defined $fid) {
678          # Yes. Write it out.          # Yes. Store it in the hash.
679          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
680      }      }
681        # Return the hash.
682        return $retVal;
683  }  }
684    
685    
686    =head3 LoadAnnotations
687    
688        $loaderObject->LoadAnnotations($fileName);
689    
690    Read in the annotation history information and use it to create annotation records.
691    
692    =over 4
693    
694    =item fileName
695    
696    Name of the annotation history file. This file is formatted with four fields per
697    record. Each field is on a separate line, with a double slash (C<//>) used as the
698    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
699    (formatted as an integer), (2) the user name, and (3) the annotation text.
700    
701    =back
702    
703    =cut
704    
705    sub LoadAnnotations {
706        # Get the parameters.
707        my ($self, $fileName) = @_;
708        # Get the assignment Hash. We use this to filter out deleted features.
709        my $assignHash = $self->{assignHash};
710        # Get the Sapling database.
711        my $sap = $self->{sap};
712        # Get the statistics object.
713        my $stats = $self->{stats};
714        # Open the input file.
715        my $ih = Tracer::Open(undef, "<$fileName");
716        # Loop through the input.
717        while (! eof $ih) {
718            # Read in the peg, timestamp, and user ID.
719            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
720            # Only proceed if the feature is not deleted.
721            if ($assignHash->{$fid}) {
722                # Add the annotation to this feature.
723                $self->MakeAnnotation($fid, $text, $user, $timestamp);
724            }
725        }
726    }
727    
728    
729  =head3 WriteProtein  =head3 WriteProtein
730    
731      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 614  Line 751 
751      # Get the parameters.      # Get the parameters.
752      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
753      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
754      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
755      # Insure the protein exists.      # Insure the protein exists.
756      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
757      # Connect the feature to it.      # Connect the feature to it.
758      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
759  }  }
760    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
761  =head3 LoadSubsystems  =head3 LoadSubsystems
762    
763      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 721  Line 797 
797      while (! eof $ih) {      while (! eof $ih) {
798          # Get this subsystem.          # Get this subsystem.
799          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
800            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
801          # Normalize the subsystem name.          # Normalize the subsystem name.
802          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
803          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 737  Line 814 
814              if (@$roleList > 0) {              if (@$roleList > 0) {
815                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
816                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
817                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
818                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
819                    $stats->Add(subsystems => 1);
820                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
821                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
822                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
823                      # Create the Includes record.                      # Create the Includes record.
824                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
825                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
826                      # Insure we have the role in place.                      # Insure we have the role in place.
827                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
828                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
829                      delete $roleFields{id};                      delete $roleFields{id};
830                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
831                      # Compute the machine-role ID.                      # Compute the machine-role ID.
832                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
833                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
834                        $stats->Add(subsystemRoles => 1);
835                  }                  }
836              }              }
837          } else {          } else {
# Line 774  Line 853 
853          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
854          # Compute the variant key.          # Compute the variant key.
855          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
856            # Insure we have it in the database.
857            if (! $sap->Exists(Variant => $variantKey)) {
858          # Get the variant from the sapling server.          # Get the variant from the sapling server.
859          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
860                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 782  Line 863 
863                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
864                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
865                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
866          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
867          # Connect it to the subsystem.          # Connect it to the subsystem.
868          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
869                $stats->Add(subsystemVariants => 1);
870            }
871          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
872          # variant.          # variant.
873          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 986  Line 1067 
1067    
1068  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1069    
1070  =head3 DeleteRelatedRecords  =head3 ReadAnnotation
   
     DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);  
   
 Delete all the records in the named entity and relationship relating to the  
 specified genome and roll up the statistics in the specified statistics object.  
   
 =over 4  
   
 =item sap  
   
 L<Sapling> object for accessing the database.  
1071    
1072  =item genome      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
   
 ID of the relevant genome.  
   
 =item stats  
   
 L<Stats> object for tracking the delete activity.  
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
1073    
1074  =item entityName  Read the next record from an annotation file. The next record must exist (that is, an
1075    end-of-file check should have been performed before calling this method).
 Name of the entity on the other side of the relationship.  
   
 =back  
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
1076    
1077  =over 4  =over 4
1078    
1079  =item tableName  =item ih
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
1080    
1081  Reference to a hash mapping fully-qualified ERDB field names to values.  Open file handle for the annotation file.
1082    
1083  =item RETURN  =item RETURN
1084    
1085  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1086    timestamp, (2) the user ID, and (3) the annotation text.
1087    
1088  =back  =back
1089    
1090  =cut  =cut
1091    
1092  sub ExtractFields {  sub ReadAnnotation {
1093      # Get the parameters.      # Get the parameter.
1094      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1095      # Declare the return variable.      # Read the three fixed fields.
1096      my %retVal;      my $fid = <$ih>; chomp $fid;
1097      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1098      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1099          # Is this a field for the specified table?      # Loop through the lines of the text field.
1100          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1101              # Yes, put it in the output hash.      my $line = <$ih>;
1102              $retVal{$1} = $dataHash->{$field};      while ($line ne "//\n") {
1103          }          $text .= $line;
1104      }          $line = <$ih>;
1105      # Return the computed hash.      }
1106      return %retVal;      # Remove the trailing new-line from the text.
1107  }      chomp $text;
1108        # Return the fields.
1109  =head3 CreateIdentifier      return ($fid, $timestamp, $user, $text);
   
     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);  
   
 Link an identifier to a feature. The identifier is presented in prefixed form and is of the  
 specified type and the specified confidence level.  
   
 =over 4  
   
 =item alias  
   
 Identifier to connect to the feature.  
   
 =item conf  
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
   
 ID of the relevant feature.  
   
 =back  
   
 =cut  
   
 sub CreateIdentifier {  
     # Get the parameters.  
     my ($self, $alias, $conf, $aliasType, $fid) = @_;  
     # Get the Sapling object.  
     my $sap = $self->{sap};  
     # Compute the identifier's natural form.  
     my $natural = $alias;  
     if ($natural =~ /[:|](.+)/) {  
         $natural = $1;  
     }  
     # Insure the identifier exists in the database.  
     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);  
     # Connect the identifier to the feature.  
     $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);  
1110  }  }
1111    
1112  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3