[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.16, Tue Dec 6 21:44:46 2011 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 55  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item disconnected
61    
62    True if the application is disconnected from the network - do not
63    attempt to contact a SAP server for more data.
64    
65    =item assignHash
66    
67    Hash of feature IDs to functional assignments. Deleted features are removed, which
68    means only features listed in this hash can be legally inserted into the database.
69    
70  =back  =back
71    
72  =cut  =cut
73    
74  sub Load {  sub Load {
75      # Get the parameters.      # Get the parameters.
76      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory, $disconnected) = @_;
77      # Create the loader object.      # Create the loader object.
78      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
79      # Load the contigs.      # Load the contigs.
80        Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
81      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
82      # Load the features.      # Load the features.
83        Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
84      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
85        # Check for annotation history. If we have it, load the history records into the
86        # database.
87        if (-f "$directory/annotations") {
88            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
89            $loaderObject->LoadAnnotations("$directory/annotations");
90        }
91      # Load the subsystem bindings.      # Load the subsystem bindings.
92        Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
93      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
94      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
95        Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
96      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
97      # Return the statistics.      # Return the statistics.
98      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 106  Line 128 
128      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
129      # Create the statistics object.      # Create the statistics object.
130      my $stats = Stats->new();      my $stats = Stats->new();
131      # Delete the DNA.      # Delete the DNA sequences.
132      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
133        for my $seq (@seqs) {
134            my $delStats = $sap->Delete(DNASequence => $seq);
135            $stats->Accumulate($delStats);
136        }
137      # Delete the contigs.      # Delete the contigs.
138      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
139      # Delete the features.      # Delete the features.
140      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
141      # Delete the molecular machines.      # Delete the molecular machines.
142      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
143      # Delete the genome itself.      # Delete the genome itself.
144      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
145      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 122  Line 148 
148      return $stats;      return $stats;
149  }  }
150    
151    
152    =head3 Process
153    
154        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
155    
156    Load genome data from the specified directory. If the genome data already
157    exists in the database, it will be deleted first.
158    
159    =over 4
160    
161    =item sap
162    
163    L</Sapling> object for accessing the database.
164    
165    =item genome
166    
167    ID of the genome whose  data is being loaded.
168    
169    =item directory
170    
171    Name of the directory containing the genome data files.
172    
173    =item disconnected
174    
175    True if the application is disconnected from the network - do not
176    attempt to contact a SAP server for more data.
177    
178    =item RETURN
179    
180    Returns a statistics object describing the activity during the reload.
181    
182    =back
183    
184    =cut
185    
186    sub Process {
187        # Get the parameters.
188        my ($sap, $genome, $directory, $disconnected) = @_;
189        # Clear the existing data for the specified genome.
190        my $stats = ClearGenome($sap, $genome);
191        # Load the new expression data from the specified directory.
192        my $newStats = Load($sap, $genome, $directory, $disconnected);
193        # Merge the statistics.
194        $stats->Accumulate($newStats);
195        # Return the result.
196        return $stats;
197    }
198    
199    
200  =head2 Loader Object Methods  =head2 Loader Object Methods
201    
202  =head3 new  =head3 new
203    
204      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
205    
206  Create a loader object that can be used to facilitate loading genome data from a  Create a loader object that can be used to facilitate loading genome data from a
207  directory.  directory.
# Line 145  Line 220 
220    
221  Name of the directory containing the genome information.  Name of the directory containing the genome information.
222    
223    =item disconnected
224    
225    Set to a true value if the application should be considered to be disconnected
226    from the network - that is, do not attempt to connect to a Sapling server
227    to load subsystem data.
228    
229  =back  =back
230    
231  The object created contains the following fields.  The object created contains the following fields.
# Line 177  Line 258 
258    
259  sub new {  sub new {
260      # Get the parameters.      # Get the parameters.
261      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory, $disconnected) = @_;
262      # Create the object.      # Create the object.
263      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
264          sap => $sap,      # Add our specialized data.
265          genome => $genome,      $retVal->{genome} = $genome;
266          directory => $directory,      $retVal->{directory} = $directory;
267          stats => Stats->new(qw(contigs dna pegs rnas)),      # Leave the assignment hash undefined until we populate it.
268          supportRecords => {}      $retVal->{assignHash} = undef;
269      };      $retVal->{disconnected} = defined($disconnected) ? 1 : 0;
270      # Bless and return it.      # Return the result.
     bless $retVal, $class;  
271      return $retVal;      return $retVal;
272  }  }
273    
# Line 196  Line 276 
276      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
277    
278  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
279  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
280  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
281    C<gc_content> statistic.
282    
283  =cut  =cut
284    
# Line 300  Line 381 
381      # Compute the chunk ID.      # Compute the chunk ID.
382      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
383      # Connect this sequence to the contig.      # Connect this sequence to the contig.
384      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
385      # Create the DNA sequence.      # Create the DNA sequence.
386      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
387      # Record the chunk.      # Record the chunk.
388      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
389        # Update the GC count.
390        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
391  }  }
392    
393  =head3 OutputContig  =head3 OutputContig
# Line 355  Line 438 
438  sub LoadFeatures {  sub LoadFeatures {
439      # Get the parameters.      # Get the parameters.
440      my ($self) = @_;      my ($self) = @_;
441        # Read in the functional assignments.
442        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
443        my $assignHash = $self->ReadAssignments();
444        $self->{assignHash} = $assignHash;
445      # Get the directory of feature types.      # Get the directory of feature types.
446      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
447      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
448        # Check for protein sequences. If we have some, load them into a hash.
449        my $protHash = {};
450        if (-f "$featureDir/peg/fasta") {
451            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
452            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
453        }
454      # Create the feature records for the types found.      # Create the feature records for the types found.
455      for my $type (@types) {      for my $type (@types) {
456          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
457          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
458              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
459              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
460                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
461                if (-f $tranFile) {
462                    $evHash = $self->LoadEvidenceCodes($tranFile);
463          }          }
464                # Now load the feature data.
465                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
466      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
467      }      }
468  }  }
469    
470    =head3 LoadEvidenceCodes
471    
472        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
473    
474    Load the evidence codes from the specified attribute transaction log file into a
475    hash. The log file is in tab-delimited format. The first column contains the
476    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
477    the third an attribute name (we'll ignore everything but C<evidence_code>), and
478    the fourth the attribute value.
479    
480    =over 4
481    
482    =item attributeFile
483    
484    Name of the attribute transaction log file.
485    
486    =item RETURN
487    
488    Returns a reference to a hash mapping each feature ID to a comma-delimited list
489    of its evidence codes.
490    
491    =back
492    
493    =cut
494    
495    sub LoadEvidenceCodes {
496        # Get the parameters.
497        my ($self, $attributeFile) = @_;
498        # Get the Sapling database.
499        my $sap = $self->{sap};
500        # Get the statistics object.
501        my $stats = $self->{stats};
502        # Get the assignment hash: we use this to filter the feature IDs.
503        my $assignHash = $self->{assignHash};
504        # Open the attribute log file for input.
505        my $ih = Open(undef, "<$attributeFile");
506        # This two-dimensional hash will hold the evidence codes for each feature.
507        my %retVal;
508        # Loop through the attribute log file.
509        while (! eof $ih) {
510            # Get the current attribute record.
511            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
512            $stats->Add(attributeLine => 1);
513            # Insure we have all the pieces we need.
514            if (! $command || ! $fid || $key ne 'evidence_code') {
515                $stats->Add(attributeLineSkipped => 1);
516            } elsif (! $assignHash->{$fid}) {
517                # Here the attribute is for a deleted feature.
518                $stats->Add(attributeFidSkipped => 1);
519            } else {
520                # Get the sub-hash for this feature.
521                if (! exists $retVal{$fid}) {
522                    $retVal{$fid} = {};
523                }
524                my $featureSubHash = $retVal{$fid};
525                # Process according to the command.
526                if ($command eq 'ADD') {
527                    # Here we are adding a new evidence code.
528                    $featureSubHash->{$value} = 1;
529                    $stats->Add(attributeAdd => 1);
530                } elsif ($command eq 'DELETE') {
531                    # Here we are deleting an evidence code.
532                    delete $featureSubHash->{$value};
533                    $stats->Add(attributeDelete => 1);
534                } else {
535                    # here we have an unrecognized command.
536                    $stats->Add(attributeCommandSkip => 1);
537                }
538            }
539        }
540        # Loop through the hash, converting each sub-hash to a comma-delimited list of
541        # evidence codes.
542        for my $fid (keys %retVal) {
543            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
544        }
545        # Return the evidence hash.
546        return \%retVal;
547    }
548    
549    
550  =head3 LoadFeatureData  =head3 LoadFeatureData
551    
552      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
553    
554  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
555  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 389  Line 564 
564    
565  Type of feature to load.  Type of feature to load.
566    
567    =item protHash
568    
569    Reference to a hash mapping each feature ID for a protein-encoding gene to
570    its protein sequence.
571    
572    =item evHash
573    
574    Reference to a hash mapping each feature ID to a comma-delimited list of
575    its evidence codes (if any).
576    
577  =back  =back
578    
579  =cut  =cut
580    
581  sub LoadFeatureData {  sub LoadFeatureData {
582      # Get the parameters.      # Get the parameters.
583      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
584      # Get the sapling database.      # Get the sapling database.
585      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
586      # Get the statistics object.      # Get the statistics object.
587      my $stats = $self->{stats};      my $stats = $self->{stats};
588      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
589      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
590        my $assignHash = $self->{assignHash};
591      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
592      # time, it overwrites the original.      # time, it overwrites the original.
593      my %fids;      my %fidHash;
594        # Finally, we need the timestamp hash. The initial feature population
595      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
596      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
597        my %deleted_features;
598      if (-f $fileName) {      if (-f $fileName) {
599          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
600            # of deleted features and remove them from the assignment hash. This insures
601            # that they are not used by subsequent methods.
602            my $deleteFile = "$featureDir/$type/deleted.features";
603            if (-f $deleteFile) {
604                my $dh = Open(undef, "<$deleteFile");
605                while (! eof $dh) {
606                    my ($deletedFid) = Tracer::GetLine($dh);
607                    if (exists $assignHash->{$deletedFid}) {
608                        delete $assignHash->{$deletedFid};
609                        $stats->Add(deletedFid => 1);
610                        $deleted_features{$deletedFid} = 1;
611                    }
612                }
613            }
614            # Open the main file for input.
615            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
616          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
617          while (! eof $ih) {          while (! eof $ih) {
618              # Read this feature's information.              # Read this feature's information.
619              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
620              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
621              if (exists $fids{$fid}) {              if (!$deleted_features{$fid}) {
622                    # If the feature already exists, delete it. (This should be extremely rare.)
623                    if ($fidHash{$fid}) {
624                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
625                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
             } else {  
                 # Otherwise connect it to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
             }  
             # Now we must parse the locations. This will contain a list of the location  
             # data 4-tuples (contig, start, dir, len).  
             my @locData;  
             # This will contain the total sequence length.  
             my $length = 0;  
             # Loop through the locations.  
             for my $loc (split /\s*,\s*/, $locations) {  
                 # Get this location's components.  
                 my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);  
                 # Normalize the contig.  
                 $self->FixContig(\$contig);  
                 # Compute the direction.  
                 if ($start <= $stop) {  
                     # Here we have the forward strand.  
                     my $len = $stop + 1 - $start;  
                     push @locData, [$contig, $start, '+', $len];  
                     $length += $len;  
                 } else {  
                     # Here we have the reverse strand.  
                     my $len = $start + 1 - $stop;  
                     push @locData, [$contig, $stop, '-', $len];  
                     $length += $len;  
                 }  
             }  
             # Now we can create the feature record.  
             $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                function => $assignHash->{$fid}, locked => 0,  
                                sequence_length => $length);  
             $fids{$fid} = 1;  
             $stats->Add($type => 1);  
             # The next step is to connect the feature record to its locations. This  
             # involves dividing the location into segments. The following variable  
             # will count the total number of segments.  
             my $segment = 0;  
             # Loop through the locations.  
             for my $loc (@locData) {  
                 # Get the current location's information.  
                 my ($contig, $left, $dir, $len) = @$loc;  
                 # Peel off any segments.  
                 while ($len > $maxLength) {  
                     # Process according to the direction.  
                     if ($dir eq '+') {  
                         # Forward strand: peel from the beginning.  
                         $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                $maxLength);  
                         $left += $maxLength;  
                     } else {  
                         # Reverse strand: peel from the end.  
                         $self->ConnectLocation($fid, $contig, $segment,  
                                               $left + $len - $maxLength, $dir,  
                                               $maxLength);  
                     }  
                     # Denote we've used up a segment.  
                     $len -= $maxLength;  
                     $segment++;  
626                  }                  }
627                  # Output the last segment.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
628                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);                  # assignment is found there.
629                    if ($type eq 'rna') {
630                        if (! $assignHash->{$fid}) {
631                            $assignHash->{$fid} = $aliases[0];
632                        }
633                        @aliases = ();
634                    }
635                    # Add the feature to the database.
636                    my $function = $assignHash->{$fid} // "";
637                    $self->AddFeature($fid, $function, $locations, \@aliases,
638                                      $protHash->{$fid}, $evHash->{$fid});
639                    # Denote we've added this feature, so that if a duplicate occurs we're ready.
640                    $fidHash{$fid} = 1;
641              }              }
642          }          }
643      }      }
644  }  }
645    
646    
647  =head3 LoadProteinData  =head3 LoadProteinData
648    
649      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
650    
651  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
652  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
653    
654  =over 4  =over 4
655    
# Line 500  Line 657 
657    
658  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
659    
660    =item RETURN
661    
662    Returns a hash mapping feature IDs to protein sequences.
663    
664  =back  =back
665    
666  =cut  =cut
# Line 509  Line 670 
670      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
671      # Open the FASTA file for input.      # Open the FASTA file for input.
672      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
673        # Create the return hash.
674        my $retVal = {};
675      # We'll track the current protein in here.      # We'll track the current protein in here.
676      my $fid;      my $fid;
677      my $sequence = "";      my $sequence = "";
# Line 522  Line 685 
685              my $newFid = $1;              my $newFid = $1;
686              # Do we have an existing protein?              # Do we have an existing protein?
687              if (defined $fid) {              if (defined $fid) {
688                  # Yes. Write it out.                  # Yes. Store it in the hash.
689                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
690              }              }
691              # Initialize for the next protein.              # Initialize for the next protein.
692              $fid = $newFid;              $fid = $newFid;
# Line 535  Line 698 
698      }      }
699      # Do we have a residual protein.      # Do we have a residual protein.
700      if (defined $fid) {      if (defined $fid) {
701          # Yes. Write it out.          # Yes. Store it in the hash.
702          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
703      }      }
704        # Return the hash.
705        return $retVal;
706  }  }
707    
708    
709    =head3 LoadAnnotations
710    
711        $loaderObject->LoadAnnotations($fileName);
712    
713    Read in the annotation history information and use it to create annotation records.
714    
715    =over 4
716    
717    =item fileName
718    
719    Name of the annotation history file. This file is formatted with four fields per
720    record. Each field is on a separate line, with a double slash (C<//>) used as the
721    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
722    (formatted as an integer), (2) the user name, and (3) the annotation text.
723    
724    =back
725    
726    =cut
727    
728    sub LoadAnnotations {
729        # Get the parameters.
730        my ($self, $fileName) = @_;
731        # Get the assignment Hash. We use this to filter out deleted features.
732        my $assignHash = $self->{assignHash};
733        # Get the Sapling database.
734        my $sap = $self->{sap};
735        # Get the statistics object.
736        my $stats = $self->{stats};
737        # Open the input file.
738        my $ih = Tracer::Open(undef, "<$fileName");
739        # Loop through the input.
740        while (! eof $ih) {
741            # Read in the peg, timestamp, and user ID.
742            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
743            # Only proceed if the feature is not deleted.
744            if ($assignHash->{$fid}) {
745                # Add the annotation to this feature.
746                $self->MakeAnnotation($fid, $text, $user, $timestamp);
747            }
748        }
749    }
750    
751    
752  =head3 WriteProtein  =head3 WriteProtein
753    
754      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 565  Line 774 
774      # Get the parameters.      # Get the parameters.
775      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
776      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
777      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
778      # Insure the protein exists.      # Insure the protein exists.
779      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
780      # Connect the feature to it.      # Connect the feature to it.
781      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
782  }  }
783    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
784  =head3 LoadSubsystems  =head3 LoadSubsystems
785    
786      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 645  Line 793 
793  sub LoadSubsystems {  sub LoadSubsystems {
794      # Get the parameters.      # Get the parameters.
795      my ($self) = @_;      my ($self) = @_;
796    
797        #
798        # If we are running in disconnected mode, do not actually load subsystems.
799        # They rely too much on information from the external sapling.
800        #
801        if ($self->{disconnected})
802        {
803            return;
804        }
805    
806      # Get the sapling object.      # Get the sapling object.
807      my $sap = $self->{sap};      my $sap = $self->{sap};
808      # Get the statistics object.      # Get the statistics object.
# Line 672  Line 830 
830      while (! eof $ih) {      while (! eof $ih) {
831          # Get this subsystem.          # Get this subsystem.
832          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
833            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
834          # Normalize the subsystem name.          # Normalize the subsystem name.
835          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
836          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 688  Line 847 
847              if (@$roleList > 0) {              if (@$roleList > 0) {
848                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
849                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
850                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
851                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
852                    $stats->Add(subsystems => 1);
853                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
854                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
855                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
856                      # Create the Includes record.                      # Create the Includes record.
857                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
858                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
859                      # Insure we have the role in place.                      # Insure we have the role in place.
860                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
861                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
862                      delete $roleFields{id};                      delete $roleFields{id};
863                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
864                      # Compute the machine-role ID.                      # Compute the machine-role ID.
865                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
866                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
867                        $stats->Add(subsystemRoles => 1);
868                  }                  }
869              }              }
870          } else {          } else {
# Line 725  Line 886 
886          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
887          # Compute the variant key.          # Compute the variant key.
888          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
889            # Insure we have it in the database.
890            if (! $sap->Exists(Variant => $variantKey)) {
891          # Get the variant from the sapling server.          # Get the variant from the sapling server.
892          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
893                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 733  Line 896 
896                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
897                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
898                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
899          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
900          # Connect it to the subsystem.          # Connect it to the subsystem.
901          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
902                $stats->Add(subsystemVariants => 1);
903            }
904          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
905          # variant.          # variant.
906          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 828  Line 991 
991      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
992      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
993      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
994        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
995      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
996      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
997      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 937  Line 1101 
1101    
1102  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1103    
1104  =head3 DeleteRelatedRecords  =head3 ReadAnnotation
1105    
1106      DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1107    
1108  Delete all the records in the named entity and relationship relating to the  Read the next record from an annotation file. The next record must exist (that is, an
1109  specified genome and roll up the statistics in the specified statistics object.  end-of-file check should have been performed before calling this method).
1110    
1111  =over 4  =over 4
1112    
1113  =item sap  =item ih
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
   
 =item stats  
   
 L<Stats> object for tracking the delete activity.  
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
   
 =item entityName  
1114    
1115  Name of the entity on the other side of the relationship.  Open file handle for the annotation file.
   
 =back  
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
   
 =over 4  
   
 =item tableName  
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
1116    
1117  =item RETURN  =item RETURN
1118    
1119  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1120    timestamp, (2) the user ID, and (3) the annotation text.
1121    
1122  =back  =back
1123    
1124  =cut  =cut
1125    
1126  sub ExtractFields {  sub ReadAnnotation {
1127      # Get the parameters.      # Get the parameter.
1128      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1129      # Declare the return variable.      # Read the three fixed fields.
1130      my %retVal;      my $fid = <$ih>; chomp $fid;
1131      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1132      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1133          # Is this a field for the specified table?      # Loop through the lines of the text field.
1134          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1135              # Yes, put it in the output hash.      my $line = <$ih>;
1136              $retVal{$1} = $dataHash->{$field};      while (defined($line) && $line ne "//\n") {
1137          }          $text .= $line;
1138      }          $line = <$ih>;
1139      # Return the computed hash.      }
1140      return %retVal;      # Remove the trailing new-line from the text.
1141        chomp $text;
1142        # Return the fields.
1143        return ($fid, $timestamp, $user, $text);
1144  }  }
1145    
1146  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.16

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3