[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Jan 11 15:04:03 2011 UTC revision 1.15, Mon Dec 5 22:05:15 2011 UTC
# Line 26  Line 26 
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28      use AliasAnalysis;      use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 56  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item disconnected
61    
62    True if the application is disconnected from the network - do not
63    attempt to contact a SAP server for more data.
64    
65    =item assignHash
66    
67    Hash of feature IDs to functional assignments. Deleted features are removed, which
68    means only features listed in this hash can be legally inserted into the database.
69    
70  =back  =back
71    
72  =cut  =cut
73    
74  sub Load {  sub Load {
75      # Get the parameters.      # Get the parameters.
76      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory, $disconnected) = @_;
77      # Create the loader object.      # Create the loader object.
78      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
79      # Load the contigs.      # Load the contigs.
80      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
81      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
82      # Load the features.      # Load the features.
83      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
84      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
85        # Check for annotation history. If we have it, load the history records into the
86        # database.
87        if (-f "$directory/annotations") {
88            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
89            $loaderObject->LoadAnnotations("$directory/annotations");
90        }
91      # Load the subsystem bindings.      # Load the subsystem bindings.
92      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
93      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
94      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
95      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
96      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
97      # Return the statistics.      # Return the statistics.
98      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 111  Line 128 
128      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
129      # Create the statistics object.      # Create the statistics object.
130      my $stats = Stats->new();      my $stats = Stats->new();
131      # Delete the DNA.      # Delete the DNA sequences.
132      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
133        for my $seq (@seqs) {
134            my $delStats = $sap->Delete(DNASequence => $seq);
135            $stats->Accumulate($delStats);
136        }
137      # Delete the contigs.      # Delete the contigs.
138      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
139      # Delete the features.      # Delete the features.
140      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
141      # Delete the molecular machines.      # Delete the molecular machines.
142      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
143      # Delete the genome itself.      # Delete the genome itself.
144      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
145      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 127  Line 148 
148      return $stats;      return $stats;
149  }  }
150    
151    
152    =head3 Process
153    
154        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
155    
156    Load genome data from the specified directory. If the genome data already
157    exists in the database, it will be deleted first.
158    
159    =over 4
160    
161    =item sap
162    
163    L</Sapling> object for accessing the database.
164    
165    =item genome
166    
167    ID of the genome whose  data is being loaded.
168    
169    =item directory
170    
171    Name of the directory containing the genome data files.
172    
173    =item RETURN
174    
175    Returns a statistics object describing the activity during the reload.
176    
177    =back
178    
179    =cut
180    
181    sub Process {
182        # Get the parameters.
183        my ($sap, $genome, $directory) = @_;
184        # Clear the existing data for the specified genome.
185        my $stats = ClearGenome($sap, $genome);
186        # Load the new expression data from the specified directory.
187        my $newStats = Load($sap, $genome, $directory);
188        # Merge the statistics.
189        $stats->Accumulate($newStats);
190        # Return the result.
191        return $stats;
192    }
193    
194    
195  =head2 Loader Object Methods  =head2 Loader Object Methods
196    
197  =head3 new  =head3 new
198    
199      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
200    
201  Create a loader object that can be used to facilitate loading genome data from a  Create a loader object that can be used to facilitate loading genome data from a
202  directory.  directory.
# Line 150  Line 215 
215    
216  Name of the directory containing the genome information.  Name of the directory containing the genome information.
217    
218    =item disconnected
219    
220    Set to a true value if the application should be considered to be disconnected
221    from the network - that is, do not attempt to connect to a Sapling server
222    to load subsystem data.
223    
224  =back  =back
225    
226  The object created contains the following fields.  The object created contains the following fields.
# Line 182  Line 253 
253    
254  sub new {  sub new {
255      # Get the parameters.      # Get the parameters.
256      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory, $disconnected) = @_;
257      # Create the object.      # Create the object.
258      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
259          sap => $sap,      # Add our specialized data.
260          genome => $genome,      $retVal->{genome} = $genome;
261          directory => $directory,      $retVal->{directory} = $directory;
262          stats => Stats->new(qw(contigs dna pegs rnas)),      # Leave the assignment hash undefined until we populate it.
263          supportRecords => {}      $retVal->{assignHash} = undef;
264      };      $retVal->{disconnected} = defined($disconnected) ? 1 : 0;
265      # Bless and return it.      # Return the result.
     bless $retVal, $class;  
266      return $retVal;      return $retVal;
267  }  }
268    
# Line 201  Line 271 
271      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
272    
273  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
274  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
275  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
276    C<gc_content> statistic.
277    
278  =cut  =cut
279    
# Line 310  Line 381 
381      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
382      # Record the chunk.      # Record the chunk.
383      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
384        # Update the GC count.
385        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
386  }  }
387    
388  =head3 OutputContig  =head3 OutputContig
# Line 360  Line 433 
433  sub LoadFeatures {  sub LoadFeatures {
434      # Get the parameters.      # Get the parameters.
435      my ($self) = @_;      my ($self) = @_;
436        # Read in the functional assignments.
437        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
438        my $assignHash = $self->ReadAssignments();
439        $self->{assignHash} = $assignHash;
440      # Get the directory of feature types.      # Get the directory of feature types.
441      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
442      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
443        # Check for protein sequences. If we have some, load them into a hash.
444        my $protHash = {};
445        if (-f "$featureDir/peg/fasta") {
446            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
447            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
448        }
449      # Create the feature records for the types found.      # Create the feature records for the types found.
450      for my $type (@types) {      for my $type (@types) {
451          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
452          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
453              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
454              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
455                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
456                if (-f $tranFile) {
457                    $evHash = $self->LoadEvidenceCodes($tranFile);
458          }          }
459                # Now load the feature data.
460                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
461      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
462      }      }
463  }  }
464    
465    =head3 LoadEvidenceCodes
466    
467        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
468    
469    Load the evidence codes from the specified attribute transaction log file into a
470    hash. The log file is in tab-delimited format. The first column contains the
471    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
472    the third an attribute name (we'll ignore everything but C<evidence_code>), and
473    the fourth the attribute value.
474    
475    =over 4
476    
477    =item attributeFile
478    
479    Name of the attribute transaction log file.
480    
481    =item RETURN
482    
483    Returns a reference to a hash mapping each feature ID to a comma-delimited list
484    of its evidence codes.
485    
486    =back
487    
488    =cut
489    
490    sub LoadEvidenceCodes {
491        # Get the parameters.
492        my ($self, $attributeFile) = @_;
493        # Get the Sapling database.
494        my $sap = $self->{sap};
495        # Get the statistics object.
496        my $stats = $self->{stats};
497        # Get the assignment hash: we use this to filter the feature IDs.
498        my $assignHash = $self->{assignHash};
499        # Open the attribute log file for input.
500        my $ih = Open(undef, "<$attributeFile");
501        # This two-dimensional hash will hold the evidence codes for each feature.
502        my %retVal;
503        # Loop through the attribute log file.
504        while (! eof $ih) {
505            # Get the current attribute record.
506            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
507            $stats->Add(attributeLine => 1);
508            # Insure we have all the pieces we need.
509            if (! $command || ! $fid || $key ne 'evidence_code') {
510                $stats->Add(attributeLineSkipped => 1);
511            } elsif (! $assignHash->{$fid}) {
512                # Here the attribute is for a deleted feature.
513                $stats->Add(attributeFidSkipped => 1);
514            } else {
515                # Get the sub-hash for this feature.
516                if (! exists $retVal{$fid}) {
517                    $retVal{$fid} = {};
518                }
519                my $featureSubHash = $retVal{$fid};
520                # Process according to the command.
521                if ($command eq 'ADD') {
522                    # Here we are adding a new evidence code.
523                    $featureSubHash->{$value} = 1;
524                    $stats->Add(attributeAdd => 1);
525                } elsif ($command eq 'DELETE') {
526                    # Here we are deleting an evidence code.
527                    delete $featureSubHash->{$value};
528                    $stats->Add(attributeDelete => 1);
529                } else {
530                    # here we have an unrecognized command.
531                    $stats->Add(attributeCommandSkip => 1);
532                }
533            }
534        }
535        # Loop through the hash, converting each sub-hash to a comma-delimited list of
536        # evidence codes.
537        for my $fid (keys %retVal) {
538            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
539        }
540        # Return the evidence hash.
541        return \%retVal;
542    }
543    
544    
545  =head3 LoadFeatureData  =head3 LoadFeatureData
546    
547      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
548    
549  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
550  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 394  Line 559 
559    
560  Type of feature to load.  Type of feature to load.
561    
562    =item protHash
563    
564    Reference to a hash mapping each feature ID for a protein-encoding gene to
565    its protein sequence.
566    
567    =item evHash
568    
569    Reference to a hash mapping each feature ID to a comma-delimited list of
570    its evidence codes (if any).
571    
572  =back  =back
573    
574  =cut  =cut
575    
576  sub LoadFeatureData {  sub LoadFeatureData {
577      # Get the parameters.      # Get the parameters.
578      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
579      # Get the sapling database.      # Get the sapling database.
580      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
581      # Get the statistics object.      # Get the statistics object.
582      my $stats = $self->{stats};      my $stats = $self->{stats};
583      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
584      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
585        my $assignHash = $self->{assignHash};
586      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
587      # time, it overwrites the original.      # time, it overwrites the original.
588      my %fids;      my %fidHash;
589        # Finally, we need the timestamp hash. The initial feature population
590      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
591      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
592        my %deleted_features;
593      if (-f $fileName) {      if (-f $fileName) {
594          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
595            # of deleted features and remove them from the assignment hash. This insures
596            # that they are not used by subsequent methods.
597            my $deleteFile = "$featureDir/$type/deleted.features";
598            if (-f $deleteFile) {
599                my $dh = Open(undef, "<$deleteFile");
600                while (! eof $dh) {
601                    my ($deletedFid) = Tracer::GetLine($dh);
602                    if (exists $assignHash->{$deletedFid}) {
603                        delete $assignHash->{$deletedFid};
604                        $stats->Add(deletedFid => 1);
605                        $deleted_features{$deletedFid} = 1;
606                    }
607                }
608            }
609            # Open the main file for input.
610            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
611          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
612          while (! eof $ih) {          while (! eof $ih) {
613              # Read this feature's information.              # Read this feature's information.
614              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
615              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
616              if (exists $fids{$fid}) {              if (!$deleted_features{$fid}) {
617                    # If the feature already exists, delete it. (This should be extremely rare.)
618                    if ($fidHash{$fid}) {
619                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
620                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
621              }              }
622              # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
623              $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
624              # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
625              # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
626              my @locData;                          $assignHash->{$fid} = $aliases[0];
627              # This will contain the total sequence length.                      }
628              my $length = 0;                      @aliases = ();
629              # Loop through the locations.                  }
630              for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
631                  # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
632                  my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
633                  # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
634                  $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                 # Compute the direction.  
                 if ($start <= $stop) {  
                     # Here we have the forward strand.  
                     my $len = $stop + 1 - $start;  
                     push @locData, [$contig, $start, '+', $len];  
                     $length += $len;  
                 } else {  
                     # Here we have the reverse strand.  
                     my $len = $start + 1 - $stop;  
                     push @locData, [$contig, $stop, '-', $len];  
                     $length += $len;  
                 }  
             }  
             # Now we can create the feature record.  
             $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                function => $assignHash->{$fid}, locked => 0,  
                                sequence_length => $length);  
             $fids{$fid} = 1;  
             $stats->Add($type => 1);  
             # The next step is to connect the feature record to its locations. This  
             # involves dividing the location into segments. The following variable  
             # will count the total number of segments.  
             my $segment = 0;  
             # Loop through the locations.  
             for my $loc (@locData) {  
                 # Get the current location's information.  
                 my ($contig, $left, $dir, $len) = @$loc;  
                 # Peel off any segments.  
                 while ($len > $maxLength) {  
                     # Process according to the direction.  
                     if ($dir eq '+') {  
                         # Forward strand: peel from the beginning.  
                         $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                $maxLength);  
                         $left += $maxLength;  
                     } else {  
                         # Reverse strand: peel from the end.  
                         $self->ConnectLocation($fid, $contig, $segment,  
                                               $left + $len - $maxLength, $dir,  
                                               $maxLength);  
                     }  
                     # Denote we've used up a segment.  
                     $len -= $maxLength;  
                     $segment++;  
                 }  
                 # Output the last segment.  
                 $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
             }  
             # Now we process the aliases and create the identifiers. We don't do this  
             # for RNA, because the RNA function is stored in the aliases.  
             if ($type ne 'rna') {  
                 for my $alias (@aliases) {  
                     my $normalized;  
                     # Determine the type.  
                     my $aliasType = AliasAnalysis::TypeOf($alias);  
                     $stats->Add(aliasAll => 1);  
                     # Is this a recognized type?  
                     if ($aliasType) {  
                         $stats->Add(aliasNormal => 1);  
                         # Yes. Write it normally.  
                         $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                         # No, but this is a specially-marked locus tag.  
                         $normalized = $1;  
                         $stats->Add(aliasLocus => 1);  
                         $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                         # No, but this is a natural locus tag.  
                         $stats->Add(aliasLocus => 1);  
                         $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                         # No, but this is a natural gene name.  
                         $stats->Add(aliasGene => 1);  
                         $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                     } elsif ($alias =~ /^\d+$/) {  
                         # Here it's a naked number, which means it's a GI number  
                         # of some sort.  
                         $stats->Add(aliasGI => 1);  
                         $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                     } elsif ($alias =~ /^protein_id\|(.+)/) {  
                         # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                         # handle that, because we don't know the feature's protein ID here.  
                         $stats->Add(aliasProtein => 1);  
                     } elsif ($alias =~ /[:|]/) {  
                         # Here it's an alias of an unknown type, so we skip it.  
                         $stats->Add(aliasUnknown => 1);  
                     } else {  
                         # Here it's a miscellaneous type.  
                         $stats->Add(aliasMisc => 1);  
                         $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                     }  
                 }  
635              }              }
636          }          }
637      }      }
638  }  }
639    
640    
641  =head3 LoadProteinData  =head3 LoadProteinData
642    
643      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
644    
645  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
646  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
647    
648  =over 4  =over 4
649    
# Line 549  Line 651 
651    
652  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
653    
654    =item RETURN
655    
656    Returns a hash mapping feature IDs to protein sequences.
657    
658  =back  =back
659    
660  =cut  =cut
# Line 558  Line 664 
664      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
665      # Open the FASTA file for input.      # Open the FASTA file for input.
666      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
667        # Create the return hash.
668        my $retVal = {};
669      # We'll track the current protein in here.      # We'll track the current protein in here.
670      my $fid;      my $fid;
671      my $sequence = "";      my $sequence = "";
# Line 571  Line 679 
679              my $newFid = $1;              my $newFid = $1;
680              # Do we have an existing protein?              # Do we have an existing protein?
681              if (defined $fid) {              if (defined $fid) {
682                  # Yes. Write it out.                  # Yes. Store it in the hash.
683                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
684              }              }
685              # Initialize for the next protein.              # Initialize for the next protein.
686              $fid = $newFid;              $fid = $newFid;
# Line 584  Line 692 
692      }      }
693      # Do we have a residual protein.      # Do we have a residual protein.
694      if (defined $fid) {      if (defined $fid) {
695          # Yes. Write it out.          # Yes. Store it in the hash.
696          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
697      }      }
698        # Return the hash.
699        return $retVal;
700  }  }
701    
702    
703    =head3 LoadAnnotations
704    
705        $loaderObject->LoadAnnotations($fileName);
706    
707    Read in the annotation history information and use it to create annotation records.
708    
709    =over 4
710    
711    =item fileName
712    
713    Name of the annotation history file. This file is formatted with four fields per
714    record. Each field is on a separate line, with a double slash (C<//>) used as the
715    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
716    (formatted as an integer), (2) the user name, and (3) the annotation text.
717    
718    =back
719    
720    =cut
721    
722    sub LoadAnnotations {
723        # Get the parameters.
724        my ($self, $fileName) = @_;
725        # Get the assignment Hash. We use this to filter out deleted features.
726        my $assignHash = $self->{assignHash};
727        # Get the Sapling database.
728        my $sap = $self->{sap};
729        # Get the statistics object.
730        my $stats = $self->{stats};
731        # Open the input file.
732        my $ih = Tracer::Open(undef, "<$fileName");
733        # Loop through the input.
734        while (! eof $ih) {
735            # Read in the peg, timestamp, and user ID.
736            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
737            # Only proceed if the feature is not deleted.
738            if ($assignHash->{$fid}) {
739                # Add the annotation to this feature.
740                $self->MakeAnnotation($fid, $text, $user, $timestamp);
741            }
742        }
743    }
744    
745    
746  =head3 WriteProtein  =head3 WriteProtein
747    
748      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 614  Line 768 
768      # Get the parameters.      # Get the parameters.
769      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
770      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
771      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
772      # Insure the protein exists.      # Insure the protein exists.
773      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
774      # Connect the feature to it.      # Connect the feature to it.
775      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
776  }  }
777    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
778  =head3 LoadSubsystems  =head3 LoadSubsystems
779    
780      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 694  Line 787 
787  sub LoadSubsystems {  sub LoadSubsystems {
788      # Get the parameters.      # Get the parameters.
789      my ($self) = @_;      my ($self) = @_;
790    
791        #
792        # If we are running in disconnected mode, do not actually load subsystems.
793        # They rely too much on information from the external sapling.
794        #
795        if ($self->{disconnected})
796        {
797            return;
798        }
799    
800      # Get the sapling object.      # Get the sapling object.
801      my $sap = $self->{sap};      my $sap = $self->{sap};
802      # Get the statistics object.      # Get the statistics object.
# Line 721  Line 824 
824      while (! eof $ih) {      while (! eof $ih) {
825          # Get this subsystem.          # Get this subsystem.
826          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
827            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
828          # Normalize the subsystem name.          # Normalize the subsystem name.
829          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
830          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 737  Line 841 
841              if (@$roleList > 0) {              if (@$roleList > 0) {
842                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
843                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
844                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
845                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
846                    $stats->Add(subsystems => 1);
847                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
848                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
849                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
850                      # Create the Includes record.                      # Create the Includes record.
851                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
852                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
853                      # Insure we have the role in place.                      # Insure we have the role in place.
854                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
855                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
856                      delete $roleFields{id};                      delete $roleFields{id};
857                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
858                      # Compute the machine-role ID.                      # Compute the machine-role ID.
859                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
860                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
861                        $stats->Add(subsystemRoles => 1);
862                  }                  }
863              }              }
864          } else {          } else {
# Line 774  Line 880 
880          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
881          # Compute the variant key.          # Compute the variant key.
882          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
883            # Insure we have it in the database.
884            if (! $sap->Exists(Variant => $variantKey)) {
885          # Get the variant from the sapling server.          # Get the variant from the sapling server.
886          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
887                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 782  Line 890 
890                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
891                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
892                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
893          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
894          # Connect it to the subsystem.          # Connect it to the subsystem.
895          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
896                $stats->Add(subsystemVariants => 1);
897            }
898          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
899          # variant.          # variant.
900          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 877  Line 985 
985      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
986      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
987      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
988        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
989      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
990      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
991      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 986  Line 1095 
1095    
1096  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1097    
1098  =head3 DeleteRelatedRecords  =head3 ReadAnnotation
   
     DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);  
   
 Delete all the records in the named entity and relationship relating to the  
 specified genome and roll up the statistics in the specified statistics object.  
   
 =over 4  
   
 =item sap  
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
   
 =item stats  
   
 L<Stats> object for tracking the delete activity.  
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
   
 =item entityName  
   
 Name of the entity on the other side of the relationship.  
   
 =back  
   
 =cut  
1099    
1100  sub DeleteRelatedRecords {      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
1101    
1102  =head3 ExtractFields  Read the next record from an annotation file. The next record must exist (that is, an
1103    end-of-file check should have been performed before calling this method).
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
1104    
1105  =over 4  =over 4
1106    
1107  =item tableName  =item ih
   
 Name of the table whose field names and values are desired.  
1108    
1109  =item dataHash  Open file handle for the annotation file.
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
1110    
1111  =item RETURN  =item RETURN
1112    
1113  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1114    timestamp, (2) the user ID, and (3) the annotation text.
1115    
1116  =back  =back
1117    
1118  =cut  =cut
1119    
1120  sub ExtractFields {  sub ReadAnnotation {
1121      # Get the parameters.      # Get the parameter.
1122      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1123      # Declare the return variable.      # Read the three fixed fields.
1124      my %retVal;      my $fid = <$ih>; chomp $fid;
1125      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1126      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1127          # Is this a field for the specified table?      # Loop through the lines of the text field.
1128          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1129              # Yes, put it in the output hash.      my $line = <$ih>;
1130              $retVal{$1} = $dataHash->{$field};      while (defined($line) && $line ne "//\n") {
1131          }          $text .= $line;
1132      }          $line = <$ih>;
1133      # Return the computed hash.      }
1134      return %retVal;      # Remove the trailing new-line from the text.
1135  }      chomp $text;
1136        # Return the fields.
1137  =head3 CreateIdentifier      return ($fid, $timestamp, $user, $text);
   
     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);  
   
 Link an identifier to a feature. The identifier is presented in prefixed form and is of the  
 specified type and the specified confidence level.  
   
 =over 4  
   
 =item alias  
   
 Identifier to connect to the feature.  
   
 =item conf  
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
   
 ID of the relevant feature.  
   
 =back  
   
 =cut  
   
 sub CreateIdentifier {  
     # Get the parameters.  
     my ($self, $alias, $conf, $aliasType, $fid) = @_;  
     # Get the Sapling object.  
     my $sap = $self->{sap};  
     # Compute the identifier's natural form.  
     my $natural = $alias;  
     if ($natural =~ /[:|](.+)/) {  
         $natural = $1;  
     }  
     # Insure the identifier exists in the database.  
     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);  
     # Connect the identifier to the feature.  
     $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);  
1138  }  }
1139    
1140  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3