[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.17, Tue Mar 26 02:10:46 2013 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 65  Line 67 
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70        Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
72      # Load the features.      # Load the features.
73        Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75        # Check for annotation history. If we have it, load the history records into the
76        # database.
77        if (-f "$directory/annotations") {
78            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
79            $loaderObject->LoadAnnotations("$directory/annotations");
80        }
81      # Load the subsystem bindings.      # Load the subsystem bindings.
82        Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
83      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
84      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
85        Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
86      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
87      # Return the statistics.      # Return the statistics.
88      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 106  Line 118 
118      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
119      # Create the statistics object.      # Create the statistics object.
120      my $stats = Stats->new();      my $stats = Stats->new();
121      # Delete the DNA.      # Delete the DNA sequences.
122      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
123        for my $seq (@seqs) {
124            my $delStats = $sap->Delete(DNASequence => $seq);
125            $stats->Accumulate($delStats);
126        }
127      # Delete the contigs.      # Delete the contigs.
128      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
129      # Delete the features.      # Delete the features.
130      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
131      # Delete the molecular machines.      # Delete the molecular machines.
132      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
133      # Delete the genome itself.      # Delete the genome itself.
134      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
135      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 122  Line 138 
138      return $stats;      return $stats;
139  }  }
140    
141    
142    =head3 Process
143    
144        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
145    
146    Load genome data from the specified directory. If the genome data already
147    exists in the database, it will be deleted first.
148    
149    =over 4
150    
151    =item sap
152    
153    L</Sapling> object for accessing the database.
154    
155    =item genome
156    
157    ID of the genome whose  data is being loaded.
158    
159    =item directory
160    
161    Name of the directory containing the genome data files. If omitted, the
162    genome will be deleted from the database.
163    
164    =item RETURN
165    
166    Returns a statistics object describing the activity during the reload.
167    
168    =back
169    
170    =cut
171    
172    sub Process {
173        # Get the parameters.
174        my ($sap, $genome, $directory) = @_;
175        # Clear the existing data for the specified genome.
176        my $stats = ClearGenome($sap, $genome);
177        # Load the new expression data from the specified directory (if one is
178        # specified).
179        if ($directory) {
180            my $newStats = Load($sap, $genome, $directory);
181            # Merge the statistics.
182            $stats->Accumulate($newStats);
183        }
184        # Return the result.
185        return $stats;
186    }
187    
188    
189  =head2 Loader Object Methods  =head2 Loader Object Methods
190    
191  =head3 new  =head3 new
# Line 179  Line 243 
243      # Get the parameters.      # Get the parameters.
244      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
245      # Create the object.      # Create the object.
246      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
247          sap => $sap,      # Add our specialized data.
248          genome => $genome,      $retVal->{genome} = $genome;
249          directory => $directory,      $retVal->{directory} = $directory;
250          stats => Stats->new(qw(contigs dna pegs rnas)),      # Leave the assignment hash undefined until we populate it.
251          supportRecords => {}      $retVal->{assignHash} = undef;
252      };      # Return the result.
     # Bless and return it.  
     bless $retVal, $class;  
253      return $retVal;      return $retVal;
254  }  }
255    
# Line 196  Line 258 
258      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
259    
260  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
261  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
262  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
263    C<gc_content> statistic.
264    
265  =cut  =cut
266    
# Line 300  Line 363 
363      # Compute the chunk ID.      # Compute the chunk ID.
364      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
365      # Connect this sequence to the contig.      # Connect this sequence to the contig.
366      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
367      # Create the DNA sequence.      # Create the DNA sequence.
368      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369      # Record the chunk.      # Record the chunk.
370      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
371        # Update the GC count.
372        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
373  }  }
374    
375  =head3 OutputContig  =head3 OutputContig
# Line 355  Line 420 
420  sub LoadFeatures {  sub LoadFeatures {
421      # Get the parameters.      # Get the parameters.
422      my ($self) = @_;      my ($self) = @_;
423        # Read in the functional assignments.
424        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
425        my $assignHash = $self->ReadAssignments();
426        $self->{assignHash} = $assignHash;
427      # Get the directory of feature types.      # Get the directory of feature types.
428      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
429      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
430        # Check for protein sequences. If we have some, load them into a hash.
431        my $protHash = {};
432        if (-f "$featureDir/peg/fasta") {
433            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
434            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
435        }
436      # Create the feature records for the types found.      # Create the feature records for the types found.
437      for my $type (@types) {      for my $type (@types) {
438          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
439          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
440              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
441              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
442                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
443                if (-f $tranFile) {
444                    $evHash = $self->LoadEvidenceCodes($tranFile);
445          }          }
446                # Now load the feature data.
447                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
448      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
449      }      }
450  }  }
451    
452    =head3 LoadEvidenceCodes
453    
454        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
455    
456    Load the evidence codes from the specified attribute transaction log file into a
457    hash. The log file is in tab-delimited format. The first column contains the
458    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
459    the third an attribute name (we'll ignore everything but C<evidence_code>), and
460    the fourth the attribute value.
461    
462    =over 4
463    
464    =item attributeFile
465    
466    Name of the attribute transaction log file.
467    
468    =item RETURN
469    
470    Returns a reference to a hash mapping each feature ID to a comma-delimited list
471    of its evidence codes.
472    
473    =back
474    
475    =cut
476    
477    sub LoadEvidenceCodes {
478        # Get the parameters.
479        my ($self, $attributeFile) = @_;
480        # Get the Sapling database.
481        my $sap = $self->{sap};
482        # Get the statistics object.
483        my $stats = $self->{stats};
484        # Get the assignment hash: we use this to filter the feature IDs.
485        my $assignHash = $self->{assignHash};
486        # Open the attribute log file for input.
487        my $ih = Open(undef, "<$attributeFile");
488        # This two-dimensional hash will hold the evidence codes for each feature.
489        my %retVal;
490        # Loop through the attribute log file.
491        while (! eof $ih) {
492            # Get the current attribute record.
493            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
494            $stats->Add(attributeLine => 1);
495            # Insure we have all the pieces we need.
496            if (! $command || ! $fid || $key ne 'evidence_code') {
497                $stats->Add(attributeLineSkipped => 1);
498            } elsif (! $assignHash->{$fid}) {
499                # Here the attribute is for a deleted feature.
500                $stats->Add(attributeFidSkipped => 1);
501            } else {
502                # Get the sub-hash for this feature.
503                if (! exists $retVal{$fid}) {
504                    $retVal{$fid} = {};
505                }
506                my $featureSubHash = $retVal{$fid};
507                # Process according to the command.
508                if ($command eq 'ADD') {
509                    # Here we are adding a new evidence code.
510                    $featureSubHash->{$value} = 1;
511                    $stats->Add(attributeAdd => 1);
512                } elsif ($command eq 'DELETE') {
513                    # Here we are deleting an evidence code.
514                    delete $featureSubHash->{$value};
515                    $stats->Add(attributeDelete => 1);
516                } else {
517                    # here we have an unrecognized command.
518                    $stats->Add(attributeCommandSkip => 1);
519                }
520            }
521        }
522        # Loop through the hash, converting each sub-hash to a comma-delimited list of
523        # evidence codes.
524        for my $fid (keys %retVal) {
525            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
526        }
527        # Return the evidence hash.
528        return \%retVal;
529    }
530    
531    
532  =head3 LoadFeatureData  =head3 LoadFeatureData
533    
534      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
535    
536  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
537  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 389  Line 546 
546    
547  Type of feature to load.  Type of feature to load.
548    
549    =item protHash
550    
551    Reference to a hash mapping each feature ID for a protein-encoding gene to
552    its protein sequence.
553    
554    =item evHash
555    
556    Reference to a hash mapping each feature ID to a comma-delimited list of
557    its evidence codes (if any).
558    
559  =back  =back
560    
561  =cut  =cut
562    
563  sub LoadFeatureData {  sub LoadFeatureData {
564      # Get the parameters.      # Get the parameters.
565      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
566      # Get the sapling database.      # Get the sapling database.
567      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
568      # Get the statistics object.      # Get the statistics object.
569      my $stats = $self->{stats};      my $stats = $self->{stats};
570      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
571      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
572        my $assignHash = $self->{assignHash};
573      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
574      # time, it overwrites the original.      # time, it overwrites the original.
575      my %fids;      my %fidHash;
576        # This hash tracks the deleted features. We don't want to update these.
577        my %deleted_features;
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
580      if (-f $fileName) {      if (-f $fileName) {
581          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
582            # of deleted features and remove them from the assignment hash. This insures
583            # that they are not used by subsequent methods.
584            my $deleteFile = "$featureDir/$type/deleted.features";
585            if (-f $deleteFile) {
586                my $dh = Open(undef, "<$deleteFile");
587                while (! eof $dh) {
588                    my ($deletedFid) = Tracer::GetLine($dh);
589                    if (exists $assignHash->{$deletedFid}) {
590                        delete $assignHash->{$deletedFid};
591                        $stats->Add(deletedFid => 1);
592                        $deleted_features{$deletedFid} = 1;
593                    }
594                }
595            }
596            # Open the main file for input.
597            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
598          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
599          while (! eof $ih) {          while (! eof $ih) {
600              # Read this feature's information.              # Read this feature's information.
601              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
602              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
603              if (exists $fids{$fid}) {              if (!$deleted_features{$fid}) {
604                    # If the feature already exists, delete it. (This should be extremely rare.)
605                    if ($fidHash{$fid}) {
606                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
607                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
             } else {  
                 # Otherwise connect it to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
608              }              }
609              # Now we must parse the locations. This will contain a list of the location                  # If this is RNA, the alias list is always empty. Sometimes, the functional
610              # data 4-tuples (contig, start, dir, len).                  # assignment is found there.
611              my @locData;                  if ($type eq 'rna') {
612              # This will contain the total sequence length.                      if (! $assignHash->{$fid}) {
613              my $length = 0;                          $assignHash->{$fid} = $aliases[0];
614              # Loop through the locations.                      }
615              for my $loc (split /\s*,\s*/, $locations) {                      @aliases = ();
616                  # Get this location's components.                  }
617                  my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                  # Add the feature to the database.
618                  # Normalize the contig.                  my $function = $assignHash->{$fid} || "";
619                  $self->FixContig(\$contig);                  $self->AddFeature($fid, $function, $locations, \@aliases,
620                  # Compute the direction.                                    $protHash->{$fid}, $evHash->{$fid});
621                  if ($start <= $stop) {                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
622                      # Here we have the forward strand.                  $fidHash{$fid} = 1;
                     my $len = $stop + 1 - $start;  
                     push @locData, [$contig, $start, '+', $len];  
                     $length += $len;  
                 } else {  
                     # Here we have the reverse strand.  
                     my $len = $start + 1 - $stop;  
                     push @locData, [$contig, $stop, '-', $len];  
                     $length += $len;  
                 }  
             }  
             # Now we can create the feature record.  
             $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                function => $assignHash->{$fid}, locked => 0,  
                                sequence_length => $length);  
             $fids{$fid} = 1;  
             $stats->Add($type => 1);  
             # The next step is to connect the feature record to its locations. This  
             # involves dividing the location into segments. The following variable  
             # will count the total number of segments.  
             my $segment = 0;  
             # Loop through the locations.  
             for my $loc (@locData) {  
                 # Get the current location's information.  
                 my ($contig, $left, $dir, $len) = @$loc;  
                 # Peel off any segments.  
                 while ($len > $maxLength) {  
                     # Process according to the direction.  
                     if ($dir eq '+') {  
                         # Forward strand: peel from the beginning.  
                         $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                $maxLength);  
                         $left += $maxLength;  
                     } else {  
                         # Reverse strand: peel from the end.  
                         $self->ConnectLocation($fid, $contig, $segment,  
                                               $left + $len - $maxLength, $dir,  
                                               $maxLength);  
                     }  
                     # Denote we've used up a segment.  
                     $len -= $maxLength;  
                     $segment++;  
                 }  
                 # Output the last segment.  
                 $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
623              }              }
624          }          }
625      }      }
626  }  }
627    
628    
629  =head3 LoadProteinData  =head3 LoadProteinData
630    
631      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
632    
633  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
634  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
635    
636  =over 4  =over 4
637    
# Line 500  Line 639 
639    
640  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
641    
642    =item RETURN
643    
644    Returns a hash mapping feature IDs to protein sequences.
645    
646  =back  =back
647    
648  =cut  =cut
# Line 509  Line 652 
652      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
653      # Open the FASTA file for input.      # Open the FASTA file for input.
654      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
655        # Create the return hash.
656        my $retVal = {};
657      # We'll track the current protein in here.      # We'll track the current protein in here.
658      my $fid;      my $fid;
659      my $sequence = "";      my $sequence = "";
# Line 522  Line 667 
667              my $newFid = $1;              my $newFid = $1;
668              # Do we have an existing protein?              # Do we have an existing protein?
669              if (defined $fid) {              if (defined $fid) {
670                  # Yes. Write it out.                  # Yes. Store it in the hash.
671                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
672              }              }
673              # Initialize for the next protein.              # Initialize for the next protein.
674              $fid = $newFid;              $fid = $newFid;
# Line 535  Line 680 
680      }      }
681      # Do we have a residual protein.      # Do we have a residual protein.
682      if (defined $fid) {      if (defined $fid) {
683          # Yes. Write it out.          # Yes. Store it in the hash.
684          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
685        }
686        # Return the hash.
687        return $retVal;
688    }
689    
690    
691    =head3 LoadAnnotations
692    
693        $loaderObject->LoadAnnotations($fileName);
694    
695    Read in the annotation history information and use it to create annotation records.
696    
697    =over 4
698    
699    =item fileName
700    
701    Name of the annotation history file. This file is formatted with four fields per
702    record. Each field is on a separate line, with a double slash (C<//>) used as the
703    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
704    (formatted as an integer), (2) the user name, and (3) the annotation text.
705    
706    =back
707    
708    =cut
709    
710    sub LoadAnnotations {
711        # Get the parameters.
712        my ($self, $fileName) = @_;
713        # Get the assignment Hash. We use this to filter out deleted features.
714        my $assignHash = $self->{assignHash};
715        # Get the Sapling database.
716        my $sap = $self->{sap};
717        # Get the statistics object.
718        my $stats = $self->{stats};
719        # Open the input file.
720        my $ih = Tracer::Open(undef, "<$fileName");
721        # Loop through the input.
722        while (! eof $ih) {
723            # Read in the peg, timestamp, and user ID.
724            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
725            # Only proceed if the feature is not deleted.
726            if ($assignHash->{$fid}) {
727                # Add the annotation to this feature.
728                $self->MakeAnnotation($fid, $text, $user, $timestamp);
729      }      }
730  }  }
731    }
732    
733    
734  =head3 WriteProtein  =head3 WriteProtein
735    
# Line 565  Line 756 
756      # Get the parameters.      # Get the parameters.
757      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
758      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
759      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
760      # Insure the protein exists.      # Insure the protein exists.
761      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
762      # Connect the feature to it.      # Connect the feature to it.
763      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
764  }  }
765    
766  =head3 InsureEntity  =head3 LoadSubsystems
767    
768      my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);      $loaderObject->LoadSubsystems($subsysList);
769    
770  Insure that the specified record exists in the database. If no record is found of the  Load the subsystem data into the database. This requires looking through the
771  specified type with the specified ID, one will be created with the indicated fields.  bindings and using them to connect to the subsystems that already exist in the
772    database. If the subsystem does not exist, its bindings will not be loaded.
773    
774  =over 4  =over 4
775    
776  =item $entityType  =item subsysList
   
 Type of entity to check.  
   
 =item id  
777    
778  ID of the entity instance in question.  Reference to a list of subsystem IDs. If specified, only the subsystems in the
779    list will be processed. This is useful when a particular set of subsystems
780  =item fields  has been replaced.
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
781    
782  =back  =back
783    
784  =cut  =cut
785    
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
 =head3 LoadSubsystems  
   
     $loaderObject->LoadSubsystems();  
   
 Load the subsystem data into the database. This includes all of the subsystems,  
 their categories, roles, and the bindings that connect them to this genome's features.  
   
 =cut  
   
786  sub LoadSubsystems {  sub LoadSubsystems {
787      # Get the parameters.      # Get the parameters.
788      my ($self) = @_;      my ($self, $subsysList) = @_;
789      # Get the sapling object.      # Get the sapling object.
790      my $sap = $self->{sap};      my $sap = $self->{sap};
791      # Get the statistics object.      # Get the statistics object.
792      my $stats = $self->{stats};      my $stats = $self->{stats};
793      # Get the genome ID.      # Get the genome ID.
794      my $genome = $self->{genome};      my $genome = $self->{genome};
     # Connect to the Sapling server so we can get the subsystem variant and role information.  
     my $sapO = SAPserver->new();  
795      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
796      my %machines;      my %machines;
797      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
798      my %machineRoles;      my %machineRoles;
799      # The first step is to create the subsystems themselves and connect them to the      # This hash will contain the list of subsystems found in the database.
800      # genome. A list is given in the subsystems file of the Subsystems directory.      my %subsystems;
801        # We loop through the subsystems, looking for the ones already in the
802        # database. The list is given in the subsystems file of the Subsystems
803        # directory.
804      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
     # Create the field hash for the subsystem query.  
     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)  
                     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)  
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
805      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
806      while (! eof $ih) {      while (! eof $ih) {
807          # Get this subsystem.          # Get this subsystem.
808          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
809            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810          # Normalize the subsystem name.          # Normalize the subsystem name.
811          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
812            # Insure the subsystem is in the database and is one we're interested in.
813            if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814                    $sap->Exists(Subsystem => $subsystem)) {
815                # We have the subsystem in the database. We need to compute the machine
816                # role IDs and create the molecular machine. First, we need to remember
817                # the subsystem.
818                $subsystems{$subsystem} = 1;
819          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
820          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
         # Insure the subsystem is in the database.  
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
821              # Loop through the roles.              # Loop through the roles.
822              my $roleList = $subHash->{$subsystem};              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823              for my $roleTuple (@$roleList) {                      [$subsystem], "to-link abbreviation");
824                for my $roleTuple (@roleList) {
825                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
826                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828              }              }
829          }              # Next we need the variant code and key.
830          # Now we need to connect the variant to the subsystem. First, we compute the real              my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
831          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832          # Get the variant from the sapling server.              # Now we create the molecular machine connecting this genome to the
833          my $variantH = $sapO->get(-objects => "Variant",              # subsystem variant.
                                   -filter => {"Variant(id)" => ["=", $variantKey]},  
                                   -fields => {"Variant(code)" => "code",  
                                               "Variant(comment)" => "comment",  
                                               "Variant(type)" => "type",  
                                               "Variant(role-rule)" => "role-rule"},  
                                   -firstOnly => 1);  
         # Insure we have it in the database.  
         $self->InsureEntity('Variant', $variantKey, %$variantH);  
         $stats->Add(subsystems => 1);  
         # Connect it to the subsystem.  
         $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
834          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 747  Line 838 
838          # Remember the machine ID.          # Remember the machine ID.
839          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
840      }      }
841      # Now we go through the bindings file. This file connects the subsystem roles to      }
842      # the molecular machines.      # Now we go through the bindings file. This file connects the subsystem
843        # roles to the molecular machines.
844      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845      # Loop through the bindings.      # Loop through the bindings.
846      while (! eof $ih) {      while (! eof $ih) {
# Line 756  Line 848 
848          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849          # Normalize the subsystem name.          # Normalize the subsystem name.
850          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
851            # Insure the subsystem is in the database.
852            if ($subsystems{$subsystem}) {
853          # Compute the machine role.          # Compute the machine role.
854          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
855          # Insure it exists.          # Insure it exists.
# Line 771  Line 865 
865          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866      }      }
867  }  }
868    }
869    
870  =head3 CreateGenome  =head3 CreateGenome
871    
# Line 828  Line 923 
923      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
924      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
925      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
926        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
927      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
928      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
929      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 937  Line 1033 
1033    
1034  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1035    
1036  =head3 DeleteRelatedRecords  =head3 ReadAnnotation
1037    
1038      DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1039    
1040  Delete all the records in the named entity and relationship relating to the  Read the next record from an annotation file. The next record must exist (that is, an
1041  specified genome and roll up the statistics in the specified statistics object.  end-of-file check should have been performed before calling this method).
1042    
1043  =over 4  =over 4
1044    
1045  =item sap  =item ih
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
   
 =item stats  
1046    
1047  L<Stats> object for tracking the delete activity.  Open file handle for the annotation file.
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
   
 =item entityName  
   
 Name of the entity on the other side of the relationship.  
   
 =back  
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
   
 =over 4  
   
 =item tableName  
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
1048    
1049  =item RETURN  =item RETURN
1050    
1051  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1052    timestamp, (2) the user ID, and (3) the annotation text.
1053    
1054  =back  =back
1055    
1056  =cut  =cut
1057    
1058  sub ExtractFields {  sub ReadAnnotation {
1059      # Get the parameters.      # Get the parameter.
1060      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1061      # Declare the return variable.      # Read the three fixed fields.
1062      my %retVal;      my $fid = <$ih>; chomp $fid;
1063      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1064      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1065          # Is this a field for the specified table?      # Loop through the lines of the text field.
1066          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1067              # Yes, put it in the output hash.      my $line = <$ih>;
1068              $retVal{$1} = $dataHash->{$field};      while (defined($line) && $line ne "//\n") {
1069          }          $text .= $line;
1070      }          $line = <$ih>;
1071      # Return the computed hash.      }
1072      return %retVal;      # Remove the trailing new-line from the text.
1073        chomp $text;
1074        # Return the fields.
1075        return ($fid, $timestamp, $user, $text);
1076  }  }
1077    
1078  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3