[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sun Feb 13 13:02:30 2011 UTC revision 1.17, Tue Mar 26 02:10:46 2013 UTC
# Line 67  Line 67 
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
72      # Load the features.      # Load the features.
73      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75        # Check for annotation history. If we have it, load the history records into the
76        # database.
77        if (-f "$directory/annotations") {
78            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
79            $loaderObject->LoadAnnotations("$directory/annotations");
80        }
81      # Load the subsystem bindings.      # Load the subsystem bindings.
82      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
83      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
84      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
85      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
86      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
87      # Return the statistics.      # Return the statistics.
88      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 112  Line 118 
118      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
119      # Create the statistics object.      # Create the statistics object.
120      my $stats = Stats->new();      my $stats = Stats->new();
121      # Delete the DNA.      # Delete the DNA sequences.
122      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
123        for my $seq (@seqs) {
124            my $delStats = $sap->Delete(DNASequence => $seq);
125            $stats->Accumulate($delStats);
126        }
127      # Delete the contigs.      # Delete the contigs.
128      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
129      # Delete the features.      # Delete the features.
# Line 128  Line 138 
138      return $stats;      return $stats;
139  }  }
140    
141    
142    =head3 Process
143    
144        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
145    
146    Load genome data from the specified directory. If the genome data already
147    exists in the database, it will be deleted first.
148    
149    =over 4
150    
151    =item sap
152    
153    L</Sapling> object for accessing the database.
154    
155    =item genome
156    
157    ID of the genome whose  data is being loaded.
158    
159    =item directory
160    
161    Name of the directory containing the genome data files. If omitted, the
162    genome will be deleted from the database.
163    
164    =item RETURN
165    
166    Returns a statistics object describing the activity during the reload.
167    
168    =back
169    
170    =cut
171    
172    sub Process {
173        # Get the parameters.
174        my ($sap, $genome, $directory) = @_;
175        # Clear the existing data for the specified genome.
176        my $stats = ClearGenome($sap, $genome);
177        # Load the new expression data from the specified directory (if one is
178        # specified).
179        if ($directory) {
180            my $newStats = Load($sap, $genome, $directory);
181            # Merge the statistics.
182            $stats->Accumulate($newStats);
183        }
184        # Return the result.
185        return $stats;
186    }
187    
188    
189  =head2 Loader Object Methods  =head2 Loader Object Methods
190    
191  =head3 new  =head3 new
# Line 189  Line 247 
247      # Add our specialized data.      # Add our specialized data.
248      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
249      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
250        # Leave the assignment hash undefined until we populate it.
251        $retVal->{assignHash} = undef;
252      # Return the result.      # Return the result.
253      return $retVal;      return $retVal;
254  }  }
# Line 198  Line 258 
258      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
259    
260  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
261  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
262  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
263    C<gc_content> statistic.
264    
265  =cut  =cut
266    
# Line 307  Line 368 
368      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369      # Record the chunk.      # Record the chunk.
370      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
371        # Update the GC count.
372        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
373  }  }
374    
375  =head3 OutputContig  =head3 OutputContig
# Line 357  Line 420 
420  sub LoadFeatures {  sub LoadFeatures {
421      # Get the parameters.      # Get the parameters.
422      my ($self) = @_;      my ($self) = @_;
423        # Read in the functional assignments.
424        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
425        my $assignHash = $self->ReadAssignments();
426        $self->{assignHash} = $assignHash;
427      # Get the directory of feature types.      # Get the directory of feature types.
428      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
429      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
430        # Check for protein sequences. If we have some, load them into a hash.
431        my $protHash = {};
432        if (-f "$featureDir/peg/fasta") {
433            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
434            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
435        }
436      # Create the feature records for the types found.      # Create the feature records for the types found.
437      for my $type (@types) {      for my $type (@types) {
438          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
439          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
440              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
441              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
442                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
443                if (-f $tranFile) {
444                    $evHash = $self->LoadEvidenceCodes($tranFile);
445          }          }
446                # Now load the feature data.
447                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
448      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
449      }      }
450  }  }
451    
452    =head3 LoadEvidenceCodes
453    
454        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
455    
456    Load the evidence codes from the specified attribute transaction log file into a
457    hash. The log file is in tab-delimited format. The first column contains the
458    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
459    the third an attribute name (we'll ignore everything but C<evidence_code>), and
460    the fourth the attribute value.
461    
462    =over 4
463    
464    =item attributeFile
465    
466    Name of the attribute transaction log file.
467    
468    =item RETURN
469    
470    Returns a reference to a hash mapping each feature ID to a comma-delimited list
471    of its evidence codes.
472    
473    =back
474    
475    =cut
476    
477    sub LoadEvidenceCodes {
478        # Get the parameters.
479        my ($self, $attributeFile) = @_;
480        # Get the Sapling database.
481        my $sap = $self->{sap};
482        # Get the statistics object.
483        my $stats = $self->{stats};
484        # Get the assignment hash: we use this to filter the feature IDs.
485        my $assignHash = $self->{assignHash};
486        # Open the attribute log file for input.
487        my $ih = Open(undef, "<$attributeFile");
488        # This two-dimensional hash will hold the evidence codes for each feature.
489        my %retVal;
490        # Loop through the attribute log file.
491        while (! eof $ih) {
492            # Get the current attribute record.
493            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
494            $stats->Add(attributeLine => 1);
495            # Insure we have all the pieces we need.
496            if (! $command || ! $fid || $key ne 'evidence_code') {
497                $stats->Add(attributeLineSkipped => 1);
498            } elsif (! $assignHash->{$fid}) {
499                # Here the attribute is for a deleted feature.
500                $stats->Add(attributeFidSkipped => 1);
501            } else {
502                # Get the sub-hash for this feature.
503                if (! exists $retVal{$fid}) {
504                    $retVal{$fid} = {};
505                }
506                my $featureSubHash = $retVal{$fid};
507                # Process according to the command.
508                if ($command eq 'ADD') {
509                    # Here we are adding a new evidence code.
510                    $featureSubHash->{$value} = 1;
511                    $stats->Add(attributeAdd => 1);
512                } elsif ($command eq 'DELETE') {
513                    # Here we are deleting an evidence code.
514                    delete $featureSubHash->{$value};
515                    $stats->Add(attributeDelete => 1);
516                } else {
517                    # here we have an unrecognized command.
518                    $stats->Add(attributeCommandSkip => 1);
519                }
520            }
521        }
522        # Loop through the hash, converting each sub-hash to a comma-delimited list of
523        # evidence codes.
524        for my $fid (keys %retVal) {
525            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
526        }
527        # Return the evidence hash.
528        return \%retVal;
529    }
530    
531    
532  =head3 LoadFeatureData  =head3 LoadFeatureData
533    
534      $loaderObject->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
535    
536  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
537  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 391  Line 546 
546    
547  Type of feature to load.  Type of feature to load.
548    
549    =item protHash
550    
551    Reference to a hash mapping each feature ID for a protein-encoding gene to
552    its protein sequence.
553    
554    =item evHash
555    
556    Reference to a hash mapping each feature ID to a comma-delimited list of
557    its evidence codes (if any).
558    
559  =back  =back
560    
561  =cut  =cut
562    
563  sub LoadFeatureData {  sub LoadFeatureData {
564      # Get the parameters.      # Get the parameters.
565      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
566      # Get the sapling database.      # Get the sapling database.
567      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
568      # Get the statistics object.      # Get the statistics object.
569      my $stats = $self->{stats};      my $stats = $self->{stats};
570      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
571      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
572        my $assignHash = $self->{assignHash};
573      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
574      # time, it overwrites the original.      # time, it overwrites the original.
575      my %fids;      my %fidHash;
576        # This hash tracks the deleted features. We don't want to update these.
577        my %deleted_features;
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
580      if (-f $fileName) {      if (-f $fileName) {
581          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
582          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
583          my %deletedFids;          # that they are not used by subsequent methods.
584          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
585          if (-f $deleteFile) {          if (-f $deleteFile) {
586              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
587                while (! eof $dh) {
588                    my ($deletedFid) = Tracer::GetLine($dh);
589                    if (exists $assignHash->{$deletedFid}) {
590                        delete $assignHash->{$deletedFid};
591                        $stats->Add(deletedFid => 1);
592                        $deleted_features{$deletedFid} = 1;
593                    }
594                }
595          }          }
596          # Open the main file for input.          # Open the main file for input.
597            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
598          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
599          while (! eof $ih) {          while (! eof $ih) {
600              # Read this feature's information.              # Read this feature's information.
601              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
602              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
603              if (! exists $deletedFids{$fid}) {              if (!$deleted_features{$fid}) {
604                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
605                  if (exists $fids{$fid}) {                  if ($fidHash{$fid}) {
606                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
607                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
608                  }                  }
609                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
610                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
611                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
612                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
613                  my @locData;                          $assignHash->{$fid} = $aliases[0];
614                  # This will contain the total sequence length.                      }
615                  my $length = 0;                      @aliases = ();
616                  # Loop through the locations.                  }
617                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
618                      # Get this location's components.                  my $function = $assignHash->{$fid} || "";
619                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                  $self->AddFeature($fid, $function, $locations, \@aliases,
620                      # Normalize the contig.                                    $protHash->{$fid}, $evHash->{$fid});
621                      $self->FixContig(\$contig);                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
622                      # Compute the direction.                  $fidHash{$fid} = 1;
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fids{$fid} = 1;  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # Now loop through the features, connecting them to their roles. Note that deleted  
     # features will not be in the assignment hash.  
     for my $fid (keys %$assignHash) {  
         # Get the roles and the error count.  
         my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});  
         # Accumulate the errors in the stats object.  
         $stats->Add(roleErrors => $errors);  
         # Is this a suspicious function?  
         if (! defined $roles) {  
             # Yes, so track it.  
             $stats->Add(badFunction => 1);  
         } else {  
             # No, connect the roles.  
             for my $role (@$roles) {  
                 # Insure this role exists.  
                 my $hypo = hypo($role);  
                 $self->InsureEntity(Role => $role, hypothetical => $hypo);  
                 # Connect it to the feature.  
                 $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);  
623              }              }
624          }          }
625      }      }
626  }  }
627    
628    
629  =head3 LoadProteinData  =head3 LoadProteinData
630    
631      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
632    
633  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
634  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
635    
636  =over 4  =over 4
637    
# Line 578  Line 639 
639    
640  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
641    
642    =item RETURN
643    
644    Returns a hash mapping feature IDs to protein sequences.
645    
646  =back  =back
647    
648  =cut  =cut
# Line 587  Line 652 
652      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
653      # Open the FASTA file for input.      # Open the FASTA file for input.
654      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
655        # Create the return hash.
656        my $retVal = {};
657      # We'll track the current protein in here.      # We'll track the current protein in here.
658      my $fid;      my $fid;
659      my $sequence = "";      my $sequence = "";
# Line 600  Line 667 
667              my $newFid = $1;              my $newFid = $1;
668              # Do we have an existing protein?              # Do we have an existing protein?
669              if (defined $fid) {              if (defined $fid) {
670                  # Yes. Write it out.                  # Yes. Store it in the hash.
671                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
672              }              }
673              # Initialize for the next protein.              # Initialize for the next protein.
674              $fid = $newFid;              $fid = $newFid;
# Line 613  Line 680 
680      }      }
681      # Do we have a residual protein.      # Do we have a residual protein.
682      if (defined $fid) {      if (defined $fid) {
683          # Yes. Write it out.          # Yes. Store it in the hash.
684          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
685      }      }
686        # Return the hash.
687        return $retVal;
688  }  }
689    
690    
691    =head3 LoadAnnotations
692    
693        $loaderObject->LoadAnnotations($fileName);
694    
695    Read in the annotation history information and use it to create annotation records.
696    
697    =over 4
698    
699    =item fileName
700    
701    Name of the annotation history file. This file is formatted with four fields per
702    record. Each field is on a separate line, with a double slash (C<//>) used as the
703    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
704    (formatted as an integer), (2) the user name, and (3) the annotation text.
705    
706    =back
707    
708    =cut
709    
710    sub LoadAnnotations {
711        # Get the parameters.
712        my ($self, $fileName) = @_;
713        # Get the assignment Hash. We use this to filter out deleted features.
714        my $assignHash = $self->{assignHash};
715        # Get the Sapling database.
716        my $sap = $self->{sap};
717        # Get the statistics object.
718        my $stats = $self->{stats};
719        # Open the input file.
720        my $ih = Tracer::Open(undef, "<$fileName");
721        # Loop through the input.
722        while (! eof $ih) {
723            # Read in the peg, timestamp, and user ID.
724            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
725            # Only proceed if the feature is not deleted.
726            if ($assignHash->{$fid}) {
727                # Add the annotation to this feature.
728                $self->MakeAnnotation($fid, $text, $user, $timestamp);
729            }
730        }
731    }
732    
733    
734  =head3 WriteProtein  =head3 WriteProtein
735    
736      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 643  Line 756 
756      # Get the parameters.      # Get the parameters.
757      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
758      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
759      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
760      # Insure the protein exists.      # Insure the protein exists.
761      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
762      # Connect the feature to it.      # Connect the feature to it.
# Line 652  Line 765 
765    
766  =head3 LoadSubsystems  =head3 LoadSubsystems
767    
768      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems($subsysList);
769    
770  Load the subsystem data into the database. This includes all of the subsystems,  Load the subsystem data into the database. This requires looking through the
771  their categories, roles, and the bindings that connect them to this genome's features.  bindings and using them to connect to the subsystems that already exist in the
772    database. If the subsystem does not exist, its bindings will not be loaded.
773    
774    =over 4
775    
776    =item subsysList
777    
778    Reference to a list of subsystem IDs. If specified, only the subsystems in the
779    list will be processed. This is useful when a particular set of subsystems
780    has been replaced.
781    
782    =back
783    
784  =cut  =cut
785    
786  sub LoadSubsystems {  sub LoadSubsystems {
787      # Get the parameters.      # Get the parameters.
788      my ($self) = @_;      my ($self, $subsysList) = @_;
789      # Get the sapling object.      # Get the sapling object.
790      my $sap = $self->{sap};      my $sap = $self->{sap};
791      # Get the statistics object.      # Get the statistics object.
792      my $stats = $self->{stats};      my $stats = $self->{stats};
793      # Get the genome ID.      # Get the genome ID.
794      my $genome = $self->{genome};      my $genome = $self->{genome};
     # Connect to the Sapling server so we can get the subsystem variant and role information.  
     my $sapO = SAPserver->new();  
795      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
796      my %machines;      my %machines;
797      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
798      my %machineRoles;      my %machineRoles;
799      # The first step is to create the subsystems themselves and connect them to the      # This hash will contain the list of subsystems found in the database.
800      # genome. A list is given in the subsystems file of the Subsystems directory.      my %subsystems;
801        # We loop through the subsystems, looking for the ones already in the
802        # database. The list is given in the subsystems file of the Subsystems
803        # directory.
804      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
     # Create the field hash for the subsystem query.  
     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)  
                     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)  
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
805      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
806      while (! eof $ih) {      while (! eof $ih) {
807          # Get this subsystem.          # Get this subsystem.
808          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
809            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810          # Normalize the subsystem name.          # Normalize the subsystem name.
811          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
812            # Insure the subsystem is in the database and is one we're interested in.
813            if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814                    $sap->Exists(Subsystem => $subsystem)) {
815                # We have the subsystem in the database. We need to compute the machine
816                # role IDs and create the molecular machine. First, we need to remember
817                # the subsystem.
818                $subsystems{$subsystem} = 1;
819          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
820          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
         # Insure the subsystem is in the database.  
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
821              # Loop through the roles.              # Loop through the roles.
822              my $roleList = $subHash->{$subsystem};              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823              for my $roleTuple (@$roleList) {                      [$subsystem], "to-link abbreviation");
824                for my $roleTuple (@roleList) {
825                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
826                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828              }              }
829          }              # Next we need the variant code and key.
830          # Now we need to connect the variant to the subsystem. First, we compute the real              my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
831          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832          # Get the variant from the sapling server.              # Now we create the molecular machine connecting this genome to the
833          my $variantH = $sapO->get(-objects => "Variant",              # subsystem variant.
                                   -filter => {"Variant(id)" => ["=", $variantKey]},  
                                   -fields => {"Variant(code)" => "code",  
                                               "Variant(comment)" => "comment",  
                                               "Variant(type)" => "type",  
                                               "Variant(role-rule)" => "role-rule"},  
                                   -firstOnly => 1);  
         # Insure we have it in the database.  
         $self->InsureEntity('Variant', $variantKey, %$variantH);  
         $stats->Add(subsystems => 1);  
         # Connect it to the subsystem.  
         $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
834          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 764  Line 838 
838          # Remember the machine ID.          # Remember the machine ID.
839          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
840      }      }
841      # Now we go through the bindings file. This file connects the subsystem roles to      }
842      # the molecular machines.      # Now we go through the bindings file. This file connects the subsystem
843        # roles to the molecular machines.
844      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845      # Loop through the bindings.      # Loop through the bindings.
846      while (! eof $ih) {      while (! eof $ih) {
# Line 773  Line 848 
848          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849          # Normalize the subsystem name.          # Normalize the subsystem name.
850          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
851            # Insure the subsystem is in the database.
852            if ($subsystems{$subsystem}) {
853          # Compute the machine role.          # Compute the machine role.
854          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
855          # Insure it exists.          # Insure it exists.
# Line 788  Line 865 
865          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866      }      }
867  }  }
868    }
869    
870  =head3 CreateGenome  =head3 CreateGenome
871    
# Line 845  Line 923 
923      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
924      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
925      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
926        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
927      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
928      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
929      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 954  Line 1033 
1033    
1034  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1035    
1036  =head3 CreateIdentifier  =head3 ReadAnnotation
1037    
1038      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1039    
1040  Link an identifier to a feature. The identifier is presented in prefixed form and is of the  Read the next record from an annotation file. The next record must exist (that is, an
1041  specified type and the specified confidence level.  end-of-file check should have been performed before calling this method).
1042    
1043  =over 4  =over 4
1044    
1045  =item alias  =item ih
   
 Identifier to connect to the feature.  
1046    
1047  =item conf  Open file handle for the annotation file.
1048    
1049  Confidence level (C<A> curated, C<B> normal, C<C> protein only).  =item RETURN
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
1050    
1051  ID of the relevant feature.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1052    timestamp, (2) the user ID, and (3) the annotation text.
1053    
1054  =back  =back
1055    
1056  =cut  =cut
1057    
1058  sub CreateIdentifier {  sub ReadAnnotation {
1059      # Get the parameters.      # Get the parameter.
1060      my ($self, $alias, $conf, $aliasType, $fid) = @_;      my ($ih) = @_;
1061      # Get the Sapling object.      # Read the three fixed fields.
1062      my $sap = $self->{sap};      my $fid = <$ih>; chomp $fid;
1063      # Compute the identifier's natural form.      my $timestamp = <$ih>; chomp $timestamp;
1064      my $natural = $alias;      my $user = <$ih>; chomp $user;
1065      if ($natural =~ /[:|](.+)/) {      # Loop through the lines of the text field.
1066          $natural = $1;      my $text = "";
1067      }      my $line = <$ih>;
1068      # Insure the identifier exists in the database.      while (defined($line) && $line ne "//\n") {
1069      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);          $text .= $line;
1070      # Connect the identifier to the feature.          $line = <$ih>;
1071      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);      }
1072        # Remove the trailing new-line from the text.
1073        chomp $text;
1074        # Return the fields.
1075        return ($fid, $timestamp, $user, $text);
1076  }  }
1077    
1078  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3