[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sun Feb 13 13:02:30 2011 UTC revision 1.14, Wed Aug 24 21:11:36 2011 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item assignHash
61    
62    Hash of feature IDs to functional assignments. Deleted features are removed, which
63    means only features listed in this hash can be legally inserted into the database.
64    
65  =back  =back
66    
67  =cut  =cut
# Line 67  Line 72 
72      # Create the loader object.      # Create the loader object.
73      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
74      # Load the contigs.      # Load the contigs.
75      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
76      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
77      # Load the features.      # Load the features.
78      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
79      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
80        # Check for annotation history. If we have it, load the history records into the
81        # database.
82        if (-f "$directory/annotations") {
83            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
84            $loaderObject->LoadAnnotations("$directory/annotations");
85        }
86      # Load the subsystem bindings.      # Load the subsystem bindings.
87      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
88      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
89      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
90      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
91      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
92      # Return the statistics.      # Return the statistics.
93      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 112  Line 123 
123      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
124      # Create the statistics object.      # Create the statistics object.
125      my $stats = Stats->new();      my $stats = Stats->new();
126      # Delete the DNA.      # Delete the DNA sequences.
127      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
128        for my $seq (@seqs) {
129            my $delStats = $sap->Delete(DNASequence => $seq);
130            $stats->Accumulate($delStats);
131        }
132      # Delete the contigs.      # Delete the contigs.
133      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
134      # Delete the features.      # Delete the features.
# Line 128  Line 143 
143      return $stats;      return $stats;
144  }  }
145    
146    
147    =head3 Process
148    
149        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
150    
151    Load genome data from the specified directory. If the genome data already
152    exists in the database, it will be deleted first.
153    
154    =over 4
155    
156    =item sap
157    
158    L</Sapling> object for accessing the database.
159    
160    =item genome
161    
162    ID of the genome whose  data is being loaded.
163    
164    =item directory
165    
166    Name of the directory containing the genome data files.
167    
168    =item RETURN
169    
170    Returns a statistics object describing the activity during the reload.
171    
172    =back
173    
174    =cut
175    
176    sub Process {
177        # Get the parameters.
178        my ($sap, $genome, $directory) = @_;
179        # Clear the existing data for the specified genome.
180        my $stats = ClearGenome($sap, $genome);
181        # Load the new expression data from the specified directory.
182        my $newStats = Load($sap, $genome, $directory);
183        # Merge the statistics.
184        $stats->Accumulate($newStats);
185        # Return the result.
186        return $stats;
187    }
188    
189    
190  =head2 Loader Object Methods  =head2 Loader Object Methods
191    
192  =head3 new  =head3 new
# Line 189  Line 248 
248      # Add our specialized data.      # Add our specialized data.
249      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
250      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
251        # Leave the assignment hash undefined until we populate it.
252        $retVal->{assignHash} = undef;
253      # Return the result.      # Return the result.
254      return $retVal;      return $retVal;
255  }  }
# Line 198  Line 259 
259      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
260    
261  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
262  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
263  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
264    C<gc_content> statistic.
265    
266  =cut  =cut
267    
# Line 307  Line 369 
369      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
370      # Record the chunk.      # Record the chunk.
371      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
372        # Update the GC count.
373        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
374  }  }
375    
376  =head3 OutputContig  =head3 OutputContig
# Line 357  Line 421 
421  sub LoadFeatures {  sub LoadFeatures {
422      # Get the parameters.      # Get the parameters.
423      my ($self) = @_;      my ($self) = @_;
424        # Read in the functional assignments.
425        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
426        my $assignHash = $self->ReadAssignments();
427        $self->{assignHash} = $assignHash;
428      # Get the directory of feature types.      # Get the directory of feature types.
429      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
430      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
431        # Check for protein sequences. If we have some, load them into a hash.
432        my $protHash = {};
433        if (-f "$featureDir/peg/fasta") {
434            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
435            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
436        }
437      # Create the feature records for the types found.      # Create the feature records for the types found.
438      for my $type (@types) {      for my $type (@types) {
439          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
440          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
441              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
442              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
443                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
444                if (-f $tranFile) {
445                    $evHash = $self->LoadEvidenceCodes($tranFile);
446          }          }
447                # Now load the feature data.
448                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
449      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
450      }      }
451  }  }
452    
453    =head3 LoadEvidenceCodes
454    
455        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
456    
457    Load the evidence codes from the specified attribute transaction log file into a
458    hash. The log file is in tab-delimited format. The first column contains the
459    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
460    the third an attribute name (we'll ignore everything but C<evidence_code>), and
461    the fourth the attribute value.
462    
463    =over 4
464    
465    =item attributeFile
466    
467    Name of the attribute transaction log file.
468    
469    =item RETURN
470    
471    Returns a reference to a hash mapping each feature ID to a comma-delimited list
472    of its evidence codes.
473    
474    =back
475    
476    =cut
477    
478    sub LoadEvidenceCodes {
479        # Get the parameters.
480        my ($self, $attributeFile) = @_;
481        # Get the Sapling database.
482        my $sap = $self->{sap};
483        # Get the statistics object.
484        my $stats = $self->{stats};
485        # Get the assignment hash: we use this to filter the feature IDs.
486        my $assignHash = $self->{assignHash};
487        # Open the attribute log file for input.
488        my $ih = Open(undef, "<$attributeFile");
489        # This two-dimensional hash will hold the evidence codes for each feature.
490        my %retVal;
491        # Loop through the attribute log file.
492        while (! eof $ih) {
493            # Get the current attribute record.
494            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
495            $stats->Add(attributeLine => 1);
496            # Insure we have all the pieces we need.
497            if (! $command || ! $fid || $key ne 'evidence_code') {
498                $stats->Add(attributeLineSkipped => 1);
499            } elsif (! $assignHash->{$fid}) {
500                # Here the attribute is for a deleted feature.
501                $stats->Add(attributeFidSkipped => 1);
502            } else {
503                # Get the sub-hash for this feature.
504                if (! exists $retVal{$fid}) {
505                    $retVal{$fid} = {};
506                }
507                my $featureSubHash = $retVal{$fid};
508                # Process according to the command.
509                if ($command eq 'ADD') {
510                    # Here we are adding a new evidence code.
511                    $featureSubHash->{$value} = 1;
512                    $stats->Add(attributeAdd => 1);
513                } elsif ($command eq 'DELETE') {
514                    # Here we are deleting an evidence code.
515                    delete $featureSubHash->{$value};
516                    $stats->Add(attributeDelete => 1);
517                } else {
518                    # here we have an unrecognized command.
519                    $stats->Add(attributeCommandSkip => 1);
520                }
521            }
522        }
523        # Loop through the hash, converting each sub-hash to a comma-delimited list of
524        # evidence codes.
525        for my $fid (keys %retVal) {
526            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
527        }
528        # Return the evidence hash.
529        return \%retVal;
530    }
531    
532    
533  =head3 LoadFeatureData  =head3 LoadFeatureData
534    
535      $loaderObject->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
536    
537  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
538  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 391  Line 547 
547    
548  Type of feature to load.  Type of feature to load.
549    
550    =item protHash
551    
552    Reference to a hash mapping each feature ID for a protein-encoding gene to
553    its protein sequence.
554    
555    =item evHash
556    
557    Reference to a hash mapping each feature ID to a comma-delimited list of
558    its evidence codes (if any).
559    
560  =back  =back
561    
562  =cut  =cut
563    
564  sub LoadFeatureData {  sub LoadFeatureData {
565      # Get the parameters.      # Get the parameters.
566      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
567      # Get the sapling database.      # Get the sapling database.
568      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
569      # Get the statistics object.      # Get the statistics object.
570      my $stats = $self->{stats};      my $stats = $self->{stats};
571      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
572      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
573        my $assignHash = $self->{assignHash};
574      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
575      # time, it overwrites the original.      # time, it overwrites the original.
576      my %fids;      my %fidHash;
577        # Finally, we need the timestamp hash. The initial feature population
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
580      if (-f $fileName) {      if (-f $fileName) {
581          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
582          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
583          my %deletedFids;          # that they are not used by subsequent methods.
584          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
585          if (-f $deleteFile) {          if (-f $deleteFile) {
586              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
587                while (! eof $dh) {
588                    my ($deletedFid) = Tracer::GetLine($dh);
589                    if (exists $assignHash->{$deletedFid}) {
590                        delete $assignHash->{$deletedFid};
591                        $stats->Add(deletedFid => 1);
592                    }
593                }
594          }          }
595          # Open the main file for input.          # Open the main file for input.
596            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
597          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
598          while (! eof $ih) {          while (! eof $ih) {
599              # Read this feature's information.              # Read this feature's information.
600              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
601              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
602              if (! exists $deletedFids{$fid}) {              if (exists $assignHash->{$fid}) {
603                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
604                  if (exists $fids{$fid}) {                  if ($fidHash{$fid}) {
605                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
606                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
607                  }                  }
608                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
609                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
610                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
611                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
612                  my @locData;                          $assignHash->{$fid} = $aliases[0];
613                  # This will contain the total sequence length.                      }
614                  my $length = 0;                      @aliases = ();
615                  # Loop through the locations.                  }
616                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
617                      # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
618                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
619                      # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
620                      $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                     # Compute the direction.  
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fids{$fid} = 1;  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # Now loop through the features, connecting them to their roles. Note that deleted  
     # features will not be in the assignment hash.  
     for my $fid (keys %$assignHash) {  
         # Get the roles and the error count.  
         my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});  
         # Accumulate the errors in the stats object.  
         $stats->Add(roleErrors => $errors);  
         # Is this a suspicious function?  
         if (! defined $roles) {  
             # Yes, so track it.  
             $stats->Add(badFunction => 1);  
         } else {  
             # No, connect the roles.  
             for my $role (@$roles) {  
                 # Insure this role exists.  
                 my $hypo = hypo($role);  
                 $self->InsureEntity(Role => $role, hypothetical => $hypo);  
                 # Connect it to the feature.  
                 $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);  
621              }              }
622          }          }
623      }      }
624  }  }
625    
626    
627  =head3 LoadProteinData  =head3 LoadProteinData
628    
629      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
630    
631  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
632  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
633    
634  =over 4  =over 4
635    
# Line 578  Line 637 
637    
638  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
639    
640    =item RETURN
641    
642    Returns a hash mapping feature IDs to protein sequences.
643    
644  =back  =back
645    
646  =cut  =cut
# Line 587  Line 650 
650      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
651      # Open the FASTA file for input.      # Open the FASTA file for input.
652      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
653        # Create the return hash.
654        my $retVal = {};
655      # We'll track the current protein in here.      # We'll track the current protein in here.
656      my $fid;      my $fid;
657      my $sequence = "";      my $sequence = "";
# Line 600  Line 665 
665              my $newFid = $1;              my $newFid = $1;
666              # Do we have an existing protein?              # Do we have an existing protein?
667              if (defined $fid) {              if (defined $fid) {
668                  # Yes. Write it out.                  # Yes. Store it in the hash.
669                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
670              }              }
671              # Initialize for the next protein.              # Initialize for the next protein.
672              $fid = $newFid;              $fid = $newFid;
# Line 613  Line 678 
678      }      }
679      # Do we have a residual protein.      # Do we have a residual protein.
680      if (defined $fid) {      if (defined $fid) {
681          # Yes. Write it out.          # Yes. Store it in the hash.
682          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
683        }
684        # Return the hash.
685        return $retVal;
686    }
687    
688    
689    =head3 LoadAnnotations
690    
691        $loaderObject->LoadAnnotations($fileName);
692    
693    Read in the annotation history information and use it to create annotation records.
694    
695    =over 4
696    
697    =item fileName
698    
699    Name of the annotation history file. This file is formatted with four fields per
700    record. Each field is on a separate line, with a double slash (C<//>) used as the
701    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
702    (formatted as an integer), (2) the user name, and (3) the annotation text.
703    
704    =back
705    
706    =cut
707    
708    sub LoadAnnotations {
709        # Get the parameters.
710        my ($self, $fileName) = @_;
711        # Get the assignment Hash. We use this to filter out deleted features.
712        my $assignHash = $self->{assignHash};
713        # Get the Sapling database.
714        my $sap = $self->{sap};
715        # Get the statistics object.
716        my $stats = $self->{stats};
717        # Open the input file.
718        my $ih = Tracer::Open(undef, "<$fileName");
719        # Loop through the input.
720        while (! eof $ih) {
721            # Read in the peg, timestamp, and user ID.
722            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
723            # Only proceed if the feature is not deleted.
724            if ($assignHash->{$fid}) {
725                # Add the annotation to this feature.
726                $self->MakeAnnotation($fid, $text, $user, $timestamp);
727      }      }
728  }  }
729    }
730    
731    
732  =head3 WriteProtein  =head3 WriteProtein
733    
# Line 643  Line 754 
754      # Get the parameters.      # Get the parameters.
755      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
756      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
757      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
758      # Insure the protein exists.      # Insure the protein exists.
759      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
760      # Connect the feature to it.      # Connect the feature to it.
# Line 689  Line 800 
800      while (! eof $ih) {      while (! eof $ih) {
801          # Get this subsystem.          # Get this subsystem.
802          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
803            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
804          # Normalize the subsystem name.          # Normalize the subsystem name.
805          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
806          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 707  Line 819 
819                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
820                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
821                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
822                    $stats->Add(subsystems => 1);
823                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
824                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
825                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
# Line 721  Line 834 
834                      # Compute the machine-role ID.                      # Compute the machine-role ID.
835                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
836                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
837                        $stats->Add(subsystemRoles => 1);
838                  }                  }
839              }              }
840          } else {          } else {
# Line 742  Line 856 
856          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
857          # Compute the variant key.          # Compute the variant key.
858          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
859            # Insure we have it in the database.
860            if (! $sap->Exists(Variant => $variantKey)) {
861          # Get the variant from the sapling server.          # Get the variant from the sapling server.
862          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
863                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 750  Line 866 
866                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
867                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
868                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
869          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
870          # Connect it to the subsystem.          # Connect it to the subsystem.
871          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
872                $stats->Add(subsystemVariants => 1);
873            }
874          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
875          # variant.          # variant.
876          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 845  Line 961 
961      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
962      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
963      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
964        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
965      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
966      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
967      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 954  Line 1071 
1071    
1072  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1073    
1074  =head3 CreateIdentifier  =head3 ReadAnnotation
1075    
1076      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1077    
1078  Link an identifier to a feature. The identifier is presented in prefixed form and is of the  Read the next record from an annotation file. The next record must exist (that is, an
1079  specified type and the specified confidence level.  end-of-file check should have been performed before calling this method).
1080    
1081  =over 4  =over 4
1082    
1083  =item alias  =item ih
   
 Identifier to connect to the feature.  
1084    
1085  =item conf  Open file handle for the annotation file.
1086    
1087  Confidence level (C<A> curated, C<B> normal, C<C> protein only).  =item RETURN
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
1088    
1089  ID of the relevant feature.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1090    timestamp, (2) the user ID, and (3) the annotation text.
1091    
1092  =back  =back
1093    
1094  =cut  =cut
1095    
1096  sub CreateIdentifier {  sub ReadAnnotation {
1097      # Get the parameters.      # Get the parameter.
1098      my ($self, $alias, $conf, $aliasType, $fid) = @_;      my ($ih) = @_;
1099      # Get the Sapling object.      # Read the three fixed fields.
1100      my $sap = $self->{sap};      my $fid = <$ih>; chomp $fid;
1101      # Compute the identifier's natural form.      my $timestamp = <$ih>; chomp $timestamp;
1102      my $natural = $alias;      my $user = <$ih>; chomp $user;
1103      if ($natural =~ /[:|](.+)/) {      # Loop through the lines of the text field.
1104          $natural = $1;      my $text = "";
1105      }      my $line = <$ih>;
1106      # Insure the identifier exists in the database.      while (defined($line) && $line ne "//\n") {
1107      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);          $text .= $line;
1108      # Connect the identifier to the feature.          $line = <$ih>;
1109      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);      }
1110        # Remove the trailing new-line from the text.
1111        chomp $text;
1112        # Return the fields.
1113        return ($fid, $timestamp, $user, $text);
1114  }  }
1115    
1116  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.14

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3