[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Fri Apr 1 20:48:01 2011 UTC revision 1.15, Mon Dec 5 22:05:15 2011 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
60    =item disconnected
61    
62    True if the application is disconnected from the network - do not
63    attempt to contact a SAP server for more data.
64    
65    =item assignHash
66    
67    Hash of feature IDs to functional assignments. Deleted features are removed, which
68    means only features listed in this hash can be legally inserted into the database.
69    
70  =back  =back
71    
72  =cut  =cut
73    
74  sub Load {  sub Load {
75      # Get the parameters.      # Get the parameters.
76      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory, $disconnected) = @_;
77      # Create the loader object.      # Create the loader object.
78      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
79      # Load the contigs.      # Load the contigs.
80      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
81      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
82      # Load the features.      # Load the features.
83      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
84      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
85      # Check for annotation history. If we have it, load the history records into the      # Check for annotation history. If we have it, load the history records into the
86      # database.      # database.
87      if (-f "$directory/annotations") {      if (-f "$directory/annotations") {
88          Trace("Processing annotations.") if T(3);          Trace("Processing annotations.") if T(SaplingDataLoader => 3);
89          $loaderObject->LoadAnnotations("$directory/annotations");          $loaderObject->LoadAnnotations("$directory/annotations");
90      }      }
91      # Load the subsystem bindings.      # Load the subsystem bindings.
92      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
93      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
94      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
95      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
96      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
97      # Return the statistics.      # Return the statistics.
98      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 118  Line 128 
128      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
129      # Create the statistics object.      # Create the statistics object.
130      my $stats = Stats->new();      my $stats = Stats->new();
131      # Delete the DNA.      # Delete the DNA sequences.
132      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
133        for my $seq (@seqs) {
134            my $delStats = $sap->Delete(DNASequence => $seq);
135            $stats->Accumulate($delStats);
136        }
137      # Delete the contigs.      # Delete the contigs.
138      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
139      # Delete the features.      # Delete the features.
# Line 182  Line 196 
196    
197  =head3 new  =head3 new
198    
199      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
200    
201  Create a loader object that can be used to facilitate loading genome data from a  Create a loader object that can be used to facilitate loading genome data from a
202  directory.  directory.
# Line 201  Line 215 
215    
216  Name of the directory containing the genome information.  Name of the directory containing the genome information.
217    
218    =item disconnected
219    
220    Set to a true value if the application should be considered to be disconnected
221    from the network - that is, do not attempt to connect to a Sapling server
222    to load subsystem data.
223    
224  =back  =back
225    
226  The object created contains the following fields.  The object created contains the following fields.
# Line 227  Line 247 
247    
248  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
249    
 =item timestamps  
   
 A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,  
 and used to prevent duplicate timestamps.  
   
250  =back  =back
251    
252  =cut  =cut
253    
254  sub new {  sub new {
255      # Get the parameters.      # Get the parameters.
256      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory, $disconnected) = @_;
257      # Create the object.      # Create the object.
258      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
259      # Add our specialized data.      # Add our specialized data.
260      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
261      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
262      $retVal->{timestamps} = {};      # Leave the assignment hash undefined until we populate it.
263        $retVal->{assignHash} = undef;
264        $retVal->{disconnected} = defined($disconnected) ? 1 : 0;
265      # Return the result.      # Return the result.
266      return $retVal;      return $retVal;
267  }  }
# Line 254  Line 271 
271      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
272    
273  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
274  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
275  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
276    C<gc_content> statistic.
277    
278  =cut  =cut
279    
# Line 363  Line 381 
381      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
382      # Record the chunk.      # Record the chunk.
383      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
384        # Update the GC count.
385        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
386  }  }
387    
388  =head3 OutputContig  =head3 OutputContig
# Line 414  Line 434 
434      # Get the parameters.      # Get the parameters.
435      my ($self) = @_;      my ($self) = @_;
436      # Read in the functional assignments.      # Read in the functional assignments.
437      Trace("Reading functional assignments.") if T(3);      Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
438      my $assignHash = $self->ReadAssignments();      my $assignHash = $self->ReadAssignments();
439        $self->{assignHash} = $assignHash;
440      # Get the directory of feature types.      # Get the directory of feature types.
441      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
442      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
443        # Check for protein sequences. If we have some, load them into a hash.
444        my $protHash = {};
445        if (-f "$featureDir/peg/fasta") {
446            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
447            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
448        }
449      # Create the feature records for the types found.      # Create the feature records for the types found.
450      for my $type (@types) {      for my $type (@types) {
451          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
452          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
453              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
454              $self->LoadFeatureData($featureDir, $type, $assignHash);              my $evHash = {};
455                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
456                if (-f $tranFile) {
457                    $evHash = $self->LoadEvidenceCodes($tranFile);
458          }          }
459                # Now load the feature data.
460                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
461      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         Trace("Processing protein sequences.") if T(3);  
         $self->LoadProteinData("$featureDir/peg/fasta");  
462      }      }
463      # Now loop through the features, connecting them to their roles. Note that deleted  }
464      # features will not be in the assignment hash.  
465      Trace("Connecting features to roles.") if T(3);  =head3 LoadEvidenceCodes
466      for my $fid (keys %$assignHash) {  
467          $self->ConnectFunctionRoles($fid, $assignHash->{$fid});      my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
468    
469    Load the evidence codes from the specified attribute transaction log file into a
470    hash. The log file is in tab-delimited format. The first column contains the
471    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
472    the third an attribute name (we'll ignore everything but C<evidence_code>), and
473    the fourth the attribute value.
474    
475    =over 4
476    
477    =item attributeFile
478    
479    Name of the attribute transaction log file.
480    
481    =item RETURN
482    
483    Returns a reference to a hash mapping each feature ID to a comma-delimited list
484    of its evidence codes.
485    
486    =back
487    
488    =cut
489    
490    sub LoadEvidenceCodes {
491        # Get the parameters.
492        my ($self, $attributeFile) = @_;
493        # Get the Sapling database.
494        my $sap = $self->{sap};
495        # Get the statistics object.
496        my $stats = $self->{stats};
497        # Get the assignment hash: we use this to filter the feature IDs.
498        my $assignHash = $self->{assignHash};
499        # Open the attribute log file for input.
500        my $ih = Open(undef, "<$attributeFile");
501        # This two-dimensional hash will hold the evidence codes for each feature.
502        my %retVal;
503        # Loop through the attribute log file.
504        while (! eof $ih) {
505            # Get the current attribute record.
506            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
507            $stats->Add(attributeLine => 1);
508            # Insure we have all the pieces we need.
509            if (! $command || ! $fid || $key ne 'evidence_code') {
510                $stats->Add(attributeLineSkipped => 1);
511            } elsif (! $assignHash->{$fid}) {
512                # Here the attribute is for a deleted feature.
513                $stats->Add(attributeFidSkipped => 1);
514            } else {
515                # Get the sub-hash for this feature.
516                if (! exists $retVal{$fid}) {
517                    $retVal{$fid} = {};
518                }
519                my $featureSubHash = $retVal{$fid};
520                # Process according to the command.
521                if ($command eq 'ADD') {
522                    # Here we are adding a new evidence code.
523                    $featureSubHash->{$value} = 1;
524                    $stats->Add(attributeAdd => 1);
525                } elsif ($command eq 'DELETE') {
526                    # Here we are deleting an evidence code.
527                    delete $featureSubHash->{$value};
528                    $stats->Add(attributeDelete => 1);
529                } else {
530                    # here we have an unrecognized command.
531                    $stats->Add(attributeCommandSkip => 1);
532                }
533      }      }
534  }  }
535        # Loop through the hash, converting each sub-hash to a comma-delimited list of
536        # evidence codes.
537        for my $fid (keys %retVal) {
538            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
539        }
540        # Return the evidence hash.
541        return \%retVal;
542    }
543    
544    
545  =head3 LoadFeatureData  =head3 LoadFeatureData
546    
547      $loaderObject->LoadFeatureData($featureDir, $type, $assignHash);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
548    
549  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
550  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 457  Line 559 
559    
560  Type of feature to load.  Type of feature to load.
561    
562  =item assignHash  =item protHash
563    
564    Reference to a hash mapping each feature ID for a protein-encoding gene to
565    its protein sequence.
566    
567    =item evHash
568    
569  Reference to a hash mapping each feature ID to its functional assignment.  Reference to a hash mapping each feature ID to a comma-delimited list of
570    its evidence codes (if any).
571    
572  =back  =back
573    
# Line 467  Line 575 
575    
576  sub LoadFeatureData {  sub LoadFeatureData {
577      # Get the parameters.      # Get the parameters.
578      my ($self, $featureDir, $type, $assignHash) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
579      # Get the sapling database.      # Get the sapling database.
580      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
581      # Get the statistics object.      # Get the statistics object.
582      my $stats = $self->{stats};      my $stats = $self->{stats};
583        # Get the assignment hash. This tells us our functional assignments. This method is
584        # also where we remove the deleted features from it.
585        my $assignHash = $self->{assignHash};
586      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
587      # time, it overwrites the original.      # time, it overwrites the original.
588      my $fidHash = $self->{timestamps};      my %fidHash;
589      # Finally, we need the timestamp hash. The initial feature population      # Finally, we need the timestamp hash. The initial feature population
590      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
591      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
592        my %deleted_features;
593      if (-f $fileName) {      if (-f $fileName) {
594          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
595          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
596          my %deletedFids;          # that they are not used by subsequent methods.
597          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
598          if (-f $deleteFile) {          if (-f $deleteFile) {
599              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
600                while (! eof $dh) {
601                    my ($deletedFid) = Tracer::GetLine($dh);
602                    if (exists $assignHash->{$deletedFid}) {
603                        delete $assignHash->{$deletedFid};
604                        $stats->Add(deletedFid => 1);
605                        $deleted_features{$deletedFid} = 1;
606                    }
607                }
608          }          }
609          # Open the main file for input.          # Open the main file for input.
610          Trace("Reading features from $fileName.") if T(3);          Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
611          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
612          while (! eof $ih) {          while (! eof $ih) {
613              # Read this feature's information.              # Read this feature's information.
614              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
615              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
616              if (! exists $deletedFids{$fid}) {              if (!$deleted_features{$fid}) {
617                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
618                  if (exists $fidHash->{$fid}) {                  if ($fidHash{$fid}) {
619                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
620                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
621                  }                  }
622                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
623                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
624                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
625                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
626                  my @locData;                          $assignHash->{$fid} = $aliases[0];
627                  # This will contain the total sequence length.                      }
628                  my $length = 0;                      @aliases = ();
629                  # Loop through the locations.                  }
630                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
631                      # Get this location's components.                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
632                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                                    $protHash->{$fid}, $evHash->{$fid});
633                      # Normalize the contig.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
634                      $self->FixContig(\$contig);                  $fidHash{$fid} = 1;
                     # Compute the direction.  
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fidHash->{$fid} = {};  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
635              }              }
636          }          }
637      }      }
638  }  }
639    
640    
641  =head3 LoadProteinData  =head3 LoadProteinData
642    
643      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
644    
645  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
646  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
647    
648  =over 4  =over 4
649    
# Line 626  Line 651 
651    
652  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
653    
654    =item RETURN
655    
656    Returns a hash mapping feature IDs to protein sequences.
657    
658  =back  =back
659    
660  =cut  =cut
# Line 635  Line 664 
664      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
665      # Open the FASTA file for input.      # Open the FASTA file for input.
666      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
667        # Create the return hash.
668        my $retVal = {};
669      # We'll track the current protein in here.      # We'll track the current protein in here.
670      my $fid;      my $fid;
671      my $sequence = "";      my $sequence = "";
# Line 648  Line 679 
679              my $newFid = $1;              my $newFid = $1;
680              # Do we have an existing protein?              # Do we have an existing protein?
681              if (defined $fid) {              if (defined $fid) {
682                  # Yes. Write it out.                  # Yes. Store it in the hash.
683                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
684              }              }
685              # Initialize for the next protein.              # Initialize for the next protein.
686              $fid = $newFid;              $fid = $newFid;
# Line 661  Line 692 
692      }      }
693      # Do we have a residual protein.      # Do we have a residual protein.
694      if (defined $fid) {      if (defined $fid) {
695          # Yes. Write it out.          # Yes. Store it in the hash.
696          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
697      }      }
698        # Return the hash.
699        return $retVal;
700  }  }
701    
702    
# Line 689  Line 722 
722  sub LoadAnnotations {  sub LoadAnnotations {
723      # Get the parameters.      # Get the parameters.
724      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
725      # Get the timestamp hash.      # Get the assignment Hash. We use this to filter out deleted features.
726      my $timeHash = $self->{timestamps};      my $assignHash = $self->{assignHash};
727      # Get the Sapling database.      # Get the Sapling database.
728      my $sap = $self->{sap};      my $sap = $self->{sap};
729      # Get the statistics object.      # Get the statistics object.
# Line 701  Line 734 
734      while (! eof $ih) {      while (! eof $ih) {
735          # Read in the peg, timestamp, and user ID.          # Read in the peg, timestamp, and user ID.
736          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
737          # Only proceed if the feature exists.          # Only proceed if the feature is not deleted.
738          if (! exists $timeHash->{$fid}) {          if ($assignHash->{$fid}) {
739              $stats->Add(skippedAnnotation => 1);              # Add the annotation to this feature.
740          } else {              $self->MakeAnnotation($fid, $text, $user, $timestamp);
             # Change assignments by the master user to FIG assignments.  
             $text =~ s/Set master function/Set FIG function/s;  
             # Insure the time stamp is valid.  
             if ($timestamp =~ /^\d+$/) {  
                 # Here it's a number. We need to insure the one we use to form  
                 # the key is unique.  
                 my $keyStamp = $timestamp;  
                 while ($timeHash->{$fid}{$keyStamp}) {  
                     $keyStamp++;  
                     $stats->Add(skippedStamp => 1);  
                 }  
                 # Form the annotation ID.  
                 my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);  
                 $timeHash->{$fid}{$keyStamp} = 1;  
                 # Generate the annotation.  
                 $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);  
                 $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,  
                             comment => $text, annotator => $user);  
             } else {  
                 # Here we have an invalid time stamp.  
                 Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);  
             }  
741          }          }
742      }      }
743  }  }
# Line 776  Line 787 
787  sub LoadSubsystems {  sub LoadSubsystems {
788      # Get the parameters.      # Get the parameters.
789      my ($self) = @_;      my ($self) = @_;
790    
791        #
792        # If we are running in disconnected mode, do not actually load subsystems.
793        # They rely too much on information from the external sapling.
794        #
795        if ($self->{disconnected})
796        {
797            return;
798        }
799    
800      # Get the sapling object.      # Get the sapling object.
801      my $sap = $self->{sap};      my $sap = $self->{sap};
802      # Get the statistics object.      # Get the statistics object.
# Line 803  Line 824 
824      while (! eof $ih) {      while (! eof $ih) {
825          # Get this subsystem.          # Get this subsystem.
826          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
827            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
828          # Normalize the subsystem name.          # Normalize the subsystem name.
829          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
830          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
# Line 821  Line 843 
843                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
844                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
845                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
846                    $stats->Add(subsystems => 1);
847                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
848                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
849                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
# Line 835  Line 858 
858                      # Compute the machine-role ID.                      # Compute the machine-role ID.
859                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});                      my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
860                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;                      $machineRoles{$subsystem}{$roleID} = $machineRoleID;
861                        $stats->Add(subsystemRoles => 1);
862                  }                  }
863              }              }
864          } else {          } else {
# Line 856  Line 880 
880          $variantCode =~ s/\*+$//;          $variantCode =~ s/\*+$//;
881          # Compute the variant key.          # Compute the variant key.
882          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
883            # Insure we have it in the database.
884            if (! $sap->Exists(Variant => $variantKey)) {
885          # Get the variant from the sapling server.          # Get the variant from the sapling server.
886          my $variantH = $sapO->get(-objects => "Variant",          my $variantH = $sapO->get(-objects => "Variant",
887                                    -filter => {"Variant(id)" => ["=", $variantKey]},                                    -filter => {"Variant(id)" => ["=", $variantKey]},
# Line 864  Line 890 
890                                                "Variant(type)" => "type",                                                "Variant(type)" => "type",
891                                                "Variant(role-rule)" => "role-rule"},                                                "Variant(role-rule)" => "role-rule"},
892                                    -firstOnly => 1);                                    -firstOnly => 1);
         # Insure we have it in the database.  
893          $self->InsureEntity('Variant', $variantKey, %$variantH);          $self->InsureEntity('Variant', $variantKey, %$variantH);
         $stats->Add(subsystems => 1);  
894          # Connect it to the subsystem.          # Connect it to the subsystem.
895          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);          $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
896                $stats->Add(subsystemVariants => 1);
897            }
898          # Now we create the molecular machine connecting this genome to the subsystem          # Now we create the molecular machine connecting this genome to the subsystem
899          # variant.          # variant.
900          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
# Line 959  Line 985 
985      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
986      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
987      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
988        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
989      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
990      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
991      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 1066  Line 1093 
1093      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1094  }  }
1095    
 =head3 CreateIdentifier  
   
     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);  
   
 Link an identifier to a feature. The identifier is presented in prefixed form and is of the  
 specified type and the specified confidence level.  
   
 =over 4  
   
 =item alias  
   
 Identifier to connect to the feature.  
   
 =item conf  
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
   
 ID of the relevant feature.  
   
 =back  
   
 =cut  
   
 sub CreateIdentifier {  
     # Get the parameters.  
     my ($self, $alias, $conf, $aliasType, $fid) = @_;  
     # Get the Sapling object.  
     my $sap = $self->{sap};  
     # Compute the identifier's natural form.  
     my $natural = $alias;  
     if ($natural =~ /[:|](.+)/) {  
         $natural = $1;  
     }  
     # Insure the identifier exists in the database.  
     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);  
     # Connect the identifier to the feature.  
     $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);  
 }  
   
1096  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1097    
1098  =head3 ReadAnnotation  =head3 ReadAnnotation
# Line 1145  Line 1127 
1127      # Loop through the lines of the text field.      # Loop through the lines of the text field.
1128      my $text = "";      my $text = "";
1129      my $line = <$ih>;      my $line = <$ih>;
1130      while ($line ne "//\n") {      while (defined($line) && $line ne "//\n") {
1131          $text .= $line;          $text .= $line;
1132          $line = <$ih>;          $line = <$ih>;
1133      }      }

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3