[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Sun Aug 14 23:47:16 2005 UTC revision 1.40, Thu Jun 8 15:37:32 2006 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 29  Line 30 
30      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
31      print $stats->Show();      print $stats->Show();
32    
 This module makes use of the internal Sprout property C<_erdb>.  
   
33  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
34  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
35  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 40  Line 39 
39  a variable called C<$fig>. This makes it fairly straightforward to determine which  a variable called C<$fig>. This makes it fairly straightforward to determine which
40  FIG methods are required to load the Sprout database.  FIG methods are required to load the Sprout database.
41    
42    This object creates the load files; however, the tables are not created until it
43    is time to actually do the load from the files into the target database.
44    
45  =cut  =cut
46    
47  #: Constructor SproutLoad->new();  #: Constructor SproutLoad->new();
# Line 48  Line 50 
50    
51  =head3 new  =head3 new
52    
53  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
54    
55  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
56  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 76  Line 78 
78  =item subsysFile  =item subsysFile
79    
80  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
81  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83    in its data directory.) Only subsystem data related to the trusted subsystems is loaded.
84    
85    =item options
86    
87    Reference to a hash of command-line options.
88    
89  =back  =back
90    
# Line 85  Line 92 
92    
93  sub new {  sub new {
94      # Get the parameters.      # Get the parameters.
95      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
96      # Load the list of genomes into a hash.      # Create the genome hash.
97      my %genomes;      my %genomes = ();
98        # We only need it if load-only is NOT specified.
99        if (! $options->{loadOnly}) {
100      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
101          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
102          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
103          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
104      } elsif (ref $genomeFile eq 'HASH') {          } else {
105                my $type = ref $genomeFile;
106                Trace("Genome file parameter type is \"$type\".") if T(3);
107                if ($type eq 'HASH') {
108          # Here the user specified a hash of genome IDs to access codes, which is          # Here the user specified a hash of genome IDs to access codes, which is
109          # exactly what we want.          # exactly what we want.
110          %genomes = %{$genomeFile};          %genomes = %{$genomeFile};
111      } elsif (ref $genomeFile eq 'SCALAR') {              } elsif (! $type || $type eq 'SCALAR' ) {
112          # The caller specified a file, so read the genomes from the file.                  # The caller specified a file, so read the genomes from the file. (Note
113                    # that some PERLs return an empty string rather than SCALAR.)
114          my @genomeList = Tracer::GetFile($genomeFile);          my @genomeList = Tracer::GetFile($genomeFile);
115          if (! @genomeList) {          if (! @genomeList) {
116              # It's an error if the genome file is empty or not found.              # It's an error if the genome file is empty or not found.
# Line 114  Line 127 
127              }              }
128          }          }
129      } else {      } else {
         my $type = ref $genomeFile;  
130          Confess("Invalid genome parameter ($type) in SproutLoad constructor.");          Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
131      }      }
132            }
133        }
134      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
135      my %subsystems = ();      my %subsystems = ();
136        # We only need it if load-only is NOT specified.
137        if (! $options->{loadOnly}) {
138      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
139          # Here we want all the subsystems.              # Here we want all the NMPDR subsystems. First we get the whole list.
140          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
141      } elsif (ref $subsysFile eq 'ARRAY') {              # Loop through, checking for the NMPDR file.
142                for my $sub (@subs) {
143                    if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") {
144                        $subsystems{$sub} = 1;
145                    }
146                }
147            } else {
148                my $type = ref $subsysFile;
149                if ($type eq 'ARRAY') {
150          # Here the user passed in a list of subsystems.          # Here the user passed in a list of subsystems.
151          %subsystems = map { $_ => 1 } @{$subsysFile};          %subsystems = map { $_ => 1 } @{$subsysFile};
152      } elsif (ref $subsysFile eq 'SCALAR') {              } elsif (! $type || $type eq 'SCALAR') {
153          # Here the list of subsystems is in a file.          # Here the list of subsystems is in a file.
154          if (! -e $subsysFile) {          if (! -e $subsysFile) {
155              # It's an error if the file does not exist.              # It's an error if the file does not exist.
# Line 138  Line 162 
162      } else {      } else {
163          Confess("Invalid subsystem parameter in SproutLoad constructor.");          Confess("Invalid subsystem parameter in SproutLoad constructor.");
164      }      }
165            }
166        }
167      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
168      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
169      # Create the Sprout load object.      # Create the Sprout load object.
# Line 147  Line 173 
173                    subsystems => \%subsystems,                    subsystems => \%subsystems,
174                    sprout => $sprout,                    sprout => $sprout,
175                    loadDirectory => $directory,                    loadDirectory => $directory,
176                    erdb => $sprout->{_erdb},                    erdb => $sprout,
177                    loaders => []                    loaders => [],
178                      options => $options
179                   };                   };
180      # Bless and return it.      # Bless and return it.
181      bless $retVal, $class;      bless $retVal, $class;
182      return $retVal;      return $retVal;
183  }  }
184    
185    =head3 LoadOnly
186    
187    C<< my $flag = $spl->LoadOnly; >>
188    
189    Return TRUE if we are in load-only mode, else FALSE.
190    
191    =cut
192    
193    sub LoadOnly {
194        my ($self) = @_;
195        return $self->{options}->{loadOnly};
196    }
197    
198    =head3 PrimaryOnly
199    
200    C<< my $flag = $spl->PrimaryOnly; >>
201    
202    Return TRUE if only the main entity is to be loaded, else FALSE.
203    
204    =cut
205    
206    sub PrimaryOnly {
207        my ($self) = @_;
208        return $self->{options}->{primaryOnly};
209    }
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 182  Line 235 
235    
236  =back  =back
237    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
238  =cut  =cut
239  #: Return Type $%;  #: Return Type $%;
240  sub LoadGenomeData {  sub LoadGenomeData {
# Line 200  Line 245 
245      # Get the genome count.      # Get the genome count.
246      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
251      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
253      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
254        if ($self->{options}->{loadOnly}) {
255            Trace("Loading from existing files.") if T(2);
256        } else {
257            Trace("Generating genome data.") if T(2);
258      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
259      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
260          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
261                $loadGenome->Add("genomeIn");
262          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
263          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
264          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
265          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
266          my $extra = join " ", @extraData, "[genomeID]";              my $extra = join " ", @extraData;
267          # Get the full taxonomy.          # Get the full taxonomy.
268          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
269          # Output the genome record.          # Output the genome record.
# Line 225  Line 273 
273          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
274          for my $contigID (@contigs) {          for my $contigID (@contigs) {
275              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
276                    $loadContig->Add("contigIn");
277                    $loadSequence->Add("contigIn");
278              # Create the contig ID.              # Create the contig ID.
279              my $sproutContigID = "$genomeID:$contigID";              my $sproutContigID = "$genomeID:$contigID";
280              # Create the contig record and relate it to the genome.              # Create the contig record and relate it to the genome.
# Line 236  Line 286 
286              # Now we get the sequence a chunk at a time.              # Now we get the sequence a chunk at a time.
287              my $contigLen = $fig->contig_ln($genomeID, $contigID);              my $contigLen = $fig->contig_ln($genomeID, $contigID);
288              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
289                        $loadSequence->Add("chunkIn");
290                  # Compute the endpoint of this chunk.                  # Compute the endpoint of this chunk.
291                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);
292                  # Get the actual DNA.                  # Get the actual DNA.
# Line 248  Line 299 
299              }              }
300          }          }
301      }      }
302        }
303      # Finish the loads.      # Finish the loads.
304      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
305      # Return the result.      # Return the result.
# Line 291  Line 343 
343      my $genomeCount = (keys %{$genomeFilter});      my $genomeCount = (keys %{$genomeFilter});
344      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
345      # Start the loads.      # Start the loads.
346      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
347      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
348      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
349      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
350      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
351      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
352            Trace("Loading from existing files.") if T(2);
353        } else {
354            Trace("Generating coupling data.") if T(2);
355      # Loop through the genomes found.      # Loop through the genomes found.
356      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
357          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
358                $loadCoupling->Add("genomeIn");
359          # Create a hash table for holding coupled pairs. We use this to prevent          # Create a hash table for holding coupled pairs. We use this to prevent
360          # duplicates. For example, if A is coupled to B, we don't want to also          # duplicates. For example, if A is coupled to B, we don't want to also
361          # assert that B is coupled to A, because we already know it. Fortunately,          # assert that B is coupled to A, because we already know it. Fortunately,
# Line 310  Line 366 
366          my @pegs = $fig->pegs_of($genome);          my @pegs = $fig->pegs_of($genome);
367          # Loop through the PEGs.          # Loop through the PEGs.
368          for my $peg1 (@pegs) {          for my $peg1 (@pegs) {
369                    $loadCoupling->Add("pegIn");
370              Trace("Processing PEG $peg1 for $genome.") if T(4);              Trace("Processing PEG $peg1 for $genome.") if T(4);
371              # Get a list of the coupled PEGs.              # Get a list of the coupled PEGs.
372              my @couplings = $fig->coupled_to($peg1);              my @couplings = $fig->coupled_to($peg1);
# Line 320  Line 377 
377                  # Compute the coupling ID.                  # Compute the coupling ID.
378                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                  my $coupleID = Sprout::CouplingID($peg1, $peg2);
379                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
380                            $loadCoupling->Add("couplingIn");
381                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
382                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);
383                      # Ensure we don't do this again.                      # Ensure we don't do this again.
# Line 335  Line 393 
393                      my %evidenceMap = ();                      my %evidenceMap = ();
394                      # Process each evidence item.                      # Process each evidence item.
395                      for my $evidenceData (@evidence) {                      for my $evidenceData (@evidence) {
396                                $loadPCH->Add("evidenceIn");
397                          my ($peg3, $peg4, $usage) = @{$evidenceData};                          my ($peg3, $peg4, $usage) = @{$evidenceData};
398                          # Only proceed if the evidence is from a Sprout                          # Only proceed if the evidence is from a Sprout
399                          # genome.                          # genome.
400                          if ($genomeFilter->{$fig->genome_of($peg3)}) {                          if ($genomeFilter->{$fig->genome_of($peg3)}) {
401                                    $loadUsesAsEvidence->Add("evidenceChosen");
402                              my $evidenceKey = "$coupleID $peg3 $peg4";                              my $evidenceKey = "$coupleID $peg3 $peg4";
403                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
404                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
405                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
406                              # at least keep the meaningful ones. Only evidence is                                  # at least keep the meaningful ones. Only evidence in
407                              # the hash makes it to the output.                              # the hash makes it to the output.
408                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
409                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 358  Line 418 
418                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);
419                          # Connect it to the features.                          # Connect it to the features.
420                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
421                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                              $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);
422                            }
423                      }                      }
424                  }                  }
425              }              }
# Line 385  Line 446 
446      FeatureTranslation      FeatureTranslation
447      FeatureUpstream      FeatureUpstream
448      IsLocatedIn      IsLocatedIn
449        HasFeature
450    
451  =over 4  =over 4
452    
# Line 403  Line 465 
465      my $fig = $self->{fig};      my $fig = $self->{fig};
466      # Get the table of genome IDs.      # Get the table of genome IDs.
467      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
468      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
469      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
470      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
471      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
472      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
473      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
474      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
475        my $loadHasFeature = $self->_TableLoader('HasFeature');
476      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
477      # locations.      # locations.
478      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
479      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
480            Trace("Loading from existing files.") if T(2);
481        } else {
482            Trace("Generating feature data.") if T(2);
483      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
484      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
485          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
486                $loadFeature->Add("genomeIn");
487          # Get the feature list for this genome.          # Get the feature list for this genome.
488          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
489          # Loop through the features.          # Loop through the features.
490          for my $featureData (@{$features}) {          for my $featureData (@{$features}) {
491                    $loadFeature->Add("featureIn");
492              # Split the tuple.              # Split the tuple.
493              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type) = @{$featureData};
494              # Create the feature record.              # Create the feature record.
495              $loadFeature->Put($featureID, 1, $type);              $loadFeature->Put($featureID, 1, $type);
496                    # Link it to the parent genome.
497                    $loadHasFeature->Put($genomeID, $featureID, $type);
498              # Create the aliases.              # Create the aliases.
499              for my $alias (split /\s*,\s*/, $aliases) {                  for my $alias ($fig->feature_aliases($featureID)) {
500                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
501              }              }
502              # Get the links.              # Get the links.
# Line 438  Line 506 
506              }              }
507              # If this is a peg, generate the translation and the upstream.              # If this is a peg, generate the translation and the upstream.
508              if ($type eq 'peg') {              if ($type eq 'peg') {
509                        $loadFeatureTranslation->Add("pegIn");
510                  my $translation = $fig->get_translation($featureID);                  my $translation = $fig->get_translation($featureID);
511                  if ($translation) {                  if ($translation) {
512                      $loadFeatureTranslation->Put($featureID, $translation);                      $loadFeatureTranslation->Put($featureID, $translation);
# Line 453  Line 522 
522              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
523              # for Sprout.              # for Sprout.
524              my @locationList = split /\s*,\s*/, $locations;              my @locationList = split /\s*,\s*/, $locations;
525                    # Create the location position indicator.
526                    my $i = 1;
527              # Loop through the locations.              # Loop through the locations.
528              for my $location (@locationList) {              for my $location (@locationList) {
529                  # Parse the location.                  # Parse the location.
530                  my $locObject = BasicLocation->new($location);                      my $locObject = BasicLocation->new("$genomeID:$location");
531                  # Split it into a list of chunks.                  # Split it into a list of chunks.
532                  my @locOList = ();                  my @locOList = ();
533                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
534                            $loadIsLocatedIn->Add("peeling");
535                      push @locOList, $peeling;                      push @locOList, $peeling;
536                  }                  }
537                  push @locOList, $locObject;                  push @locOList, $locObject;
538                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
539                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
540                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
541                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
542                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
# Line 474  Line 545 
545              }              }
546          }          }
547      }      }
548        }
549      # Finish the loads.      # Finish the loads.
550      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
551      return $retVal;      return $retVal;
# Line 510  Line 582 
582      my $fig = $self->{fig};      my $fig = $self->{fig};
583      # Get the table of genome IDs.      # Get the table of genome IDs.
584      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
585      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
586      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');
587                                                             $featureCount * $genomeCount);      if ($self->{options}->{loadOnly}) {
588      Trace("Beginning BBH load.") if T(2);          Trace("Loading from existing files.") if T(2);
589        } else {
590            Trace("Generating BBH data.") if T(2);
591      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
592      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
593                $loadIsBidirectionalBestHitOf->Add("genomeIn");
594          Trace("Processing features for genome $genomeID.") if T(3);          Trace("Processing features for genome $genomeID.") if T(3);
595          # Get the feature list for this genome.          # Get the feature list for this genome.
596          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
# Line 540  Line 613 
613              }              }
614          }          }
615      }      }
616        }
617      # Finish the loads.      # Finish the loads.
618      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
619      return $retVal;      return $retVal;
# Line 561  Line 635 
635    
636      Subsystem      Subsystem
637      Role      Role
638        RoleEC
639      SSCell      SSCell
640      ContainsFeature      ContainsFeature
641      IsGenomeOf      IsGenomeOf
# Line 568  Line 643 
643      OccursInSubsystem      OccursInSubsystem
644      ParticipatesIn      ParticipatesIn
645      HasSSCell      HasSSCell
646        ConsistsOfRoles
647        RoleSubset
648        HasRoleSubset
649        ConsistsOfGenomes
650        GenomeSubset
651        HasGenomeSubset
652        Catalyzes
653        Diagram
654        RoleOccursIn
655    
656  =over 4  =over 4
657    
# Line 577  Line 661 
661    
662  =back  =back
663    
 B<TO DO>  
   
 Generate RoleName table?  
   
664  =cut  =cut
665  #: Return Type $%;  #: Return Type $%;
666  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 594  Line 674 
674      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
675      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
676      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
677      my $subsysCount = @subsysIDs;      # Get the map list.
678      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
679      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
680      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
681      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
682      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
683      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
684      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
685      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
686      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
687      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
688      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
689      Trace("Beginning subsystem data load.") if T(2);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
690        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
691        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
692        my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
693        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
694        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
695        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
696        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
697        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
698        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
699        if ($self->{options}->{loadOnly}) {
700            Trace("Loading from existing files.") if T(2);
701        } else {
702            Trace("Generating subsystem data.") if T(2);
703            # This hash will contain the role for each EC. When we're done, this
704            # information will be used to generate the Catalyzes table.
705            my %ecToRoles = ();
706      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
707      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
708      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
709      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
710            # and memorize up the reactions.
711            my ($genomeID, $roleID);
712      my %roleData = ();      my %roleData = ();
713      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
714                # Get the subsystem object.
715                my $sub = $fig->get_subsystem($subsysID);
716                # Only proceed if the subsystem has a spreadsheet.
717                if (! $sub->{empty_ss}) {
718          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
719                    $loadSubsystem->Add("subsystemIn");
720          # Create the subsystem record.          # Create the subsystem record.
721          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
722          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
723          my @roles = $fig->subsys_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
724          # Connect the roles to the subsystem. If a role is new, we create                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
725          # a role record for it.                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
726          for my $roleID (@roles) {                      # Connect to this role.
727              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Add("roleIn");
728                        $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
729                        # If it's a new role, add it to the role table.
730              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
731                  $loadRole->Put($roleID);                          # Get the role's abbreviation.
732                            my $abbr = $sub->get_role_abbr($col);
733                            # Add the role.
734                            $loadRole->Put($roleID, $abbr);
735                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
736                            # Check for an EC number.
737                            if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
738                                my $ec = $1;
739                                $loadRoleEC->Put($roleID, $ec);
740                                $ecToRoles{$ec} = $roleID;
741              }              }
742          }          }
743          # Now all roles for this subsystem have been filled in. We create the                  }
744          # spreadsheet by matches roles to genomes. To do this, we need to                  # Now we create the spreadsheet for the subsystem by matching roles to
745          # get the genomes on the sheet.                  # genomes. Each genome is a row and each role is a column. We may need
746                    # to actually create the roles as we find them.
747          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
748          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
749          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
750              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
751                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
752                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
753                            my $cellCount = 0;
754                            # Create a list for the PEGs we find. This list will be used
755                            # to generate cluster numbers.
756                            my @pegsFound = ();
757                            # Create a hash that maps spreadsheet IDs to PEGs. We will
758                            # use this to generate the ContainsFeature data after we have
759                            # the cluster numbers.
760                            my %cellPegs = ();
761                            # Get the genome's variant code for this subsystem.
762                            my $variantCode = $sub->get_variant_code($row);
763                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
764                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
765                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
766                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
767                      my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
768                      # Only proceed if features exist.                      # Only proceed if features exist.
769                      if (@pegs > 0) {                      if (@pegs > 0) {
770                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
771                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
772                                    my $cellID = "$subsysID:$genomeID:$col";
773                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
774                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
775                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
776                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
777                          # Attach the features to it.                                  # Remember its features.
778                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
779                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
780                                    $pegCount += @pegs;
781                                }
782                            }
783                            # If we found some cells for this genome, we need to compute clusters and
784                            # denote it participates in the subsystem.
785                            if ($pegCount > 0) {
786                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
787                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
788                                # Partition the PEGs found into clusters.
789                                my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
790                                # Create a hash mapping PEG IDs to cluster numbers.
791                                # We default to -1 for all of them.
792                                my %clusterOf = map { $_ => -1 } @pegsFound;
793                                for (my $i = 0; $i <= $#clusters; $i++) {
794                                    my $subList = $clusters[$i];
795                                    for my $peg (@{$subList}) {
796                                        $clusterOf{$peg} = $i;
797                                    }
798                                }
799                                # Create the ContainsFeature data.
800                                for my $cellID (keys %cellPegs) {
801                                    my $cellList = $cellPegs{$cellID};
802                                    for my $cellPeg (@$cellList) {
803                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
804                          }                          }
805                      }                      }
806                  }                  }
807              }              }
808          }          }
809                    # Now we need to generate the subsets. The subset names must be concatenated to
810                    # the subsystem name to make them unique keys. There are two types of subsets:
811                    # genome subsets and role subsets. We do the role subsets first.
812                    my @subsetNames = $sub->get_subset_names();
813                    for my $subsetID (@subsetNames) {
814                        # Create the subset record.
815                        my $actualID = "$subsysID:$subsetID";
816                        $loadRoleSubset->Put($actualID);
817                        # Connect the subset to the subsystem.
818                        $loadHasRoleSubset->Put($subsysID, $actualID);
819                        # Connect the subset to its roles.
820                        my @roles = $sub->get_subsetC_roles($subsetID);
821                        for my $roleID (@roles) {
822                            $loadConsistsOfRoles->Put($actualID, $roleID);
823      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
824  }  }
825                    # Next the genome subsets.
826  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
827                    for my $subsetID (@subsetNames) {
828  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
829                        my $actualID = "$subsysID:$subsetID";
830  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
831                        # Connect the subset to the subsystem.
832  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
833  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
834                        my @genomes = $sub->get_subsetR($subsetID);
835  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
836                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
837      Diagram                      }
838      RoleOccursIn                  }
839                }
840  =over 4              # Now we loop through the diagrams. We need to create the diagram records
841                # and link each diagram to its roles. Note that only roles which occur
842  =item RETURNS              # in subsystems (and therefore appear in the %ecToRoles hash) are
843                # included.
844  Returns a statistics object for the loads.              for my $map (@maps) {
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
845          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
846          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
847          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 714  Line 850 
850          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
851          my %roleHash = ();          my %roleHash = ();
852          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
853              if (! $roleHash{$role}) {                      if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
854                  $loadRoleOccursIn->Put($role, $map);                          $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
855                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
856              }              }
857          }          }
858      }      }
859                # Before we leave, we must create the Catalyzes table. We start with the reactions,
860                # then use the "ecToRoles" table to convert EC numbers to role IDs.
861                my @reactions = $fig->all_reactions();
862                for my $reactionID (@reactions) {
863                    # Get this reaction's list of roles. The results will be EC numbers.
864                    my @roles = $fig->catalyzed_by($reactionID);
865                    # Loop through the roles, creating catalyzation records.
866                    for my $thisRole (@roles) {
867                        if (exists $ecToRoles{$thisRole}) {
868                            $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
869                        }
870                    }
871                }
872            }
873        }
874      # Finish the load.      # Finish the load.
875      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
876      return $retVal;      return $retVal;
# Line 761  Line 912 
912      my $fig = $self->{fig};      my $fig = $self->{fig};
913      # Get the genome hash.      # Get the genome hash.
914      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
915      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
916      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
917      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
918      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
919            Trace("Loading from existing files.") if T(2);
920        } else {
921            Trace("Generating property data.") if T(2);
922      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
923      my %propertyKeys = ();      my %propertyKeys = ();
924      my $nextID = 1;      my $nextID = 1;
925      # Loop through the genomes.      # Loop through the genomes.
926      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
927                $loadProperty->Add("genomeIn");
928                Trace("Generating properties for $genomeID.") if T(3);
929          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
930          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
931          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
932          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
933                my $featureCount = 0;
934                my $propertyCount = 0;
935          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
936          for my $fid (@features) {          for my $fid (@features) {
937              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
938              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
939              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
940                    if (scalar @attributeList) {
941                        $featureCount++;
942                    }
943              # Loop through the attributes.              # Loop through the attributes.
944              for my $tuple (@attributeList) {              for my $tuple (@attributeList) {
945                        $propertyCount++;
946                  # Get this attribute value's data. Note that we throw away the FID,                  # Get this attribute value's data. Note that we throw away the FID,
947                  # since it will always be the same as the value if "$fid".                  # since it will always be the same as the value if "$fid".
948                  my (undef, $key, $value, $url) = @{$tuple};                  my (undef, $key, $value, $url) = @{$tuple};
# Line 803  Line 964 
964                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
965              }              }
966          }          }
967                # Update the statistics.
968                Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
969                $loadHasProperty->Add("featuresIn", $featureCount);
970                $loadHasProperty->Add("propertiesIn", $propertyCount);
971            }
972      }      }
973      # Finish the load.      # Finish the load.
974      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 843  Line 1009 
1009      my $fig = $self->{fig};      my $fig = $self->{fig};
1010      # Get the genome hash.      # Get the genome hash.
1011      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1012      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1013      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1014      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1015      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1016      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1017      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1018      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1019            Trace("Loading from existing files.") if T(2);
1020        } else {
1021            Trace("Generating annotation data.") if T(2);
1022      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1023      # user records.      # user records.
1024      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 862  Line 1030 
1030      # Get the current time.      # Get the current time.
1031      my $time = time();      my $time = time();
1032      # Loop through the genomes.      # Loop through the genomes.
1033      for my $genomeID (%{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1034          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1035              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1036              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1037              my %seenTimestamps = ();              my %seenTimestamps = ();
1038              # Check for a functional assignment.              # Get the genome's annotations.
1039              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1040              if ($func) {              Trace("Processing annotations.") if T(2);
1041                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1042                  # assignment annotation for it.                  # Get the annotation tuple.
1043                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = $tuple;  
1044                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1045                      # and "\t" and "\n" are escaped. Note we use the "s"                      # and "\t" and "\n" are escaped. Note we use the "s"
1046                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
# Line 899  Line 1052 
1052                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1053                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1054                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1055                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1056                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1057                              $timestamp++;                      my $keyStamp = $timestamp;
1058                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1059                            $keyStamp++;
1060                          }                          }
1061                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1062                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1063                          # Insure the user exists.                          # Insure the user exists.
1064                          if (! $users{$user}) {                          if (! $users{$user}) {
1065                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 912  Line 1067 
1067                              $users{$user} = 1;                              $users{$user} = 1;
1068                          }                          }
1069                          # Generate the annotation.                          # Generate the annotation.
1070                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1071                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1072                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1073                      } else {                      } else {
# Line 922  Line 1077 
1077                  }                  }
1078              }              }
1079          }          }
1080        # Finish the load.
1081        my $retVal = $self->_FinishAll();
1082        return $retVal;
1083    }
1084    
1085    =head3 LoadSourceData
1086    
1087    C<< my $stats = $spl->LoadSourceData(); >>
1088    
1089    Load the source data from FIG into Sprout.
1090    
1091    Source data links genomes to information about the organizations that
1092    mapped it.
1093    
1094    The following relations are loaded by this method.
1095    
1096        ComesFrom
1097        Source
1098        SourceURL
1099    
1100    There is no direct support for source attribution in FIG, so we access the SEED
1101    files directly.
1102    
1103    =over 4
1104    
1105    =item RETURNS
1106    
1107    Returns a statistics object for the loads.
1108    
1109    =back
1110    
1111    =cut
1112    #: Return Type $%;
1113    sub LoadSourceData {
1114        # Get this object instance.
1115        my ($self) = @_;
1116        # Get the FIG object.
1117        my $fig = $self->{fig};
1118        # Get the genome hash.
1119        my $genomeHash = $self->{genomes};
1120        # Create load objects for each of the tables we're loading.
1121        my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1122        my $loadSource = $self->_TableLoader('Source');
1123        my $loadSourceURL = $self->_TableLoader('SourceURL');
1124        if ($self->{options}->{loadOnly}) {
1125            Trace("Loading from existing files.") if T(2);
1126        } else {
1127            Trace("Generating annotation data.") if T(2);
1128            # Create hashes to collect the Source information.
1129            my %sourceURL = ();
1130            my %sourceDesc = ();
1131            # Loop through the genomes.
1132            my $line;
1133            for my $genomeID (sort keys %{$genomeHash}) {
1134                Trace("Processing $genomeID.") if T(3);
1135                # Open the project file.
1136                if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1137                    defined($line = <TMP>)) {
1138                    chomp $line;
1139                    my($sourceID, $desc, $url) = split(/\t/,$line);
1140                    $loadComesFrom->Put($genomeID, $sourceID);
1141                    if ($url && ! exists $sourceURL{$sourceID}) {
1142                        $loadSourceURL->Put($sourceID, $url);
1143                        $sourceURL{$sourceID} = 1;
1144                    }
1145                    if ($desc) {
1146                        $sourceDesc{$sourceID} = $desc;
1147                    } elsif (! exists $sourceDesc{$sourceID}) {
1148                        $sourceDesc{$sourceID} = $sourceID;
1149                    }
1150                }
1151                close TMP;
1152            }
1153            # Write the source descriptions.
1154            for my $sourceID (keys %sourceDesc) {
1155                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1156            }
1157        }
1158        # Finish the load.
1159        my $retVal = $self->_FinishAll();
1160        return $retVal;
1161    }
1162    
1163    =head3 LoadExternalData
1164    
1165    C<< my $stats = $spl->LoadExternalData(); >>
1166    
1167    Load the external data from FIG into Sprout.
1168    
1169    External data contains information about external feature IDs.
1170    
1171    The following relations are loaded by this method.
1172    
1173        ExternalAliasFunc
1174        ExternalAliasOrg
1175    
1176    The support for external IDs in FIG is hidden beneath layers of other data, so
1177    we access the SEED files directly to create these tables. This is also one of
1178    the few load methods that does not proceed genome by genome.
1179    
1180    =over 4
1181    
1182    =item RETURNS
1183    
1184    Returns a statistics object for the loads.
1185    
1186    =back
1187    
1188    =cut
1189    #: Return Type $%;
1190    sub LoadExternalData {
1191        # Get this object instance.
1192        my ($self) = @_;
1193        # Get the FIG object.
1194        my $fig = $self->{fig};
1195        # Get the genome hash.
1196        my $genomeHash = $self->{genomes};
1197        # Convert the genome hash. We'll get the genus and species for each genome and make
1198        # it the key.
1199        my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1200        # Create load objects for each of the tables we're loading.
1201        my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1202        my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1203        if ($self->{options}->{loadOnly}) {
1204            Trace("Loading from existing files.") if T(2);
1205        } else {
1206            Trace("Generating external data.") if T(2);
1207            # We loop through the files one at a time. First, the organism file.
1208            Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1209            my $orgLine;
1210            while (defined($orgLine = <ORGS>)) {
1211                # Clean the input line.
1212                chomp $orgLine;
1213                # Parse the organism name.
1214                my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1215                $loadExternalAliasOrg->Put($protID, $name);
1216            }
1217            close ORGS;
1218            # Now the function file.
1219            my $funcLine;
1220            Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1221            while (defined($funcLine = <FUNCS>)) {
1222                # Clean the line ending.
1223                chomp $funcLine;
1224                # Only proceed if the line is non-blank.
1225                if ($funcLine) {
1226                    # Split it into fields.
1227                    my @funcFields = split /\s*\t\s*/, $funcLine;
1228                    # If there's an EC number, append it to the description.
1229                    if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1230                        $funcFields[1] .= " $1";
1231                    }
1232                    # Output the function line.
1233                    $loadExternalAliasFunc->Put(@funcFields[0,1]);
1234                }
1235            }
1236        }
1237        # Finish the load.
1238        my $retVal = $self->_FinishAll();
1239        return $retVal;
1240    }
1241    
1242    
1243    =head3 LoadReactionData
1244    
1245    C<< my $stats = $spl->LoadReactionData(); >>
1246    
1247    Load the reaction data from FIG into Sprout.
1248    
1249    Reaction data connects reactions to the compounds that participate in them.
1250    
1251    The following relations are loaded by this method.
1252    
1253        Reaction
1254        ReactionURL
1255        Compound
1256        CompoundName
1257        CompoundCAS
1258        IsAComponentOf
1259    
1260    This method proceeds reaction by reaction rather than genome by genome.
1261    
1262    =over 4
1263    
1264    =item RETURNS
1265    
1266    Returns a statistics object for the loads.
1267    
1268    =back
1269    
1270    =cut
1271    #: Return Type $%;
1272    sub LoadReactionData {
1273        # Get this object instance.
1274        my ($self) = @_;
1275        # Get the FIG object.
1276        my $fig = $self->{fig};
1277        # Create load objects for each of the tables we're loading.
1278        my $loadReaction = $self->_TableLoader('Reaction');
1279        my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1280        my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1281        my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1282        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1283        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1284        if ($self->{options}->{loadOnly}) {
1285            Trace("Loading from existing files.") if T(2);
1286        } else {
1287            Trace("Generating annotation data.") if T(2);
1288            # First we create the compounds.
1289            my @compounds = $fig->all_compounds();
1290            for my $cid (@compounds) {
1291                # Check for names.
1292                my @names = $fig->names_of_compound($cid);
1293                # Each name will be given a priority number, starting with 1.
1294                my $prio = 1;
1295                for my $name (@names) {
1296                    $loadCompoundName->Put($cid, $name, $prio++);
1297                }
1298                # Create the main compound record. Note that the first name
1299                # becomes the label.
1300                my $label = (@names > 0 ? $names[0] : $cid);
1301                $loadCompound->Put($cid, $label);
1302                # Check for a CAS ID.
1303                my $cas = $fig->cas($cid);
1304                if ($cas) {
1305                    $loadCompoundCAS->Put($cid, $cas);
1306                }
1307            }
1308            # All the compounds are set up, so we need to loop through the reactions next. First,
1309            # we initialize the discriminator index. This is a single integer used to insure
1310            # duplicate elements in a reaction are not accidentally collapsed.
1311            my $discrim = 0;
1312            my @reactions = $fig->all_reactions();
1313            for my $reactionID (@reactions) {
1314                # Create the reaction record.
1315                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1316                # Compute the reaction's URL.
1317                my $url = HTML::reaction_link($reactionID);
1318                # Put it in the ReactionURL table.
1319                $loadReactionURL->Put($reactionID, $url);
1320                # Now we need all of the reaction's compounds. We get these in two phases,
1321                # substrates first and then products.
1322                for my $product (0, 1) {
1323                    # Get the compounds of the current type for the current reaction. FIG will
1324                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1325                    # have location data in SEED, so it defaults to the empty string.
1326                    my @compounds = $fig->reaction2comp($reactionID, $product);
1327                    for my $compData (@compounds) {
1328                        # Extract the compound data from the current tuple.
1329                        my ($cid, $stoich, $main) = @{$compData};
1330                        # Link the compound to the reaction.
1331                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1332                                                 $product, $stoich);
1333                    }
1334                }
1335            }
1336        }
1337        # Finish the load.
1338        my $retVal = $self->_FinishAll();
1339        return $retVal;
1340    }
1341    
1342    =head3 LoadGroupData
1343    
1344    C<< my $stats = $spl->LoadGroupData(); >>
1345    
1346    Load the genome Groups into Sprout.
1347    
1348    The following relations are loaded by this method.
1349    
1350        GenomeGroups
1351    
1352    There is no direct support for genome groups in FIG, so we access the SEED
1353    files directly.
1354    
1355    =over 4
1356    
1357    =item RETURNS
1358    
1359    Returns a statistics object for the loads.
1360    
1361    =back
1362    
1363    =cut
1364    #: Return Type $%;
1365    sub LoadGroupData {
1366        # Get this object instance.
1367        my ($self) = @_;
1368        # Get the FIG object.
1369        my $fig = $self->{fig};
1370        # Get the genome hash.
1371        my $genomeHash = $self->{genomes};
1372        # Create a load object for the table we're loading.
1373        my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1374        if ($self->{options}->{loadOnly}) {
1375            Trace("Loading from existing files.") if T(2);
1376        } else {
1377            Trace("Generating group data.") if T(2);
1378            # Loop through the genomes.
1379            my $line;
1380            for my $genomeID (keys %{$genomeHash}) {
1381                Trace("Processing $genomeID.") if T(3);
1382                # Open the NMPDR group file for this genome.
1383                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1384                    defined($line = <TMP>)) {
1385                    # Clean the line ending.
1386                    chomp $line;
1387                    # Add the group to the table. Note that there can only be one group
1388                    # per genome.
1389                    $loadGenomeGroups->Put($genomeID, $line);
1390                }
1391                close TMP;
1392            }
1393      }      }
1394      # Finish the load.      # Finish the load.
1395      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 944  Line 1412 
1412    
1413  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1414    
1415  =item rowCount (optional)  =item ignore
1416    
1417  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1418    
1419  =item RETURN  =item RETURN
1420    
# Line 958  Line 1426 
1426    
1427  sub _TableLoader {  sub _TableLoader {
1428      # Get the parameters.      # Get the parameters.
1429      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1430      # Create the load object.      # Create the load object.
1431      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1432                                   $ignore);
1433      # Cache it in the loader list.      # Cache it in the loader list.
1434      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1435      # Return it to the caller.      # Return it to the caller.
# Line 997  Line 1466 
1466      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1467      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1468      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1469            # Get the relation name.
1470            my $relName = $loader->RelName;
1471            # Check the ignore flag.
1472            if ($loader->Ignore) {
1473                Trace("Relation $relName not loaded.") if T(2);
1474            } else {
1475                # Here we really need to finish.
1476                Trace("Finishing $relName.") if T(2);
1477          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1478                if ($self->{options}->{dbLoad}) {
1479                    # Here we want to use the load file just created to load the database.
1480                    Trace("Loading relation $relName.") if T(2);
1481                    my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1482                    # Accumulate the statistics from the DB load.
1483                    $stats->Accumulate($newStats);
1484                }
1485          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1486          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1487      }      }
1488        }
1489      # Return the load statistics.      # Return the load statistics.
1490      return $retVal;      return $retVal;
1491  }  }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.40

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3