[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Sun Aug 14 23:32:09 2005 UTC revision 1.21, Sat Nov 12 03:42:48 2005 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 40  Line 41 
41  a variable called C<$fig>. This makes it fairly straightforward to determine which  a variable called C<$fig>. This makes it fairly straightforward to determine which
42  FIG methods are required to load the Sprout database.  FIG methods are required to load the Sprout database.
43    
44    This object creates the load files; however, the tables are not created until it
45    is time to actually do the load from the files into the target database.
46    
47  =cut  =cut
48    
49  #: Constructor SproutLoad->new();  #: Constructor SproutLoad->new();
# Line 48  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 83 
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all known subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. Only subsystem data related to the trusted subsystems is loaded.
85    
86    =item options
87    
88    Reference to a hash of command-line options.
89    
90  =back  =back
91    
92  =cut  =cut
93    
94  sub new {  sub new {
95      # Get the parameters.      # Get the parameters.
96      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
97      # Load the list of genomes into a hash.      # Load the list of genomes into a hash.
98      my %genomes;      my %genomes;
99      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
100          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
101          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
102          %genomes = map { $_ => 1 } @genomeList;          %genomes = map { $_ => 1 } @genomeList;
103      } elsif (ref $genomeFile eq 'HASH') {      } else {
104            my $type = ref $genomeFile;
105            Trace("Genome file parameter type is \"$type\".") if T(3);
106            if ($type eq 'HASH') {
107          # Here the user specified a hash of genome IDs to access codes, which is          # Here the user specified a hash of genome IDs to access codes, which is
108          # exactly what we want.          # exactly what we want.
109          %genomes = %{$genomeFile};          %genomes = %{$genomeFile};
110      } elsif (ref $genomeFile eq 'SCALAR') {          } elsif (! $type || $type eq 'SCALAR' ) {
111          # The caller specified a file, so read the genomes from the file.              # The caller specified a file, so read the genomes from the file. (Note
112                # that some PERLs return an empty string rather than SCALAR.)
113          my @genomeList = Tracer::GetFile($genomeFile);          my @genomeList = Tracer::GetFile($genomeFile);
114          if (! @genomeList) {          if (! @genomeList) {
115              # It's an error if the genome file is empty or not found.              # It's an error if the genome file is empty or not found.
# Line 114  Line 126 
126              }              }
127          }          }
128      } else {      } else {
129          Confess("Invalid genome parameter in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
130            }
131      }      }
132      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
133      my %subsystems = ();      my %subsystems = ();
134      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
135          # Here we want all the subsystems.          # Here we want all the subsystems.
136          %subsystems = map { $_ => 1 } $fig->all_subsystems();          %subsystems = map { $_ => 1 } $fig->all_subsystems();
137      } elsif (ref $subsysFile eq 'ARRAY') {      } else {
138            my $type = ref $subsysFile;
139            if ($type eq 'ARRAY') {
140          # Here the user passed in a list of subsystems.          # Here the user passed in a list of subsystems.
141          %subsystems = map { $_ => 1 } @{$subsysFile};          %subsystems = map { $_ => 1 } @{$subsysFile};
142      } elsif (ref $subsysFile eq 'SCALAR') {          } elsif (! $type || $type eq 'SCALAR') {
143          # Here the list of subsystems is in a file.          # Here the list of subsystems is in a file.
144          if (! -e $subsysFile) {          if (! -e $subsysFile) {
145              # It's an error if the file does not exist.              # It's an error if the file does not exist.
# Line 137  Line 152 
152      } else {      } else {
153          Confess("Invalid subsystem parameter in SproutLoad constructor.");          Confess("Invalid subsystem parameter in SproutLoad constructor.");
154      }      }
155        }
156      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
157      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
158      # Create the Sprout load object.      # Create the Sprout load object.
# Line 147  Line 163 
163                    sprout => $sprout,                    sprout => $sprout,
164                    loadDirectory => $directory,                    loadDirectory => $directory,
165                    erdb => $sprout->{_erdb},                    erdb => $sprout->{_erdb},
166                    loaders => []                    loaders => [],
167                      options => $options
168                   };                   };
169      # Bless and return it.      # Bless and return it.
170      bless $retVal, $class;      bless $retVal, $class;
# Line 209  Line 226 
226      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
227      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
228          Trace("Loading data for genome $genomeID.") if T(3);          Trace("Loading data for genome $genomeID.") if T(3);
229            $loadGenome->Add("genomeIn");
230          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
231          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
232          # Get the genus, species, and strain from the scientific name. Note that we append          # Get the genus, species, and strain from the scientific name. Note that we append
233          # the genome ID to the strain. In some cases this is the totality of the strain name.          # the genome ID to the strain. In some cases this is the totality of the strain name.
234          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
235          my $extra = join " ", @extraData, "[genomeID]";          my $extra = join " ", @extraData, "[$genomeID]";
236          # Get the full taxonomy.          # Get the full taxonomy.
237          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
238          # Output the genome record.          # Output the genome record.
# Line 224  Line 242 
242          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
243          for my $contigID (@contigs) {          for my $contigID (@contigs) {
244              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
245                $loadContig->Add("contigIn");
246                $loadSequence->Add("contigIn");
247              # Create the contig ID.              # Create the contig ID.
248              my $sproutContigID = "$genomeID:$contigID";              my $sproutContigID = "$genomeID:$contigID";
249              # Create the contig record and relate it to the genome.              # Create the contig record and relate it to the genome.
# Line 235  Line 255 
255              # Now we get the sequence a chunk at a time.              # Now we get the sequence a chunk at a time.
256              my $contigLen = $fig->contig_ln($genomeID, $contigID);              my $contigLen = $fig->contig_ln($genomeID, $contigID);
257              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
258                    $loadSequence->Add("chunkIn");
259                  # Compute the endpoint of this chunk.                  # Compute the endpoint of this chunk.
260                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);
261                  # Get the actual DNA.                  # Get the actual DNA.
# Line 299  Line 320 
320      # Loop through the genomes found.      # Loop through the genomes found.
321      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
322          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
323            $loadCoupling->Add("genomeIn");
324          # Create a hash table for holding coupled pairs. We use this to prevent          # Create a hash table for holding coupled pairs. We use this to prevent
325          # duplicates. For example, if A is coupled to B, we don't want to also          # duplicates. For example, if A is coupled to B, we don't want to also
326          # assert that B is coupled to A, because we already know it. Fortunately,          # assert that B is coupled to A, because we already know it. Fortunately,
# Line 309  Line 331 
331          my @pegs = $fig->pegs_of($genome);          my @pegs = $fig->pegs_of($genome);
332          # Loop through the PEGs.          # Loop through the PEGs.
333          for my $peg1 (@pegs) {          for my $peg1 (@pegs) {
334                $loadCoupling->Add("pegIn");
335              Trace("Processing PEG $peg1 for $genome.") if T(4);              Trace("Processing PEG $peg1 for $genome.") if T(4);
336              # Get a list of the coupled PEGs.              # Get a list of the coupled PEGs.
337              my @couplings = $fig->coupled_to($peg1);              my @couplings = $fig->coupled_to($peg1);
# Line 319  Line 342 
342                  # Compute the coupling ID.                  # Compute the coupling ID.
343                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                  my $coupleID = Sprout::CouplingID($peg1, $peg2);
344                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
345                        $loadCoupling->Add("couplingIn");
346                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
347                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);
348                      # Ensure we don't do this again.                      # Ensure we don't do this again.
# Line 334  Line 358 
358                      my %evidenceMap = ();                      my %evidenceMap = ();
359                      # Process each evidence item.                      # Process each evidence item.
360                      for my $evidenceData (@evidence) {                      for my $evidenceData (@evidence) {
361                            $loadPCH->Add("evidenceIn");
362                          my ($peg3, $peg4, $usage) = @{$evidenceData};                          my ($peg3, $peg4, $usage) = @{$evidenceData};
363                          # Only proceed if the evidence is from a Sprout                          # Only proceed if the evidence is from a Sprout
364                          # genome.                          # genome.
365                          if ($genomeFilter->{$fig->genome_of($peg3)}) {                          if ($genomeFilter->{$fig->genome_of($peg3)}) {
366                                $loadUsesAsEvidence->Add("evidenceChosen");
367                              my $evidenceKey = "$coupleID $peg3 $peg4";                              my $evidenceKey = "$coupleID $peg3 $peg4";
368                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
369                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
370                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
371                              # at least keep the meaningful ones. Only evidence is                              # at least keep the meaningful ones. Only evidence in
372                              # the hash makes it to the output.                              # the hash makes it to the output.
373                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
374                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 357  Line 383 
383                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);
384                          # Connect it to the features.                          # Connect it to the features.
385                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
386                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);
387                      }                      }
388                  }                  }
389              }              }
# Line 400  Line 426 
426      my ($self) = @_;      my ($self) = @_;
427      # Get the FIG object.      # Get the FIG object.
428      my $fig = $self->{fig};      my $fig = $self->{fig};
429        # Find out if this is a limited run.
430        my $limited = $self->{options}->{limitedFeatures};
431      # Get the table of genome IDs.      # Get the table of genome IDs.
432      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
433      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
434      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
435      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
436      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature', $featureCount);
     my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);  
     my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);  
     my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);  
     my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);  
437      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);
438        my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);
439        my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);
440        if (! $limited) {
441            $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);
442            $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);
443            $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);
444        }
445      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
446      # locations.      # locations.
447      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 418  Line 449 
449      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
450      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
451          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
452            $loadFeature->Add("genomeIn");
453          # Get the feature list for this genome.          # Get the feature list for this genome.
454          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
455          # Loop through the features.          # Loop through the features.
456          for my $featureData (@{$features}) {          for my $featureData (@{$features}) {
457                $loadFeature->Add("featureIn");
458              # Split the tuple.              # Split the tuple.
459              my ($featureID, $locations, $aliases, $type) = @{$featureData};              my ($featureID, $locations, undef, $type) = @{$featureData};
460              # Create the feature record.              # Create the feature record.
461              $loadFeature->Put($featureID, 1, $type);              $loadFeature->Put($featureID, 1, $type);
462              # Create the aliases.              # Create the aliases.
463              for my $alias (split /\s*,\s*/, $aliases) {              for my $alias ($fig->feature_aliases($featureID)) {
464                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
465              }              }
466                # The next stuff is for a full load only.
467                if (! $limited) {
468              # Get the links.              # Get the links.
469              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
470              for my $link (@links) {              for my $link (@links) {
# Line 437  Line 472 
472              }              }
473              # If this is a peg, generate the translation and the upstream.              # If this is a peg, generate the translation and the upstream.
474              if ($type eq 'peg') {              if ($type eq 'peg') {
475                        $loadFeatureTranslation->Add("pegIn");
476                  my $translation = $fig->get_translation($featureID);                  my $translation = $fig->get_translation($featureID);
477                  if ($translation) {                  if ($translation) {
478                      $loadFeatureTranslation->Put($featureID, $translation);                      $loadFeatureTranslation->Put($featureID, $translation);
# Line 447  Line 483 
483                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
484                  }                  }
485              }              }
486                }
487              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
488              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
489              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
490              # for Sprout.              # for Sprout.
491              my @locationList = split /\s*,\s*/, $locations;              my @locationList = split /\s*,\s*/, $locations;
492                # Create the location position indicator.
493                my $i = 1;
494              # Loop through the locations.              # Loop through the locations.
495              for my $location (@locationList) {              for my $location (@locationList) {
496                  # Parse the location.                  # Parse the location.
497                  my $locObject = BasicLocation->new($location);                  my $locObject = BasicLocation->new("$genomeID:$location");
498                  # Split it into a list of chunks.                  # Split it into a list of chunks.
499                  my @locOList = ();                  my @locOList = ();
500                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
501                        $loadIsLocatedIn->Add("peeling");
502                      push @locOList, $peeling;                      push @locOList, $peeling;
503                  }                  }
504                  push @locOList, $locObject;                  push @locOList, $locObject;
505                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
506                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
507                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
508                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
509                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
# Line 502  Line 541 
541    
542  =cut  =cut
543  #: Return Type $%;  #: Return Type $%;
544  sub LoadFeatureData {  sub LoadBBHData {
545      # Get this object instance.      # Get this object instance.
546      my ($self) = @_;      my ($self) = @_;
547      # Get the FIG object.      # Get the FIG object.
# Line 517  Line 556 
556      Trace("Beginning BBH load.") if T(2);      Trace("Beginning BBH load.") if T(2);
557      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
558      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
559            $loadIsBidirectionalBestHitOf->Add("genomeIn");
560          Trace("Processing features for genome $genomeID.") if T(3);          Trace("Processing features for genome $genomeID.") if T(3);
561          # Get the feature list for this genome.          # Get the feature list for this genome.
562          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
# Line 560  Line 600 
600    
601      Subsystem      Subsystem
602      Role      Role
603        RoleEC
604      SSCell      SSCell
605      ContainsFeature      ContainsFeature
606      IsGenomeOf      IsGenomeOf
# Line 567  Line 608 
608      OccursInSubsystem      OccursInSubsystem
609      ParticipatesIn      ParticipatesIn
610      HasSSCell      HasSSCell
611        ConsistsOfRoles
612        RoleSubset
613        HasRoleSubset
614        ConsistsOfGenomes
615        GenomeSubset
616        HasGenomeSubset
617        Catalyzes
618        Diagram
619        RoleOccursIn
620    
621  =over 4  =over 4
622    
# Line 576  Line 626 
626    
627  =back  =back
628    
 B<TO DO>  
   
 Generate RoleName table?  
   
629  =cut  =cut
630  #: Return Type $%;  #: Return Type $%;
631  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 596  Line 642 
642      my $subsysCount = @subsysIDs;      my $subsysCount = @subsysIDs;
643      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
644      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
645        # Get the map list.
646        my @maps = $fig->all_maps;
647        my $mapCount = @maps;
648      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
649        my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);
650        my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);
651      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);
652      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);
653        my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);
654        my $loadCatalyzes = $self->_TableLoader('Catalyzes', $genomeCount * $featureCount);
655      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);
656      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);
657      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);
# Line 606  Line 659 
659      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
660      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
661      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
662        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);
663        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);
664        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);
665        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);
666        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);
667        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);
668        # Create load objects for each of the tables we're loading.
669      Trace("Beginning subsystem data load.") if T(2);      Trace("Beginning subsystem data load.") if T(2);
670        # This hash will contain the role for each EC. When we're done, this
671        # information will be used to generate the Catalyzes table.
672        my %ecToRoles = ();
673      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
674      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
675      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
676      # duplicates. As we move along, we'll connect the roles and subsystems.      # duplicates. As we move along, we'll connect the roles and subsystems
677        # and memorize up the reactions.
678        my ($genomeID, $roleID);
679      my %roleData = ();      my %roleData = ();
680      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
681          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
682            $loadSubsystem->Add("subsystemIn");
683            # Get the subsystem object.
684            my $sub = $fig->get_subsystem($subsysID);
685          # Create the subsystem record.          # Create the subsystem record.
686          $loadSubsystem->Put($subsysID);          my $curator = $sub->get_curator();
687          # Get the subsystem's roles.          my $notes = $sub->get_notes();
688          my @roles = $fig->subsys_to_roles($subsysID);          $loadSubsystem->Put($subsysID, $curator, $notes);
689          # Connect the roles to the subsystem. If a role is new, we create          # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
690          # a role record for it.          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
691          for my $roleID (@roles) {              # Connect to this role.
692              $loadOccursInSubsystem->Put($roleID, $subsysID);              $loadOccursInSubsystem->Add("roleIn");
693                $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
694                # If it's a new role, add it to the role table.
695              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
696                  $loadRole->Put($roleID);                  # Get the role's abbreviation.
697                    my $abbr = $sub->get_role_abbr($col);
698                    # Add the role.
699                    $loadRole->Put($roleID, $abbr);
700                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
701                    # Check for an EC number.
702                    if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
703                        my $ec = $1;
704                        $loadRoleEC->Put($roleID, $ec);
705                        $ecToRoles{$ec} = $roleID;
706              }              }
707          }          }
708          # Now all roles for this subsystem have been filled in. We create the          }
709          # spreadsheet by matches roles to genomes. To do this, we need to          # Now we create the spreadsheet for the subsystem by matching roles to
710          # get the genomes on the sheet.          # genomes. Each genome is a row and each role is a column. We may need
711            # to actually create the roles as we find them.
712          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
713          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};          for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
714          for my $genomeID (@genomes) {              # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
715              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
716                  # Connect the genome to the subsystem.                  # Count the PEGs and cells found for verification purposes.
717                  $loadParticipatesIn->Put($genomeID, $subsysID);                  my $pegCount = 0;
718                    my $cellCount = 0;
719                    # Create a list for the PEGs we find. This list will be used
720                    # to generate cluster numbers.
721                    my @pegsFound = ();
722                    # Create a hash that maps spreadsheet IDs to PEGs. We will
723                    # use this to generate the ContainsFeature data after we have
724                    # the cluster numbers.
725                    my %cellPegs = ();
726                    # Get the genome's variant code for this subsystem.
727                    my $variantCode = $sub->get_variant_code($row);
728                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
729                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
730                  for (my $i = 0; $i <= $#roles; $i++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
731                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
732                      my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i);                      my @pegs = $sub->get_pegs_from_cell($row, $col);
733                      # Only proceed if features exist.                      # Only proceed if features exist.
734                      if (@pegs > 0) {                      if (@pegs > 0) {
735                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
736                          my $cellID = "$subsysID:$genomeID:$i";                          $cellCount++;
737                            my $cellID = "$subsysID:$genomeID:$col";
738                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
739                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
740                          $loadIsRoleOf->Put($role, $cellID);                          $loadIsRoleOf->Put($roleID, $cellID);
741                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
742                          # Attach the features to it.                          # Remember its features.
743                          for my $pegID (@pegs) {                          push @pegsFound, @pegs;
744                              $loadContainsFeature->Put($cellID, $pegID);                          $cellPegs{$cellID} = \@pegs;
745                            $pegCount += @pegs;
746                        }
747                    }
748                    # If we found some cells for this genome, we need to compute clusters and
749                    # denote it participates in the subsystem.
750                    if ($pegCount > 0) {
751                        Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
752                        $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
753                        # Partition the PEGs found into clusters.
754                        my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
755                        # Create a hash mapping PEG IDs to cluster numbers.
756                        # We default to -1 for all of them.
757                        my %clusterOf = map { $_ => -1 } @pegsFound;
758                        for (my $i = 0; $i <= $#clusters; $i++) {
759                            my $subList = $clusters[$i];
760                            for my $peg (@{$subList}) {
761                                $clusterOf{$peg} = $i;
762                            }
763                        }
764                        # Create the ContainsFeature data.
765                        for my $cellID (keys %cellPegs) {
766                            my $cellList = $cellPegs{$cellID};
767                            for my $cellPeg (@$cellList) {
768                                $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
769                          }                          }
770                      }                      }
771                  }                  }
772              }              }
773          }          }
774            # Now we need to generate the subsets. The subset names must be concatenated to
775            # the subsystem name to make them unique keys. There are two types of subsets:
776            # genome subsets and role subsets. We do the role subsets first.
777            my @subsetNames = $sub->get_subset_names();
778            for my $subsetID (@subsetNames) {
779                # Create the subset record.
780                my $actualID = "$subsysID:$subsetID";
781                $loadRoleSubset->Put($actualID);
782                # Connect the subset to the subsystem.
783                $loadHasRoleSubset->Put($subsysID, $actualID);
784                # Connect the subset to its roles.
785                my @roles = $sub->get_subset($subsetID);
786                for my $roleID (@roles) {
787                    $loadConsistsOfRoles->Put($actualID, $roleID);
788      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
789  }  }
790            # Next the genome subsets.
791  =head3 LoadDiagramData          @subsetNames = $sub->get_subset_namesR();
792            for my $subsetID (@subsetNames) {
793  C<< my $stats = $spl->LoadDiagramData(); >>              # Create the subset record.
794                my $actualID = "$subsysID:$subsetID";
795  Load the diagram data from FIG into Sprout.              $loadGenomeSubset->Put($actualID);
796                # Connect the subset to the subsystem.
797  Diagrams are used to organize functional roles. The diagram shows the              $loadHasGenomeSubset->Put($subsysID, $actualID);
798  connections between chemicals that interact with a subsystem.              # Connect the subset to its genomes.
799                my @genomes = $sub->get_subsetR($subsetID);
800  The following relations are loaded by this method.              for my $genomeID (@genomes) {
801                    $loadConsistsOfGenomes->Put($actualID, $genomeID);
802      Diagram              }
803      RoleOccursIn          }
804        }
805  =over 4      # Now we loop through the diagrams. We need to create the diagram records
806        # and link each diagram to its roles. Note that only roles which occur
807  =item RETURNS      # in subsystems (and therefore appear in the %ecToRoles hash) are
808        # included.
809  Returns a statistics object for the loads.      for my $map (@maps) {
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
810          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
811          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
812          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 713  Line 815 
815          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
816          my %roleHash = ();          my %roleHash = ();
817          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
818              if (! $roleHash{$role}) {              if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
819                  $loadRoleOccursIn->Put($role, $map);                  $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
820                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
821              }              }
822          }          }
823      }      }
824        # Before we leave, we must create the Catalyzes table. We start with the reactions,
825        # then use the "ecToRoles" table to convert EC numbers to role IDs.
826        my @reactions = $fig->all_reactions();
827        for my $reactionID (@reactions) {
828            # Get this reaction's list of roles. The results will be EC numbers.
829            my @roles = $fig->catalyzed_by($reactionID);
830            # Loop through the roles, creating catalyzation records.
831            for my $thisRole (@roles) {
832                if (exists $ecToRoles{$thisRole}) {
833                    $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
834                }
835            }
836        }
837      # Finish the load.      # Finish the load.
838      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
839      return $retVal;      return $retVal;
# Line 770  Line 885 
885      my $nextID = 1;      my $nextID = 1;
886      # Loop through the genomes.      # Loop through the genomes.
887      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
888            $loadProperty->Add("genomeIn");
889          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
890          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
891          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
892          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
893          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
894          for my $fid (@features) {          for my $fid (@features) {
895                $loadProperty->Add("featureIn");
896              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
897              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
898              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
# Line 861  Line 978 
978      # Get the current time.      # Get the current time.
979      my $time = time();      my $time = time();
980      # Loop through the genomes.      # Loop through the genomes.
981      for my $genomeID (%{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
982          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
983          # Get the genome's PEGs.          # Get the genome's PEGs.
984          my @pegs = $fig->pegs_of($genomeID);          my @pegs = $fig->pegs_of($genomeID);
# Line 884  Line 1001 
1001                      # Denote we've seen this timestamp.                      # Denote we've seen this timestamp.
1002                      $seenTimestamps{$time} = 1;                      $seenTimestamps{$time} = 1;
1003                  }                  }
1004                }
1005                  # Now loop through the real annotations.                  # Now loop through the real annotations.
1006                  for my $tuple ($fig->feature_annotations($peg, "raw")) {                  for my $tuple ($fig->feature_annotations($peg, "raw")) {
1007                      my ($fid, $timestamp, $user, $text) = $tuple;                  my ($fid, $timestamp, $user, $text) = @{$tuple};
1008                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1009                      # and "\t" and "\n" are escaped. Note we use the "s"                      # and "\t" and "\n" are escaped. Note we use the "s"
1010                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
# Line 898  Line 1016 
1016                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1017                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1018                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1019                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1020                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1021                              $timestamp++;                      my $keyStamp = $timestamp;
1022                        while ($seenTimestamps{$keyStamp}) {
1023                            $keyStamp++;
1024                          }                          }
1025                          $seenTimestamps{$timestamp} = 1;                      $seenTimestamps{$keyStamp} = 1;
1026                          my $annotationID = "$peg:$timestamp";                      my $annotationID = "$peg:$keyStamp";
1027                          # Insure the user exists.                          # Insure the user exists.
1028                          if (! $users{$user}) {                          if (! $users{$user}) {
1029                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 911  Line 1031 
1031                              $users{$user} = 1;                              $users{$user} = 1;
1032                          }                          }
1033                          # Generate the annotation.                          # Generate the annotation.
1034                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1035                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1036                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1037                      } else {                      } else {
# Line 921  Line 1041 
1041                  }                  }
1042              }              }
1043          }          }
1044        # Finish the load.
1045        my $retVal = $self->_FinishAll();
1046        return $retVal;
1047    }
1048    
1049    =head3 LoadSourceData
1050    
1051    C<< my $stats = $spl->LoadSourceData(); >>
1052    
1053    Load the source data from FIG into Sprout.
1054    
1055    Source data links genomes to information about the organizations that
1056    mapped it.
1057    
1058    The following relations are loaded by this method.
1059    
1060        ComesFrom
1061        Source
1062        SourceURL
1063    
1064    There is no direct support for source attribution in FIG, so we access the SEED
1065    files directly.
1066    
1067    =over 4
1068    
1069    =item RETURNS
1070    
1071    Returns a statistics object for the loads.
1072    
1073    =back
1074    
1075    =cut
1076    #: Return Type $%;
1077    sub LoadSourceData {
1078        # Get this object instance.
1079        my ($self) = @_;
1080        # Get the FIG object.
1081        my $fig = $self->{fig};
1082        # Get the genome hash.
1083        my $genomeHash = $self->{genomes};
1084        my $genomeCount = (keys %{$genomeHash});
1085        # Create load objects for each of the tables we're loading.
1086        my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);
1087        my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);
1088        my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);
1089        Trace("Beginning source data load.") if T(2);
1090        # Create hashes to collect the Source information.
1091        my %sourceURL = ();
1092        my %sourceDesc = ();
1093        # Loop through the genomes.
1094        my $line;
1095        for my $genomeID (sort keys %{$genomeHash}) {
1096            Trace("Processing $genomeID.") if T(3);
1097            # Open the project file.
1098            if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1099                defined($line = <TMP>)) {
1100                chomp $line;
1101                my($sourceID, $desc, $url) = split(/\t/,$line);
1102                $loadComesFrom->Put($genomeID, $sourceID);
1103                if ($url && ! exists $sourceURL{$sourceID}) {
1104                    $loadSourceURL->Put($sourceID, $url);
1105                    $sourceURL{$sourceID} = 1;
1106                }
1107                if ($desc) {
1108                    $sourceDesc{$sourceID} = $desc;
1109                } elsif (! exists $sourceDesc{$sourceID}) {
1110                    $sourceDesc{$sourceID} = $sourceID;
1111                }
1112            }
1113            close TMP;
1114        }
1115        # Write the source descriptions.
1116        for my $sourceID (keys %sourceDesc) {
1117            $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1118        }
1119        # Finish the load.
1120        my $retVal = $self->_FinishAll();
1121        return $retVal;
1122    }
1123    
1124    =head3 LoadExternalData
1125    
1126    C<< my $stats = $spl->LoadExternalData(); >>
1127    
1128    Load the external data from FIG into Sprout.
1129    
1130    External data contains information about external feature IDs.
1131    
1132    The following relations are loaded by this method.
1133    
1134        ExternalAliasFunc
1135        ExternalAliasOrg
1136    
1137    The support for external IDs in FIG is hidden beneath layers of other data, so
1138    we access the SEED files directly to create these tables. This is also one of
1139    the few load methods that does not proceed genome by genome.
1140    
1141    =over 4
1142    
1143    =item RETURNS
1144    
1145    Returns a statistics object for the loads.
1146    
1147    =back
1148    
1149    =cut
1150    #: Return Type $%;
1151    sub LoadExternalData {
1152        # Get this object instance.
1153        my ($self) = @_;
1154        # Get the FIG object.
1155        my $fig = $self->{fig};
1156        # Get the genome hash.
1157        my $genomeHash = $self->{genomes};
1158        my $genomeCount = (keys %{$genomeHash});
1159        # Convert the genome hash. We'll get the genus and species for each genome and make
1160        # it the key.
1161        my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1162        # Create load objects for each of the tables we're loading.
1163        my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);
1164        my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);
1165        Trace("Beginning external data load.") if T(2);
1166        # We loop through the files one at a time. First, the organism file.
1167        Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1168        my $orgLine;
1169        while (defined($orgLine = <ORGS>)) {
1170            # Clean the input line.
1171            chomp $orgLine;
1172            # Parse the organism name.
1173            my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1174            $loadExternalAliasOrg->Put($protID, $name);
1175        }
1176        close ORGS;
1177        # Now the function file.
1178        my $funcLine;
1179        Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1180        while (defined($funcLine = <FUNCS>)) {
1181            # Clean the line ending.
1182            chomp $funcLine;
1183            # Only proceed if the line is non-blank.
1184            if ($funcLine) {
1185                # Split it into fields.
1186                my @funcFields = split /\s*\t\s*/, $funcLine;
1187                # If there's an EC number, append it to the description.
1188                if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1189                    $funcFields[1] .= " $1";
1190                }
1191                # Output the function line.
1192                $loadExternalAliasFunc->Put(@funcFields[0,1]);
1193            }
1194        }
1195        # Finish the load.
1196        my $retVal = $self->_FinishAll();
1197        return $retVal;
1198    }
1199    
1200    
1201    =head3 LoadReactionData
1202    
1203    C<< my $stats = $spl->LoadReactionData(); >>
1204    
1205    Load the reaction data from FIG into Sprout.
1206    
1207    Reaction data connects reactions to the compounds that participate in them.
1208    
1209    The following relations are loaded by this method.
1210    
1211        Reaction
1212        ReactionURL
1213        Compound
1214        CompoundName
1215        CompoundCAS
1216        IsAComponentOf
1217    
1218    This method proceeds reaction by reaction rather than genome by genome.
1219    
1220    =over 4
1221    
1222    =item RETURNS
1223    
1224    Returns a statistics object for the loads.
1225    
1226    =back
1227    
1228    =cut
1229    #: Return Type $%;
1230    sub LoadReactionData {
1231        # Get this object instance.
1232        my ($self) = @_;
1233        # Get the FIG object.
1234        my $fig = $self->{fig};
1235        # Get the genome hash.
1236        my $genomeHash = $self->{genomes};
1237        my $genomeCount = (keys %{$genomeHash});
1238        # Create load objects for each of the tables we're loading.
1239        my $loadReaction = $self->_TableLoader('Reaction', $genomeCount * 4000);
1240        my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);
1241        my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);
1242        my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);
1243        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);
1244        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);
1245        Trace("Beginning reaction/compound data load.") if T(2);
1246        # First we create the compounds.
1247        my @compounds = $fig->all_compounds();
1248        for my $cid (@compounds) {
1249            # Check for names.
1250            my @names = $fig->names_of_compound($cid);
1251            # Each name will be given a priority number, starting with 1.
1252            my $prio = 1;
1253            for my $name (@names) {
1254                $loadCompoundName->Put($cid, $name, $prio++);
1255            }
1256            # Create the main compound record. Note that the first name
1257            # becomes the label.
1258            my $label = (@names > 0 ? $names[0] : $cid);
1259            $loadCompound->Put($cid, $label);
1260            # Check for a CAS ID.
1261            my $cas = $fig->cas($cid);
1262            if ($cas) {
1263                $loadCompoundCAS->Put($cid, $cas);
1264            }
1265        }
1266        # All the compounds are set up, so we need to loop through the reactions next. First,
1267        # we initialize the discriminator index. This is a single integer used to insure
1268        # duplicate elements in a reaction are not accidentally collapsed.
1269        my $discrim = 0;
1270        my @reactions = $fig->all_reactions();
1271        for my $reactionID (@reactions) {
1272            # Create the reaction record.
1273            $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1274            # Compute the reaction's URL.
1275            my $url = HTML::reaction_link($reactionID);
1276            # Put it in the ReactionURL table.
1277            $loadReactionURL->Put($reactionID, $url);
1278            # Now we need all of the reaction's compounds. We get these in two phases,
1279            # substrates first and then products.
1280            for my $product (0, 1) {
1281                # Get the compounds of the current type for the current reaction. FIG will
1282                # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1283                # have location data in SEED, so it defaults to the empty string.
1284                my @compounds = $fig->reaction2comp($reactionID, $product);
1285                for my $compData (@compounds) {
1286                    # Extract the compound data from the current tuple.
1287                    my ($cid, $stoich, $main) = @{$compData};
1288                    # Link the compound to the reaction.
1289                    $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1290                                             $product, $stoich);
1291                }
1292            }
1293        }
1294        # Finish the load.
1295        my $retVal = $self->_FinishAll();
1296        return $retVal;
1297    }
1298    
1299    =head3 LoadGroupData
1300    
1301    C<< my $stats = $spl->LoadGroupData(); >>
1302    
1303    Load the genome Groups into Sprout.
1304    
1305    The following relations are loaded by this method.
1306    
1307        GenomeGroups
1308    
1309    There is no direct support for genome groups in FIG, so we access the SEED
1310    files directly.
1311    
1312    =over 4
1313    
1314    =item RETURNS
1315    
1316    Returns a statistics object for the loads.
1317    
1318    =back
1319    
1320    =cut
1321    #: Return Type $%;
1322    sub LoadGroupData {
1323        # Get this object instance.
1324        my ($self) = @_;
1325        # Get the FIG object.
1326        my $fig = $self->{fig};
1327        # Get the genome hash.
1328        my $genomeHash = $self->{genomes};
1329        my $genomeCount = (keys %{$genomeHash});
1330        # Create a load object for the table we're loading.
1331        my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);
1332        Trace("Beginning group data load.") if T(2);
1333        # Loop through the genomes.
1334        my $line;
1335        for my $genomeID (keys %{$genomeHash}) {
1336            Trace("Processing $genomeID.") if T(3);
1337            # Open the NMPDR group file for this genome.
1338            if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1339                defined($line = <TMP>)) {
1340                # Clean the line ending.
1341                chomp $line;
1342                # Add the group to the table. Note that there can only be one group
1343                # per genome.
1344                $loadGenomeGroups->Put($genomeID, $line);
1345            }
1346            close TMP;
1347      }      }
1348      # Finish the load.      # Finish the load.
1349      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 996  Line 1419 
1419      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1420      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1421      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1422            # Trace the fact that we're cleaning up.
1423            my $relName = $loader->RelName;
1424            Trace("Finishing load for $relName.") if T(2);
1425          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1426            if ($self->{options}->{dbLoad}) {
1427                # Here we want to use the load file just created to load the database.
1428                Trace("Loading relation $relName.") if T(2);
1429                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1430                # Accumulate the statistics from the DB load.
1431                $stats->Accumulate($newStats);
1432            }
1433          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1434          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1435      }      }
1436      # Return the load statistics.      # Return the load statistics.

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3