[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Wed Sep 14 11:21:24 2005 UTC revision 1.34, Sat May 13 04:13:24 2006 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 79  Line 80 
80  =item subsysFile  =item subsysFile
81    
82  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
85    in its data directory.) Only subsystem data related to the trusted subsystems is loaded.
86    
87  =item options  =item options
88    
# Line 131  Line 133 
133      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
134      my %subsystems = ();      my %subsystems = ();
135      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
136          # Here we want all the subsystems.          # Here we want all the NMPDR subsystems. First we get the whole list.
137          %subsystems = map { $_ => 1 } $fig->all_subsystems();          my @subs = $fig->all_subsystems();
138            # Loop through, checking for the NMPDR file.
139            for my $sub (@subs) {
140                if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") {
141                    $subsystems{$sub} = 1;
142                }
143            }
144      } else {      } else {
145          my $type = ref $subsysFile;          my $type = ref $subsysFile;
146          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 170  Line 178 
178      return $retVal;      return $retVal;
179  }  }
180    
181    =head3 LoadOnly
182    
183    C<< my $flag = $spl->LoadOnly; >>
184    
185    Return TRUE if we are in load-only mode, else FALSE.
186    
187    =cut
188    
189    sub LoadOnly {
190        my ($self) = @_;
191        return $self->{options}->{loadOnly};
192    }
193    
194    =head3 PrimaryOnly
195    
196    C<< my $flag = $spl->PrimaryOnly; >>
197    
198    Return TRUE if only the main entity is to be loaded, else FALSE.
199    
200    =cut
201    
202    sub PrimaryOnly {
203        my ($self) = @_;
204        return $self->{options}->{primaryOnly};
205    }
206    
207  =head3 LoadGenomeData  =head3 LoadGenomeData
208    
209  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 197  Line 231 
231    
232  =back  =back
233    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
234  =cut  =cut
235  #: Return Type $%;  #: Return Type $%;
236  sub LoadGenomeData {  sub LoadGenomeData {
# Line 215  Line 241 
241      # Get the genome count.      # Get the genome count.
242      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
243      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
244      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
245      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
246      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
247      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
248      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
249      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
250        if ($self->{options}->{loadOnly}) {
251            Trace("Loading from existing files.") if T(2);
252        } else {
253            Trace("Generating genome data.") if T(2);
254      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
255      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
256          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
257          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
258          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
259          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
260          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
261          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
262          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
263          # Get the full taxonomy.          # Get the full taxonomy.
264          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
265          # Output the genome record.          # Output the genome record.
# Line 267  Line 295 
295              }              }
296          }          }
297      }      }
298        }
299      # Finish the loads.      # Finish the loads.
300      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
301      # Return the result.      # Return the result.
# Line 310  Line 339 
339      my $genomeCount = (keys %{$genomeFilter});      my $genomeCount = (keys %{$genomeFilter});
340      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
341      # Start the loads.      # Start the loads.
342      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
343      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
344      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
345      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
346      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
347      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
348            Trace("Loading from existing files.") if T(2);
349        } else {
350            Trace("Generating coupling data.") if T(2);
351      # Loop through the genomes found.      # Loop through the genomes found.
352      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
353          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
# Line 367  Line 399 
399                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
400                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
401                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
402                              # at least keep the meaningful ones. Only evidence is                                  # at least keep the meaningful ones. Only evidence in
403                              # the hash makes it to the output.                              # the hash makes it to the output.
404                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
405                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 382  Line 414 
414                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);
415                          # Connect it to the features.                          # Connect it to the features.
416                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
417                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                              $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);
418                            }
419                      }                      }
420                  }                  }
421              }              }
# Line 409  Line 442 
442      FeatureTranslation      FeatureTranslation
443      FeatureUpstream      FeatureUpstream
444      IsLocatedIn      IsLocatedIn
445        HasFeature
446    
447  =over 4  =over 4
448    
# Line 425  Line 459 
459      my ($self) = @_;      my ($self) = @_;
460      # Get the FIG object.      # Get the FIG object.
461      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Find out if this is a limited run.  
     my $limited = $self->{options}->{limitedFeatures};  
462      # Get the table of genome IDs.      # Get the table of genome IDs.
463      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
464      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
465      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
466      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
467      my ($loadFeatureAlias, $loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
468      if (! $limited) {      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
469          $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
470          $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
471          $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadHasFeature = $self->_TableLoader('HasFeature');
         $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);  
     }  
472      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
473      # locations.      # locations.
474      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
475      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
476            Trace("Loading from existing files.") if T(2);
477        } else {
478            Trace("Generating feature data.") if T(2);
479      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
480      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
481          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
# Line 455  Line 486 
486          for my $featureData (@{$features}) {          for my $featureData (@{$features}) {
487              $loadFeature->Add("featureIn");              $loadFeature->Add("featureIn");
488              # Split the tuple.              # Split the tuple.
489              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type) = @{$featureData};
490              # Create the feature record.              # Create the feature record.
491              $loadFeature->Put($featureID, 1, $type);              $loadFeature->Put($featureID, 1, $type);
492              # The next stuff is for a full load only.                  # Link it to the parent genome.
493              if (! $limited) {                  $loadHasFeature->Put($genomeID, $featureID, $type);
494                  # Create the aliases.                  # Create the aliases.
495                  for my $alias (split /\s*,\s*/, $aliases) {                  for my $alias ($fig->feature_aliases($featureID)) {
496                      $loadFeatureAlias->Put($featureID, $alias);                      $loadFeatureAlias->Put($featureID, $alias);
497                  }                  }
498                  # Get the links.                  # Get the links.
# Line 482  Line 513 
513                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
514                      }                      }
515                  }                  }
             }  
516              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
517              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
518              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
519              # for Sprout.              # for Sprout.
520              my @locationList = map { "$genomeID:$_" } split /\s*,\s*/, $locations;                  my @locationList = split /\s*,\s*/, $locations;
521              # Create the location position indicator.              # Create the location position indicator.
522              my $i = 1;              my $i = 1;
523              # Loop through the locations.              # Loop through the locations.
524              for my $location (@locationList) {              for my $location (@locationList) {
525                  # Parse the location.                  # Parse the location.
526                  my $locObject = BasicLocation->new($location);                      my $locObject = BasicLocation->new("$genomeID:$location");
527                  # Split it into a list of chunks.                  # Split it into a list of chunks.
528                  my @locOList = ();                  my @locOList = ();
529                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
# Line 511  Line 541 
541              }              }
542          }          }
543      }      }
544        }
545      # Finish the loads.      # Finish the loads.
546      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
547      return $retVal;      return $retVal;
# Line 547  Line 578 
578      my $fig = $self->{fig};      my $fig = $self->{fig};
579      # Get the table of genome IDs.      # Get the table of genome IDs.
580      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
581      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
582      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');
583                                                             $featureCount * $genomeCount);      if ($self->{options}->{loadOnly}) {
584      Trace("Beginning BBH load.") if T(2);          Trace("Loading from existing files.") if T(2);
585        } else {
586            Trace("Generating BBH data.") if T(2);
587      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
588      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
589          $loadIsBidirectionalBestHitOf->Add("genomeIn");          $loadIsBidirectionalBestHitOf->Add("genomeIn");
# Line 578  Line 609 
609              }              }
610          }          }
611      }      }
612        }
613      # Finish the loads.      # Finish the loads.
614      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
615      return $retVal;      return $retVal;
# Line 599  Line 631 
631    
632      Subsystem      Subsystem
633      Role      Role
634        RoleEC
635      SSCell      SSCell
636      ContainsFeature      ContainsFeature
637      IsGenomeOf      IsGenomeOf
# Line 606  Line 639 
639      OccursInSubsystem      OccursInSubsystem
640      ParticipatesIn      ParticipatesIn
641      HasSSCell      HasSSCell
642        ConsistsOfRoles
643        RoleSubset
644        HasRoleSubset
645        ConsistsOfGenomes
646        GenomeSubset
647        HasGenomeSubset
648        Catalyzes
649        Diagram
650        RoleOccursIn
651    
652  =over 4  =over 4
653    
# Line 615  Line 657 
657    
658  =back  =back
659    
 B<TO DO>  
   
 Generate RoleName table?  
   
660  =cut  =cut
661  #: Return Type $%;  #: Return Type $%;
662  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 632  Line 670 
670      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
671      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
672      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
673      my $subsysCount = @subsysIDs;      # Get the map list.
674      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
675      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
676      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
677      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
678      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
679      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
680      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
681      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
682      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
683      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
684      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
685      Trace("Beginning subsystem data load.") if T(2);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
686        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
687        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
688        my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
689        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
690        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
691        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
692        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
693        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
694        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
695        if ($self->{options}->{loadOnly}) {
696            Trace("Loading from existing files.") if T(2);
697        } else {
698            Trace("Generating subsystem data.") if T(2);
699            # This hash will contain the role for each EC. When we're done, this
700            # information will be used to generate the Catalyzes table.
701            my %ecToRoles = ();
702      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
703      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
704      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
705      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
706            # and memorize up the reactions.
707            my ($genomeID, $roleID);
708      my %roleData = ();      my %roleData = ();
709      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
710                # Get the subsystem object.
711                my $sub = $fig->get_subsystem($subsysID);
712                # Only proceed if the subsystem has a spreadsheet.
713                if (! $sub->{empty_ss}) {
714          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
715          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
716          # Create the subsystem record.          # Create the subsystem record.
717          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
718          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
719          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
720          # Connect the roles to the subsystem. If a role is new, we create                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
721          # a role record for it.                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
722          for my $roleID (@roles) {                      # Connect to this role.
723              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
724              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
725                        # If it's a new role, add it to the role table.
726              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
727                  $loadRole->Put($roleID);                          # Get the role's abbreviation.
728                            my $abbr = $sub->get_role_abbr($col);
729                            # Add the role.
730                            $loadRole->Put($roleID, $abbr);
731                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
732                            # Check for an EC number.
733                            if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
734                                my $ec = $1;
735                                $loadRoleEC->Put($roleID, $ec);
736                                $ecToRoles{$ec} = $roleID;
737              }              }
738          }          }
739          # Now all roles for this subsystem have been filled in. We create the                  }
740          # spreadsheet by matches roles to genomes. To do this, we need to                  # Now we create the spreadsheet for the subsystem by matching roles to
741          # get the genomes on the sheet.                  # genomes. Each genome is a row and each role is a column. We may need
742                    # to actually create the roles as we find them.
743          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
744          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
745          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
746              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
747                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
748                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
749                            my $cellCount = 0;
750                            # Create a list for the PEGs we find. This list will be used
751                            # to generate cluster numbers.
752                            my @pegsFound = ();
753                            # Create a hash that maps spreadsheet IDs to PEGs. We will
754                            # use this to generate the ContainsFeature data after we have
755                            # the cluster numbers.
756                            my %cellPegs = ();
757                            # Get the genome's variant code for this subsystem.
758                            my $variantCode = $sub->get_variant_code($row);
759                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
760                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
761                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
762                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
763                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = $sub->get_pegs_from_cell($row, $col);
764                      # Only proceed if features exist.                      # Only proceed if features exist.
765                      if (@pegs > 0) {                      if (@pegs > 0) {
766                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
767                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
768                                    my $cellID = "$subsysID:$genomeID:$col";
769                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
770                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
771                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
772                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
773                          # Attach the features to it.                                  # Remember its features.
774                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
775                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
776                                    $pegCount += @pegs;
777                                }
778                            }
779                            # If we found some cells for this genome, we need to compute clusters and
780                            # denote it participates in the subsystem.
781                            if ($pegCount > 0) {
782                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
783                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
784                                # Partition the PEGs found into clusters.
785                                my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
786                                # Create a hash mapping PEG IDs to cluster numbers.
787                                # We default to -1 for all of them.
788                                my %clusterOf = map { $_ => -1 } @pegsFound;
789                                for (my $i = 0; $i <= $#clusters; $i++) {
790                                    my $subList = $clusters[$i];
791                                    for my $peg (@{$subList}) {
792                                        $clusterOf{$peg} = $i;
793                                    }
794                                }
795                                # Create the ContainsFeature data.
796                                for my $cellID (keys %cellPegs) {
797                                    my $cellList = $cellPegs{$cellID};
798                                    for my $cellPeg (@$cellList) {
799                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
800                          }                          }
801                      }                      }
802                  }                  }
803              }              }
804          }          }
805                    # Now we need to generate the subsets. The subset names must be concatenated to
806                    # the subsystem name to make them unique keys. There are two types of subsets:
807                    # genome subsets and role subsets. We do the role subsets first.
808                    my @subsetNames = $sub->get_subset_names();
809                    for my $subsetID (@subsetNames) {
810                        # Create the subset record.
811                        my $actualID = "$subsysID:$subsetID";
812                        $loadRoleSubset->Put($actualID);
813                        # Connect the subset to the subsystem.
814                        $loadHasRoleSubset->Put($subsysID, $actualID);
815                        # Connect the subset to its roles.
816                        my @roles = $sub->get_subsetC_roles($subsetID);
817                        for my $roleID (@roles) {
818                            $loadConsistsOfRoles->Put($actualID, $roleID);
819      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
820  }  }
821                    # Next the genome subsets.
822  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
823                    for my $subsetID (@subsetNames) {
824  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
825                        my $actualID = "$subsysID:$subsetID";
826  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
827                        # Connect the subset to the subsystem.
828  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
829  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
830                        my @genomes = $sub->get_subsetR($subsetID);
831  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
832                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
833      Diagram                      }
834      RoleOccursIn                  }
835                }
836  =over 4              # Now we loop through the diagrams. We need to create the diagram records
837                # and link each diagram to its roles. Note that only roles which occur
838  =item RETURNS              # in subsystems (and therefore appear in the %ecToRoles hash) are
839                # included.
840  Returns a statistics object for the loads.              for my $map (@maps) {
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
841          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
842          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
843          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 754  Line 846 
846          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
847          my %roleHash = ();          my %roleHash = ();
848          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
849              if (! $roleHash{$role}) {                      if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
850                  $loadRoleOccursIn->Put($role, $map);                          $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
851                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
852              }              }
853          }          }
854      }      }
855                # Before we leave, we must create the Catalyzes table. We start with the reactions,
856                # then use the "ecToRoles" table to convert EC numbers to role IDs.
857                my @reactions = $fig->all_reactions();
858                for my $reactionID (@reactions) {
859                    # Get this reaction's list of roles. The results will be EC numbers.
860                    my @roles = $fig->catalyzed_by($reactionID);
861                    # Loop through the roles, creating catalyzation records.
862                    for my $thisRole (@roles) {
863                        if (exists $ecToRoles{$thisRole}) {
864                            $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
865                        }
866                    }
867                }
868            }
869        }
870      # Finish the load.      # Finish the load.
871      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
872      return $retVal;      return $retVal;
# Line 801  Line 908 
908      my $fig = $self->{fig};      my $fig = $self->{fig};
909      # Get the genome hash.      # Get the genome hash.
910      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
911      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
912      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
913      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
914      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
915            Trace("Loading from existing files.") if T(2);
916        } else {
917            Trace("Generating property data.") if T(2);
918      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
919      my %propertyKeys = ();      my %propertyKeys = ();
920      my $nextID = 1;      my $nextID = 1;
921      # Loop through the genomes.      # Loop through the genomes.
922      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
923          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
924                Trace("Generating properties for $genomeID.") if T(3);
925          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
926          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
927          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
928          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
929                my $featureCount = 0;
930                my $propertyCount = 0;
931          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
932          for my $fid (@features) {          for my $fid (@features) {
             $loadProperty->Add("featureIn");  
933              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
934              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
935              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
936                    if (scalar @attributeList) {
937                        $featureCount++;
938                    }
939              # Loop through the attributes.              # Loop through the attributes.
940              for my $tuple (@attributeList) {              for my $tuple (@attributeList) {
941                        $propertyCount++;
942                  # Get this attribute value's data. Note that we throw away the FID,                  # Get this attribute value's data. Note that we throw away the FID,
943                  # since it will always be the same as the value if "$fid".                  # since it will always be the same as the value if "$fid".
944                  my (undef, $key, $value, $url) = @{$tuple};                  my (undef, $key, $value, $url) = @{$tuple};
# Line 845  Line 960 
960                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
961              }              }
962          }          }
963                # Update the statistics.
964                Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
965                $loadHasProperty->Add("featuresIn", $featureCount);
966                $loadHasProperty->Add("propertiesIn", $propertyCount);
967            }
968      }      }
969      # Finish the load.      # Finish the load.
970      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 885  Line 1005 
1005      my $fig = $self->{fig};      my $fig = $self->{fig};
1006      # Get the genome hash.      # Get the genome hash.
1007      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1008      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1009      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1010      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1011      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1012      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1013      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1014      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1015            Trace("Loading from existing files.") if T(2);
1016        } else {
1017            Trace("Generating annotation data.") if T(2);
1018      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1019      # user records.      # user records.
1020      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 913  Line 1035 
1035              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1036              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1037              my %seenTimestamps = ();              my %seenTimestamps = ();
1038              # Check for a functional assignment.                  # Loop through the annotations.
             my $func = $fig->function_of($peg);  
             if ($func) {  
                 # If this is NOT a hypothetical assignment, we create an  
                 # assignment annotation for it.  
                 if (! FIG::hypo($peg)) {  
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
1039                  for my $tuple ($fig->feature_annotations($peg, "raw")) {                  for my $tuple ($fig->feature_annotations($peg, "raw")) {
1040                      my ($fid, $timestamp, $user, $text) = @{$tuple};                      my ($fid, $timestamp, $user, $text) = @{$tuple};
1041                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
# Line 941  Line 1049 
1049                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1050                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1051                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1052                          # Here it's a number. We need to insure it's unique.                          # Here it's a number. We need to insure the one we use to form
1053                          while ($seenTimestamps{$timestamp}) {                          # the key is unique.
1054                              $timestamp++;                          my $keyStamp = $timestamp;
1055                            while ($seenTimestamps{$keyStamp}) {
1056                                $keyStamp++;
1057                          }                          }
1058                          $seenTimestamps{$timestamp} = 1;                          $seenTimestamps{$keyStamp} = 1;
1059                          my $annotationID = "$peg:$timestamp";                          my $annotationID = "$peg:$keyStamp";
1060                          # Insure the user exists.                          # Insure the user exists.
1061                          if (! $users{$user}) {                          if (! $users{$user}) {
1062                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 954  Line 1064 
1064                              $users{$user} = 1;                              $users{$user} = 1;
1065                          }                          }
1066                          # Generate the annotation.                          # Generate the annotation.
1067                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                          $loadAnnotation->Put($annotationID, $timestamp, $text);
1068                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1069                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1070                      } else {                      } else {
# Line 1005  Line 1115 
1115      my $fig = $self->{fig};      my $fig = $self->{fig};
1116      # Get the genome hash.      # Get the genome hash.
1117      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1118      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1119      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1120      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1121      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1122      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1123            Trace("Loading from existing files.") if T(2);
1124        } else {
1125            Trace("Generating annotation data.") if T(2);
1126      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1127      my %sourceURL = ();      my %sourceURL = ();
1128      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1024  Line 1136 
1136              chomp $line;              chomp $line;
1137              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1138              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1139              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1140                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1141                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1142              }              }
1143              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1144                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1145                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1146                        $sourceDesc{$sourceID} = $sourceID;
1147              }              }
1148          }          }
1149          close TMP;          close TMP;
1150      }      }
1151            # Write the source descriptions.
1152            for my $sourceID (keys %sourceDesc) {
1153                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1154            }
1155        }
1156      # Finish the load.      # Finish the load.
1157      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1158      return $retVal;      return $retVal;
# Line 1074  Line 1192 
1192      my $fig = $self->{fig};      my $fig = $self->{fig};
1193      # Get the genome hash.      # Get the genome hash.
1194      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1195      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1196      # it the key.      # it the key.
1197      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1198      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1199      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1200      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1201      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1202            Trace("Loading from existing files.") if T(2);
1203        } else {
1204            Trace("Generating external data.") if T(2);
1205      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1206      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1207      my $orgLine;      my $orgLine;
# Line 1111  Line 1231 
1231              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1232          }          }
1233      }      }
1234        }
1235        # Finish the load.
1236        my $retVal = $self->_FinishAll();
1237        return $retVal;
1238    }
1239    
1240    
1241    =head3 LoadReactionData
1242    
1243    C<< my $stats = $spl->LoadReactionData(); >>
1244    
1245    Load the reaction data from FIG into Sprout.
1246    
1247    Reaction data connects reactions to the compounds that participate in them.
1248    
1249    The following relations are loaded by this method.
1250    
1251        Reaction
1252        ReactionURL
1253        Compound
1254        CompoundName
1255        CompoundCAS
1256        IsAComponentOf
1257    
1258    This method proceeds reaction by reaction rather than genome by genome.
1259    
1260    =over 4
1261    
1262    =item RETURNS
1263    
1264    Returns a statistics object for the loads.
1265    
1266    =back
1267    
1268    =cut
1269    #: Return Type $%;
1270    sub LoadReactionData {
1271        # Get this object instance.
1272        my ($self) = @_;
1273        # Get the FIG object.
1274        my $fig = $self->{fig};
1275        # Create load objects for each of the tables we're loading.
1276        my $loadReaction = $self->_TableLoader('Reaction');
1277        my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1278        my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1279        my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1280        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1281        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1282        if ($self->{options}->{loadOnly}) {
1283            Trace("Loading from existing files.") if T(2);
1284        } else {
1285            Trace("Generating annotation data.") if T(2);
1286            # First we create the compounds.
1287            my @compounds = $fig->all_compounds();
1288            for my $cid (@compounds) {
1289                # Check for names.
1290                my @names = $fig->names_of_compound($cid);
1291                # Each name will be given a priority number, starting with 1.
1292                my $prio = 1;
1293                for my $name (@names) {
1294                    $loadCompoundName->Put($cid, $name, $prio++);
1295                }
1296                # Create the main compound record. Note that the first name
1297                # becomes the label.
1298                my $label = (@names > 0 ? $names[0] : $cid);
1299                $loadCompound->Put($cid, $label);
1300                # Check for a CAS ID.
1301                my $cas = $fig->cas($cid);
1302                if ($cas) {
1303                    $loadCompoundCAS->Put($cid, $cas);
1304                }
1305            }
1306            # All the compounds are set up, so we need to loop through the reactions next. First,
1307            # we initialize the discriminator index. This is a single integer used to insure
1308            # duplicate elements in a reaction are not accidentally collapsed.
1309            my $discrim = 0;
1310            my @reactions = $fig->all_reactions();
1311            for my $reactionID (@reactions) {
1312                # Create the reaction record.
1313                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1314                # Compute the reaction's URL.
1315                my $url = HTML::reaction_link($reactionID);
1316                # Put it in the ReactionURL table.
1317                $loadReactionURL->Put($reactionID, $url);
1318                # Now we need all of the reaction's compounds. We get these in two phases,
1319                # substrates first and then products.
1320                for my $product (0, 1) {
1321                    # Get the compounds of the current type for the current reaction. FIG will
1322                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1323                    # have location data in SEED, so it defaults to the empty string.
1324                    my @compounds = $fig->reaction2comp($reactionID, $product);
1325                    for my $compData (@compounds) {
1326                        # Extract the compound data from the current tuple.
1327                        my ($cid, $stoich, $main) = @{$compData};
1328                        # Link the compound to the reaction.
1329                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1330                                                 $product, $stoich);
1331                    }
1332                }
1333            }
1334        }
1335      # Finish the load.      # Finish the load.
1336      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1337      return $retVal;      return $retVal;
# Line 1146  Line 1367 
1367      my $fig = $self->{fig};      my $fig = $self->{fig};
1368      # Get the genome hash.      # Get the genome hash.
1369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1370      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1371      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1372      Trace("Beginning group data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1373            Trace("Loading from existing files.") if T(2);
1374        } else {
1375            Trace("Generating group data.") if T(2);
1376      # Loop through the genomes.      # Loop through the genomes.
1377      my $line;      my $line;
1378      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
# Line 1165  Line 1388 
1388          }          }
1389          close TMP;          close TMP;
1390      }      }
1391        }
1392      # Finish the load.      # Finish the load.
1393      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1394      return $retVal;      return $retVal;
# Line 1186  Line 1410 
1410    
1411  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1412    
1413  =item rowCount (optional)  =item ignore
1414    
1415  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1416    
1417  =item RETURN  =item RETURN
1418    
# Line 1200  Line 1424 
1424    
1425  sub _TableLoader {  sub _TableLoader {
1426      # Get the parameters.      # Get the parameters.
1427      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1428      # Create the load object.      # Create the load object.
1429      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1430                                   $ignore);
1431      # Cache it in the loader list.      # Cache it in the loader list.
1432      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1433      # Return it to the caller.      # Return it to the caller.
# Line 1239  Line 1464 
1464      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1465      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1466      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1467            # Get the relation name.
1468            my $relName = $loader->RelName;
1469            # Check the ignore flag.
1470            if ($loader->Ignore) {
1471                Trace("Relation $relName not loaded.") if T(2);
1472            } else {
1473                # Here we really need to finish.
1474                Trace("Finishing $relName.") if T(2);
1475          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1476                if ($self->{options}->{dbLoad}) {
1477                    # Here we want to use the load file just created to load the database.
1478                    Trace("Loading relation $relName.") if T(2);
1479                    my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1480                    # Accumulate the statistics from the DB load.
1481                    $stats->Accumulate($newStats);
1482                }
1483          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1484          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1485      }      }
1486        }
1487      # Return the load statistics.      # Return the load statistics.
1488      return $retVal;      return $retVal;
1489  }  }

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.34

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3