[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Tue Aug 16 20:35:03 2005 UTC revision 1.19, Thu Oct 20 09:34:09 2005 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 40  Line 41 
41  a variable called C<$fig>. This makes it fairly straightforward to determine which  a variable called C<$fig>. This makes it fairly straightforward to determine which
42  FIG methods are required to load the Sprout database.  FIG methods are required to load the Sprout database.
43    
44    This object creates the load files; however, the tables are not created until it
45    is time to actually do the load from the files into the target database.
46    
47  =cut  =cut
48    
49  #: Constructor SproutLoad->new();  #: Constructor SproutLoad->new();
# Line 48  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 83 
83  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all known subsystems will be
84  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. Only subsystem data related to the trusted subsystems is loaded.
85    
86    =item options
87    
88    Reference to a hash of command-line options.
89    
90  =back  =back
91    
92  =cut  =cut
93    
94  sub new {  sub new {
95      # Get the parameters.      # Get the parameters.
96      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
97      # Load the list of genomes into a hash.      # Load the list of genomes into a hash.
98      my %genomes;      my %genomes;
99      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
# Line 155  Line 163 
163                    sprout => $sprout,                    sprout => $sprout,
164                    loadDirectory => $directory,                    loadDirectory => $directory,
165                    erdb => $sprout->{_erdb},                    erdb => $sprout->{_erdb},
166                    loaders => []                    loaders => [],
167                      options => $options
168                   };                   };
169      # Bless and return it.      # Bless and return it.
170      bless $retVal, $class;      bless $retVal, $class;
# Line 217  Line 226 
226      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
227      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
228          Trace("Loading data for genome $genomeID.") if T(3);          Trace("Loading data for genome $genomeID.") if T(3);
229            $loadGenome->Add("genomeIn");
230          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
231          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
232          # Get the genus, species, and strain from the scientific name. Note that we append          # Get the genus, species, and strain from the scientific name. Note that we append
# Line 232  Line 242 
242          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
243          for my $contigID (@contigs) {          for my $contigID (@contigs) {
244              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
245                $loadContig->Add("contigIn");
246                $loadSequence->Add("contigIn");
247              # Create the contig ID.              # Create the contig ID.
248              my $sproutContigID = "$genomeID:$contigID";              my $sproutContigID = "$genomeID:$contigID";
249              # Create the contig record and relate it to the genome.              # Create the contig record and relate it to the genome.
# Line 243  Line 255 
255              # Now we get the sequence a chunk at a time.              # Now we get the sequence a chunk at a time.
256              my $contigLen = $fig->contig_ln($genomeID, $contigID);              my $contigLen = $fig->contig_ln($genomeID, $contigID);
257              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
258                    $loadSequence->Add("chunkIn");
259                  # Compute the endpoint of this chunk.                  # Compute the endpoint of this chunk.
260                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);
261                  # Get the actual DNA.                  # Get the actual DNA.
# Line 307  Line 320 
320      # Loop through the genomes found.      # Loop through the genomes found.
321      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
322          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
323            $loadCoupling->Add("genomeIn");
324          # Create a hash table for holding coupled pairs. We use this to prevent          # Create a hash table for holding coupled pairs. We use this to prevent
325          # duplicates. For example, if A is coupled to B, we don't want to also          # duplicates. For example, if A is coupled to B, we don't want to also
326          # assert that B is coupled to A, because we already know it. Fortunately,          # assert that B is coupled to A, because we already know it. Fortunately,
# Line 317  Line 331 
331          my @pegs = $fig->pegs_of($genome);          my @pegs = $fig->pegs_of($genome);
332          # Loop through the PEGs.          # Loop through the PEGs.
333          for my $peg1 (@pegs) {          for my $peg1 (@pegs) {
334                $loadCoupling->Add("pegIn");
335              Trace("Processing PEG $peg1 for $genome.") if T(4);              Trace("Processing PEG $peg1 for $genome.") if T(4);
336              # Get a list of the coupled PEGs.              # Get a list of the coupled PEGs.
337              my @couplings = $fig->coupled_to($peg1);              my @couplings = $fig->coupled_to($peg1);
# Line 327  Line 342 
342                  # Compute the coupling ID.                  # Compute the coupling ID.
343                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                  my $coupleID = Sprout::CouplingID($peg1, $peg2);
344                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
345                        $loadCoupling->Add("couplingIn");
346                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
347                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);
348                      # Ensure we don't do this again.                      # Ensure we don't do this again.
# Line 342  Line 358 
358                      my %evidenceMap = ();                      my %evidenceMap = ();
359                      # Process each evidence item.                      # Process each evidence item.
360                      for my $evidenceData (@evidence) {                      for my $evidenceData (@evidence) {
361                            $loadPCH->Add("evidenceIn");
362                          my ($peg3, $peg4, $usage) = @{$evidenceData};                          my ($peg3, $peg4, $usage) = @{$evidenceData};
363                          # Only proceed if the evidence is from a Sprout                          # Only proceed if the evidence is from a Sprout
364                          # genome.                          # genome.
365                          if ($genomeFilter->{$fig->genome_of($peg3)}) {                          if ($genomeFilter->{$fig->genome_of($peg3)}) {
366                                $loadUsesAsEvidence->Add("evidenceChosen");
367                              my $evidenceKey = "$coupleID $peg3 $peg4";                              my $evidenceKey = "$coupleID $peg3 $peg4";
368                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
369                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
370                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
371                              # at least keep the meaningful ones. Only evidence is                              # at least keep the meaningful ones. Only evidence in
372                              # the hash makes it to the output.                              # the hash makes it to the output.
373                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
374                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 365  Line 383 
383                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);
384                          # Connect it to the features.                          # Connect it to the features.
385                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
386                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);
387                      }                      }
388                  }                  }
389              }              }
# Line 408  Line 426 
426      my ($self) = @_;      my ($self) = @_;
427      # Get the FIG object.      # Get the FIG object.
428      my $fig = $self->{fig};      my $fig = $self->{fig};
429        # Find out if this is a limited run.
430        my $limited = $self->{options}->{limitedFeatures};
431      # Get the table of genome IDs.      # Get the table of genome IDs.
432      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
433      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
434      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
435      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
436      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature', $featureCount);
     my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);  
     my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);  
     my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);  
     my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);  
437      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);
438        my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);
439        my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);
440        if (! $limited) {
441            $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);
442            $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);
443            $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);
444        }
445      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
446      # locations.      # locations.
447      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 426  Line 449 
449      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
450      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
451          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
452            $loadFeature->Add("genomeIn");
453          # Get the feature list for this genome.          # Get the feature list for this genome.
454          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
455          # Loop through the features.          # Loop through the features.
456          for my $featureData (@{$features}) {          for my $featureData (@{$features}) {
457                $loadFeature->Add("featureIn");
458              # Split the tuple.              # Split the tuple.
459              my ($featureID, $locations, $aliases, $type) = @{$featureData};              my ($featureID, $locations, undef, $type) = @{$featureData};
460              # Create the feature record.              # Create the feature record.
461              $loadFeature->Put("$genomeID:$featureID", 1, $type);              $loadFeature->Put($featureID, 1, $type);
462              # Create the aliases.              # Create the aliases.
463              for my $alias (split /\s*,\s*/, $aliases) {              for my $alias ($fig->feature_aliases($featureID)) {
464                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
465              }              }
466                # The next stuff is for a full load only.
467                if (! $limited) {
468              # Get the links.              # Get the links.
469              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
470              for my $link (@links) {              for my $link (@links) {
# Line 445  Line 472 
472              }              }
473              # If this is a peg, generate the translation and the upstream.              # If this is a peg, generate the translation and the upstream.
474              if ($type eq 'peg') {              if ($type eq 'peg') {
475                        $loadFeatureTranslation->Add("pegIn");
476                  my $translation = $fig->get_translation($featureID);                  my $translation = $fig->get_translation($featureID);
477                  if ($translation) {                  if ($translation) {
478                      $loadFeatureTranslation->Put($featureID, $translation);                      $loadFeatureTranslation->Put($featureID, $translation);
# Line 455  Line 483 
483                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
484                  }                  }
485              }              }
486                }
487              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
488              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
489              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
490              # for Sprout.              # for Sprout.
491              my @locationList = split /\s*,\s*/, $locations;              my @locationList = split /\s*,\s*/, $locations;
492                # Create the location position indicator.
493                my $i = 1;
494              # Loop through the locations.              # Loop through the locations.
495              for my $location (@locationList) {              for my $location (@locationList) {
496                  # Parse the location.                  # Parse the location.
497                  my $locObject = BasicLocation->new($location);                  my $locObject = BasicLocation->new("$genomeID:$location");
498                  # Split it into a list of chunks.                  # Split it into a list of chunks.
499                  my @locOList = ();                  my @locOList = ();
500                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
501                        $loadIsLocatedIn->Add("peeling");
502                      push @locOList, $peeling;                      push @locOList, $peeling;
503                  }                  }
504                  push @locOList, $locObject;                  push @locOList, $locObject;
505                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
506                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
507                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
508                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
509                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
# Line 525  Line 556 
556      Trace("Beginning BBH load.") if T(2);      Trace("Beginning BBH load.") if T(2);
557      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
558      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
559            $loadIsBidirectionalBestHitOf->Add("genomeIn");
560          Trace("Processing features for genome $genomeID.") if T(3);          Trace("Processing features for genome $genomeID.") if T(3);
561          # Get the feature list for this genome.          # Get the feature list for this genome.
562          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
# Line 568  Line 600 
600    
601      Subsystem      Subsystem
602      Role      Role
603        RoleEC
604      SSCell      SSCell
605      ContainsFeature      ContainsFeature
606      IsGenomeOf      IsGenomeOf
# Line 575  Line 608 
608      OccursInSubsystem      OccursInSubsystem
609      ParticipatesIn      ParticipatesIn
610      HasSSCell      HasSSCell
611        Catalyzes
612        Reaction
613        ConsistsOfRoles
614        RoleSubset
615        HasRoleSubset
616        ConsistsOfGenomes
617        GenomeSubset
618        HasGenomeSubset
619    
620  =over 4  =over 4
621    
# Line 584  Line 625 
625    
626  =back  =back
627    
 B<TO DO>  
   
 Generate RoleName table?  
   
628  =cut  =cut
629  #: Return Type $%;  #: Return Type $%;
630  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 607  Line 644 
644      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
645      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);
646      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);
647        my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);
648      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);
649      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);
650      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);
# Line 614  Line 652 
652      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
653      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
654      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
655        my $loadReaction = $self->_TableLoader('Reaction', $featureCount * $genomeCount);
656        my $loadCatalyzes = $self->_TableLoader('Catalyzes', $featureCount * $genomeCount);
657        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);
658        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);
659        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);
660        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);
661        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);
662        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);
663      Trace("Beginning subsystem data load.") if T(2);      Trace("Beginning subsystem data load.") if T(2);
664        # The reaction hash will contain a list of reactions for each role. When we're done,
665        # a complicated sort and merge will be used to generate the Reaction and Catalyzes
666        # tables.
667        my %reactionsToRoles = ();
668      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
669      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
670      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
671      # duplicates. As we move along, we'll connect the roles and subsystems.      # duplicates. As we move along, we'll connect the roles and subsystems
672        # and memorize up the reactions.
673        my ($genomeID, $roleID);
674      my %roleData = ();      my %roleData = ();
675      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
676          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
677            $loadSubsystem->Add("subsystemIn");
678            # Get the subsystem object.
679            my $sub = $fig->get_subsystem($subsysID);
680            # Get its reaction hash.
681            my $reactionHash = $sub->get_reactions();
682          # Create the subsystem record.          # Create the subsystem record.
683          $loadSubsystem->Put($subsysID);          my $curator = $sub->get_curator();
684          # Get the subsystem's roles.          my $notes = $sub->get_notes();
685          my @roles = $fig->subsys_to_roles($subsysID);          $loadSubsystem->Put($subsysID, $curator, $notes);
686          # Connect the roles to the subsystem. If a role is new, we create          # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
687          # a role record for it.          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
688          for my $roleID (@roles) {              # Connect to this role.
689              $loadOccursInSubsystem->Put($roleID, $subsysID);              $loadOccursInSubsystem->Add("roleIn");
690                $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
691                # If it's a new role, add it to the role table.
692              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
693                  $loadRole->Put($roleID);                  # Get the role's abbreviation.
694                    my $abbr = $sub->get_role_abbr($col);
695                    # Add the role.
696                    $loadRole->Put($roleID, $abbr);
697                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
698                    # Check for an EC number.
699                    if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
700                        $loadRoleEC->Put($roleID, $1);
701                    }
702                    # Add the role's reactions.
703                    my $reactions = $reactionHash->{$roleID};
704                    for my $reactionID (@{$reactions}) {
705                        if (! exists $reactionsToRoles{$reactionID}) {
706                            # Here the reaction is brand-new, so we create its reaction
707                            # record.
708                            $loadReaction->Put($reactionID, $fig->reversible($reactionID));
709                            # We also create a blank list for it in the reaction hash.
710                            $reactionsToRoles{$reactionID} = [];
711                        }
712                        # Add the role to the reaction's role list.
713                        push @{$reactionsToRoles{$reactionID}}, $roleID;
714              }              }
715          }          }
716          # Now all roles for this subsystem have been filled in. We create the          }
717          # spreadsheet by matches roles to genomes. To do this, we need to          # Now we create the spreadsheet for the subsystem by matching roles to
718          # get the genomes on the sheet.          # genomes. Each genome is a row and each role is a column. We may need
719            # to actually create the roles as we find them.
720          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
721          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};          for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
722          for my $genomeID (@genomes) {              # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
723              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
724                  # Connect the genome to the subsystem.                  # Count the PEGs and cells found for verification purposes.
725                  $loadParticipatesIn->Put($genomeID, $subsysID);                  my $pegCount = 0;
726                    my $cellCount = 0;
727                    # Create a list for the PEGs we find. This list will be used
728                    # to generate cluster numbers.
729                    my @pegsFound = ();
730                    # Create a hash that maps spreadsheet IDs to PEGs. We will
731                    # use this to generate the ContainsFeature data after we have
732                    # the cluster numbers.
733                    my %cellPegs = ();
734                    # Get the genome's variant code for this subsystem.
735                    my $variantCode = $sub->get_variant_code($row);
736                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
737                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
738                  for (my $i = 0; $i <= $#roles; $i++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
739                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
740                      my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i);                      my @pegs = $sub->get_pegs_from_cell($row, $col);
741                      # Only proceed if features exist.                      # Only proceed if features exist.
742                      if (@pegs > 0) {                      if (@pegs > 0) {
743                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
744                          my $cellID = "$subsysID:$genomeID:$i";                          $cellCount++;
745                            my $cellID = "$subsysID:$genomeID:$col";
746                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
747                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
748                          $loadIsRoleOf->Put($role, $cellID);                          $loadIsRoleOf->Put($roleID, $cellID);
749                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
750                          # Attach the features to it.                          # Remember its features.
751                          for my $pegID (@pegs) {                          push @pegsFound, @pegs;
752                              $loadContainsFeature->Put($cellID, $pegID);                          $cellPegs{$cellID} = \@pegs;
753                            $pegCount += @pegs;
754                        }
755                    }
756                    # If we found some cells for this genome, we need to compute clusters and
757                    # denote it participates in the subsystem.
758                    if ($pegCount > 0) {
759                        Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
760                        $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
761                        # Partition the PEGs found into clusters.
762                        my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
763                        # Create a hash mapping PEG IDs to cluster numbers.
764                        # We default to -1 for all of them.
765                        my %clusterOf = map { $_ => -1 } @pegsFound;
766                        for (my $i = 0; $i <= $#clusters; $i++) {
767                            my $subList = $clusters[$i];
768                            for my $peg (@{$subList}) {
769                                $clusterOf{$peg} = $i;
770                            }
771                        }
772                        # Create the ContainsFeature data.
773                        for my $cellID (keys %cellPegs) {
774                            my $cellList = $cellPegs{$cellID};
775                            for my $cellPeg (@$cellList) {
776                                $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
777                            }
778                          }                          }
779                      }                      }
780                  }                  }
781              }              }
782            # Now we need to generate the subsets. The subset names must be concatenated to
783            # the subsystem name to make them unique keys. There are two types of subsets:
784            # genome subsets and role subsets. We do the role subsets first.
785            my @subsetNames = $sub->get_subset_names();
786            for my $subsetID (@subsetNames) {
787                # Create the subset record.
788                my $actualID = "$subsysID:$subsetID";
789                $loadRoleSubset->Put($actualID);
790                # Connect the subset to the subsystem.
791                $loadHasRoleSubset->Put($subsysID, $actualID);
792                # Connect the subset to its roles.
793                my @roles = $sub->get_subset($subsetID);
794                for my $roleID (@roles) {
795                    $loadConsistsOfRoles->Put($actualID, $roleID);
796                }
797            }
798            # Next the genome subsets.
799            @subsetNames = $sub->get_subset_namesR();
800            for my $subsetID (@subsetNames) {
801                # Create the subset record.
802                my $actualID = "$subsysID:$subsetID";
803                $loadGenomeSubset->Put($actualID);
804                # Connect the subset to the subsystem.
805                $loadHasGenomeSubset->Put($subsysID, $actualID);
806                # Connect the subset to its genomes.
807                my @genomes = $sub->get_subsetR($subsetID);
808                for my $genomeID (@genomes) {
809                    $loadConsistsOfGenomes->Put($actualID, $genomeID);
810                }
811            }
812        }
813        # Before we leave, we must create the Catalyzes table. The data is all stored in
814        # "reactionToRoles" hash.
815        for my $reactionID (keys %reactionsToRoles) {
816            # Get this reaction's list of roles. We sort it so we can merge out duplicates.
817            my @roles = sort @{$reactionsToRoles{$reactionID}};
818            my $lastRole = "";
819            # Loop through the roles, creating catalyzation records.
820            for my $thisRole (@roles) {
821                if ($thisRole ne $lastRole) {
822                    $loadCatalyzes->Put($thisRole, $reactionID);
823                }
824          }          }
825      }      }
826      # Finish the load.      # Finish the load.
# Line 778  Line 933 
933      my $nextID = 1;      my $nextID = 1;
934      # Loop through the genomes.      # Loop through the genomes.
935      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
936            $loadProperty->Add("genomeIn");
937          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
938          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
939          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
940          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
941          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
942          for my $fid (@features) {          for my $fid (@features) {
943                $loadProperty->Add("featureIn");
944              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
945              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
946              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
# Line 869  Line 1026 
1026      # Get the current time.      # Get the current time.
1027      my $time = time();      my $time = time();
1028      # Loop through the genomes.      # Loop through the genomes.
1029      for my $genomeID (%{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1030          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
1031          # Get the genome's PEGs.          # Get the genome's PEGs.
1032          my @pegs = $fig->pegs_of($genomeID);          my @pegs = $fig->pegs_of($genomeID);
# Line 892  Line 1049 
1049                      # Denote we've seen this timestamp.                      # Denote we've seen this timestamp.
1050                      $seenTimestamps{$time} = 1;                      $seenTimestamps{$time} = 1;
1051                  }                  }
1052                }
1053                  # Now loop through the real annotations.                  # Now loop through the real annotations.
1054                  for my $tuple ($fig->feature_annotations($peg, "raw")) {                  for my $tuple ($fig->feature_annotations($peg, "raw")) {
1055                      my ($fid, $timestamp, $user, $text) = $tuple;                  my ($fid, $timestamp, $user, $text) = @{$tuple};
1056                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1057                      # and "\t" and "\n" are escaped. Note we use the "s"                      # and "\t" and "\n" are escaped. Note we use the "s"
1058                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
# Line 906  Line 1064 
1064                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1065                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1066                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1067                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1068                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1069                              $timestamp++;                      my $keyStamp = $timestamp;
1070                        while ($seenTimestamps{$keyStamp}) {
1071                            $keyStamp++;
1072                          }                          }
1073                          $seenTimestamps{$timestamp} = 1;                      $seenTimestamps{$keyStamp} = 1;
1074                          my $annotationID = "$peg:$timestamp";                      my $annotationID = "$peg:$keyStamp";
1075                          # Insure the user exists.                          # Insure the user exists.
1076                          if (! $users{$user}) {                          if (! $users{$user}) {
1077                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 919  Line 1079 
1079                              $users{$user} = 1;                              $users{$user} = 1;
1080                          }                          }
1081                          # Generate the annotation.                          # Generate the annotation.
1082                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1083                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1084                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1085                      } else {                      } else {
# Line 929  Line 1089 
1089                  }                  }
1090              }              }
1091          }          }
1092        # Finish the load.
1093        my $retVal = $self->_FinishAll();
1094        return $retVal;
1095    }
1096    
1097    =head3 LoadSourceData
1098    
1099    C<< my $stats = $spl->LoadSourceData(); >>
1100    
1101    Load the source data from FIG into Sprout.
1102    
1103    Source data links genomes to information about the organizations that
1104    mapped it.
1105    
1106    The following relations are loaded by this method.
1107    
1108        ComesFrom
1109        Source
1110        SourceURL
1111    
1112    There is no direct support for source attribution in FIG, so we access the SEED
1113    files directly.
1114    
1115    =over 4
1116    
1117    =item RETURNS
1118    
1119    Returns a statistics object for the loads.
1120    
1121    =back
1122    
1123    =cut
1124    #: Return Type $%;
1125    sub LoadSourceData {
1126        # Get this object instance.
1127        my ($self) = @_;
1128        # Get the FIG object.
1129        my $fig = $self->{fig};
1130        # Get the genome hash.
1131        my $genomeHash = $self->{genomes};
1132        my $genomeCount = (keys %{$genomeHash});
1133        # Create load objects for each of the tables we're loading.
1134        my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);
1135        my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);
1136        my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);
1137        Trace("Beginning source data load.") if T(2);
1138        # Create hashes to collect the Source information.
1139        my %sourceURL = ();
1140        my %sourceDesc = ();
1141        # Loop through the genomes.
1142        my $line;
1143        for my $genomeID (sort keys %{$genomeHash}) {
1144            Trace("Processing $genomeID.") if T(3);
1145            # Open the project file.
1146            if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1147                defined($line = <TMP>)) {
1148                chomp $line;
1149                my($sourceID, $desc, $url) = split(/\t/,$line);
1150                $loadComesFrom->Put($genomeID, $sourceID);
1151                if ($url && ! exists $sourceURL{$sourceID}) {
1152                    $loadSourceURL->Put($sourceID, $url);
1153                    $sourceURL{$sourceID} = 1;
1154                }
1155                if ($desc) {
1156                    $sourceDesc{$sourceID} = $desc;
1157                } elsif (! exists $sourceDesc{$sourceID}) {
1158                    $sourceDesc{$sourceID} = $sourceID;
1159                }
1160            }
1161            close TMP;
1162        }
1163        # Write the source descriptions.
1164        for my $sourceID (keys %sourceDesc) {
1165            $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1166        }
1167        # Finish the load.
1168        my $retVal = $self->_FinishAll();
1169        return $retVal;
1170    }
1171    
1172    =head3 LoadExternalData
1173    
1174    C<< my $stats = $spl->LoadExternalData(); >>
1175    
1176    Load the external data from FIG into Sprout.
1177    
1178    External data contains information about external feature IDs.
1179    
1180    The following relations are loaded by this method.
1181    
1182        ExternalAliasFunc
1183        ExternalAliasOrg
1184    
1185    The support for external IDs in FIG is hidden beneath layers of other data, so
1186    we access the SEED files directly to create these tables. This is also one of
1187    the few load methods that does not proceed genome by genome.
1188    
1189    =over 4
1190    
1191    =item RETURNS
1192    
1193    Returns a statistics object for the loads.
1194    
1195    =back
1196    
1197    =cut
1198    #: Return Type $%;
1199    sub LoadExternalData {
1200        # Get this object instance.
1201        my ($self) = @_;
1202        # Get the FIG object.
1203        my $fig = $self->{fig};
1204        # Get the genome hash.
1205        my $genomeHash = $self->{genomes};
1206        my $genomeCount = (keys %{$genomeHash});
1207        # Convert the genome hash. We'll get the genus and species for each genome and make
1208        # it the key.
1209        my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1210        # Create load objects for each of the tables we're loading.
1211        my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);
1212        my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);
1213        Trace("Beginning external data load.") if T(2);
1214        # We loop through the files one at a time. First, the organism file.
1215        Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1216        my $orgLine;
1217        while (defined($orgLine = <ORGS>)) {
1218            # Clean the input line.
1219            chomp $orgLine;
1220            # Parse the organism name.
1221            my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1222            $loadExternalAliasOrg->Put($protID, $name);
1223        }
1224        close ORGS;
1225        # Now the function file.
1226        my $funcLine;
1227        Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1228        while (defined($funcLine = <FUNCS>)) {
1229            # Clean the line ending.
1230            chomp $funcLine;
1231            # Only proceed if the line is non-blank.
1232            if ($funcLine) {
1233                # Split it into fields.
1234                my @funcFields = split /\s*\t\s*/, $funcLine;
1235                # If there's an EC number, append it to the description.
1236                if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1237                    $funcFields[1] .= " $1";
1238                }
1239                # Output the function line.
1240                $loadExternalAliasFunc->Put(@funcFields[0,1]);
1241            }
1242        }
1243        # Finish the load.
1244        my $retVal = $self->_FinishAll();
1245        return $retVal;
1246    }
1247    
1248    
1249    =head3 LoadReactionData
1250    
1251    C<< my $stats = $spl->LoadReactionData(); >>
1252    
1253    Load the reaction data from FIG into Sprout.
1254    
1255    Reaction data connects reactions to the compounds that participate in them.
1256    
1257    The following relations are loaded by this method.
1258    
1259        ReactionURL
1260        Compound
1261        CompoundName
1262        CompoundCAS
1263        IsAComponentOf
1264    
1265    This method proceeds reaction by reaction rather than genome by genome.
1266    
1267    =over 4
1268    
1269    =item RETURNS
1270    
1271    Returns a statistics object for the loads.
1272    
1273    =back
1274    
1275    =cut
1276    #: Return Type $%;
1277    sub LoadReactionData {
1278        # Get this object instance.
1279        my ($self) = @_;
1280        # Get the FIG object.
1281        my $fig = $self->{fig};
1282        # Get the genome hash.
1283        my $genomeHash = $self->{genomes};
1284        my $genomeCount = (keys %{$genomeHash});
1285        # Create load objects for each of the tables we're loading.
1286        my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);
1287        my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);
1288        my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);
1289        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);
1290        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);
1291        Trace("Beginning reaction/compound data load.") if T(2);
1292        # Create a hash to remember the compounds we've generated in the compound table.
1293        my %compoundHash = ();
1294        # Loop through the reactions.
1295        my @reactions = $fig->all_reactions();
1296        for my $reactionID (@reactions) {
1297            # Compute the reaction's URL.
1298            my $url = HTML::reaction_link($reactionID);
1299            # Put it in the ReactionURL table.
1300            $loadReactionURL->Put($reactionID, $url);
1301            # Now we need all of the reaction's compounds. We get these in two phases,
1302            # substrates first and then products.
1303            for my $product (0, 1) {
1304                # Get the compounds of the current type for the current reaction. FIG will
1305                # give us 3-tuples: [ID, Stoichometry, main-flag]. At this time we do not
1306                # have location data in SEED, so it defaults to the empty string.
1307                my @compounds = $fig->reaction2comp($reactionID, $product);
1308                for my $compData (@compounds) {
1309                    # Extract the compound data from the current tuple.
1310                    my ($cid, $stoich, $main) = @{$compData};
1311                    # Link the compound to the reaction.
1312                    $loadIsAComponentOf->Put($cid, $reactionID, "", $main, $product, $stoich);
1313                    # If this is a new compound, we need to create its table entries.
1314                    if (! exists $compoundHash{$cid}) {
1315                        $compoundHash{$cid} = 1;
1316                        # Create the main compound record and denote we've done it.
1317                        $loadCompound->Put($cid);
1318                        # Check for a CAS ID.
1319                        my $cas = $fig->cas($cid);
1320                        if ($cas) {
1321                            $loadCompoundCAS->Put($cid, $cas);
1322                        }
1323                        # Check for names.
1324                        my @names = $fig->names_of_compound($cid);
1325                        # Each name will be given a priority number, starting with 1.
1326                        my $prio = 0;
1327                        for my $name (@names) {
1328                            $loadCompoundName->Put($cid, $name, $prio++);
1329                        }
1330                    }
1331                }
1332            }
1333        }
1334        # Finish the load.
1335        my $retVal = $self->_FinishAll();
1336        return $retVal;
1337    }
1338    
1339    =head3 LoadGroupData
1340    
1341    C<< my $stats = $spl->LoadGroupData(); >>
1342    
1343    Load the genome Groups into Sprout.
1344    
1345    The following relations are loaded by this method.
1346    
1347        GenomeGroups
1348    
1349    There is no direct support for genome groups in FIG, so we access the SEED
1350    files directly.
1351    
1352    =over 4
1353    
1354    =item RETURNS
1355    
1356    Returns a statistics object for the loads.
1357    
1358    =back
1359    
1360    =cut
1361    #: Return Type $%;
1362    sub LoadGroupData {
1363        # Get this object instance.
1364        my ($self) = @_;
1365        # Get the FIG object.
1366        my $fig = $self->{fig};
1367        # Get the genome hash.
1368        my $genomeHash = $self->{genomes};
1369        my $genomeCount = (keys %{$genomeHash});
1370        # Create a load object for the table we're loading.
1371        my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);
1372        Trace("Beginning group data load.") if T(2);
1373        # Loop through the genomes.
1374        my $line;
1375        for my $genomeID (keys %{$genomeHash}) {
1376            Trace("Processing $genomeID.") if T(3);
1377            # Open the NMPDR group file for this genome.
1378            if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1379                defined($line = <TMP>)) {
1380                # Clean the line ending.
1381                chomp $line;
1382                # Add the group to the table. Note that there can only be one group
1383                # per genome.
1384                $loadGenomeGroups->Put($genomeID, $line);
1385            }
1386            close TMP;
1387      }      }
1388      # Finish the load.      # Finish the load.
1389      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1004  Line 1459 
1459      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1460      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1461      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1462            # Trace the fact that we're cleaning up.
1463            my $relName = $loader->RelName;
1464            Trace("Finishing load for $relName.") if T(2);
1465          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1466            if ($self->{options}->{dbLoad}) {
1467                # Here we want to use the load file just created to load the database.
1468                Trace("Loading relation $relName.") if T(2);
1469                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1470                # Accumulate the statistics from the DB load.
1471                $stats->Accumulate($newStats);
1472            }
1473          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1474          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1475      }      }
1476      # Return the load statistics.      # Return the load statistics.

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.19

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3