[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.21, Sat Nov 12 03:42:48 2005 UTC revision 1.33, Sat May 13 04:11:23 2006 UTC
# Line 132  Line 132 
132      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
133      my %subsystems = ();      my %subsystems = ();
134      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
135          # Here we want all the subsystems.          # Here we want all the NMPDR subsystems. First we get the whole list.
136          %subsystems = map { $_ => 1 } $fig->all_subsystems();          my @subs = $fig->all_subsystems();
137            # Loop through, checking for the NMPDR file.
138            for my $sub (@subs) {
139                if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") {
140                    $subsystems{$sub} = 1;
141                }
142            }
143      } else {      } else {
144          my $type = ref $subsysFile;          my $type = ref $subsysFile;
145          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 171  Line 177 
177      return $retVal;      return $retVal;
178  }  }
179    
180    =head3 LoadOnly
181    
182    C<< my $flag = $spl->LoadOnly; >>
183    
184    Return TRUE if we are in load-only mode, else FALSE.
185    
186    =cut
187    
188    sub LoadOnly {
189        my ($self) = @_;
190        return $self->{options}->{loadOnly};
191    }
192    
193    =head3 PrimaryOnly
194    
195    C<< my $flag = $spl->PrimaryOnly; >>
196    
197    Return TRUE if only the main entity is to be loaded, else FALSE.
198    
199    =cut
200    
201    sub PrimaryOnly {
202        my ($self) = @_;
203        return $self->{options}->{primaryOnly};
204    }
205    
206  =head3 LoadGenomeData  =head3 LoadGenomeData
207    
208  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 198  Line 230 
230    
231  =back  =back
232    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
233  =cut  =cut
234  #: Return Type $%;  #: Return Type $%;
235  sub LoadGenomeData {  sub LoadGenomeData {
# Line 216  Line 240 
240      # Get the genome count.      # Get the genome count.
241      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
242      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
243      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
244      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
245      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
246      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
247      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
248      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
249        if ($self->{options}->{loadOnly}) {
250            Trace("Loading from existing files.") if T(2);
251        } else {
252            Trace("Generating genome data.") if T(2);
253      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
254      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
255          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
256          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
257          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
258          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
259          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
260          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
261          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
262          # Get the full taxonomy.          # Get the full taxonomy.
263          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
264          # Output the genome record.          # Output the genome record.
# Line 268  Line 294 
294              }              }
295          }          }
296      }      }
297        }
298      # Finish the loads.      # Finish the loads.
299      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
300      # Return the result.      # Return the result.
# Line 311  Line 338 
338      my $genomeCount = (keys %{$genomeFilter});      my $genomeCount = (keys %{$genomeFilter});
339      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
340      # Start the loads.      # Start the loads.
341      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
342      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
343      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
344      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
345      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
346      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
347            Trace("Loading from existing files.") if T(2);
348        } else {
349            Trace("Generating coupling data.") if T(2);
350      # Loop through the genomes found.      # Loop through the genomes found.
351      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
352          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
# Line 389  Line 419 
419              }              }
420          }          }
421      }      }
422        }
423      # All done. Finish the load.      # All done. Finish the load.
424      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
425      return $retVal;      return $retVal;
# Line 410  Line 441 
441      FeatureTranslation      FeatureTranslation
442      FeatureUpstream      FeatureUpstream
443      IsLocatedIn      IsLocatedIn
444        HasFeature
445    
446  =over 4  =over 4
447    
# Line 426  Line 458 
458      my ($self) = @_;      my ($self) = @_;
459      # Get the FIG object.      # Get the FIG object.
460      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Find out if this is a limited run.  
     my $limited = $self->{options}->{limitedFeatures};  
461      # Get the table of genome IDs.      # Get the table of genome IDs.
462      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
463      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
464      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
465      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
466      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
467      my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
468      if (! $limited) {      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
469          $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
470          $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadHasFeature = $self->_TableLoader('HasFeature');
         $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);  
     }  
471      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
472      # locations.      # locations.
473      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
474      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
475            Trace("Loading from existing files.") if T(2);
476        } else {
477            Trace("Generating feature data.") if T(2);
478      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
479      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
480          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
# Line 459  Line 488 
488              my ($featureID, $locations, undef, $type) = @{$featureData};              my ($featureID, $locations, undef, $type) = @{$featureData};
489              # Create the feature record.              # Create the feature record.
490              $loadFeature->Put($featureID, 1, $type);              $loadFeature->Put($featureID, 1, $type);
491                    # Link it to the parent genome.
492                    $loadHasFeature->Put($genomeID, $featureID, $type);
493              # Create the aliases.              # Create the aliases.
494              for my $alias ($fig->feature_aliases($featureID)) {              for my $alias ($fig->feature_aliases($featureID)) {
495                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
496              }              }
             # The next stuff is for a full load only.  
             if (! $limited) {  
497                  # Get the links.                  # Get the links.
498                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
499                  for my $link (@links) {                  for my $link (@links) {
# Line 483  Line 512 
512                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
513                      }                      }
514                  }                  }
             }  
515              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
516              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
517              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
# Line 512  Line 540 
540              }              }
541          }          }
542      }      }
543        }
544      # Finish the loads.      # Finish the loads.
545      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
546      return $retVal;      return $retVal;
# Line 548  Line 577 
577      my $fig = $self->{fig};      my $fig = $self->{fig};
578      # Get the table of genome IDs.      # Get the table of genome IDs.
579      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
580      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
581      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');
582                                                             $featureCount * $genomeCount);      if ($self->{options}->{loadOnly}) {
583      Trace("Beginning BBH load.") if T(2);          Trace("Loading from existing files.") if T(2);
584        } else {
585            Trace("Generating BBH data.") if T(2);
586      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
587      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
588          $loadIsBidirectionalBestHitOf->Add("genomeIn");          $loadIsBidirectionalBestHitOf->Add("genomeIn");
# Line 579  Line 608 
608              }              }
609          }          }
610      }      }
611        }
612      # Finish the loads.      # Finish the loads.
613      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
614      return $retVal;      return $retVal;
# Line 639  Line 669 
669      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
670      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
671      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
     my $subsysCount = @subsysIDs;  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
672      # Get the map list.      # Get the map list.
673      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
     my $mapCount = @maps;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);  
     my $loadRole = $self->_TableLoader('Role', $featureCount * 6);  
     my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);  
     my $loadCatalyzes = $self->_TableLoader('Catalyzes', $genomeCount * $featureCount);  
     my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);  
     my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);  
     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);  
     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);  
     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);  
     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);  
     my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);  
     my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);  
     my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);  
     my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);  
     my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);  
     my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);  
     my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);  
674      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
675      Trace("Beginning subsystem data load.") if T(2);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
676        my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
677        my $loadSubsystem = $self->_TableLoader('Subsystem');
678        my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
679        my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
680        my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
681        my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
682        my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
683        my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
684        my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
685        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
686        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
687        my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
688        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
689        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
690        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
691        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
692        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
693        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
694        if ($self->{options}->{loadOnly}) {
695            Trace("Loading from existing files.") if T(2);
696        } else {
697            Trace("Generating subsystem data.") if T(2);
698      # This hash will contain the role for each EC. When we're done, this      # This hash will contain the role for each EC. When we're done, this
699      # information will be used to generate the Catalyzes table.      # information will be used to generate the Catalyzes table.
700      my %ecToRoles = ();      my %ecToRoles = ();
# Line 678  Line 706 
706      my ($genomeID, $roleID);      my ($genomeID, $roleID);
707      my %roleData = ();      my %roleData = ();
708      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
         Trace("Creating subsystem $subsysID.") if T(3);  
         $loadSubsystem->Add("subsystemIn");  
709          # Get the subsystem object.          # Get the subsystem object.
710          my $sub = $fig->get_subsystem($subsysID);          my $sub = $fig->get_subsystem($subsysID);
711                # Only proceed if the subsystem has a spreadsheet.
712                if (! $sub->{empty_ss}) {
713                    Trace("Creating subsystem $subsysID.") if T(3);
714                    $loadSubsystem->Add("subsystemIn");
715          # Create the subsystem record.          # Create the subsystem record.
716          my $curator = $sub->get_curator();          my $curator = $sub->get_curator();
717          my $notes = $sub->get_notes();          my $notes = $sub->get_notes();
# Line 782  Line 812 
812              # Connect the subset to the subsystem.              # Connect the subset to the subsystem.
813              $loadHasRoleSubset->Put($subsysID, $actualID);              $loadHasRoleSubset->Put($subsysID, $actualID);
814              # Connect the subset to its roles.              # Connect the subset to its roles.
815              my @roles = $sub->get_subset($subsetID);                      my @roles = $sub->get_subsetC_roles($subsetID);
816              for my $roleID (@roles) {              for my $roleID (@roles) {
817                  $loadConsistsOfRoles->Put($actualID, $roleID);                  $loadConsistsOfRoles->Put($actualID, $roleID);
818              }              }
# Line 834  Line 864 
864              }              }
865          }          }
866      }      }
867            }
868        }
869      # Finish the load.      # Finish the load.
870      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
871      return $retVal;      return $retVal;
# Line 875  Line 907 
907      my $fig = $self->{fig};      my $fig = $self->{fig};
908      # Get the genome hash.      # Get the genome hash.
909      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
910      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
911      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
912      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
913      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
914            Trace("Loading from existing files.") if T(2);
915        } else {
916            Trace("Generating property data.") if T(2);
917      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
918      my %propertyKeys = ();      my %propertyKeys = ();
919      my $nextID = 1;      my $nextID = 1;
920      # Loop through the genomes.      # Loop through the genomes.
921      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
922          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
923                Trace("Generating properties for $genomeID.") if T(3);
924          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
925          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
926          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
927          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
928                my $featureCount = 0;
929                my $propertyCount = 0;
930          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
931          for my $fid (@features) {          for my $fid (@features) {
             $loadProperty->Add("featureIn");  
932              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
933              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
934              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
935                    if (scalar @attributeList) {
936                        $featureCount++;
937                    }
938              # Loop through the attributes.              # Loop through the attributes.
939              for my $tuple (@attributeList) {              for my $tuple (@attributeList) {
940                        $propertyCount++;
941                  # Get this attribute value's data. Note that we throw away the FID,                  # Get this attribute value's data. Note that we throw away the FID,
942                  # since it will always be the same as the value if "$fid".                  # since it will always be the same as the value if "$fid".
943                  my (undef, $key, $value, $url) = @{$tuple};                  my (undef, $key, $value, $url) = @{$tuple};
# Line 919  Line 959 
959                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
960              }              }
961          }          }
962                # Update the statistics.
963                Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
964                $loadHasProperty->Add("featuresIn", $featureCount);
965                $loadHasProperty->Add("propertiesIn", $propertyCount);
966            }
967      }      }
968      # Finish the load.      # Finish the load.
969      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 959  Line 1004 
1004      my $fig = $self->{fig};      my $fig = $self->{fig};
1005      # Get the genome hash.      # Get the genome hash.
1006      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1007      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1008      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1009      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1010      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1011      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1012      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1013      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1014            Trace("Loading from existing files.") if T(2);
1015        } else {
1016            Trace("Generating annotation data.") if T(2);
1017      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1018      # user records.      # user records.
1019      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 987  Line 1034 
1034              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1035              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1036              my %seenTimestamps = ();              my %seenTimestamps = ();
1037              # Check for a functional assignment.                  # Loop through the annotations.
             my $func = $fig->function_of($peg);  
             if ($func) {  
                 # If this is NOT a hypothetical assignment, we create an  
                 # assignment annotation for it.  
                 if (! FIG::hypo($peg)) {  
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
             }  
             # Now loop through the real annotations.  
1038              for my $tuple ($fig->feature_annotations($peg, "raw")) {              for my $tuple ($fig->feature_annotations($peg, "raw")) {
1039                  my ($fid, $timestamp, $user, $text) = @{$tuple};                  my ($fid, $timestamp, $user, $text) = @{$tuple};
1040                  # Here we fix up the annotation text. "\r" is removed,                  # Here we fix up the annotation text. "\r" is removed,
# Line 1041  Line 1073 
1073              }              }
1074          }          }
1075      }      }
1076        }
1077      # Finish the load.      # Finish the load.
1078      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1079      return $retVal;      return $retVal;
# Line 1081  Line 1114 
1114      my $fig = $self->{fig};      my $fig = $self->{fig};
1115      # Get the genome hash.      # Get the genome hash.
1116      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1117      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1118      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1119      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1120      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1121      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1122            Trace("Loading from existing files.") if T(2);
1123        } else {
1124            Trace("Generating annotation data.") if T(2);
1125      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1126      my %sourceURL = ();      my %sourceURL = ();
1127      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1116  Line 1151 
1151      for my $sourceID (keys %sourceDesc) {      for my $sourceID (keys %sourceDesc) {
1152          $loadSource->Put($sourceID, $sourceDesc{$sourceID});          $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1153      }      }
1154        }
1155      # Finish the load.      # Finish the load.
1156      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1157      return $retVal;      return $retVal;
# Line 1155  Line 1191 
1191      my $fig = $self->{fig};      my $fig = $self->{fig};
1192      # Get the genome hash.      # Get the genome hash.
1193      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1194      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1195      # it the key.      # it the key.
1196      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1197      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1198      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1199      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1200      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1201            Trace("Loading from existing files.") if T(2);
1202        } else {
1203            Trace("Generating external data.") if T(2);
1204      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1205      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1206      my $orgLine;      my $orgLine;
# Line 1192  Line 1230 
1230              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1231          }          }
1232      }      }
1233        }
1234      # Finish the load.      # Finish the load.
1235      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1236      return $retVal;      return $retVal;
# Line 1232  Line 1271 
1271      my ($self) = @_;      my ($self) = @_;
1272      # Get the FIG object.      # Get the FIG object.
1273      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
1274      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1275      my $loadReaction = $self->_TableLoader('Reaction', $genomeCount * 4000);      my $loadReaction = $self->_TableLoader('Reaction');
1276      my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1277      my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1278      my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1279      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1280      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1281      Trace("Beginning reaction/compound data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1282            Trace("Loading from existing files.") if T(2);
1283        } else {
1284            Trace("Generating annotation data.") if T(2);
1285      # First we create the compounds.      # First we create the compounds.
1286      my @compounds = $fig->all_compounds();      my @compounds = $fig->all_compounds();
1287      for my $cid (@compounds) {      for my $cid (@compounds) {
# Line 1291  Line 1330 
1330              }              }
1331          }          }
1332      }      }
1333        }
1334      # Finish the load.      # Finish the load.
1335      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1336      return $retVal;      return $retVal;
# Line 1326  Line 1366 
1366      my $fig = $self->{fig};      my $fig = $self->{fig};
1367      # Get the genome hash.      # Get the genome hash.
1368      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1369      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1370      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1371      Trace("Beginning group data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1372            Trace("Loading from existing files.") if T(2);
1373        } else {
1374            Trace("Generating group data.") if T(2);
1375      # Loop through the genomes.      # Loop through the genomes.
1376      my $line;      my $line;
1377      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
# Line 1345  Line 1387 
1387          }          }
1388          close TMP;          close TMP;
1389      }      }
1390        }
1391      # Finish the load.      # Finish the load.
1392      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1393      return $retVal;      return $retVal;
# Line 1366  Line 1409 
1409    
1410  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1411    
1412  =item rowCount (optional)  =item ignore
1413    
1414  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1415    
1416  =item RETURN  =item RETURN
1417    
# Line 1380  Line 1423 
1423    
1424  sub _TableLoader {  sub _TableLoader {
1425      # Get the parameters.      # Get the parameters.
1426      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1427      # Create the load object.      # Create the load object.
1428      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1429                                   $ignore);
1430      # Cache it in the loader list.      # Cache it in the loader list.
1431      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1432      # Return it to the caller.      # Return it to the caller.
# Line 1419  Line 1463 
1463      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1464      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1465      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1466          # Trace the fact that we're cleaning up.          # Get the relation name.
1467          my $relName = $loader->RelName;          my $relName = $loader->RelName;
1468          Trace("Finishing load for $relName.") if T(2);          # Check the ignore flag.
1469            if ($loader->Ignore) {
1470                Trace("Relation $relName not loaded.") if T(2);
1471            } else {
1472                # Here we really need to finish.
1473                Trace("Finishing $relName.") if T(2);
1474          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1475          if ($self->{options}->{dbLoad}) {          if ($self->{options}->{dbLoad}) {
1476              # Here we want to use the load file just created to load the database.              # Here we want to use the load file just created to load the database.
# Line 1433  Line 1482 
1482          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
1483          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1484      }      }
1485        }
1486      # Return the load statistics.      # Return the load statistics.
1487      return $retVal;      return $retVal;
1488  }  }

Legend:
Removed from v.1.21  
changed lines
  Added in v.1.33

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3