[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.20, Wed Nov 2 21:54:40 2005 UTC revision 1.26, Mon Jan 30 21:57:02 2006 UTC
# Line 171  Line 171 
171      return $retVal;      return $retVal;
172  }  }
173    
174    =head3 LoadOnly
175    
176    C<< my $flag = $spl->LoadOnly; >>
177    
178    Return TRUE if we are in load-only mode, else FALSE.
179    
180    =cut
181    
182    sub LoadOnly {
183        my ($self) = @_;
184        return $self->{options}->{loadOnly};
185    }
186    
187    =head3 PrimaryOnly
188    
189    C<< my $flag = $spl->PrimaryOnly; >>
190    
191    Return TRUE if only the main entity is to be loaded, else FALSE.
192    
193    =cut
194    
195    sub PrimaryOnly {
196        my ($self) = @_;
197        return $self->{options}->{primaryOnly};
198    }
199    
200  =head3 LoadGenomeData  =head3 LoadGenomeData
201    
202  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 198  Line 224 
224    
225  =back  =back
226    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
227  =cut  =cut
228  #: Return Type $%;  #: Return Type $%;
229  sub LoadGenomeData {  sub LoadGenomeData {
# Line 216  Line 234 
234      # Get the genome count.      # Get the genome count.
235      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
236      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
237      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
238      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
239      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
240      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
241      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
242      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
243        if ($self->{options}->{loadOnly}) {
244            Trace("Loading from existing files.") if T(2);
245        } else {
246            Trace("Generating genome data.") if T(2);
247      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
248      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
249          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
250          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
251          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
252          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
# Line 268  Line 289 
289              }              }
290          }          }
291      }      }
292        }
293      # Finish the loads.      # Finish the loads.
294      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
295      # Return the result.      # Return the result.
# Line 311  Line 333 
333      my $genomeCount = (keys %{$genomeFilter});      my $genomeCount = (keys %{$genomeFilter});
334      my $featureCount = $genomeCount * 4000;      my $featureCount = $genomeCount * 4000;
335      # Start the loads.      # Start the loads.
336      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
337      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
338      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
339      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
340      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
341      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
342            Trace("Loading from existing files.") if T(2);
343        } else {
344            Trace("Generating coupling data.") if T(2);
345      # Loop through the genomes found.      # Loop through the genomes found.
346      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
347          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
# Line 389  Line 414 
414              }              }
415          }          }
416      }      }
417        }
418      # All done. Finish the load.      # All done. Finish the load.
419      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
420      return $retVal;      return $retVal;
# Line 426  Line 452 
452      my ($self) = @_;      my ($self) = @_;
453      # Get the FIG object.      # Get the FIG object.
454      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Find out if this is a limited run.  
     my $limited = $self->{options}->{limitedFeatures};  
455      # Get the table of genome IDs.      # Get the table of genome IDs.
456      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
457      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
458      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
459      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
460      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
461      my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
462      if (! $limited) {      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
463          $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
         $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);  
         $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);  
     }  
464      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
465      # locations.      # locations.
466      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
467      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
468            Trace("Loading from existing files.") if T(2);
469        } else {
470            Trace("Generating feature data.") if T(2);
471      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
472      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
473          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
# Line 463  Line 485 
485              for my $alias ($fig->feature_aliases($featureID)) {              for my $alias ($fig->feature_aliases($featureID)) {
486                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
487              }              }
             # The next stuff is for a full load only.  
             if (! $limited) {  
488                  # Get the links.                  # Get the links.
489                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
490                  for my $link (@links) {                  for my $link (@links) {
# Line 483  Line 503 
503                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
504                      }                      }
505                  }                  }
             }  
506              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
507              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
508              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
# Line 512  Line 531 
531              }              }
532          }          }
533      }      }
534        }
535      # Finish the loads.      # Finish the loads.
536      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
537      return $retVal;      return $retVal;
# Line 548  Line 568 
568      my $fig = $self->{fig};      my $fig = $self->{fig};
569      # Get the table of genome IDs.      # Get the table of genome IDs.
570      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
571      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
572      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',      my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');
573                                                             $featureCount * $genomeCount);      if ($self->{options}->{loadOnly}) {
574      Trace("Beginning BBH load.") if T(2);          Trace("Loading from existing files.") if T(2);
575        } else {
576            Trace("Generating BBH data.") if T(2);
577      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
578      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
579          $loadIsBidirectionalBestHitOf->Add("genomeIn");          $loadIsBidirectionalBestHitOf->Add("genomeIn");
# Line 579  Line 599 
599              }              }
600          }          }
601      }      }
602        }
603      # Finish the loads.      # Finish the loads.
604      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
605      return $retVal;      return $retVal;
# Line 615  Line 636 
636      GenomeSubset      GenomeSubset
637      HasGenomeSubset      HasGenomeSubset
638      Catalyzes      Catalyzes
639        Diagram
640        RoleOccursIn
641    
642  =over 4  =over 4
643    
# Line 637  Line 660 
660      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
661      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
662      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
663      my $subsysCount = @subsysIDs;      # Get the map list.
664      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
665      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
666      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
667      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
668      my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);      my $loadSubsystem = $self->_TableLoader('Subsystem');
669      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $genomeCount * $featureCount);      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
670      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
671      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
672      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
673      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
674      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
675      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
676      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
677      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
678      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
679      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
680      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
681      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
682      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
683      Trace("Beginning subsystem data load.") if T(2);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
684        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
685        if ($self->{options}->{loadOnly}) {
686            Trace("Loading from existing files.") if T(2);
687        } else {
688            Trace("Generating subsystem data.") if T(2);
689      # This hash will contain the role for each EC. When we're done, this      # This hash will contain the role for each EC. When we're done, this
690      # information will be used to generate the Catalyzes table.      # information will be used to generate the Catalyzes table.
691      my %ecToRoles = ();      my %ecToRoles = ();
# Line 794  Line 821 
821              }              }
822          }          }
823      }      }
824            # Now we loop through the diagrams. We need to create the diagram records
825            # and link each diagram to its roles. Note that only roles which occur
826            # in subsystems (and therefore appear in the %ecToRoles hash) are
827            # included.
828            for my $map (@maps) {
829                Trace("Loading diagram $map.") if T(3);
830                # Get the diagram's descriptive name.
831                my $name = $fig->map_name($map);
832                $loadDiagram->Put($map, $name);
833                # Now we need to link all the map's roles to it.
834                # A hash is used to prevent duplicates.
835                my %roleHash = ();
836                for my $role ($fig->map_to_ecs($map)) {
837                    if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
838                        $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
839                        $roleHash{$role} = 1;
840                    }
841                }
842            }
843      # Before we leave, we must create the Catalyzes table. We start with the reactions,      # Before we leave, we must create the Catalyzes table. We start with the reactions,
844      # then use the "ecToRoles" table to convert EC numbers to role IDs.      # then use the "ecToRoles" table to convert EC numbers to role IDs.
845      my @reactions = $fig->all_reactions();      my @reactions = $fig->all_reactions();
# Line 807  Line 853 
853              }              }
854          }          }
855      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadDiagramData  
   
 C<< my $stats = $spl->LoadDiagramData(); >>  
   
 Load the diagram data from FIG into Sprout.  
   
 Diagrams are used to organize functional roles. The diagram shows the  
 connections between chemicals that interact with a subsystem.  
   
 The following relations are loaded by this method.  
   
     Diagram  
     RoleOccursIn  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
         Trace("Loading diagram $map.") if T(3);  
         # Get the diagram's descriptive name.  
         my $name = $fig->map_name($map);  
         $loadDiagram->Put($map, $name);  
         # Now we need to link all the map's roles to it.  
         # A hash is used to prevent duplicates.  
         my %roleHash = ();  
         for my $role ($fig->map_to_ecs($map)) {  
             if (! $roleHash{$role}) {  
                 $loadRoleOccursIn->Put($role, $map);  
                 $roleHash{$role} = 1;  
             }  
         }  
856      }      }
857      # Finish the load.      # Finish the load.
858      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 907  Line 895 
895      my $fig = $self->{fig};      my $fig = $self->{fig};
896      # Get the genome hash.      # Get the genome hash.
897      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
898      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
899      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
900      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
901      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
902            Trace("Loading from existing files.") if T(2);
903        } else {
904            Trace("Generating property data.") if T(2);
905      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
906      my %propertyKeys = ();      my %propertyKeys = ();
907      my $nextID = 1;      my $nextID = 1;
908      # Loop through the genomes.      # Loop through the genomes.
909      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
910          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
911                Trace("Generating properties for $genomeID.") if T(3);
912          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
913          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
914          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
915          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
916                my $featureCount = 0;
917                my $propertyCount = 0;
918          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
919          for my $fid (@features) {          for my $fid (@features) {
             $loadProperty->Add("featureIn");  
920              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
921              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
922              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
923                    if (scalar @attributeList) {
924                        $featureCount++;
925                    }
926              # Loop through the attributes.              # Loop through the attributes.
927              for my $tuple (@attributeList) {              for my $tuple (@attributeList) {
928                        $propertyCount++;
929                  # Get this attribute value's data. Note that we throw away the FID,                  # Get this attribute value's data. Note that we throw away the FID,
930                  # since it will always be the same as the value if "$fid".                  # since it will always be the same as the value if "$fid".
931                  my (undef, $key, $value, $url) = @{$tuple};                  my (undef, $key, $value, $url) = @{$tuple};
# Line 951  Line 947 
947                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
948              }              }
949          }          }
950                # Update the statistics.
951                Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
952                $loadHasProperty->Add("featuresIn", $featureCount);
953                $loadHasProperty->Add("propertiesIn", $propertyCount);
954            }
955      }      }
956      # Finish the load.      # Finish the load.
957      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 991  Line 992 
992      my $fig = $self->{fig};      my $fig = $self->{fig};
993      # Get the genome hash.      # Get the genome hash.
994      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
995      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
996      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
997      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
998      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
999      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1000      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1001      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1002            Trace("Loading from existing files.") if T(2);
1003        } else {
1004            Trace("Generating annotation data.") if T(2);
1005      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1006      # user records.      # user records.
1007      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 1019  Line 1022 
1022              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1023              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1024              my %seenTimestamps = ();              my %seenTimestamps = ();
1025              # Check for a functional assignment.                  # Loop through the annotations.
             my $func = $fig->function_of($peg);  
             if ($func) {  
                 # If this is NOT a hypothetical assignment, we create an  
                 # assignment annotation for it.  
                 if (! FIG::hypo($peg)) {  
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
             }  
             # Now loop through the real annotations.  
1026              for my $tuple ($fig->feature_annotations($peg, "raw")) {              for my $tuple ($fig->feature_annotations($peg, "raw")) {
1027                  my ($fid, $timestamp, $user, $text) = @{$tuple};                  my ($fid, $timestamp, $user, $text) = @{$tuple};
1028                  # Here we fix up the annotation text. "\r" is removed,                  # Here we fix up the annotation text. "\r" is removed,
# Line 1073  Line 1061 
1061              }              }
1062          }          }
1063      }      }
1064        }
1065      # Finish the load.      # Finish the load.
1066      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1067      return $retVal;      return $retVal;
# Line 1113  Line 1102 
1102      my $fig = $self->{fig};      my $fig = $self->{fig};
1103      # Get the genome hash.      # Get the genome hash.
1104      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1105      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1106      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1107      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1108      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1109      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1110            Trace("Loading from existing files.") if T(2);
1111        } else {
1112            Trace("Generating annotation data.") if T(2);
1113      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1114      my %sourceURL = ();      my %sourceURL = ();
1115      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1148  Line 1139 
1139      for my $sourceID (keys %sourceDesc) {      for my $sourceID (keys %sourceDesc) {
1140          $loadSource->Put($sourceID, $sourceDesc{$sourceID});          $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1141      }      }
1142        }
1143      # Finish the load.      # Finish the load.
1144      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1145      return $retVal;      return $retVal;
# Line 1187  Line 1179 
1179      my $fig = $self->{fig};      my $fig = $self->{fig};
1180      # Get the genome hash.      # Get the genome hash.
1181      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1182      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1183      # it the key.      # it the key.
1184      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1185      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1186      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1187      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1188      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1189            Trace("Loading from existing files.") if T(2);
1190        } else {
1191            Trace("Generating external data.") if T(2);
1192      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1193      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1194      my $orgLine;      my $orgLine;
# Line 1224  Line 1218 
1218              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1219          }          }
1220      }      }
1221        }
1222      # Finish the load.      # Finish the load.
1223      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1224      return $retVal;      return $retVal;
# Line 1264  Line 1259 
1259      my ($self) = @_;      my ($self) = @_;
1260      # Get the FIG object.      # Get the FIG object.
1261      my $fig = $self->{fig};      my $fig = $self->{fig};
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
1262      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1263      my $loadReaction = $self->_TableLoader('Reaction', $genomeCount * 4000);      my $loadReaction = $self->_TableLoader('Reaction');
1264      my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1265      my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1266      my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1267      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1268      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1269      Trace("Beginning reaction/compound data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1270            Trace("Loading from existing files.") if T(2);
1271        } else {
1272            Trace("Generating annotation data.") if T(2);
1273      # First we create the compounds.      # First we create the compounds.
1274      my @compounds = $fig->all_compounds();      my @compounds = $fig->all_compounds();
1275      for my $cid (@compounds) {      for my $cid (@compounds) {
# Line 1323  Line 1318 
1318              }              }
1319          }          }
1320      }      }
1321        }
1322      # Finish the load.      # Finish the load.
1323      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1324      return $retVal;      return $retVal;
# Line 1358  Line 1354 
1354      my $fig = $self->{fig};      my $fig = $self->{fig};
1355      # Get the genome hash.      # Get the genome hash.
1356      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1357      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1358      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1359      Trace("Beginning group data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1360            Trace("Loading from existing files.") if T(2);
1361        } else {
1362            Trace("Generating group data.") if T(2);
1363      # Loop through the genomes.      # Loop through the genomes.
1364      my $line;      my $line;
1365      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
# Line 1377  Line 1375 
1375          }          }
1376          close TMP;          close TMP;
1377      }      }
1378        }
1379      # Finish the load.      # Finish the load.
1380      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1381      return $retVal;      return $retVal;
# Line 1398  Line 1397 
1397    
1398  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1399    
1400  =item rowCount (optional)  =item ignore
1401    
1402  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1403    
1404  =item RETURN  =item RETURN
1405    
# Line 1412  Line 1411 
1411    
1412  sub _TableLoader {  sub _TableLoader {
1413      # Get the parameters.      # Get the parameters.
1414      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1415      # Create the load object.      # Create the load object.
1416      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1417                                   $ignore);
1418      # Cache it in the loader list.      # Cache it in the loader list.
1419      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1420      # Return it to the caller.      # Return it to the caller.
# Line 1451  Line 1451 
1451      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1452      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads restartable.
1453      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1454          # Trace the fact that we're cleaning up.          # Get the relation name.
1455          my $relName = $loader->RelName;          my $relName = $loader->RelName;
1456          Trace("Finishing load for $relName.") if T(2);          # Check the ignore flag.
1457            if ($loader->Ignore) {
1458                Trace("Relation $relName not loaded.") if T(2);
1459            } else {
1460                # Here we really need to finish.
1461                Trace("Finishing $relName.") if T(2);
1462          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1463          if ($self->{options}->{dbLoad}) {              if ($self->{options}->{dbLoad} && ! $loader->Ignore) {
1464              # Here we want to use the load file just created to load the database.              # Here we want to use the load file just created to load the database.
1465              Trace("Loading relation $relName.") if T(2);              Trace("Loading relation $relName.") if T(2);
1466              my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);              my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
# Line 1465  Line 1470 
1470          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
1471          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1472      }      }
1473        }
1474      # Return the load statistics.      # Return the load statistics.
1475      return $retVal;      return $retVal;
1476  }  }

Legend:
Removed from v.1.20  
changed lines
  Added in v.1.26

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3