[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.37, Sat May 27 02:01:24 2006 UTC revision 1.51, Mon Jul 10 21:12:51 2006 UTC
# Line 30  Line 30 
30      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
31      print $stats->Show();      print $stats->Show();
32    
 This module makes use of the internal Sprout property C<_erdb>.  
   
33  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
34  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
35  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 175  Line 173 
173                    subsystems => \%subsystems,                    subsystems => \%subsystems,
174                    sprout => $sprout,                    sprout => $sprout,
175                    loadDirectory => $directory,                    loadDirectory => $directory,
176                    erdb => $sprout->{_erdb},                    erdb => $sprout,
177                    loaders => [],                    loaders => [],
178                    options => $options                    options => $options
179                   };                   };
# Line 342  Line 340 
340      my $fig = $self->{fig};      my $fig = $self->{fig};
341      # Get the genome hash.      # Get the genome hash.
342      my $genomeFilter = $self->{genomes};      my $genomeFilter = $self->{genomes};
343      my $genomeCount = (keys %{$genomeFilter});      # Set up an ID counter for the PCHs.
344      my $featureCount = $genomeCount * 4000;      my $pchID = 0;
345      # Start the loads.      # Start the loads.
346      my $loadCoupling = $self->_TableLoader('Coupling');      my $loadCoupling = $self->_TableLoader('Coupling');
347      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
# Line 372  Line 370 
370                  Trace("Processing PEG $peg1 for $genome.") if T(4);                  Trace("Processing PEG $peg1 for $genome.") if T(4);
371                  # Get a list of the coupled PEGs.                  # Get a list of the coupled PEGs.
372                  my @couplings = $fig->coupled_to($peg1);                  my @couplings = $fig->coupled_to($peg1);
373                    Trace(scalar(@couplings) . " couplings found for $peg1.") if T(4);
374                  # For each coupled PEG, we need to verify that a coupling already                  # For each coupled PEG, we need to verify that a coupling already
375                  # exists. If not, we have to create one.                  # exists. If not, we have to create one.
376                  for my $coupleData (@couplings) {                  for my $coupleData (@couplings) {
377                      my ($peg2, $score) = @{$coupleData};                      my ($peg2, $score) = @{$coupleData};
378                      # Compute the coupling ID.                      # Compute the coupling ID.
379                      my $coupleID = Sprout::CouplingID($peg1, $peg2);                      my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);
380                      if (! exists $dupHash{$coupleID}) {                      if (! exists $dupHash{$coupleID}) {
381                          $loadCoupling->Add("couplingIn");                          $loadCoupling->Add("couplingIn");
382                          # Here we have a new coupling to store in the load files.                          # Here we have a new coupling to store in the load files.
# Line 413  Line 412 
412                              }                              }
413                          }                          }
414                          for my $evidenceID (keys %evidenceMap) {                          for my $evidenceID (keys %evidenceMap) {
415                                # Get the ID for this evidence.
416                                $pchID++;
417                              # Create the evidence record.                              # Create the evidence record.
418                              my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};                              my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
419                              $loadPCH->Put($evidenceID, $usage);                              $loadPCH->Put($pchID, $usage);
420                              # Connect it to the coupling.                              # Connect it to the coupling.
421                              $loadIsEvidencedBy->Put($coupleID, $evidenceID);                              $loadIsEvidencedBy->Put($coupleID, $pchID);
422                              # Connect it to the features.                              # Connect it to the features.
423                              $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                              $loadUsesAsEvidence->Put($pchID, $peg3, 1);
424                              $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);                              $loadUsesAsEvidence->Put($pchID, $peg4, 2);
425                          }                          }
426                      }                      }
427                  }                  }
# Line 636  Line 637 
637  The following relations are loaded by this method.  The following relations are loaded by this method.
638    
639      Subsystem      Subsystem
640        SubsystemClass
641      Role      Role
642      RoleEC      RoleEC
643      SSCell      SSCell
# Line 698  Line 700 
700      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
701      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
702      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
703        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);
704      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
705          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
706      } else {      } else {
# Line 723  Line 726 
726                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
727                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
728                  $loadSubsystem->Put($subsysID, $curator, $notes);                  $loadSubsystem->Put($subsysID, $curator, $notes);
729                    my $class = $fig->subsystem_classification($subsysID);
730                    if ($class) {
731                        $loadSubsystemClass->Put($subsysID, $class);
732                    }
733                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
734                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
735                      # Connect to this role.                      # Connect to this role.
# Line 787  Line 794 
794                          if ($pegCount > 0) {                          if ($pegCount > 0) {
795                              Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);                              Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
796                              $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);                              $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
                             # Partition the PEGs found into clusters.  
                             my @clusters = $fig->compute_clusters(\@pegsFound, $sub);  
797                              # Create a hash mapping PEG IDs to cluster numbers.                              # Create a hash mapping PEG IDs to cluster numbers.
798                              # We default to -1 for all of them.                              # We default to -1 for all of them.
799                              my %clusterOf = map { $_ => -1 } @pegsFound;                              my %clusterOf = map { $_ => -1 } @pegsFound;
800                                # Partition the PEGs found into clusters.
801                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
802                              for (my $i = 0; $i <= $#clusters; $i++) {                              for (my $i = 0; $i <= $#clusters; $i++) {
803                                  my $subList = $clusters[$i];                                  my $subList = $clusters[$i];
804                                  for my $peg (@{$subList}) {                                  for my $peg (@{$subList}) {
# Line 1034  Line 1041 
1041          # Loop through the genomes.          # Loop through the genomes.
1042          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1043              Trace("Processing $genomeID.") if T(3);              Trace("Processing $genomeID.") if T(3);
1044                # Create a hash of timestamps. We use this to prevent duplicate time stamps
1045                # from showing up for a single PEG's annotations.
1046                my %seenTimestamps = ();
1047              # Get the genome's annotations.              # Get the genome's annotations.
1048              my @annotations = $fig->read_all_annotations($genomeID);              my @annotations = $fig->read_all_annotations($genomeID);
1049              Trace("Processing annotations.") if T(2);              Trace("Processing annotations.") if T(2);
1050              for my $tuple (@annotations) {              for my $tuple (@annotations) {
                 # Create a hash of timestamps. We use this to prevent duplicate time stamps  
                 # from showing up for a single PEG's annotations.  
                 my %seenTimestamps = ();  
1051                  # Get the annotation tuple.                  # Get the annotation tuple.
1052                  my ($peg, $timestamp, $user, $text) = @{$tuple};                  my ($peg, $timestamp, $user, $text) = @{$tuple};
1053                  # Here we fix up the annotation text. "\r" is removed,                  # Here we fix up the annotation text. "\r" is removed,
1054                  # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1055                  # modifier so that new-lines inside the text do not                  # modifier so that new-lines inside the text do not
1056                  # stop the substitution search.                  # stop the substitution search.
1057                  $text =~ s/\r//gs;                  $text =~ s/\r//gs;
# Line 1398  Line 1405 
1405      return $retVal;      return $retVal;
1406  }  }
1407    
1408    =head3 LoadSynonymData
1409    
1410    C<< my $stats = $spl->LoadSynonymData(); >>
1411    
1412    Load the synonym groups into Sprout.
1413    
1414    The following relations are loaded by this method.
1415    
1416        SynonymGroup
1417        IsSynonymGroupFor
1418    
1419    The source information for these relations is taken from the C<maps_to_id> method
1420    of the B<FIG> object. The process starts from the features, so it is possible
1421    that there will be duplicates in the SynonymGroup load file, since the relationship
1422    is one-to-many toward the features. The automatic sort on primary entity relations
1423    will fix this for us.
1424    
1425    =over 4
1426    
1427    =item RETURNS
1428    
1429    Returns a statistics object for the loads.
1430    
1431    =back
1432    
1433    =cut
1434    #: Return Type $%;
1435    sub LoadSynonymData {
1436        # Get this object instance.
1437        my ($self) = @_;
1438        # Get the FIG object.
1439        my $fig = $self->{fig};
1440        # Get the genome hash.
1441        my $genomeHash = $self->{genomes};
1442        # Create a load object for the table we're loading.
1443        my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1444        my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1445        if ($self->{options}->{loadOnly}) {
1446            Trace("Loading from existing files.") if T(2);
1447        } else {
1448            Trace("Generating synonym group data.") if T(2);
1449            # Loop through the genomes.
1450            for my $genomeID (sort keys %{$genomeHash}) {
1451                Trace("Processing $genomeID.") if T(3);
1452                # Get all of the features for this genome. The only method that does this is
1453                # all_features_detailed, which returns extra baggage that we discard.
1454                my $featureData = $fig->all_features_detailed($genomeID);
1455                my @fids = map { $_->[0] } @{$featureData};
1456                Trace(scalar(@fids) . " features found for genome $genomeID.") if T(3);
1457                # Loop through the feature IDs.
1458                for my $fid (@fids) {
1459                    # Get the group for this feature.
1460                    my $synonym = $fig->maps_to_id($fid);
1461                    # Only proceed if the synonym is a real group.
1462                    if ($synonym ne $fid) {
1463                        $loadSynonymGroup->Put($synonym);
1464                        $loadIsSynonymGroupFor->Put($synonym, $fid);
1465                    }
1466                }
1467            }
1468        }
1469        # Finish the load.
1470        my $retVal = $self->_FinishAll();
1471        return $retVal;
1472    }
1473    
1474    
1475  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1476    
1477  =head3 TableLoader  =head3 TableLoader
# Line 1465  Line 1539 
1539      my $retVal = Stats->new();      my $retVal = Stats->new();
1540      # Get the loader list.      # Get the loader list.
1541      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1542        # Create a hash to hold the statistics objects, keyed on relation name.
1543        my %loaderHash = ();
1544      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1545      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1546      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1547          # Get the relation name.          # Get the relation name.
1548          my $relName = $loader->RelName;          my $relName = $loader->RelName;
# Line 1477  Line 1553 
1553              # Here we really need to finish.              # Here we really need to finish.
1554              Trace("Finishing $relName.") if T(2);              Trace("Finishing $relName.") if T(2);
1555              my $stats = $loader->Finish();              my $stats = $loader->Finish();
1556                $loaderHash{$relName} = $stats;
1557            }
1558        }
1559        # Now we loop through again, actually loading the tables. We want to finish before
1560        # loading so that if something goes wrong at this point, all the load files are usable
1561        # and we don't have to redo all that work.
1562        for my $relName (sort keys %loaderHash) {
1563            # Get the statistics for this relation.
1564            my $stats = $loaderHash{$relName};
1565            # Check for a database load.
1566              if ($self->{options}->{dbLoad}) {              if ($self->{options}->{dbLoad}) {
1567                  # Here we want to use the load file just created to load the database.                  # Here we want to use the load file just created to load the database.
1568                  Trace("Loading relation $relName.") if T(2);                  Trace("Loading relation $relName.") if T(2);
# Line 1487  Line 1573 
1573              $retVal->Accumulate($stats);              $retVal->Accumulate($stats);
1574              Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);              Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1575          }          }
     }  
1576      # Return the load statistics.      # Return the load statistics.
1577      return $retVal;      return $retVal;
1578  }  }

Legend:
Removed from v.1.37  
changed lines
  Added in v.1.51

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3