[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17, Mon Sep 19 01:46:51 2005 UTC revision 1.18, Wed Oct 12 03:17:58 2005 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 367  Line 368 
368                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
369                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
370                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
371                              # at least keep the meaningful ones. Only evidence is                              # at least keep the meaningful ones. Only evidence in
372                              # the hash makes it to the output.                              # the hash makes it to the output.
373                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
374                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 606  Line 607 
607      OccursInSubsystem      OccursInSubsystem
608      ParticipatesIn      ParticipatesIn
609      HasSSCell      HasSSCell
610        Catalyzes
611        Reaction
612        ConsistsOfRoles
613        RoleSubset
614        HasRoleSubset
615        ConsistsOfGenomes
616        GenomeSubset
617        HasGenomeSubset
618    
619  =over 4  =over 4
620    
# Line 645  Line 654 
654      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
655      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
656      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
657        my $loadReaction = $self->_TableLoader('Reaction', $featureCount * $genomeCount);
658        my $loadCatalyzes = $self->_TableLoader('Catalyzes', $featureCount * $genomeCount);
659        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);
660        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);
661        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);
662        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);
663        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);
664        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);
665      Trace("Beginning subsystem data load.") if T(2);      Trace("Beginning subsystem data load.") if T(2);
666        # The reaction hash will contain a list of reactions for each role. When we're done,
667        # a complicated sort and merge will be used to generate the Reaction and Catalyzes
668        # tables.
669        my %reactionsToRoles = ();
670      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
671      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
672      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
673      # duplicates. As we move along, we'll connect the roles and subsystems.      # duplicates. As we move along, we'll connect the roles and subsystems
674        # and memorize up the reactions.
675      my ($genomeID, $roleID);      my ($genomeID, $roleID);
676      my %roleData = ();      my %roleData = ();
677      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
678          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
679          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
         # Create the subsystem record.  
         $loadSubsystem->Put($subsysID);  
680          # Get the subsystem object.          # Get the subsystem object.
681          my $sub = $fig->get_subsystem($subsysID);          my $sub = $fig->get_subsystem($subsysID);
682          # Connect it to its roles.          # Get its reaction hash.
683            my $reactionHash = $sub->get_reactions();
684            # Create the subsystem record.
685            my $curator = $sub->get_curator();
686            my $notes = $sub->get_notes();
687            $loadSubsystem->Put($subsysID, $curator, $notes);
688            # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
689          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
690                # Connect to this role.
691              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
692              $loadOccursInSubsystem->Put($roleID, $subsysID);              $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
693                # If it's a new role, add it to the role table.
694              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
695                  $loadRole->Put($roleID);                  # Get the role's abbreviation.
696                    my $abbr = $sub->get_role_abbr($col);
697                    # Add the role.
698                    $loadRole->Put($roleID, $abbr);
699                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
700                    # Add the role's reactions.
701                    my $reactions = $reactionHash->{$roleID};
702                    for my $reactionID (@{$reactions}) {
703                        if (! exists $reactionsToRoles{$reactionID}) {
704                            # Here the reaction is brand-new, so we create its reaction
705                            # record.
706                            $loadReaction->Put($reactionID, $fig->reversible($reactionID));
707                            # We also create a blank list for it in the reaction hash.
708                            $reactionsToRoles{$reactionID} = [];
709                        }
710                        # Add the role to the reaction's role list.
711                        push @{$reactionsToRoles{$reactionID}}, $roleID;
712                    }
713              }              }
714          }          }
715          # Now we create the spreadsheet for the subsystem by matching roles to          # Now we create the spreadsheet for the subsystem by matching roles to
# Line 678  Line 722 
722                  # Count the PEGs and cells found for verification purposes.                  # Count the PEGs and cells found for verification purposes.
723                  my $pegCount = 0;                  my $pegCount = 0;
724                  my $cellCount = 0;                  my $cellCount = 0;
725                    # Create a list for the PEGs we find. This list will be used
726                    # to generate cluster numbers.
727                    my @pegsFound = ();
728                    # Create a hash that maps spreadsheet IDs to PEGs. We will
729                    # use this to generate the ContainsFeature data after we have
730                    # the cluster numbers.
731                    my %cellPegs = ();
732                    # Get the genome's variant code for this subsystem.
733                    my $variantCode = $sub->get_variant_code($row);
734                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
735                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
736                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
# Line 692  Line 745 
745                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
746                          $loadIsRoleOf->Put($roleID, $cellID);                          $loadIsRoleOf->Put($roleID, $cellID);
747                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
748                          # Attach the features to it.                          # Remember its features.
749                          for my $pegID (@pegs) {                          push @pegsFound, @pegs;
750                              $loadContainsFeature->Put($cellID, $pegID);                          $cellPegs{$cellID} = \@pegs;
751                              $pegCount++;                          $pegCount += @pegs;
752                          }                          }
753                      }                      }
754                  }                  # If we found some cells for this genome, we need to compute clusters and
755                  # If we found some cells for this genome, denote it participates in the                  # denote it participates in the subsystem.
                 # subsystem.  
756                  if ($pegCount > 0) {                  if ($pegCount > 0) {
757                      Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);                      Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
758                      $loadParticipatesIn->Put($genomeID, $subsysID);                      $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
759                        # Partition the PEGs found into clusters.
760                        my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
761                        # Create a hash mapping PEG IDs to cluster numbers.
762                        # We default to -1 for all of them.
763                        my %clusterOf = map { $_ => -1 } @pegsFound;
764                        for (my $i = 0; $i <= $#clusters; $i++) {
765                            my $subList = $clusters[$i];
766                            for my $peg (@{$subList}) {
767                                $clusterOf{$peg} = $i;
768                            }
769                        }
770                        # Create the ContainsFeature data.
771                        for my $cellID (keys %cellPegs) {
772                            my $cellList = $cellPegs{$cellID};
773                            for my $cellPeg (@$cellList) {
774                                $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
775                            }
776                        }
777                  }                  }
778              }              }
779          }          }
780            # Now we need to generate the subsets. The subset names must be concatenated to
781            # the subsystem name to make them unique keys. There are two types of subsets:
782            # genome subsets and role subsets. We do the role subsets first.
783            my @subsetNames = $sub->get_subset_names();
784            for my $subsetID (@subsetNames) {
785                # Create the subset record.
786                my $actualID = "$subsysID:$subsetID";
787                $loadRoleSubset->Put($actualID);
788                # Connect the subset to the subsystem.
789                $loadHasRoleSubset->Put($subsysID, $actualID);
790                # Connect the subset to its roles.
791                my @roles = $sub->get_subset($subsetID);
792                for my $roleID (@roles) {
793                    $loadConsistsOfRoles->Put($actualID, $roleID);
794                }
795            }
796            # Next the genome subsets.
797            @subsetNames = $sub->get_subset_namesR();
798            for my $subsetID (@subsetNames) {
799                # Create the subset record.
800                my $actualID = "$subsysID:$subsetID";
801                $loadGenomeSubset->Put($actualID);
802                # Connect the subset to the subsystem.
803                $loadHasGenomeSubset->Put($subsysID, $actualID);
804                # Connect the subset to its genomes.
805                my @genomes = $sub->get_subsetR($subsetID);
806                for my $genomeID (@genomes) {
807                    $loadConsistsOfGenomes->Put($actualID, $genomeID);
808                }
809            }
810        }
811        # Before we leave, we must create the Catalyzes table. The data is all stored in
812        # "reactionToRoles" hash.
813        for my $reactionID (keys %reactionsToRoles) {
814            # Get this reaction's list of roles. We sort it so we can merge out duplicates.
815            my @roles = sort @{$reactionsToRoles{$reactionID}};
816            my $lastRole = "";
817            # Loop through the roles, creating catalyzation records.
818            for my $thisRole (@roles) {
819                if ($thisRole ne $lastRole) {
820                    $loadCatalyzes->Put($thisRole, $reactionID);
821                }
822            }
823      }      }
824      # Finish the load.      # Finish the load.
825      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 934  Line 1047 
1047                      # Denote we've seen this timestamp.                      # Denote we've seen this timestamp.
1048                      $seenTimestamps{$time} = 1;                      $seenTimestamps{$time} = 1;
1049                  }                  }
1050                }
1051                  # Now loop through the real annotations.                  # Now loop through the real annotations.
1052                  for my $tuple ($fig->feature_annotations($peg, "raw")) {                  for my $tuple ($fig->feature_annotations($peg, "raw")) {
1053                      my ($fid, $timestamp, $user, $text) = @{$tuple};                      my ($fid, $timestamp, $user, $text) = @{$tuple};
# Line 948  Line 1062 
1062                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1063                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1064                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1065                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1066                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1067                              $timestamp++;                      my $keyStamp = $timestamp;
1068                        while ($seenTimestamps{$keyStamp}) {
1069                            $keyStamp++;
1070                          }                          }
1071                          $seenTimestamps{$timestamp} = 1;                      $seenTimestamps{$keyStamp} = 1;
1072                          my $annotationID = "$peg:$timestamp";                      my $annotationID = "$peg:$keyStamp";
1073                          # Insure the user exists.                          # Insure the user exists.
1074                          if (! $users{$user}) {                          if (! $users{$user}) {
1075                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 961  Line 1077 
1077                              $users{$user} = 1;                              $users{$user} = 1;
1078                          }                          }
1079                          # Generate the annotation.                          # Generate the annotation.
1080                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1081                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1082                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1083                      } else {                      } else {
# Line 971  Line 1087 
1087                  }                  }
1088              }              }
1089          }          }
     }  
1090      # Finish the load.      # Finish the load.
1091      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1092      return $retVal;      return $retVal;
# Line 1128  Line 1243 
1243      return $retVal;      return $retVal;
1244  }  }
1245    
1246    
1247    =head3 LoadReactionData
1248    
1249    C<< my $stats = $spl->LoadReactionData(); >>
1250    
1251    Load the reaction data from FIG into Sprout.
1252    
1253    Reaction data connects reactions to the compounds that participate in them.
1254    
1255    The following relations are loaded by this method.
1256    
1257        ReactionURL
1258        Compound
1259        CompoundName
1260        CompoundCAS
1261        IsAComponentOf
1262    
1263    This method proceeds reaction by reaction rather than genome by genome.
1264    
1265    =over 4
1266    
1267    =item RETURNS
1268    
1269    Returns a statistics object for the loads.
1270    
1271    =back
1272    
1273    =cut
1274    #: Return Type $%;
1275    sub LoadReactionData {
1276        # Get this object instance.
1277        my ($self) = @_;
1278        # Get the FIG object.
1279        my $fig = $self->{fig};
1280        # Get the genome hash.
1281        my $genomeHash = $self->{genomes};
1282        my $genomeCount = (keys %{$genomeHash});
1283        # Create load objects for each of the tables we're loading.
1284        my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);
1285        my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);
1286        my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);
1287        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);
1288        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);
1289        Trace("Beginning reaction/compound data load.") if T(2);
1290        # Create a hash to remember the compounds we've generated in the compound table.
1291        my %compoundHash = ();
1292        # Loop through the reactions.
1293        my @reactions = $fig->all_reactions();
1294        for my $reactionID (@reactions) {
1295            # Compute the reaction's URL.
1296            my $url = HTML::reaction_link($reactionID);
1297            # Put it in the ReactionURL table.
1298            $loadReactionURL->Put($reactionID, $url);
1299            # Now we need all of the reaction's compounds. We get these in two phases,
1300            # substrates first and then products.
1301            for my $product (0, 1) {
1302                # Get the compounds of the current type for the current reaction. FIG will
1303                # give us 3-tuples: [ID, Stoichometry, main-flag]. At this time we do not
1304                # have location data in SEED, so it defaults to the empty string.
1305                my @compounds = $fig->reaction2comp($reactionID, $product);
1306                for my $compData (@compounds) {
1307                    # Extract the compound data from the current tuple.
1308                    my ($cid, $stoich, $main) = @{$compData};
1309                    # Link the compound to the reaction.
1310                    $loadIsAComponentOf->Put($cid, $reactionID, "", $main, $product, $stoich);
1311                    # If this is a new compound, we need to create its table entries.
1312                    if (! exists $compoundHash{$cid}) {
1313                        $compoundHash{$cid} = 1;
1314                        # Create the main compound record and denote we've done it.
1315                        $loadCompound->Put($cid);
1316                        # Check for a CAS ID.
1317                        my $cas = $fig->cas($cid);
1318                        if ($cas) {
1319                            $loadCompoundCAS->Put($cid, $cas);
1320                        }
1321                        # Check for names.
1322                        my @names = $fig->names_of_compound($cid);
1323                        # Each name will be given a priority number, starting with 1.
1324                        my $prio = 0;
1325                        for my $name (@names) {
1326                            $loadCompoundName->Put($cid, $name, $prio++);
1327                        }
1328                    }
1329                }
1330            }
1331        }
1332        # Finish the load.
1333        my $retVal = $self->_FinishAll();
1334        return $retVal;
1335    }
1336    
1337  =head3 LoadGroupData  =head3 LoadGroupData
1338    
1339  C<< my $stats = $spl->LoadGroupData(); >>  C<< my $stats = $spl->LoadGroupData(); >>

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3