[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.84, Thu May 17 23:44:51 2007 UTC revision 1.85, Mon Jul 16 19:59:33 2007 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
# Line 172  Line 173 
173              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
174          }          }
175      }      }
176        # Get the list of NMPDR-oriented attribute keys.
177        my @propKeys = $fig->get_group_keys("NMPDR");
178      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
179      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
180      # Create the Sprout load object.      # Create the Sprout load object.
# Line 183  Line 186 
186                    loadDirectory => $directory,                    loadDirectory => $directory,
187                    erdb => $sprout,                    erdb => $sprout,
188                    loaders => [],                    loaders => [],
189                    options => $options                    options => $options,
190                      propKeys => \@propKeys,
191                   };                   };
192      # Bless and return it.      # Bless and return it.
193      bless $retVal, $class;      bless $retVal, $class;
# Line 203  Line 207 
207      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
208  }  }
209    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
# Line 255  Line 247 
247      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
248      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
249      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
250      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
251      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
252      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
253      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
254      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
255          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
256      } else {      } else {
# Line 332  Line 324 
324      return $retVal;      return $retVal;
325  }  }
326    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     # Set up an ID counter for the PCHs.  
     my $pchID = 0;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Get the ID for this evidence.  
                             $pchID++;  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($pchID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $pchID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($pchID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($pchID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
327  =head3 LoadFeatureData  =head3 LoadFeatureData
328    
329  C<< my $stats = $spl->LoadFeatureData(); >>  C<< my $stats = $spl->LoadFeatureData(); >>
# Line 470  Line 336 
336    
337      Feature      Feature
338      FeatureAlias      FeatureAlias
339        IsAliasOf
340      FeatureLink      FeatureLink
341      FeatureTranslation      FeatureTranslation
342      FeatureUpstream      FeatureUpstream
# Line 479  Line 346 
346      FeatureEssential      FeatureEssential
347      FeatureVirulent      FeatureVirulent
348      FeatureIEDB      FeatureIEDB
349        CDD
350        IsPresentOnProteinOf
351    
352  =over 4  =over 4
353    
# Line 500  Line 369 
369      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
370      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
371      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
372      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
373      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
374        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
375      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
376      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
377      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
378      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);      my $loadHasFeature = $self->_TableLoader('HasFeature');
379      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
380      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
381      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
382      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
383        my $loadCDD = $self->_TableLoader('CDD');
384        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
385      # Get the subsystem hash.      # Get the subsystem hash.
386      my $subHash = $self->{subsystems};      my $subHash = $self->{subsystems};
387        # Get the property keys.
388        my $propKeys = $self->{propKeys};
389        # Create a hashes to hold CDD and alias values.
390        my %CDD = ();
391        my %alias = ();
392      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
393      # locations.      # locations.
394      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 531  Line 408 
408              my @fids = map { $_->[0] } @featureTuples;              my @fids = map { $_->[0] } @featureTuples;
409              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
410              # Get the attributes for this genome and put them in a hash by feature ID.              # Get the attributes for this genome and put them in a hash by feature ID.
411              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
412              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
413              my $oldFeatureID = "";              my $oldFeatureID = "";
414              # Loop through the features.              # Loop through the features.
# Line 556  Line 433 
433                                      $fig->taxonomy_of($genomeID));                                      $fig->taxonomy_of($genomeID));
434                      # Create the aliases.                      # Create the aliases.
435                      for my $alias ($fig->feature_aliases($featureID)) {                      for my $alias ($fig->feature_aliases($featureID)) {
436                          $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
437                            $loadIsAliasOf->Put($alias, $featureID);
438                          push @keywords, $alias;                          push @keywords, $alias;
439                            # If this is a locus tag, also add its natural form as a keyword.
440                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
441                            if ($naturalName) {
442                                push @keywords, $naturalName;
443                            }
444                            # If this is the first time for the specified alias, create its
445                            # alias record.
446                            if (! exists $alias{$alias}) {
447                                $loadFeatureAlias->Put($alias);
448                                $alias{$alias} = 1;
449                            }
450                      }                      }
451                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
452                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
# Line 634  Line 523 
523                          push @keywords, 'iedb';                          push @keywords, 'iedb';
524                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
525                      }                      }
526                        # Now we have some other attributes we need to process. Currently,
527                        # this is CDD and CELLO, but we expect the number to increase.
528                        my %attributeHash = ();
529                        for my $attrRow (@{$attributes->{$featureID}}) {
530                            my (undef, $key, @values) = @{$attrRow};
531                            $key =~ /^([^:]+)::(.+)/;
532                            if (exists $attributeHash{$1}) {
533                                $attributeHash{$1}->{$2} = \@values;
534                            } else {
535                                $attributeHash{$1} = {$2 => \@values};
536                            }
537                        }
538                        my $celloValue = "unknown";
539                        # Pull in the CELLO attribute. There will never be more than one.
540                        # If we have one, it's a feature attribute AND a keyword.
541                        my @celloData = keys %{$attributeHash{CELLO}};
542                        if (@celloData) {
543                            $celloValue = $celloData[0];
544                            push @keywords, $celloValue;
545                        }
546                        # Now we handle CDD. This is a bit more complicated, because
547                        # there are multiple CDDs per protein.
548                        if (exists $attributeHash{CDD}) {
549                            # Get the hash of CDD IDs to scores for this feature. We
550                            # already know it exists because of the above IF.
551                            my $cddHash = $attributeHash{CDD};
552                            my @cddData = sort keys %{$cddHash};
553                            for my $cdd (@cddData) {
554                                # Extract the score for this CDD and decode it.
555                                my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);
556                                my $realScore = FIGRules::DecodeScore($codeScore);
557                                # Create the connection.
558                                $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
559                                # If this CDD does not yet exist, create its record.
560                                if (! exists $CDD{$cdd}) {
561                                    $CDD{$cdd} = 1;
562                                    $loadCDD->Put($cdd);
563                                }
564                            }
565                        }
566                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
567                      # list. We keep them separate and put them at the end so                      # list. We keep them separate and put them at the end so
568                      # the original word order is available.                      # the original word order is available.
# Line 654  Line 583 
583                      # Clean the keyword list.                      # Clean the keyword list.
584                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my $cleanWords = $sprout->CleanKeywords($keywordString);
585                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
586                      # Create the feature record.                      # Now we need to process the feature's locations. First, we split them up.
587                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);                      my @locationList = split /\s*,\s*/, $locations;
588                        # Next, we convert them to Sprout location objects.
589                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
590                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
591                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
592                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
593                      # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                     my @locationList = split /\s*,\s*/, $locations;  
                     # Create the location position indicator.  
594                      my $i = 1;                      my $i = 1;
595                      # Loop through the locations.                      # Loop through the locations.
596                      for my $location (@locationList) {                      for my $locObject (@locObjectList) {
597                          # Parse the location.                          # Split this location into a list of chunks.
                         my $locObject = BasicLocation->new("$genomeID:$location");  
                         # Split it into a list of chunks.  
598                          my @locOList = ();                          my @locOList = ();
599                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
600                              $loadIsLocatedIn->Add("peeling");                              $loadIsLocatedIn->Add("peeling");
# Line 682  Line 609 
609                              $i++;                              $i++;
610                          }                          }
611                      }                      }
612                        # Finally, reassemble the location objects into a list of Sprout location strings.
613                        $locations = join(", ", map { $_->String } @locObjectList);
614                        # Create the feature record.
615                        $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);
616                  }                  }
617              }              }
618          }          }
# Line 709  Line 640 
640      SubsystemClass      SubsystemClass
641      Role      Role
642      RoleEC      RoleEC
643        IsIdentifiedByEC
644      SSCell      SSCell
645      ContainsFeature      ContainsFeature
646      IsGenomeOf      IsGenomeOf
# Line 750  Line 682 
682      # Get the map list.      # Get the map list.
683      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
684      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
685      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
686      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
687      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
688      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
689      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
690      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
691      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
692      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
693      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
694      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
695      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
696      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
697      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
698      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
699      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
700      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
701      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
702      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
703      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
704      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
705        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
706      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
707          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
708      } else {      } else {
709          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
710          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
711          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
712          my %ecToRoles = ();          my %ecToRoles = ();
713          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 802  Line 735 
735                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
736                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
737                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
738                        # Get the role's abbreviation.
739                        my $abbr = $sub->get_role_abbr($col);
740                      # Connect to this role.                      # Connect to this role.
741                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
742                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);
743                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
744                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
745                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
746                          # Add the role.                          # Add the role.
747                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
748                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
749                          # Check for an EC number.                          # Check for an EC number.
750                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
751                              my $ec = $1;                              my $ec = $1;
752                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
753                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
754                                if (exists $ecToRoles{$ec}) {
755                                    # No, so just add this role to the EC list.
756                                    push @{$ecToRoles{$ec}}, $roleID;
757                                } else {
758                                    # Output this EC.
759                                    $loadRoleEC->Put($ec);
760                                    # Create its role list.
761                                    $ecToRoles{$ec} = [$roleID];
762                                }
763                          }                          }
764                      }                      }
765                  }                  }
# Line 941  Line 884 
884          my @reactions = $fig->all_reactions();          my @reactions = $fig->all_reactions();
885          for my $reactionID (@reactions) {          for my $reactionID (@reactions) {
886              # Get this reaction's list of roles. The results will be EC numbers.              # Get this reaction's list of roles. The results will be EC numbers.
887              my @roles = $fig->catalyzed_by($reactionID);              my @ecs = $fig->catalyzed_by($reactionID);
888              # Loop through the roles, creating catalyzation records.              # Loop through the roles, creating catalyzation records.
889              for my $thisRole (@roles) {              for my $thisEC (@ecs) {
890                  if (exists $ecToRoles{$thisRole}) {                  if (exists $ecToRoles{$thisEC}) {
891                      $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);                      for my $thisRole (@{$ecToRoles{$thisEC}}) {
892                            $loadCatalyzes->Put($thisRole, $reactionID);
893                        }
894                  }                  }
895              }              }
896          }          }
# Line 993  Line 938 
938      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
939      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
940      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
941      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
942      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
943          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
944      } else {      } else {
# Line 1002  Line 947 
947          my %propertyKeys = ();          my %propertyKeys = ();
948          my $nextID = 1;          my $nextID = 1;
949          # Get the attributes we intend to store in the property table.          # Get the attributes we intend to store in the property table.
950          my @propKeys = $fig->get_group_keys("NMPDR");          my $propKeys = $self->{propKeys};
951          # Loop through the genomes.          # Loop through the genomes.
952          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
953              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
# Line 1010  Line 955 
955              # Initialize a counter.              # Initialize a counter.
956              my $propertyCount = 0;              my $propertyCount = 0;
957              # Get the properties for this genome's features.              # Get the properties for this genome's features.
958              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
959              Trace("Property list built for $genomeID.") if T(3);              Trace("Property list built for $genomeID.") if T(3);
960              # Loop through the results, creating HasProperty records.              # Loop through the results, creating HasProperty records.
961              for my $attributeData (@attributes) {              for my $attributeData (@attributes) {
# Line 1079  Line 1024 
1024      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1025      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1026      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1027      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1028      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1029      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1030      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1031      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1032          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1033      } else {      } else {
# Line 1186  Line 1131 
1131      # Get the genome hash.      # Get the genome hash.
1132      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1133      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1134      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1135      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1136      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1137      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1323  Line 1268 
1268      Compound      Compound
1269      CompoundName      CompoundName
1270      CompoundCAS      CompoundCAS
1271        IsIdentifiedByCAS
1272        HasCompoundName
1273      IsAComponentOf      IsAComponentOf
1274    
1275  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
# Line 1344  Line 1291 
1291      my $fig = $self->{fig};      my $fig = $self->{fig};
1292      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1293      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1294      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1295      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1296      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1297      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1298      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1299        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1300        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1301      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1302          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1303      } else {      } else {
1304          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1305            # We need some hashes to prevent duplicates.
1306            my %compoundNames = ();
1307            my %compoundCASes = ();
1308          # First we create the compounds.          # First we create the compounds.
1309          my @compounds = $fig->all_compounds();          my @compounds = $fig->all_compounds();
1310          for my $cid (@compounds) {          for my $cid (@compounds) {
# Line 1361  Line 1313 
1313              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1314              my $prio = 1;              my $prio = 1;
1315              for my $name (@names) {              for my $name (@names) {
1316                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1317                        $loadCompoundName->Put($name);
1318                        $compoundNames{$name} = 1;
1319                    }
1320                    $loadHasCompoundName->Put($cid, $name, $prio++);
1321              }              }
1322              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1323              # becomes the label.              # becomes the label.
# Line 1370  Line 1326 
1326              # Check for a CAS ID.              # Check for a CAS ID.
1327              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1328              if ($cas) {              if ($cas) {
1329                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1330                    if (! exists $compoundCASes{$cas}) {
1331                        $loadCompoundCAS->Put($cas);
1332                        $compoundCASes{$cas} = 1;
1333                    }
1334              }              }
1335          }          }
1336          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
# Line 1407  Line 1367 
1367      return $retVal;      return $retVal;
1368  }  }
1369    
 =head3 LoadGroupData  
   
 C<< my $stats = $spl->LoadGroupData(); >>  
   
 Load the genome Groups into Sprout.  
   
 The following relations are loaded by this method.  
   
     GenomeGroups  
   
 Currently, we do not use groups. We used to use them for NMPDR groups,  
 butThere is no direct support for genome groups in FIG, so we access the SEED  
 files directly.  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadGroupData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     # Create a load object for the table we're loading.  
     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating group data.") if T(2);  
         # Currently there are no groups.  
     }  
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
1370  =head3 LoadSynonymData  =head3 LoadSynonymData
1371    
1372  C<< my $stats = $spl->LoadSynonymData(); >>  C<< my $stats = $spl->LoadSynonymData(); >>
# Line 1492  Line 1408 
1408          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1409          # Get the database handle.          # Get the database handle.
1410          my $dbh = $fig->db_handle();          my $dbh = $fig->db_handle();
1411          # Ask for the synonyms.          # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1412          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1413          my $result = $sth->execute();          my $result = $sth->execute();
1414          if (! defined($result)) {          if (! defined($result)) {
# Line 1504  Line 1420 
1420              my $featureCount = 0;              my $featureCount = 0;
1421              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1422              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1423                  # Get the synonym ID and feature ID.                  # Get the synonym group ID and feature ID.
1424                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1425                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1426                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
# Line 1890  Line 1806 
1806    
1807  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1808    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
1809  =item RETURN  =item RETURN
1810    
1811  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1904  Line 1816 
1816    
1817  sub _TableLoader {  sub _TableLoader {
1818      # Get the parameters.      # Get the parameters.
1819      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
1820      # Create the load object.      # Create the load object.
1821      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
1822      # Cache it in the loader list.      # Cache it in the loader list.
1823      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1824      # Return it to the caller.      # Return it to the caller.
# Line 1981  Line 1892 
1892    
1893  =head3 GetGenomeAttributes  =head3 GetGenomeAttributes
1894    
1895  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>
1896    
1897  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1898  attributes for all the features of a genome in a single call, then organizes them into  attributes for all the features of a genome in a single call, then organizes them into
# Line 2001  Line 1912 
1912    
1913  Reference to a list of the feature IDs whose attributes are to be kept.  Reference to a list of the feature IDs whose attributes are to be kept.
1914    
1915    =item propKeys
1916    
1917    A list of the keys to retrieve.
1918    
1919  =item RETURN  =item RETURN
1920    
1921  Returns a reference to a hash. The key of the hash is the feature ID. The value is the  Returns a reference to a hash. The key of the hash is the feature ID. The value is the
# Line 2013  Line 1928 
1928    
1929  sub GetGenomeAttributes {  sub GetGenomeAttributes {
1930      # Get the parameters.      # Get the parameters.
1931      my ($fig, $genomeID, $fids) = @_;      my ($fig, $genomeID, $fids, $propKeys) = @_;
1932      # Declare the return variable.      # Declare the return variable.
1933      my $retVal = {};      my $retVal = {};
     # Get a list of the attributes we care about.  
     my @propKeys = $fig->get_group_keys("NMPDR");  
     # Get the attributes.  
     my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);  
1934      # Initialize the hash. This not only enables us to easily determine which FIDs to      # Initialize the hash. This not only enables us to easily determine which FIDs to
1935      # keep, it insures that the caller sees a list reference for every known fid,      # keep, it insures that the caller sees a list reference for every known fid,
1936      # simplifying the logic.      # simplifying the logic.
1937      for my $fid (@{$fids}) {      for my $fid (@{$fids}) {
1938          $retVal->{$fid} = [];          $retVal->{$fid} = [];
1939      }      }
1940      # Populate the hash.      # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
1941        # an eval is used.
1942        my @aList = ();
1943        eval {
1944            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
1945            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
1946        };
1947        # Check for a problem.
1948        if ($@) {
1949            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
1950            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
1951            # but allows us to continue processing.
1952            my $nFids = scalar @{$fids};
1953            for (my $i = 0; $i < $nFids; $i += 100) {
1954                # Determine the index of the last feature ID we'll be specifying on this pass.
1955                # Normally it's $i + 99, but if we're close to the end it may be less.
1956                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
1957                # Get a slice of the fid list.
1958                my @slice = @{$fids}[$i .. $end];
1959                # Get the relevant attributes.
1960                Trace("Retrieving attributes for fids $i to $end.") if T(3);
1961                my @aShort = $fig->get_attributes(\@slice, $propKeys);
1962                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
1963                push @aList, @aShort;
1964            }
1965        }
1966        # Now we should have all the interesting attributes in @aList. Populate the hash with
1967        # them.
1968      for my $aListEntry (@aList) {      for my $aListEntry (@aList) {
1969          my $fid = $aListEntry->[0];          my $fid = $aListEntry->[0];
1970          if (exists $retVal->{$fid}) {          if (exists $retVal->{$fid}) {
# Line 2037  Line 1975 
1975      return $retVal;      return $retVal;
1976  }  }
1977    
1978    
1979  1;  1;

Legend:
Removed from v.1.84  
changed lines
  Added in v.1.85

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3