[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.88, Mon Nov 5 22:52:06 2007 UTC revision 1.96, Thu Sep 25 23:28:28 2008 UTC
# Line 13  Line 13 
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15      use AliasAnalysis;      use AliasAnalysis;
16        use BioWords;
17    
18  =head1 Sprout Load Methods  =head1 Sprout Load Methods
19    
# Line 52  Line 53 
53    
54  =head3 new  =head3 new
55    
56  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57    
58  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
59  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 170  Line 171 
171          for my $subsystem (keys %subsystems) {          for my $subsystem (keys %subsystems) {
172              my $name = $subsystem;              my $name = $subsystem;
173              $name =~ s/_/ /g;              $name =~ s/_/ /g;
 #            my $classes = $fig->subsystem_classification($subsystem);  
 #            $name .= " " . join(" ", @{$classes});  
174              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
175          }          }
176      }      }
# Line 198  Line 197 
197    
198  =head3 LoadOnly  =head3 LoadOnly
199    
200  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
201    
202  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
203    
# Line 212  Line 211 
211    
212  =head3 LoadGenomeData  =head3 LoadGenomeData
213    
214  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
215    
216  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
217    
# Line 257  Line 256 
256          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
257      } else {      } else {
258          Trace("Generating genome data.") if T(2);          Trace("Generating genome data.") if T(2);
259            # Get the full info for the FIG genomes.
260            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
263          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
264              Trace("Generating data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
# Line 286  Line 288 
288                  $group = $FIG_Config::otherGroup;                  $group = $FIG_Config::otherGroup;
289              }              }
290              close TMP;              close TMP;
291                # Get the contigs.
292                my @contigs = $fig->all_contigs($genomeID);
293                # Get this genome's info array.
294                my $info = $genomeInfo{$genomeID};
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
             my @contigs = $fig->all_contigs($genomeID);  
299              for my $contigID (@contigs) {              for my $contigID (@contigs) {
300                  Trace("Processing contig $contigID for $genomeID.") if T(4);                  Trace("Processing contig $contigID for $genomeID.") if T(4);
301                  $loadContig->Add("contigIn");                  $loadContig->Add("contigIn");
# Line 328  Line 333 
333    
334  =head3 LoadFeatureData  =head3 LoadFeatureData
335    
336  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
337    
338  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
339    
# Line 350  Line 355 
355      FeatureIEDB      FeatureIEDB
356      CDD      CDD
357      IsPresentOnProteinOf      IsPresentOnProteinOf
358        CellLocation
359        IsPossiblePlaceFor
360        ExternalDatabase
361        IsAlsoFoundIn
362        Keyword
363    
364  =over 4  =over 4
365    
# Line 384  Line 394 
394      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395      my $loadCDD = $self->_TableLoader('CDD');      my $loadCDD = $self->_TableLoader('CDD');
396      my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');      my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397        my $loadCellLocation = $self->_TableLoader('CellLocation');
398        my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399        my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400        my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401        my $loadKeyword = $self->_TableLoader('Keyword');
402      # Get the subsystem hash.      # Get the subsystem hash.
403      my $subHash = $self->{subsystems};      my $subHash = $self->{subsystems};
404      # Get the property keys.      # Get the property keys.
405      my $propKeys = $self->{propKeys};      my $propKeys = $self->{propKeys};
406      # Create a hashes to hold CDD and alias values.      # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407      my %CDD = ();      my %CDD = ();
408      my %alias = ();      my %alias = ();
409        my %cellLocation = ();
410        my %xdb = ();
411        # Create the bio-words object.
412        my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
413                                     stops => "$FIG_Config::sproutData/StopWords.txt",
414                                     cache => 0);
415        # One of the things we have to do here is build the keyword table, and the keyword
416        # table needs to contain the originating text and feature count for each stem. Unfortunately,
417        # the number of distinct keywords is so large it causes PERL to hang if we try to
418        # keep them in memory. As a result, we need to track them using disk files.
419        # Our approach will be to use two sequential files. One will contain stems and phonexes.
420        # Each time a stem occurs in a feature, a record will be written to that file. The stem
421        # file can then be sorted and collated to determine the number of features for each
422        # stem. A separate file will contain keywords and stems. This last file
423        # will be subjected to a sort unique on stem/keyword. The file is then merged
424        # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
425        my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
426        my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
427        my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
428        my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
429      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
430      # locations.      # locations.
431      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 453  Line 488 
488                              $alias{$alias} = 1;                              $alias{$alias} = 1;
489                          }                          }
490                      }                      }
491                        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
492                        my @corresponders = $fig->get_corresponding_ids($featureID, 1);
493                        for my $tuple (@corresponders) {
494                            my ($id, $xdb) = @{$tuple};
495                            # Ignore SEED: that's us.
496                            if ($xdb ne 'SEED') {
497                                # Connect this ID to the feature.
498                                $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
499                                # Add it as a keyword.
500                                push @keywords, $id;
501                                # If this is a new database, create a record for it.
502                                if (! exists $xdb{$xdb}) {
503                                    $xdb{$xdb} = 1;
504                                    $loadExternalDatabase->Put($xdb);
505                                }
506                            }
507                        }
508                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
509                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
510                      # keyword list.                      # keyword list.
# Line 528  Line 580 
580                          push @keywords, 'iedb';                          push @keywords, 'iedb';
581                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
582                      }                      }
583                      # Now we have some other attributes we need to process. Currently,                      # Now we have some other attributes we need to process. To get
584                      # this is CDD and CELLO, but we expect the number to increase.                      # through them, we convert the attribute list for this feature
585                        # into a two-layer hash: key => subkey => value.
586                      my %attributeHash = ();                      my %attributeHash = ();
587                      for my $attrRow (@{$attributes->{$featureID}}) {                      for my $attrRow (@{$attributes->{$featureID}}) {
588                          my (undef, $key, @values) = @{$attrRow};                          my (undef, $key, @values) = @{$attrRow};
589                          $key =~ /^([^:]+)::(.+)/;                          my ($realKey, $subKey);
590                            if ($key =~ /^([^:]+)::(.+)/) {
591                                ($realKey, $subKey) = ($1, $2);
592                            } else {
593                                ($realKey, $subKey) = ($key, "");
594                            }
595                          if (exists $attributeHash{$1}) {                          if (exists $attributeHash{$1}) {
596                              $attributeHash{$1}->{$2} = \@values;                              $attributeHash{$1}->{$2} = \@values;
597                          } else {                          } else {
598                              $attributeHash{$1} = {$2 => \@values};                              $attributeHash{$1} = {$2 => \@values};
599                          }                          }
600                      }                      }
601                      my $celloValue = "unknown";                      # First we handle CDD. This is a bit complicated, because
                     # Pull in the CELLO attribute. There will never be more than one.  
                     # If we have one, it's a feature attribute AND a keyword.  
                     my @celloData = keys %{$attributeHash{CELLO}};  
                     if (@celloData) {  
                         $celloValue = $celloData[0];  
                         push @keywords, $celloValue;  
                     }  
                     # Now we handle CDD. This is a bit more complicated, because  
602                      # there are multiple CDDs per protein.                      # there are multiple CDDs per protein.
603                      if (exists $attributeHash{CDD}) {                      if (exists $attributeHash{CDD}) {
604                          # Get the hash of CDD IDs to scores for this feature. We                          # Get the hash of CDD IDs to scores for this feature. We
# Line 557  Line 607 
607                          my @cddData = sort keys %{$cddHash};                          my @cddData = sort keys %{$cddHash};
608                          for my $cdd (@cddData) {                          for my $cdd (@cddData) {
609                              # Extract the score for this CDD and decode it.                              # Extract the score for this CDD and decode it.
610                              my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);                              my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
611                              my $realScore = FIGRules::DecodeScore($codeScore);                              my $realScore = FIGRules::DecodeScore($codeScore);
612                                # We can't afford to crash because of a bad attribute
613                                # value, hence the IF below.
614                                if (! defined($realScore)) {
615                                    # Bad score, so count it.
616                                    $loadFeature->Add('badCDDscore');
617                                    Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
618                                } else {
619                              # Create the connection.                              # Create the connection.
620                              $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);                              $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
621                              # If this CDD does not yet exist, create its record.                              # If this CDD does not yet exist, create its record.
# Line 568  Line 625 
625                              }                              }
626                          }                          }
627                      }                      }
                     # Now we need to bust up hyphenated words in the keyword  
                     # list. We keep them separate and put them at the end so  
                     # the original word order is available.  
                     my $keywordString = "";  
                     my $bustedString = "";  
                     for my $keyword (@keywords) {  
                         if (length $keyword >= 3) {  
                             $keywordString .= " $keyword";  
                             if ($keyword =~ /-/) {  
                                 my @words = split /-/, $keyword;  
                                 $bustedString .= join(" ", "", @words);  
628                              }                              }
629                        # Next we do PSORT cell locations. here the confidence value
630                        # could have the value "unknown", which we translate to -1.
631                        if (exists $attributeHash{PSORT}) {
632                            # This will be a hash of cell locations to confidence
633                            # factors.
634                            my $psortHash = $attributeHash{PSORT};
635                            for my $psort (keys %{$psortHash}) {
636                                # Get the confidence, and convert it to a number if necessary.
637                                my $confidence = $psortHash->{$psort};
638                                if ($confidence eq 'unknown') {
639                                    $confidence = -1;
640                                }
641                                $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
642                                # If this cell location does not yet exist, create its record.
643                                if (! exists $cellLocation{$psort}) {
644                                    $cellLocation{$psort} = 1;
645                                    $loadCellLocation->Put($psort);
646                                }
647                                # If this is a significant location, add it as a keyword.
648                                if ($confidence > 2.5) {
649                                    push @keywords, $psort;
650                          }                          }
651                      }                      }
652                      $keywordString .= $bustedString;                      }
653                        # Phobius data is next. This consists of the signal peptide location and
654                        # the transmembrane locations.
655                        my $signalList = "";
656                        my $transList = "";
657                        if (exists $attributeHash{Phobius}) {
658                            # This will be a hash of two keys (transmembrane and signal) to
659                            # location strings. If there's no value, we stuff in an empty string.
660                            $signalList = GetCommaList($attributeHash{Phobius}->{signal});
661                            $transList = GetCommaList($attributeHash{Phobius}->{transmembrane});
662                        }
663                        # Here are some more numbers: isoelectric point, molecular weight, and
664                        # the similar-to-human flag.
665                        my $isoelectric = 0;
666                        if (exists $attributeHash{isoelectric_point}) {
667                            $isoelectric = $attributeHash{isoelectric_point}->{""};
668                        }
669                        my $similarToHuman = 0;
670                        if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
671                            $similarToHuman = 1;
672                        }
673                        my $molecularWeight = 0;
674                        if (exists $attributeHash{molecular_weight}) {
675                            $molecularWeight = $attributeHash{molecular_weight}->{""};
676                        }
677                        # Create the keyword string.
678                        my $keywordString = join(" ", @keywords);
679                        Trace("Real keyword string for $featureID: $keywordString.") if T(4);
680                      # Get rid of annoying punctuation.                      # Get rid of annoying punctuation.
681                      $keywordString =~ s/[();]//g;                      $keywordString =~ s/[();@#\/]/ /g;
682                      # Clean the keyword list.                      # Get the list of keywords in the keyword string.
683                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
684                        # We need to do two things here: create the keyword string for the feature table
685                        # and write records to the keyword and stem files. The stuff we write to
686                        # the files will be taken from the following two hashes. The stuff used
687                        # to create the keyword string will be taken from the list.
688                        my (%keys, %stems, @realStems);
689                        for my $keyword (@realKeywords) {
690                            # Compute the stem and phonex for this keyword.
691                            my ($stem, $phonex) = $biowords->StemLookup($keyword);
692                            # Only proceed if a stem comes back. If no stem came back, it's a
693                            # stop word and we throw it away.
694                            if ($stem) {
695                                $keys{$keyword} = $stem;
696                                $stems{$stem} = $phonex;
697                                push @realStems, $stem;
698                            }
699                        }
700                        # Now create the keyword string.
701                        my $cleanWords = join(" ", @realStems);
702                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
703                        # Write the stem and keyword records.
704                        for my $stem (keys %stems) {
705                            Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
706                        }
707                        for my $key (keys %keys) {
708                            # The stem goes first in this file, because we want to sort
709                            # by stem and then keyword.
710                            Tracer::PutLine($keyh, [$keys{$key}, $key]);
711                        }
712                      # Now we need to process the feature's locations. First, we split them up.                      # Now we need to process the feature's locations. First, we split them up.
713                      my @locationList = split /\s*,\s*/, $locations;                      my @locationList = split /\s*,\s*/, $locations;
714                      # Next, we convert them to Sprout location objects.                      # Next, we convert them to Sprout location objects.
715                      my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;                      my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
716                        # Assemble them into a sprout location string for later.
717                        my $locationString = join(", ", map { $_->String } @locObjectList);
718                        # We'll store the sequence length in here.
719                        my $sequenceLength = 0;
720                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
721                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
722                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 599  Line 724 
724                      my $i = 1;                      my $i = 1;
725                      # Loop through the locations.                      # Loop through the locations.
726                      for my $locObject (@locObjectList) {                      for my $locObject (@locObjectList) {
727                            # Record the length.
728                            $sequenceLength += $locObject->Length;
729                          # Split this location into a list of chunks.                          # Split this location into a list of chunks.
730                          my @locOList = ();                          my @locOList = ();
731                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
# Line 614  Line 741 
741                              $i++;                              $i++;
742                          }                          }
743                      }                      }
744                      # Finally, reassemble the location objects into a list of Sprout location strings.                      # Now we get some ancillary flags.
745                      $locations = join(", ", map { $_->String } @locObjectList);                      my $locked = $fig->is_locked_fid($featureID);
746                        my $in_genbank = $fig->peg_in_gendb($featureID);
747                      # Create the feature record.                      # Create the feature record.
748                      $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
749                                          $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
750                                          $transList);
751                  }                  }
752              }              }
753              Trace("Genome $genomeID processed.") if T(3);              Trace("Genome $genomeID processed.") if T(3);
754          }          }
755      }      }
756        Trace("Sorting keywords.") if T(2);
757        # Now we need to load the keyword table from the key and stem files.
758        close $keyh;
759        close $stemh;
760        Trace("Loading keywords.") if T(2);
761        $keyh = Open(undef, "<$keyFileName");
762        $stemh = Open(undef, "<$stemFileName");
763        # We'll count the keywords in here, for tracing purposes.
764        my $count = 0;
765        # These variables track the current stem's data. When an incoming
766        # keyword's stem changes, these will be recomputed.
767        my ($currentStem, $currentPhonex, $currentCount);
768        # Prime the loop by reading the first stem in the stem file.
769        my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
770        # Loop through the keyword file.
771        while (! eof $keyh) {
772            # Read this keyword.
773            my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
774            # Check to see if it's the new stem yet.
775            if ($thisStem ne $currentStem) {
776                # Yes. It's a terrible error if it's not also the next stem.
777                if ($thisStem ne $nextStem) {
778                    Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
779                } else {
780                    # Here we're okay.
781                    ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
782                    # Count the number of features for this stem.
783                    $currentCount = 0;
784                    while ($nextStem eq $thisStem) {
785                        ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
786                        $currentCount++;
787                    }
788                }
789            }
790            # Now $currentStem is the same as $thisStem, and the other $current-vars
791            # contain the stem's data (phonex and count).
792            $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
793            if (++$count % 1000 == 0 && T(3)) {
794                Trace("$count keywords loaded.");
795            }
796        }
797        Trace("$count keywords loaded into keyword table.") if T(2);
798      # Finish the loads.      # Finish the loads.
799      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
800      return $retVal;      return $retVal;
# Line 630  Line 802 
802    
803  =head3 LoadSubsystemData  =head3 LoadSubsystemData
804    
805  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
806    
807  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
808    
# Line 660  Line 832 
832      ConsistsOfGenomes      ConsistsOfGenomes
833      GenomeSubset      GenomeSubset
834      HasGenomeSubset      HasGenomeSubset
     Catalyzes  
835      Diagram      Diagram
836      RoleOccursIn      RoleOccursIn
837        SubsystemHopeNotes
838    
839  =over 4  =over 4
840    
# Line 709  Line 881 
881      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
882      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
883      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
884        my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
885      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
886          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
887      } else {      } else {
# Line 733  Line 906 
906                  # Create the subsystem record.                  # Create the subsystem record.
907                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
908                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
909                  $loadSubsystem->Put($subsysID, $curator, $notes);                  my $version = $sub->get_version();
910                    my $description = $sub->get_description();
911                    $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
912                    # Add the hope notes.
913                    my $hopeNotes = $sub->get_hope_curation_notes();
914                    if ($hopeNotes) {
915                        $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
916                    }
917                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
918                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
919                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
# Line 743  Line 923 
923                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
924                      # Get the role's abbreviation.                      # Get the role's abbreviation.
925                      my $abbr = $sub->get_role_abbr($col);                      my $abbr = $sub->get_role_abbr($col);
926                        # Get its essentiality.
927                        my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
928                        # Get its reaction note.
929                        my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
930                      # Connect to this role.                      # Connect to this role.
931                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
932                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
933                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
934                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
935                          # Get the role's abbreviation.                          # Get the role's abbreviation.
# Line 889  Line 1073 
1073                  }                  }
1074              }              }
1075          }          }
         # Before we leave, we must create the Catalyzes table. We start with the reactions,  
         # then use the "ecToRoles" table to convert EC numbers to role IDs.  
         my @reactions = $fig->all_reactions();  
         for my $reactionID (@reactions) {  
             # Get this reaction's list of roles. The results will be EC numbers.  
             my @ecs = $fig->catalyzed_by($reactionID);  
             # Loop through the roles, creating catalyzation records.  
             for my $thisEC (@ecs) {  
                 if (exists $ecToRoles{$thisEC}) {  
                     for my $thisRole (@{$ecToRoles{$thisEC}}) {  
                         $loadCatalyzes->Put($thisRole, $reactionID);  
                     }  
                 }  
             }  
         }  
1076      }      }
1077      # Finish the load.      # Finish the load.
1078      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 912  Line 1081 
1081    
1082  =head3 LoadPropertyData  =head3 LoadPropertyData
1083    
1084  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
1085    
1086  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
1087    
# Line 987  Line 1156 
1156                  }                  }
1157                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
1158                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
1159                    $propertyCount++;
1160              }              }
1161              # Update the statistics.              # Update the statistics.
1162              Trace("$propertyCount attributes processed.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
# Line 1000  Line 1170 
1170    
1171  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1172    
1173  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1174    
1175  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1176    
# Line 1107  Line 1277 
1277    
1278  =head3 LoadSourceData  =head3 LoadSourceData
1279    
1280  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1281    
1282  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1283    
# Line 1185  Line 1355 
1355    
1356  =head3 LoadExternalData  =head3 LoadExternalData
1357    
1358  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1359    
1360  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1361    
# Line 1265  Line 1435 
1435    
1436  =head3 LoadReactionData  =head3 LoadReactionData
1437    
1438  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1439    
1440  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1441    
# Line 1281  Line 1451 
1451      IsIdentifiedByCAS      IsIdentifiedByCAS
1452      HasCompoundName      HasCompoundName
1453      IsAComponentOf      IsAComponentOf
1454        Scenario
1455        Catalyzes
1456        HasScenario
1457        IsInputFor
1458        IsOutputOf
1459        ExcludesReaction
1460        IncludesReaction
1461        IsOnDiagram
1462        IncludesReaction
1463    
1464  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
1465    
# Line 1308  Line 1487 
1487      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1488      my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');      my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1489      my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');      my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1490        my $loadScenario = $self->_TableLoader('Scenario');
1491        my $loadHasScenario = $self->_TableLoader('HasScenario');
1492        my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1493        my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1494        my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1495        my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1496        my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1497        my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1498      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1499          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1500      } else {      } else {
# Line 1316  Line 1503 
1503          my %compoundNames = ();          my %compoundNames = ();
1504          my %compoundCASes = ();          my %compoundCASes = ();
1505          # First we create the compounds.          # First we create the compounds.
1506          my @compounds = $fig->all_compounds();          my %compounds = map { $_ => 1 } $fig->all_compounds();
1507          for my $cid (@compounds) {          for my $cid (keys %compounds) {
1508              # Check for names.              # Check for names.
1509              my @names = $fig->names_of_compound($cid);              my @names = $fig->names_of_compound($cid);
1510              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
# Line 1347  Line 1534 
1534          # we initialize the discriminator index. This is a single integer used to insure          # we initialize the discriminator index. This is a single integer used to insure
1535          # duplicate elements in a reaction are not accidentally collapsed.          # duplicate elements in a reaction are not accidentally collapsed.
1536          my $discrim = 0;          my $discrim = 0;
1537          my @reactions = $fig->all_reactions();          my %reactions = map { $_ => 1 } $fig->all_reactions();
1538          for my $reactionID (@reactions) {          for my $reactionID (keys %reactions) {
1539              # Create the reaction record.              # Create the reaction record.
1540              $loadReaction->Put($reactionID, $fig->reversible($reactionID));              $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1541              # Compute the reaction's URL.              # Compute the reaction's URL.
# Line 1371  Line 1558 
1558                  }                  }
1559              }              }
1560          }          }
1561            # Now we run through the subsystems and roles, generating the scenarios
1562            # and connecting the reactions. We'll need some hashes to prevent
1563            # duplicates and a counter for compound group keys.
1564            my %roles = ();
1565            my %scenarios = ();
1566            my @subsystems = $fig->all_subsystems();
1567            for my $subName (@subsystems) {
1568                my $sub = $fig->get_subsystem($subName);
1569                Trace("Processing $subName reactions.") if T(3);
1570                # Get the subsystem's reactions.
1571                my %reactions = $sub->get_hope_reactions();
1572                # Loop through the roles, connecting them to the reactions.
1573                for my $role (keys %reactions) {
1574                    # Only process this role if it is new.
1575                    if (! $roles{$role}) {
1576                        $roles{$role} = 1;
1577                        my @reactions = @{$reactions{$role}};
1578                        for my $reaction (@reactions) {
1579                            $loadCatalyzes->Put($role, $reaction);
1580                        }
1581                    }
1582                }
1583                Trace("Processing $subName scenarios.") if T(3);
1584                # Get the subsystem's scenarios.
1585                my @scenarioNames = $sub->get_hope_scenario_names();
1586                # Loop through the scenarios, creating scenario data.
1587                for my $scenarioName (@scenarioNames) {
1588                    # Link this scenario to this subsystem.
1589                    $loadHasScenario->Put($subName, $scenarioName);
1590                    # If this scenario is new, we need to create it.
1591                    if (! $scenarios{$scenarioName}) {
1592                        Trace("Creating scenario $scenarioName.") if T(3);
1593                        $scenarios{$scenarioName} = 1;
1594                        # Create the scenario itself.
1595                        $loadScenario->Put($scenarioName);
1596                        # Attach the input compounds.
1597                        for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1598                            $loadIsInputFor->Put($input, $scenarioName);
1599                        }
1600                        # Now we need to set up the output compounds. They come in two
1601                        # groups, which we mark 0 and 1.
1602                        my $outputGroup = 0;
1603                        # Set up the output compounds.
1604                        for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1605                            # Attach the compounds.
1606                            for my $compound (@$outputGroup) {
1607                                $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1608                            }
1609                        }
1610                        # Create the reaction lists.
1611                        my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1612                        for my $reaction (@addReactions) {
1613                            $loadIncludesReaction->Put($scenarioName, $reaction);
1614                        }
1615                        my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1616                        for my $reaction (@notReactions) {
1617                            $loadExcludesReaction->Put($scenarioName, $reaction);
1618                        }
1619                        # Link the maps.
1620                        my @maps = $sub->get_hope_map_ids($scenarioName);
1621                        for my $map (@maps) {
1622                            $loadIsOnDiagram->Put($scenarioName, "map$map");
1623                        }
1624                    }
1625                }
1626            }
1627      }      }
1628      # Finish the load.      # Finish the load.
1629      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1379  Line 1632 
1632    
1633  =head3 LoadSynonymData  =head3 LoadSynonymData
1634    
1635  C<< my $stats = $spl->LoadSynonymData(); >>      my $stats = $spl->LoadSynonymData();
1636    
1637  Load the synonym groups into Sprout.  Load the synonym groups into Sprout.
1638    
# Line 1424  Line 1677 
1677          if (! defined($result)) {          if (! defined($result)) {
1678              Confess("Database error in Synonym load: " . $sth->errstr());              Confess("Database error in Synonym load: " . $sth->errstr());
1679          } else {          } else {
1680                Trace("Processing synonym results.") if T(2);
1681              # Remember the current synonym.              # Remember the current synonym.
1682              my $current_syn = "";              my $current_syn = "";
1683              # Count the features.              # Count the features.
1684              my $featureCount = 0;              my $featureCount = 0;
1685                my $entryCount = 0;
1686              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1687              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1688                  # Get the synonym group ID and feature ID.                  # Get the synonym group ID and feature ID.
1689                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1690                    # Count this row.
1691                    $entryCount++;
1692                    if ($entryCount % 1000 == 0) {
1693                        Trace("$entryCount rows processed.") if T(3);
1694                    }
1695                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1696                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
1697                  if (exists $genomeHash->{$genomeID}) {                  if (exists $genomeHash->{$genomeID}) {
# Line 1450  Line 1710 
1710                      }                      }
1711                  }                  }
1712              }              }
1713                Trace("$entryCount rows produced $featureCount features.") if T(2);
1714          }          }
1715      }      }
1716      # Finish the load.      # Finish the load.
# Line 1459  Line 1720 
1720    
1721  =head3 LoadFamilyData  =head3 LoadFamilyData
1722    
1723  C<< my $stats = $spl->LoadFamilyData(); >>      my $stats = $spl->LoadFamilyData();
1724    
1725  Load the protein families into Sprout.  Load the protein families into Sprout.
1726    
# Line 1527  Line 1788 
1788    
1789  =head3 LoadDrugData  =head3 LoadDrugData
1790    
1791  C<< my $stats = $spl->LoadDrugData(); >>      my $stats = $spl->LoadDrugData();
1792    
1793  Load the drug target data into Sprout.  Load the drug target data into Sprout.
1794    
# Line 1661  Line 1922 
1922                          # Decode the score.                          # Decode the score.
1923                          my $realScore = FIGRules::DecodeScore($score);                          my $realScore = FIGRules::DecodeScore($score);
1924                          # Connect the PDB to the feature.                          # Connect the PDB to the feature.
1925                          $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);                          $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1926                      }                      }
1927                  }                  }
1928              }              }
# Line 1726  Line 1987 
1987    
1988  =head3 SpecialAttribute  =head3 SpecialAttribute
1989    
1990  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>      my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1991    
1992  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1993  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
# Line 1902  Line 2163 
2163    
2164  =head3 GetGenomeAttributes  =head3 GetGenomeAttributes
2165    
2166  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>      my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2167    
2168  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2169  attributes for all the features of a genome in a single call, then organizes them into  attributes for all the features of a genome in a single call, then organizes them into
# Line 1985  Line 2246 
2246      return $retVal;      return $retVal;
2247  }  }
2248    
2249    =head3 GetCommaList
2250    
2251        my $string = GetCommaList($value);
2252    
2253    Create a comma-separated list of the values in a list reference. If the
2254    list reference is a scalar, it will be returned unchanged. If it is
2255    undefined, an empty string will be returned. The idea is that we may be
2256    looking at a string, a list, or nothing, but whatever comes out will be a
2257    string.
2258    
2259    =over 4
2260    
2261    =item value
2262    
2263    Reference to a list of values to be assembled into the return string.
2264    
2265    =item RETURN
2266    
2267    Returns a scalar string containing the content of the input value.
2268    
2269    =back
2270    
2271    =cut
2272    
2273    sub GetCommaList {
2274        # Get the parameters.
2275        my ($value) = @_;
2276        # Declare the return variable.
2277        my $retVal = "";
2278        # Only proceed if we have an input value.
2279        if (defined $value) {
2280            # Analyze the input value.
2281            if (ref $value eq 'ARRAY') {
2282                # Here it's a list reference.
2283                $retVal = join(", ", @$value);
2284            } else {
2285                # Here it's not. Flatten it to a scalar.
2286                $retVal = "$value";
2287            }
2288        }
2289        # Return the result.
2290        return $retVal;
2291    }
2292    
2293    
2294  1;  1;

Legend:
Removed from v.1.88  
changed lines
  Added in v.1.96

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3