[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.79, Sat Nov 18 20:38:45 2006 UTC revision 1.93, Tue Sep 9 21:02:10 2008 UTC
# Line 7  Line 7 
7      use PageBuilder;      use PageBuilder;
8      use ERDBLoad;      use ERDBLoad;
9      use FIG;      use FIG;
10        use FIGRules;
11      use Sprout;      use Sprout;
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16        use BioWords;
17    
18  =head1 Sprout Load Methods  =head1 Sprout Load Methods
19    
# Line 50  Line 53 
53    
54  =head3 new  =head3 new
55    
56  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57    
58  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
59  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 101  Line 104 
104              # Here we want all the complete genomes and an access code of 1.              # Here we want all the complete genomes and an access code of 1.
105              my @genomeList = $fig->genomes(1);              my @genomeList = $fig->genomes(1);
106              %genomes = map { $_ => 1 } @genomeList;              %genomes = map { $_ => 1 } @genomeList;
107                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
108          } else {          } else {
109              my $type = ref $genomeFile;              my $type = ref $genomeFile;
110              Trace("Genome file parameter type is \"$type\".") if T(3);              Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 167  Line 171 
171          for my $subsystem (keys %subsystems) {          for my $subsystem (keys %subsystems) {
172              my $name = $subsystem;              my $name = $subsystem;
173              $name =~ s/_/ /g;              $name =~ s/_/ /g;
             my $classes = $fig->subsystem_classification($subsystem);  
             $name .= " " . join(" ", @{$classes});  
174              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
175          }          }
176      }      }
177        # Get the list of NMPDR-oriented attribute keys.
178        my @propKeys = $fig->get_group_keys("NMPDR");
179      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
180      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
181      # Create the Sprout load object.      # Create the Sprout load object.
# Line 183  Line 187 
187                    loadDirectory => $directory,                    loadDirectory => $directory,
188                    erdb => $sprout,                    erdb => $sprout,
189                    loaders => [],                    loaders => [],
190                    options => $options                    options => $options,
191                      propKeys => \@propKeys,
192                   };                   };
193      # Bless and return it.      # Bless and return it.
194      bless $retVal, $class;      bless $retVal, $class;
# Line 192  Line 197 
197    
198  =head3 LoadOnly  =head3 LoadOnly
199    
200  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
201    
202  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
203    
# Line 203  Line 208 
208      return $self->{options}->{loadOnly};      return $self->{options}->{loadOnly};
209  }  }
210    
 =head3 PrimaryOnly  
   
 C<< my $flag = $spl->PrimaryOnly; >>  
   
 Return TRUE if only the main entity is to be loaded, else FALSE.  
   
 =cut  
   
 sub PrimaryOnly {  
     my ($self) = @_;  
     return $self->{options}->{primaryOnly};  
 }  
211    
212  =head3 LoadGenomeData  =head3 LoadGenomeData
213    
214  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
215    
216  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
217    
# Line 255  Line 248 
248      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
249      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
250      my $loadGenome = $self->_TableLoader('Genome');      my $loadGenome = $self->_TableLoader('Genome');
251      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);      my $loadHasContig = $self->_TableLoader('HasContig');
252      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);      my $loadContig = $self->_TableLoader('Contig');
253      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
254      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);      my $loadSequence = $self->_TableLoader('Sequence');
255      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
256          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
257      } else {      } else {
258          Trace("Generating genome data.") if T(2);          Trace("Generating genome data.") if T(2);
259            # Get the full info for the FIG genomes.
260            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
263          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
264              Trace("Generating data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
# Line 274  Line 270 
270              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
271              # Get the full taxonomy.              # Get the full taxonomy.
272              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
273                # Get the version. If no version is specified, we default to the genome ID by itself.
274                my $version = $fig->genome_version($genomeID);
275                if (! defined($version)) {
276                    $version = $genomeID;
277                }
278                # Get the DNA size.
279                my $dnaSize = $fig->genome_szdna($genomeID);
280              # Open the NMPDR group file for this genome.              # Open the NMPDR group file for this genome.
281              my $group;              my $group;
282              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
# Line 285  Line 288 
288                  $group = $FIG_Config::otherGroup;                  $group = $FIG_Config::otherGroup;
289              }              }
290              close TMP;              close TMP;
291                # Get the contigs.
292                my @contigs = $fig->all_contigs($genomeID);
293                # Get this genome's info array.
294                my $info = $genomeInfo{$genomeID};
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297                               $group, $species, $extra, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
             my @contigs = $fig->all_contigs($genomeID);  
299              for my $contigID (@contigs) {              for my $contigID (@contigs) {
300                  Trace("Processing contig $contigID for $genomeID.") if T(4);                  Trace("Processing contig $contigID for $genomeID.") if T(4);
301                  $loadContig->Add("contigIn");                  $loadContig->Add("contigIn");
# Line 325  Line 331 
331      return $retVal;      return $retVal;
332  }  }
333    
 =head3 LoadCouplingData  
   
 C<< my $stats = $spl->LoadCouplingData(); >>  
   
 Load the coupling and evidence data from FIG into Sprout.  
   
 The coupling data specifies which genome features are functionally coupled. The  
 evidence data explains why the coupling is functional.  
   
 The following relations are loaded by this method.  
   
     Coupling  
     IsEvidencedBy  
     PCH  
     ParticipatesInCoupling  
     UsesAsEvidence  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadCouplingData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeFilter = $self->{genomes};  
     # Set up an ID counter for the PCHs.  
     my $pchID = 0;  
     # Start the loads.  
     my $loadCoupling = $self->_TableLoader('Coupling');  
     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);  
     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);  
     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);  
     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating coupling data.") if T(2);  
         # Loop through the genomes found.  
         for my $genome (sort keys %{$genomeFilter}) {  
             Trace("Generating coupling data for $genome.") if T(3);  
             $loadCoupling->Add("genomeIn");  
             # Create a hash table for holding coupled pairs. We use this to prevent  
             # duplicates. For example, if A is coupled to B, we don't want to also  
             # assert that B is coupled to A, because we already know it. Fortunately,  
             # all couplings occur within a genome, so we can keep the hash table  
             # size reasonably small.  
             my %dupHash = ();  
             # Get all of the genome's PEGs.  
             my @pegs = $fig->pegs_of($genome);  
             # Loop through the PEGs.  
             for my $peg1 (@pegs) {  
                 $loadCoupling->Add("pegIn");  
                 Trace("Processing PEG $peg1 for $genome.") if T(4);  
                 # Get a list of the coupled PEGs.  
                 my @couplings = $fig->coupled_to($peg1);  
                 # For each coupled PEG, we need to verify that a coupling already  
                 # exists. If not, we have to create one.  
                 for my $coupleData (@couplings) {  
                     my ($peg2, $score) = @{$coupleData};  
                     # Compute the coupling ID.  
                     my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);  
                     if (! exists $dupHash{$coupleID}) {  
                         $loadCoupling->Add("couplingIn");  
                         # Here we have a new coupling to store in the load files.  
                         Trace("Storing coupling ($coupleID) with score $score.") if T(4);  
                         # Ensure we don't do this again.  
                         $dupHash{$coupleID} = $score;  
                         # Write the coupling record.  
                         $loadCoupling->Put($coupleID, $score);  
                         # Connect it to the coupled PEGs.  
                         $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);  
                         $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);  
                         # Get the evidence for this coupling.  
                         my @evidence = $fig->coupling_evidence($peg1, $peg2);  
                         # Organize the evidence into a hash table.  
                         my %evidenceMap = ();  
                         # Process each evidence item.  
                         for my $evidenceData (@evidence) {  
                             $loadPCH->Add("evidenceIn");  
                             my ($peg3, $peg4, $usage) = @{$evidenceData};  
                             # Only proceed if the evidence is from a Sprout  
                             # genome.  
                             if ($genomeFilter->{$fig->genome_of($peg3)}) {  
                                 $loadUsesAsEvidence->Add("evidenceChosen");  
                                 my $evidenceKey = "$coupleID $peg3 $peg4";  
                                 # We store this evidence in the hash if the usage  
                                 # is nonzero or no prior evidence has been found. This  
                                 # insures that if there is duplicate evidence, we  
                                 # at least keep the meaningful ones. Only evidence in  
                                 # the hash makes it to the output.  
                                 if ($usage || ! exists $evidenceMap{$evidenceKey}) {  
                                     $evidenceMap{$evidenceKey} = $evidenceData;  
                                 }  
                             }  
                         }  
                         for my $evidenceID (keys %evidenceMap) {  
                             # Get the ID for this evidence.  
                             $pchID++;  
                             # Create the evidence record.  
                             my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};  
                             $loadPCH->Put($pchID, $usage);  
                             # Connect it to the coupling.  
                             $loadIsEvidencedBy->Put($coupleID, $pchID);  
                             # Connect it to the features.  
                             $loadUsesAsEvidence->Put($pchID, $peg3, 1);  
                             $loadUsesAsEvidence->Put($pchID, $peg4, 2);  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     # All done. Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
334  =head3 LoadFeatureData  =head3 LoadFeatureData
335    
336  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
337    
338  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
339    
# Line 463  Line 343 
343    
344      Feature      Feature
345      FeatureAlias      FeatureAlias
346        IsAliasOf
347      FeatureLink      FeatureLink
348      FeatureTranslation      FeatureTranslation
349      FeatureUpstream      FeatureUpstream
# Line 472  Line 353 
353      FeatureEssential      FeatureEssential
354      FeatureVirulent      FeatureVirulent
355      FeatureIEDB      FeatureIEDB
356        CDD
357        IsPresentOnProteinOf
358        CellLocation
359        IsPossiblePlaceFor
360        ExternalDatabase
361        IsAlsoFoundIn
362        Keyword
363    
364  =over 4  =over 4
365    
# Line 493  Line 381 
381      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
382      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
383      my $loadFeature = $self->_TableLoader('Feature');      my $loadFeature = $self->_TableLoader('Feature');
384      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
385      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
386        my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
387      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
388      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
389      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
390      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);      my $loadHasFeature = $self->_TableLoader('HasFeature');
391      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
392      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
393      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');      my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
394      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');      my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395        my $loadCDD = $self->_TableLoader('CDD');
396        my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397        my $loadCellLocation = $self->_TableLoader('CellLocation');
398        my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399        my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400        my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401        my $loadKeyword = $self->_TableLoader('Keyword');
402      # Get the subsystem hash.      # Get the subsystem hash.
403      my $subHash = $self->{subsystems};      my $subHash = $self->{subsystems};
404        # Get the property keys.
405        my $propKeys = $self->{propKeys};
406        # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407        my %CDD = ();
408        my %alias = ();
409        my %cellLocation = ();
410        my %xdb = ();
411        # Create the bio-words object.
412        my $biowords = BioWords->new(cache => 0);
413        # One of the things we have to do here is build the keyword table, and the keyword
414        # table needs to contain the originating text and feature count for each stem. Unfortunately,
415        # the number of distinct keywords is so large it causes PERL to hang if we try to
416        # keep them in memory. As a result, we need to track them using disk files.
417        # Our approach will be to use two sequential files. One will contain stems and phonexes.
418        # Each time a stem occurs in a feature, a record will be written to that file. The stem
419        # file can then be sorted and collated to determine the number of features for each
420        # stem. A separate file will contain keywords and stems. This last file
421        # will be subjected to a sort unique on stem/keyword. The file is then merged
422        # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
423        my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
424        my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
425        my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
426        my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
427      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
428      # locations.      # locations.
429      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 513  Line 432 
432      } else {      } else {
433          Trace("Generating feature data.") if T(2);          Trace("Generating feature data.") if T(2);
434          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
435          for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
436            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
437            for my $genomeID (@allGenomes) {
438              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
439              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
440              # Get the feature list for this genome.              # Get the feature list for this genome.
441              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
442              # Sort and count the list.              # Sort and count the list.
443              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
444              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
445                my @fids = map { $_->[0] } @featureTuples;
446              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
447                # Get the attributes for this genome and put them in a hash by feature ID.
448                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
449                Trace("Looping through features for $genomeID.") if T(3);
450              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
451              my $oldFeatureID = "";              my $oldFeatureID = "";
452              # Loop through the features.              # Loop through the features.
453              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
454                  # Split the tuple.                  # Split the tuple.
455                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
456                  # Check for duplicates.                  # Check for duplicates.
457                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
458                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 535  Line 460 
460                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
461                      # Count this feature.                      # Count this feature.
462                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
463                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
464                        # Sprout database requires a single character.
465                        if (! defined($quality) || $quality eq "") {
466                            $quality = " ";
467                        }
468                      # Begin building the keywords. We start with the genome ID, the                      # Begin building the keywords. We start with the genome ID, the
469                      # feature ID, the taxonomy, and the organism name.                      # feature ID, the taxonomy, and the organism name.
470                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
471                                      $fig->taxonomy_of($genomeID));                                      $fig->taxonomy_of($genomeID));
                     # Get the functional assignment and aliases. This  
                     # depends on the feature type.  
                     my $assignment;  
                     if ($type eq "peg") {  
                         $assignment = $fig->function_of($featureID);  
472                          # Create the aliases.                          # Create the aliases.
473                          for my $alias ($fig->feature_aliases($featureID)) {                          for my $alias ($fig->feature_aliases($featureID)) {
474                              $loadFeatureAlias->Put($featureID, $alias);                          #Connect this alias to this feature.
475                            $loadIsAliasOf->Put($alias, $featureID);
476                              push @keywords, $alias;                              push @keywords, $alias;
477                            # If this is a locus tag, also add its natural form as a keyword.
478                            my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
479                            if ($naturalName) {
480                                push @keywords, $naturalName;
481                            }
482                            # If this is the first time for the specified alias, create its
483                            # alias record.
484                            if (! exists $alias{$alias}) {
485                                $loadFeatureAlias->Put($alias);
486                                $alias{$alias} = 1;
487                            }
488                        }
489                        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
490                        my @corresponders = $fig->get_corresponding_ids($featureID, 1);
491                        for my $tuple (@corresponders) {
492                            my ($id, $xdb) = @{$tuple};
493                            # Ignore SEED: that's us.
494                            if ($xdb ne 'SEED') {
495                                # Connect this ID to the feature.
496                                $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
497                                # Add it as a keyword.
498                                push @keywords, $id;
499                                # If this is a new database, create a record for it.
500                                if (! exists $xdb{$xdb}) {
501                                    $xdb{$xdb} = 1;
502                                    $loadExternalDatabase->Put($xdb);
503                                }
504                          }                          }
                     } else {  
                         # For other types, the assignment is the first (and ONLY) alias.  
                         ($assignment) = $fig->feature_aliases($featureID);  
505                      }                      }
506                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
507                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
# Line 579  Line 529 
529                      }                      }
530                      # Now we need to find the subsystems this feature participates in.                      # Now we need to find the subsystems this feature participates in.
531                      # We also add the subsystems to the keyword list. Before we do that,                      # We also add the subsystems to the keyword list. Before we do that,
532                      # we must convert underscores to spaces and tack on the classifications.                      # we must convert underscores to spaces.
533                      my @subsystems = $fig->peg_to_subsystems($featureID);                      my @subsystems = $fig->peg_to_subsystems($featureID);
534                      for my $subsystem (@subsystems) {                      for my $subsystem (@subsystems) {
535                          # Only proceed if we like this subsystem.                          # Only proceed if we like this subsystem.
# Line 608  Line 558 
558                      # [name, value, value with URL]. (We don't need the PEG, since                      # [name, value, value with URL]. (We don't need the PEG, since
559                      # we already know it.)                      # we already know it.)
560                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
561                                           $fig->get_attributes($featureID);                                           @{$attributes->{$featureID}};
562                      # Now we process each of the special attributes.                      # Now we process each of the special attributes.
563                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
564                                           1, [0,2], '^(essential|potential_essential)$',                                           1, [0,2], '^(essential|potential_essential)$',
# Line 628  Line 578 
578                          push @keywords, 'iedb';                          push @keywords, 'iedb';
579                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
580                      }                      }
581                      # Now we need to bust up hyphenated words in the keyword                      # Now we have some other attributes we need to process. To get
582                      # list. We keep them separate and put them at the end so                      # through them, we convert the attribute list for this feature
583                      # the original word order is available.                      # into a two-layer hash: key => subkey => value.
584                      my $keywordString = "";                      my %attributeHash = ();
585                      my $bustedString = "";                      for my $attrRow (@{$attributes->{$featureID}}) {
586                      for my $keyword (@keywords) {                          my (undef, $key, @values) = @{$attrRow};
587                          if (length $keyword >= 4) {                          my ($realKey, $subKey);
588                              $keywordString .= " $keyword";                          if ($key =~ /^([^:]+)::(.+)/) {
589                              if ($keyword =~ /-/) {                              ($realKey, $subKey) = ($1, $2);
590                                  my @words = grep { length($_) >= 4 } split /-/, $keyword;                          } else {
591                                  $bustedString .= join(" ", "", @words);                              ($realKey, $subKey) = ($key, "");
592                              }                          }
593                            if (exists $attributeHash{$1}) {
594                                $attributeHash{$1}->{$2} = \@values;
595                            } else {
596                                $attributeHash{$1} = {$2 => \@values};
597                            }
598                        }
599                        # First we handle CDD. This is a bit complicated, because
600                        # there are multiple CDDs per protein.
601                        if (exists $attributeHash{CDD}) {
602                            # Get the hash of CDD IDs to scores for this feature. We
603                            # already know it exists because of the above IF.
604                            my $cddHash = $attributeHash{CDD};
605                            my @cddData = sort keys %{$cddHash};
606                            for my $cdd (@cddData) {
607                                # Extract the score for this CDD and decode it.
608                                my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
609                                my $realScore = FIGRules::DecodeScore($codeScore);
610                                # We can't afford to crash because of a bad attribute
611                                # value, hence the IF below.
612                                if (! defined($realScore)) {
613                                    # Bad score, so count it.
614                                    $loadFeature->Add('badCDDscore');
615                                    Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
616                                } else {
617                                    # Create the connection.
618                                    $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
619                                    # If this CDD does not yet exist, create its record.
620                                    if (! exists $CDD{$cdd}) {
621                                        $CDD{$cdd} = 1;
622                                        $loadCDD->Put($cdd);
623                                    }
624                                }
625                            }
626                        }
627                        # Next we do PSORT cell locations. here the confidence value
628                        # could have the value "unknown", which we translate to -1.
629                        if (exists $attributeHash{PSORT}) {
630                            # This will be a hash of cell locations to confidence
631                            # factors.
632                            my $psortHash = $attributeHash{PSORT};
633                            for my $psort (keys %{$psortHash}) {
634                                # Get the confidence, and convert it to a number if necessary.
635                                my $confidence = $psortHash->{$psort};
636                                if ($confidence eq 'unknown') {
637                                    $confidence = -1;
638                                }
639                                $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
640                                # If this cell location does not yet exist, create its record.
641                                if (! exists $cellLocation{$psort}) {
642                                    $cellLocation{$psort} = 1;
643                                    $loadCellLocation->Put($psort);
644                                }
645                                # If this is a significant location, add it as a keyword.
646                                if ($confidence > 2.5) {
647                                    push @keywords, $psort;
648                                }
649                            }
650                        }
651                        # Phobius data is next. This consists of the signal peptide location and
652                        # the transmembrane locations.
653                        my $signalList = "";
654                        my $transList = "";
655                        if (exists $attributeHash{Phobius}) {
656                            # This will be a hash of two keys (transmembrane and signal) to
657                            # location strings. If there's no value, we stuff in an empty string.
658                            $signalList = ($attributeHash{Phobius}->{signal} || "");
659                            $transList = ($attributeHash{Phobius}->{transmembrane} || "");
660                        }
661                        # Here are some more numbers: isoelectric point, molecular weight, and
662                        # the similar-to-human flag.
663                        my $isoelectric = 0;
664                        if (exists $attributeHash{isoelectric_point}) {
665                            $isoelectric = $attributeHash{isoelectric_point}->{""};
666                        }
667                        my $similarToHuman = 0;
668                        if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
669                            $similarToHuman = 1;
670                        }
671                        my $molecularWeight = 0;
672                        if (exists $attributeHash{molecular_weight}) {
673                            $molecularWeight = $attributeHash{molecular_weight}->{""};
674                        }
675                        # Create the keyword string.
676                        my $keywordString = join(" ", @keywords);
677                        Trace("Real keyword string for $featureID: $keywordString.") if T(4);
678                        # Get rid of annoying punctuation.
679                        $keywordString =~ s/[();@#\/]/ /g;
680                        # Get the list of keywords in the keyword string.
681                        my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
682                        # We need to do two things here: create the keyword string for the feature table
683                        # and write records to the keyword and stem files. The stuff we write to
684                        # the files will be taken from the following two hashes. The stuff used
685                        # to create the keyword string will be taken from the list.
686                        my (%keys, %stems, @realStems);
687                        for my $keyword (@realKeywords) {
688                            # Compute the stem and phonex for this keyword.
689                            my ($stem, $phonex) = $biowords->StemLookup($keyword);
690                            # Only proceed if a stem comes back. If no stem came back, it's a
691                            # stop word and we throw it away.
692                            if ($stem) {
693                                $keys{$keyword} = $stem;
694                                $stems{$stem} = $phonex;
695                                push @realStems, $stem;
696                          }                          }
697                      }                      }
698                      $keywordString .= $bustedString;                      # Now create the keyword string.
699                      # Get rid of annoying punctuation.                      my $cleanWords = join(" ", @realStems);
                     $keywordString =~ s/[();]//g;  
                     # Clean the keyword list.  
                     my $cleanWords = $sprout->CleanKeywords($keywordString);  
700                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
701                      # Create the feature record.                      # Write the stem and keyword records.
702                      $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);                      for my $stem (keys %stems) {
703                            Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
704                        }
705                        for my $key (keys %keys) {
706                            # The stem goes first in this file, because we want to sort
707                            # by stem and then keyword.
708                            Tracer::PutLine($keyh, [$keys{$key}, $key]);
709                        }
710                        # Now we need to process the feature's locations. First, we split them up.
711                        my @locationList = split /\s*,\s*/, $locations;
712                        # Next, we convert them to Sprout location objects.
713                        my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
714                        # Assemble them into a sprout location string for later.
715                        my $locationString = join(", ", map { $_->String } @locObjectList);
716                        # We'll store the sequence length in here.
717                        my $sequenceLength = 0;
718                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
719                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
720                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
721                      # for Sprout.                      # for Sprout. To start, we create the location position indicator.
                     my @locationList = split /\s*,\s*/, $locations;  
                     # Create the location position indicator.  
722                      my $i = 1;                      my $i = 1;
723                      # Loop through the locations.                      # Loop through the locations.
724                      for my $location (@locationList) {                      for my $locObject (@locObjectList) {
725                          # Parse the location.                          # Record the length.
726                          my $locObject = BasicLocation->new("$genomeID:$location");                          $sequenceLength += $locObject->Length;
727                          # Split it into a list of chunks.                          # Split this location into a list of chunks.
728                          my @locOList = ();                          my @locOList = ();
729                          while (my $peeling = $locObject->Peel($chunkSize)) {                          while (my $peeling = $locObject->Peel($chunkSize)) {
730                              $loadIsLocatedIn->Add("peeling");                              $loadIsLocatedIn->Add("peeling");
# Line 676  Line 739 
739                              $i++;                              $i++;
740                          }                          }
741                      }                      }
742                        # Now we get some ancillary flags.
743                        my $locked = $fig->is_locked_fid($featureID);
744                        my $in_genbank = $fig->peg_in_gendb($featureID);
745                        # Create the feature record.
746                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
747                                          $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
748                                          $transList);
749                    }
750                }
751                Trace("Genome $genomeID processed.") if T(3);
752            }
753        }
754        Trace("Sorting keywords.") if T(2);
755        # Now we need to load the keyword table from the key and stem files.
756        close $keyh;
757        close $stemh;
758        Trace("Loading keywords.") if T(2);
759        $keyh = Open(undef, "<$keyFileName");
760        $stemh = Open(undef, "<$stemFileName");
761        # We'll count the keywords in here, for tracing purposes.
762        my $count = 0;
763        # These variables track the current stem's data. When an incoming
764        # keyword's stem changes, these will be recomputed.
765        my ($currentStem, $currentPhonex, $currentCount);
766        # Prime the loop by reading the first stem in the stem file.
767        my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
768        # Loop through the keyword file.
769        while (! eof $keyh) {
770            # Read this keyword.
771            my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
772            # Check to see if it's the new stem yet.
773            if ($thisStem ne $currentStem) {
774                # Yes. It's a terrible error if it's not also the next stem.
775                if ($thisStem ne $nextStem) {
776                    Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
777                } else {
778                    # Here we're okay.
779                    ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
780                    # Count the number of features for this stem.
781                    $currentCount = 0;
782                    while ($nextStem eq $thisStem) {
783                        ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
784                        $currentCount++;
785                    }
786                  }                  }
787              }              }
788            # Now $currentStem is the same as $thisStem, and the other $current-vars
789            # contain the stem's data (phonex and count).
790            $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
791            if (++$count % 1000 == 0 && T(3)) {
792                Trace("$count keywords loaded.");
793          }          }
794      }      }
795        Trace("$count keywords loaded into keyword table.") if T(2);
796      # Finish the loads.      # Finish the loads.
797      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
798      return $retVal;      return $retVal;
# Line 687  Line 800 
800    
801  =head3 LoadSubsystemData  =head3 LoadSubsystemData
802    
803  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
804    
805  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
806    
# Line 703  Line 816 
816      SubsystemClass      SubsystemClass
817      Role      Role
818      RoleEC      RoleEC
819        IsIdentifiedByEC
820      SSCell      SSCell
821      ContainsFeature      ContainsFeature
822      IsGenomeOf      IsGenomeOf
# Line 716  Line 830 
830      ConsistsOfGenomes      ConsistsOfGenomes
831      GenomeSubset      GenomeSubset
832      HasGenomeSubset      HasGenomeSubset
     Catalyzes  
833      Diagram      Diagram
834      RoleOccursIn      RoleOccursIn
835        SubsystemHopeNotes
836    
837  =over 4  =over 4
838    
# Line 744  Line 858 
858      # Get the map list.      # Get the map list.
859      my @maps = $fig->all_maps;      my @maps = $fig->all_maps;
860      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
861      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);      my $loadDiagram = $self->_TableLoader('Diagram');
862      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
863      my $loadSubsystem = $self->_TableLoader('Subsystem');      my $loadSubsystem = $self->_TableLoader('Subsystem');
864      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);      my $loadRole = $self->_TableLoader('Role');
865      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);      my $loadRoleEC = $self->_TableLoader('RoleEC');
866      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);      my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
867      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);      my $loadCatalyzes = $self->_TableLoader('Catalyzes');
868      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);      my $loadSSCell = $self->_TableLoader('SSCell');
869      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
870      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
871      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
872      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
873      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
874      my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);      my $loadHasSSCell = $self->_TableLoader('HasSSCell');
875      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);      my $loadRoleSubset = $self->_TableLoader('RoleSubset');
876      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);      my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
877      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);      my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
878      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);      my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
879      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);      my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
880      my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);      my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
881        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
882        my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
883      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
884          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
885      } else {      } else {
886          Trace("Generating subsystem data.") if T(2);          Trace("Generating subsystem data.") if T(2);
887          # This hash will contain the role for each EC. When we're done, this          # This hash will contain the roles for each EC. When we're done, this
888          # information will be used to generate the Catalyzes table.          # information will be used to generate the Catalyzes table.
889          my %ecToRoles = ();          my %ecToRoles = ();
890          # Loop through the subsystems. Our first task will be to create the          # Loop through the subsystems. Our first task will be to create the
# Line 782  Line 898 
898              # Get the subsystem object.              # Get the subsystem object.
899              my $sub = $fig->get_subsystem($subsysID);              my $sub = $fig->get_subsystem($subsysID);
900              # Only proceed if the subsystem has a spreadsheet.              # Only proceed if the subsystem has a spreadsheet.
901              if (! $sub->{empty_ss}) {              if (defined($sub) && ! $sub->{empty_ss}) {
902                  Trace("Creating subsystem $subsysID.") if T(3);                  Trace("Creating subsystem $subsysID.") if T(3);
903                  $loadSubsystem->Add("subsystemIn");                  $loadSubsystem->Add("subsystemIn");
904                  # Create the subsystem record.                  # Create the subsystem record.
905                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
906                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
907                  $loadSubsystem->Put($subsysID, $curator, $notes);                  my $version = $sub->get_version();
908                    my $description = $sub->get_description();
909                    $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
910                    # Add the hope notes.
911                    my $hopeNotes = $sub->get_hope_curation_notes();
912                    if ($hopeNotes) {
913                        $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
914                    }
915                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
916                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
917                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
# Line 796  Line 919 
919                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
920                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
921                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
922                        # Get the role's abbreviation.
923                        my $abbr = $sub->get_role_abbr($col);
924                        # Get its essentiality.
925                        my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
926                        # Get its reaction note.
927                        my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
928                      # Connect to this role.                      # Connect to this role.
929                      $loadOccursInSubsystem->Add("roleIn");                      $loadOccursInSubsystem->Add("roleIn");
930                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
931                      # If it's a new role, add it to the role table.                      # If it's a new role, add it to the role table.
932                      if (! exists $roleData{$roleID}) {                      if (! exists $roleData{$roleID}) {
933                          # Get the role's abbreviation.                          # Get the role's abbreviation.
                         my $abbr = $sub->get_role_abbr($col);  
934                          # Add the role.                          # Add the role.
935                          $loadRole->Put($roleID, $abbr);                          $loadRole->Put($roleID);
936                          $roleData{$roleID} = 1;                          $roleData{$roleID} = 1;
937                          # Check for an EC number.                          # Check for an EC number.
938                          if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {                          if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
939                              my $ec = $1;                              my $ec = $1;
940                              $loadRoleEC->Put($roleID, $ec);                              $loadIsIdentifiedByEC->Put($roleID, $ec);
941                              $ecToRoles{$ec} = $roleID;                              # Check to see if this is our first encounter with this EC.
942                                if (exists $ecToRoles{$ec}) {
943                                    # No, so just add this role to the EC list.
944                                    push @{$ecToRoles{$ec}}, $roleID;
945                                } else {
946                                    # Output this EC.
947                                    $loadRoleEC->Put($ec);
948                                    # Create its role list.
949                                    $ecToRoles{$ec} = [$roleID];
950                                }
951                          }                          }
952                      }                      }
953                  }                  }
# Line 923  Line 1060 
1060              # Now we need to link all the map's roles to it.              # Now we need to link all the map's roles to it.
1061              # A hash is used to prevent duplicates.              # A hash is used to prevent duplicates.
1062              my %roleHash = ();              my %roleHash = ();
1063              for my $role ($fig->map_to_ecs($map)) {              for my $ec ($fig->map_to_ecs($map)) {
1064                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {                  if (exists $ecToRoles{$ec}) {
1065                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);                      for my $role (@{$ecToRoles{$ec}}) {
1066                            if (! $roleHash{$role}) {
1067                                $loadRoleOccursIn->Put($role, $map);
1068                      $roleHash{$role} = 1;                      $roleHash{$role} = 1;
1069                  }                  }
1070              }              }
1071          }          }
         # Before we leave, we must create the Catalyzes table. We start with the reactions,  
         # then use the "ecToRoles" table to convert EC numbers to role IDs.  
         my @reactions = $fig->all_reactions();  
         for my $reactionID (@reactions) {  
             # Get this reaction's list of roles. The results will be EC numbers.  
             my @roles = $fig->catalyzed_by($reactionID);  
             # Loop through the roles, creating catalyzation records.  
             for my $thisRole (@roles) {  
                 if (exists $ecToRoles{$thisRole}) {  
                     $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);  
                 }  
1072              }              }
1073          }          }
1074      }      }
# Line 951  Line 1079 
1079    
1080  =head3 LoadPropertyData  =head3 LoadPropertyData
1081    
1082  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
1083    
1084  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
1085    
# Line 987  Line 1115 
1115      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1116      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1117      my $loadProperty = $self->_TableLoader('Property');      my $loadProperty = $self->_TableLoader('Property');
1118      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);      my $loadHasProperty = $self->_TableLoader('HasProperty');
1119      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1120          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1121      } else {      } else {
# Line 995  Line 1123 
1123          # Create a hash for storing property IDs.          # Create a hash for storing property IDs.
1124          my %propertyKeys = ();          my %propertyKeys = ();
1125          my $nextID = 1;          my $nextID = 1;
1126            # Get the attributes we intend to store in the property table.
1127            my $propKeys = $self->{propKeys};
1128          # Loop through the genomes.          # Loop through the genomes.
1129          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1130              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1131              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1132              # Get the genome's features. The feature ID is the first field in the              # Initialize a counter.
             # tuples returned by "all_features_detailed". We use "all_features_detailed"  
             # rather than "all_features" because we want all features regardless of type.  
             my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};  
             my $featureCount = 0;  
1133              my $propertyCount = 0;              my $propertyCount = 0;
1134              # Loop through the features, creating HasProperty records.              # Get the properties for this genome's features.
1135              for my $fid (@features) {              my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
1136                  # Get all attributes for this feature. We do this one feature at a time              Trace("Property list built for $genomeID.") if T(3);
1137                  # to insure we do not get any genome attributes.              # Loop through the results, creating HasProperty records.
1138                  my @attributeList = $fig->get_attributes($fid);              for my $attributeData (@attributes) {
1139                  # Add essentiality and virulence attributes.                  # Pull apart the attribute tuple.
1140                  if ($fig->essential($fid)) {                  my ($fid, $key, $value, $url) = @{$attributeData};
                     push @attributeList, [$fid, 'essential', 1, ''];  
                 }  
                 if ($fig->virulent($fid)) {  
                     push @attributeList, [$fid, 'virulent', 1, ''];  
                 }  
                 if (scalar @attributeList) {  
                     $featureCount++;  
                 }  
                 # Loop through the attributes.  
                 for my $tuple (@attributeList) {  
                     $propertyCount++;  
                     # Get this attribute value's data. Note that we throw away the FID,  
                     # since it will always be the same as the value if "$fid".  
                     my (undef, $key, $value, $url) = @{$tuple};  
1141                      # Concatenate the key and value and check the "propertyKeys" hash to                      # Concatenate the key and value and check the "propertyKeys" hash to
1142                      # see if we already have an ID for it. We use a tab for the separator                      # see if we already have an ID for it. We use a tab for the separator
1143                      # character.                      # character.
# Line 1043  Line 1155 
1155                      # Create the HasProperty entry for this feature/property association.                      # Create the HasProperty entry for this feature/property association.
1156                      $loadHasProperty->Put($fid, $propertyID, $url);                      $loadHasProperty->Put($fid, $propertyID, $url);
1157                  }                  }
             }  
1158              # Update the statistics.              # Update the statistics.
1159              Trace("$propertyCount attributes processed for $featureCount features.") if T(3);              Trace("$propertyCount attributes processed.") if T(3);
             $loadHasProperty->Add("featuresIn", $featureCount);  
1160              $loadHasProperty->Add("propertiesIn", $propertyCount);              $loadHasProperty->Add("propertiesIn", $propertyCount);
1161          }          }
1162      }      }
# Line 1057  Line 1167 
1167    
1168  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1169    
1170  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1171    
1172  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1173    
# Line 1091  Line 1201 
1201      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1202      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1203      my $loadAnnotation = $self->_TableLoader('Annotation');      my $loadAnnotation = $self->_TableLoader('Annotation');
1204      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1205      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);      my $loadSproutUser = $self->_TableLoader('SproutUser');
1206      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);      my $loadUserAccess = $self->_TableLoader('UserAccess');
1207      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1208      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1209          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1210      } else {      } else {
# Line 1164  Line 1274 
1274    
1275  =head3 LoadSourceData  =head3 LoadSourceData
1276    
1277  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1278    
1279  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1280    
# Line 1198  Line 1308 
1308      # Get the genome hash.      # Get the genome hash.
1309      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1310      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1311      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);      my $loadComesFrom = $self->_TableLoader('ComesFrom');
1312      my $loadSource = $self->_TableLoader('Source');      my $loadSource = $self->_TableLoader('Source');
1313      my $loadSourceURL = $self->_TableLoader('SourceURL');      my $loadSourceURL = $self->_TableLoader('SourceURL');
1314      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
# Line 1242  Line 1352 
1352    
1353  =head3 LoadExternalData  =head3 LoadExternalData
1354    
1355  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1356    
1357  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1358    
# Line 1322  Line 1432 
1432    
1433  =head3 LoadReactionData  =head3 LoadReactionData
1434    
1435  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1436    
1437  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1438    
# Line 1335  Line 1445 
1445      Compound      Compound
1446      CompoundName      CompoundName
1447      CompoundCAS      CompoundCAS
1448        IsIdentifiedByCAS
1449        HasCompoundName
1450      IsAComponentOf      IsAComponentOf
1451        Scenario
1452        Catalyzes
1453        HasScenario
1454        IsInputFor
1455        IsOutputOf
1456        ExcludesReaction
1457        IncludesReaction
1458        IsOnDiagram
1459        IncludesReaction
1460    
1461  This method proceeds reaction by reaction rather than genome by genome.  This method proceeds reaction by reaction rather than genome by genome.
1462    
# Line 1356  Line 1477 
1477      my $fig = $self->{fig};      my $fig = $self->{fig};
1478      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1479      my $loadReaction = $self->_TableLoader('Reaction');      my $loadReaction = $self->_TableLoader('Reaction');
1480      my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);      my $loadReactionURL = $self->_TableLoader('ReactionURL');
1481      my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);      my $loadCompound = $self->_TableLoader('Compound');
1482      my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);      my $loadCompoundName = $self->_TableLoader('CompoundName');
1483      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);      my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1484      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);      my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1485        my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1486        my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1487        my $loadScenario = $self->_TableLoader('Scenario');
1488        my $loadHasScenario = $self->_TableLoader('HasScenario');
1489        my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1490        my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1491        my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1492        my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1493        my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1494        my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1495      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1496          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1497      } else {      } else {
1498          Trace("Generating annotation data.") if T(2);          Trace("Generating reaction data.") if T(2);
1499            # We need some hashes to prevent duplicates.
1500            my %compoundNames = ();
1501            my %compoundCASes = ();
1502          # First we create the compounds.          # First we create the compounds.
1503          my @compounds = $fig->all_compounds();          my %compounds = map { $_ => 1 } $fig->all_compounds();
1504          for my $cid (@compounds) {          for my $cid (keys %compounds) {
1505              # Check for names.              # Check for names.
1506              my @names = $fig->names_of_compound($cid);              my @names = $fig->names_of_compound($cid);
1507              # Each name will be given a priority number, starting with 1.              # Each name will be given a priority number, starting with 1.
1508              my $prio = 1;              my $prio = 1;
1509              for my $name (@names) {              for my $name (@names) {
1510                  $loadCompoundName->Put($cid, $name, $prio++);                  if (! exists $compoundNames{$name}) {
1511                        $loadCompoundName->Put($name);
1512                        $compoundNames{$name} = 1;
1513                    }
1514                    $loadHasCompoundName->Put($cid, $name, $prio++);
1515              }              }
1516              # Create the main compound record. Note that the first name              # Create the main compound record. Note that the first name
1517              # becomes the label.              # becomes the label.
# Line 1382  Line 1520 
1520              # Check for a CAS ID.              # Check for a CAS ID.
1521              my $cas = $fig->cas($cid);              my $cas = $fig->cas($cid);
1522              if ($cas) {              if ($cas) {
1523                  $loadCompoundCAS->Put($cid, $cas);                  $loadIsIdentifiedByCAS->Put($cid, $cas);
1524                    if (! exists $compoundCASes{$cas}) {
1525                        $loadCompoundCAS->Put($cas);
1526                        $compoundCASes{$cas} = 1;
1527                    }
1528              }              }
1529          }          }
1530          # All the compounds are set up, so we need to loop through the reactions next. First,          # All the compounds are set up, so we need to loop through the reactions next. First,
1531          # we initialize the discriminator index. This is a single integer used to insure          # we initialize the discriminator index. This is a single integer used to insure
1532          # duplicate elements in a reaction are not accidentally collapsed.          # duplicate elements in a reaction are not accidentally collapsed.
1533          my $discrim = 0;          my $discrim = 0;
1534          my @reactions = $fig->all_reactions();          my %reactions = map { $_ => 1 } $fig->all_reactions();
1535          for my $reactionID (@reactions) {          for my $reactionID (keys %reactions) {
1536              # Create the reaction record.              # Create the reaction record.
1537              $loadReaction->Put($reactionID, $fig->reversible($reactionID));              $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1538              # Compute the reaction's URL.              # Compute the reaction's URL.
# Line 1413  Line 1555 
1555                  }                  }
1556              }              }
1557          }          }
1558            # Now we run through the subsystems and roles, generating the scenarios
1559            # and connecting the reactions. We'll need some hashes to prevent
1560            # duplicates and a counter for compound group keys.
1561            my %roles = ();
1562            my %scenarios = ();
1563            my @subsystems = $fig->all_subsystems();
1564            for my $subName (@subsystems) {
1565                my $sub = $fig->get_subsystem($subName);
1566                Trace("Processing $subName reactions.") if T(3);
1567                # Get the subsystem's reactions.
1568                my %reactions = $sub->get_hope_reactions();
1569                # Loop through the roles, connecting them to the reactions.
1570                for my $role (keys %reactions) {
1571                    # Only process this role if it is new.
1572                    if (! $roles{$role}) {
1573                        $roles{$role} = 1;
1574                        my @reactions = @{$reactions{$role}};
1575                        for my $reaction (@reactions) {
1576                            $loadCatalyzes->Put($role, $reaction);
1577                        }
1578                    }
1579                }
1580                Trace("Processing $subName scenarios.") if T(3);
1581                # Get the subsystem's scenarios.
1582                my @scenarioNames = $sub->get_hope_scenario_names();
1583                # Loop through the scenarios, creating scenario data.
1584                for my $scenarioName (@scenarioNames) {
1585                    # Link this scenario to this subsystem.
1586                    $loadHasScenario->Put($subName, $scenarioName);
1587                    # If this scenario is new, we need to create it.
1588                    if (! $scenarios{$scenarioName}) {
1589                        Trace("Creating scenario $scenarioName.") if T(3);
1590                        $scenarios{$scenarioName} = 1;
1591                        # Create the scenario itself.
1592                        $loadScenario->Put($scenarioName);
1593                        # Attach the input compounds.
1594                        for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1595                            $loadIsInputFor->Put($input, $scenarioName);
1596                        }
1597                        # Now we need to set up the output compounds. They come in two
1598                        # groups, which we mark 0 and 1.
1599                        my $outputGroup = 0;
1600                        # Set up the output compounds.
1601                        for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1602                            # Attach the compounds.
1603                            for my $compound (@$outputGroup) {
1604                                $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1605                            }
1606                        }
1607                        # Create the reaction lists.
1608                        my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1609                        for my $reaction (@addReactions) {
1610                            $loadIncludesReaction->Put($scenarioName, $reaction);
1611                        }
1612                        my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1613                        for my $reaction (@notReactions) {
1614                            $loadExcludesReaction->Put($scenarioName, $reaction);
1615                        }
1616                        # Link the maps.
1617                        my @maps = $sub->get_hope_map_ids($scenarioName);
1618                        for my $map (@maps) {
1619                            $loadIsOnDiagram->Put($scenarioName, "map$map");
1620                        }
1621                    }
1622      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
1623  }  }
   
 =head3 LoadGroupData  
   
 C<< my $stats = $spl->LoadGroupData(); >>  
   
 Load the genome Groups into Sprout.  
   
 The following relations are loaded by this method.  
   
     GenomeGroups  
   
 Currently, we do not use groups. We used to use them for NMPDR groups,  
 butThere is no direct support for genome groups in FIG, so we access the SEED  
 files directly.  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadGroupData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the genome hash.  
     my $genomeHash = $self->{genomes};  
     # Create a load object for the table we're loading.  
     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating group data.") if T(2);  
         # Currently there are no groups.  
1624      }      }
1625      # Finish the load.      # Finish the load.
1626      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1465  Line 1629 
1629    
1630  =head3 LoadSynonymData  =head3 LoadSynonymData
1631    
1632  C<< my $stats = $spl->LoadSynonymData(); >>      my $stats = $spl->LoadSynonymData();
1633    
1634  Load the synonym groups into Sprout.  Load the synonym groups into Sprout.
1635    
# Line 1504  Line 1668 
1668          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1669          # Get the database handle.          # Get the database handle.
1670          my $dbh = $fig->db_handle();          my $dbh = $fig->db_handle();
1671          # Ask for the synonyms.          # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1672          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");          my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1673          my $result = $sth->execute();          my $result = $sth->execute();
1674          if (! defined($result)) {          if (! defined($result)) {
1675              Confess("Database error in Synonym load: " . $sth->errstr());              Confess("Database error in Synonym load: " . $sth->errstr());
1676          } else {          } else {
1677                Trace("Processing synonym results.") if T(2);
1678              # Remember the current synonym.              # Remember the current synonym.
1679              my $current_syn = "";              my $current_syn = "";
1680              # Count the features.              # Count the features.
1681              my $featureCount = 0;              my $featureCount = 0;
1682                my $entryCount = 0;
1683              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1684              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1685                  # Get the synonym ID and feature ID.                  # Get the synonym group ID and feature ID.
1686                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1687                    # Count this row.
1688                    $entryCount++;
1689                    if ($entryCount % 1000 == 0) {
1690                        Trace("$entryCount rows processed.") if T(3);
1691                    }
1692                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1693                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
1694                  if (exists $genomeHash->{$genomeID}) {                  if (exists $genomeHash->{$genomeID}) {
# Line 1536  Line 1707 
1707                      }                      }
1708                  }                  }
1709              }              }
1710                Trace("$entryCount rows produced $featureCount features.") if T(2);
1711          }          }
1712      }      }
1713      # Finish the load.      # Finish the load.
# Line 1545  Line 1717 
1717    
1718  =head3 LoadFamilyData  =head3 LoadFamilyData
1719    
1720  C<< my $stats = $spl->LoadFamilyData(); >>      my $stats = $spl->LoadFamilyData();
1721    
1722  Load the protein families into Sprout.  Load the protein families into Sprout.
1723    
# Line 1613  Line 1785 
1785    
1786  =head3 LoadDrugData  =head3 LoadDrugData
1787    
1788  C<< my $stats = $spl->LoadDrugData(); >>      my $stats = $spl->LoadDrugData();
1789    
1790  Load the drug target data into Sprout.  Load the drug target data into Sprout.
1791    
1792  The following relations are loaded by this method.  The following relations are loaded by this method.
1793    
     DrugProject  
     ContainsTopic  
     DrugTopic  
     ContainsAnalysisOf  
1794      PDB      PDB
1795      IncludesBound      DocksWith
1796      IsBoundIn      IsProteinForFeature
     BindsWith  
1797      Ligand      Ligand
     DescribesProteinForFeature  
     FeatureConservation  
1798    
1799  The source information for these relations is taken from flat files in the  The source information for these relations is taken from attributes. The
1800  C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains  C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1801  a list of drug project names paired with file names. The named file (in the  The C<zinc_name> attribute describes the ligands. The C<docking_results>
1802  same directory) contains all the data for the project.  attribute contains the information for the B<DocksWith> relationship. It is
1803    expected that additional attributes and tables will be added in the future.
1804    
1805  =over 4  =over 4
1806    
# Line 1654  Line 1820 
1820      # Get the genome hash.      # Get the genome hash.
1821      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1822      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
     my $loadDrugProject = $self->_TableLoader('DrugProject');  
     my $loadContainsTopic = $self->_TableLoader('ContainsTopic');  
     my $loadDrugTopic = $self->_TableLoader('DrugTopic');  
     my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');  
1823      my $loadPDB = $self->_TableLoader('PDB');      my $loadPDB = $self->_TableLoader('PDB');
     my $loadIncludesBound = $self->_TableLoader('IncludesBound');  
     my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');  
     my $loadBindsWith = $self->_TableLoader('BindsWith');  
1824      my $loadLigand = $self->_TableLoader('Ligand');      my $loadLigand = $self->_TableLoader('Ligand');
1825      my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');      my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1826      my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');      my $loadDocksWith = $self->_TableLoader('DocksWith');
1827      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1828          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1829      } else {      } else {
1830          Trace("Generating drug target data.") if T(2);          Trace("Generating drug target data.") if T(2);
1831          # Load the project list. The file comes in as a list of chomped lines,          # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1832          # and we split them on the TAB character to make the project name the          # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1833          # key and the file name the value of the resulting hash.          # this process, PDB information is collected in a hash table and then
1834          my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");          # unspooled after both relationships are created.
1835          # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects          my %pdbHash = ();
1836          # may occur multiple times in a single project file or even in multiple project          Trace("Generating docking data.") if T(2);
1837          # files.          # Get all the docking data. This may cause problems if there are too many PDBs,
1838          my %ligands = ();          # at which point we'll need another algorithm. The indicator that this is
1839          my %pdbs = ();          # happening will be a timeout error in the next statement.
1840          my %features = ();          my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1841          my %bindings = ();                                                ['docking_results', $FIG_Config::dockLimit]);
1842          # Set up a counter for drug topics. This will be used as the key.          Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1843          my $topicCounter = 0;          for my $dockData (@dockData) {
1844          # Loop through the projects. We sort the keys not because we need them sorted, but              # Get the docking data components.
1845          # because it makes it easier to infer our progress from trace messages.              my ($pdbID, $docking_key, @valueData) = @{$dockData};
1846          for my $project (sort keys %projects) {              # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1847              Trace("Processing project $project.") if T(3);              $pdbID = lc $pdbID;
1848              # Only proceed if the download file exists.              # Strip off the object type.
1849              my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";              $pdbID =~ s/pdb://;
1850              if (! -f $projectFile) {              # Extract the ZINC ID from the docking key. Note that there are two possible
1851                  Trace("Project file $projectFile not found.") if T(0);              # formats.
1852              } else {              my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1853                  # Create the project record.              if (! $zinc_id) {
1854                  $loadDrugProject->Put($project);                  Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1855                  # Create a hash for the topics. Each project has one or more topics. The                  $loadDocksWith->Add("errors");
1856                  # topic is identified by a URL, a category, and an identifier.              } else {
1857                  my %topics = ();                  # Get the pieces of the value and parse the energy.
1858                  # Now we can open the project file.                  # Note that we don't care about the rank, since
1859                  Trace("Reading project file $projectFile.") if T(3);                  # we can sort on the energy level itself in our database.
1860                  Open(\*PROJECT, "<$projectFile");                  my ($energy, $tool, $type) = @valueData;
1861                  # Get the first record, which is a list of column headers. We don't use this                  my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1862                  # for anything, but it may be useful for debugging.                  # Ignore predicted results.
1863                  my $headerLine = <PROJECT>;                  if ($type ne "Predicted") {
1864                  # Loop through the rest of the records.                      # Count this docking result.
1865                  while (! eof PROJECT) {                      if (! exists $pdbHash{$pdbID}) {
1866                      # Get the current line of data. Note that not all lines will have all                          $pdbHash{$pdbID} = 1;
1867                      # the fields. In particular, the CLIBE data is fairly rare.                      } else {
1868                      my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,                          $pdbHash{$pdbID}++;
1869                          $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,                      }
1870                          $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,                      # Write the result to the output.
1871                          $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,                      $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1872                          $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)                                          $total, $vanderwaals);
                        = Tracer::GetLine(\*PROJECT);  
                     # The tag contains an identifier for the current line of data followed  
                     # by a text statement that generally matches a property name in the  
                     # main database. We split it up, since the identifier goes with  
                     # the PDB data and the text statement is part of the topic.  
                     my ($lineID, $topicTag) = split /\s*,\s*/, $tag;  
                     $loadDrugProject->Add("data line");  
                     # Check for a new topic.  
                     my $topicData = "$category\t$topicTag\t$refURL";  
                     if (! exists $topics{$topicData}) {  
                         # Here we have a new topic. Compute its ID.  
                         $topicCounter++;  
                         $topics{$topicData} = $topicCounter;  
                         # Create its database record.  
                         $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,  
                                             $topicTag);  
                         # Connect it to the project.  
                         $loadContainsTopic->Put($project, $topicCounter);  
                         $loadDrugTopic->Add("topic");  
                     }  
                     # Now we know the topic ID exists in the hash and the topic will  
                     # appear in the database, so we get this topic's ID.  
                     my $topicID = $topics{$topicData};  
                     # If the feature in this line is new, we need to save its conservation  
                     # number.  
                     if (! exists $features{$peg}) {  
                         $loadFeatureConservation->Put($peg, $conservation);  
                         $features{$peg} = 1;  
                     }  
                     # Now we have two PDBs to deal with-- a bound PDB and a free PDB.  
                     # The free PDB will have data about docking points; the bound PDB  
                     # will have data about docking. We store both types as PDBs, and  
                     # the special data comes from relationships. First we process the  
                     # bound PDB.  
                     if ($pdbBound) {  
                         $loadPDB->Add("bound line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadIncludesBound->Put($topicID, $pdbBound);  
                         # Check for CLIBE data.  
                         if ($clibeInfo) {  
                             $loadLigand->Add("clibes");  
                             # We have CLIBE data, so we create a ligand and relate it to the PDB.  
                             if (! exists $ligands{$clibeInfo}) {  
                                 # This is a new ligand, so create its record.  
                                 $loadLigand->Put($clibeInfo);  
                                 $loadLigand->Add("ligand");  
                                 # Make sure we know this ligand already exists.  
                                 $ligands{$clibeInfo} = 1;  
                             }  
                             # Now connect the PDB to the ligand using the CLIBE data.  
                             $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,  
                                                 $clibeSolvationE, $clibeVanderwaals);  
                         }  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);  
                     }  
                     # Next is the free PDB.  
                     if ($pdbFree) {  
                         $loadPDB->Add("free line");  
                         # Insure this PDB is in the database.  
                         $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);  
                         # Connect it to this topic.  
                         $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,  
                                                      $passWeightFile, $passWeightInfo, $passAspFile);  
                         # Connect this PDB to the feature.  
                         $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);  
                     }  
                     # If we have both PDBs, we may need to link them.  
                     if ($pdbFree && $pdbBound) {  
                         $loadIsBoundIn->Add("connection");  
                         # Insure we only link them once.  
                         my $bindingKey =  "$pdbFree\t$pdbBound";  
                         if (! exists $bindings{$bindingKey}) {  
                             $loadIsBoundIn->Add("newConnection");  
                             $loadIsBoundIn->Put($pdbFree, $pdbBound);  
                             $bindings{$bindingKey} = 1;  
1873                          }                          }
1874                      }                      }
1875                  }                  }
1876                  # Close off this project.          Trace("Connecting features.") if T(2);
1877                  close PROJECT;          # Loop through the genomes.
1878            for my $genome (sort keys %{$genomeHash}) {
1879                Trace("Generating PDBs for $genome.") if T(3);
1880                # Get all of the PDBs that BLAST against this genome's features.
1881                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1882                for my $pdbData (@attributeData) {
1883                    # The PDB ID is coded as a subkey.
1884                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1885                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1886                        $loadPDB->Add("errors");
1887                    } else {
1888                        my $pdbID = $1;
1889                        # Insure the PDB is in the hash.
1890                        if (! exists $pdbHash{$pdbID}) {
1891                            $pdbHash{$pdbID} = 0;
1892                        }
1893                        # The score and locations are coded in the attribute value.
1894                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1895                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1896                            $loadIsProteinForFeature->Add("errors");
1897                        } else {
1898                            my ($score, $locData) = ($1,$2);
1899                            # The location data may not be present, so we have to start with some
1900                            # defaults and then check.
1901                            my ($start, $end) = (1, 0);
1902                            if ($locData) {
1903                                $locData =~ /(\d+)-(\d+)/;
1904                                $start = $1;
1905                                $end = $2;
1906                            }
1907                            # If we still don't have the end location, compute it from
1908                            # the feature length.
1909                            if (! $end) {
1910                                # Most features have one location, but we do a list iteration
1911                                # just in case.
1912                                my @locations = $fig->feature_location($pdbData->[0]);
1913                                $end = 0;
1914                                for my $loc (@locations) {
1915                                    my $locObject = BasicLocation->new($loc);
1916                                    $end += $locObject->Length;
1917                                }
1918                            }
1919                            # Decode the score.
1920                            my $realScore = FIGRules::DecodeScore($score);
1921                            # Connect the PDB to the feature.
1922                            $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1923                        }
1924                    }
1925                }
1926            }
1927            # We've got all our PDBs now, so we unspool them from the hash.
1928            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1929            my $count = 0;
1930            for my $pdbID (sort keys %pdbHash) {
1931                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1932                $count++;
1933                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1934            }
1935            # Finally we create the ligand table. This information can be found in the
1936            # zinc_name attribute.
1937            Trace("Loading ligands.") if T(2);
1938            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1939            my $last_zinc_id = "";
1940            my $zinc_id = "";
1941            my $done = 0;
1942            while (! $done) {
1943                # Get the next 10000 ligands. We insist that the object ID is greater than
1944                # the last ID we processed.
1945                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1946                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1947                                                           ["ZINC:$zinc_id", "zinc_name"]);
1948                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1949                if (! @attributeData) {
1950                    # Here there are no attributes left, so we quit the loop.
1951                    $done = 1;
1952                } else {
1953                    # Process the attribute data we've received.
1954                    for my $zinc_data (@attributeData) {
1955                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1956                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1957                            $zinc_id = $1;
1958                            # Check for a duplicate.
1959                            if ($zinc_id eq $last_zinc_id) {
1960                                $loadLigand->Add("duplicate");
1961                            } else {
1962                                # Here it's safe to output the ligand. The ligand name is the attribute value
1963                                # (third column in the row).
1964                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1965                                # Insure we don't try to add this ID again.
1966                                $last_zinc_id = $zinc_id;
1967                            }
1968                        } else {
1969                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1970                            $loadLigand->Add("errors");
1971                        }
1972              }              }
1973          }          }
1974      }      }
1975            Trace("Ligands loaded.") if T(2);
1976        }
1977      # Finish the load.      # Finish the load.
1978      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1979      return $retVal;      return $retVal;
# Line 1807  Line 1984 
1984    
1985  =head3 SpecialAttribute  =head3 SpecialAttribute
1986    
1987  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>      my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1988    
1989  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1990  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
# Line 1883  Line 2060 
2060      return $retVal;      return $retVal;
2061  }  }
2062    
 =head3 CreatePDB  
   
 C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>  
   
 Insure that a PDB record exists for the identified PDB. If one does not exist, it will be  
 created.  
   
 =over 4  
   
 =item pdbID  
   
 ID string (usually an unqualified file name) for the desired PDB.  
   
 =item title  
   
 Title to use if the PDB must be created.  
   
 =item type  
   
 Type of PDB: C<free> or C<bound>  
   
 =item pdbHash  
   
 Hash containing the IDs of PDBs that have already been created.  
   
 =item pdbLoader  
   
 Load object for the PDB table.  
   
 =back  
   
 =cut  
   
 sub CreatePDB {  
     # Get the parameters.  
     my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;  
     $pdbLoader->Add("PDB check");  
     # Check to see if this is a new PDB.  
     if (! exists $pdbHash->{$pdbID}) {  
         # It is, so we create it.  
         $pdbLoader->Put($pdbID, $title, $type);  
         $pdbHash->{$pdbID} = 1;  
         # Count it.  
         $pdbLoader->Add("PDB-$type");  
     }  
 }  
   
2063  =head3 TableLoader  =head3 TableLoader
2064    
2065  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1944  Line 2074 
2074    
2075  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
2076    
 =item ignore  
   
 TRUE if the table should be ignored entirely, else FALSE.  
   
2077  =item RETURN  =item RETURN
2078    
2079  Returns an ERDBLoad object for loading the specified table.  Returns an ERDBLoad object for loading the specified table.
# Line 1958  Line 2084 
2084    
2085  sub _TableLoader {  sub _TableLoader {
2086      # Get the parameters.      # Get the parameters.
2087      my ($self, $tableName, $ignore) = @_;      my ($self, $tableName) = @_;
2088      # Create the load object.      # Create the load object.
2089      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
                                $ignore);  
2090      # Cache it in the loader list.      # Cache it in the loader list.
2091      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
2092      # Return it to the caller.      # Return it to the caller.
# Line 2033  Line 2158 
2158      return $retVal;      return $retVal;
2159  }  }
2160    
2161    =head3 GetGenomeAttributes
2162    
2163        my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2164    
2165    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2166    attributes for all the features of a genome in a single call, then organizes them into
2167    a hash.
2168    
2169    =over 4
2170    
2171    =item fig
2172    
2173    FIG-like object for accessing attributes.
2174    
2175    =item genomeID
2176    
2177    ID of the genome who's attributes are desired.
2178    
2179    =item fids
2180    
2181    Reference to a list of the feature IDs whose attributes are to be kept.
2182    
2183    =item propKeys
2184    
2185    A list of the keys to retrieve.
2186    
2187    =item RETURN
2188    
2189    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2190    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2191    the attribute key, and one or more attribute values.
2192    
2193    =back
2194    
2195    =cut
2196    
2197    sub GetGenomeAttributes {
2198        # Get the parameters.
2199        my ($fig, $genomeID, $fids, $propKeys) = @_;
2200        # Declare the return variable.
2201        my $retVal = {};
2202        # Initialize the hash. This not only enables us to easily determine which FIDs to
2203        # keep, it insures that the caller sees a list reference for every known fid,
2204        # simplifying the logic.
2205        for my $fid (@{$fids}) {
2206            $retVal->{$fid} = [];
2207        }
2208        # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
2209        # an eval is used.
2210        my @aList = ();
2211        eval {
2212            @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
2213            Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
2214        };
2215        # Check for a problem.
2216        if ($@) {
2217            Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
2218            # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
2219            # but allows us to continue processing.
2220            my $nFids = scalar @{$fids};
2221            for (my $i = 0; $i < $nFids; $i += 100) {
2222                # Determine the index of the last feature ID we'll be specifying on this pass.
2223                # Normally it's $i + 99, but if we're close to the end it may be less.
2224                my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
2225                # Get a slice of the fid list.
2226                my @slice = @{$fids}[$i .. $end];
2227                # Get the relevant attributes.
2228                Trace("Retrieving attributes for fids $i to $end.") if T(3);
2229                my @aShort = $fig->get_attributes(\@slice, $propKeys);
2230                Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
2231                push @aList, @aShort;
2232            }
2233        }
2234        # Now we should have all the interesting attributes in @aList. Populate the hash with
2235        # them.
2236        for my $aListEntry (@aList) {
2237            my $fid = $aListEntry->[0];
2238            if (exists $retVal->{$fid}) {
2239                push @{$retVal->{$fid}}, $aListEntry;
2240            }
2241        }
2242        # Return the result.
2243        return $retVal;
2244    }
2245    
2246    
2247  1;  1;

Legend:
Removed from v.1.79  
changed lines
  Added in v.1.93

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3