[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12, Fri Sep 16 02:09:47 2005 UTC revision 1.84, Thu May 17 23:44:51 2007 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 29  Line 30 
30      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
31      print $stats->Show();      print $stats->Show();
32    
 This module makes use of the internal Sprout property C<_erdb>.  
   
33  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
34  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
35  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 79  Line 78 
78  =item subsysFile  =item subsysFile
79    
80  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
81  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
84    
85  =item options  =item options
86    
# Line 93  Line 93 
93  sub new {  sub new {
94      # Get the parameters.      # Get the parameters.
95      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
96      # Load the list of genomes into a hash.      # Create the genome hash.
97      my %genomes;      my %genomes = ();
98        # We only need it if load-only is NOT specified.
99        if (! $options->{loadOnly}) {
100      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
101          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
102          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
# Line 118  Line 120 
120                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
121                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
122                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
123                      if (undef $accessCode) {                          if (! defined($accessCode)) {
124                          $accessCode = 1;                          $accessCode = 1;
125                      }                      }
126                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 128  Line 130 
130              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
131          }          }
132      }      }
133        }
134      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
135      my %subsystems = ();      my %subsystems = ();
136        # We only need it if load-only is NOT specified.
137        if (! $options->{loadOnly}) {
138      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
139          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
140          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
141                # Loop through, checking for the NMPDR file.
142                for my $sub (@subs) {
143                    if ($fig->nmpdr_subsystem($sub)) {
144                        $subsystems{$sub} = 1;
145                    }
146                }
147      } else {      } else {
148          my $type = ref $subsysFile;          my $type = ref $subsysFile;
149          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 152  Line 163 
163              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
164          }          }
165      }      }
166            # Go through the subsys hash again, creating the keyword list for each subsystem.
167            for my $subsystem (keys %subsystems) {
168                my $name = $subsystem;
169                $name =~ s/_/ /g;
170                my $classes = $fig->subsystem_classification($subsystem);
171                $name .= " " . join(" ", @{$classes});
172                $subsystems{$subsystem} = $name;
173            }
174        }
175      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
176      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
177      # Create the Sprout load object.      # Create the Sprout load object.
# Line 161  Line 181 
181                    subsystems => \%subsystems,                    subsystems => \%subsystems,
182                    sprout => $sprout,                    sprout => $sprout,
183                    loadDirectory => $directory,                    loadDirectory => $directory,
184                    erdb => $sprout->{_erdb},                    erdb => $sprout,
185                    loaders => [],                    loaders => [],
186                    options => $options                    options => $options
187                   };                   };
# Line 170  Line 190 
190      return $retVal;      return $retVal;
191  }  }
192    
193    =head3 LoadOnly
194    
195    C<< my $flag = $spl->LoadOnly; >>
196    
197    Return TRUE if we are in load-only mode, else FALSE.
198    
199    =cut
200    
201    sub LoadOnly {
202        my ($self) = @_;
203        return $self->{options}->{loadOnly};
204    }
205    
206    =head3 PrimaryOnly
207    
208    C<< my $flag = $spl->PrimaryOnly; >>
209    
210    Return TRUE if only the main entity is to be loaded, else FALSE.
211    
212    =cut
213    
214    sub PrimaryOnly {
215        my ($self) = @_;
216        return $self->{options}->{primaryOnly};
217    }
218    
219  =head3 LoadGenomeData  =head3 LoadGenomeData
220    
221  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 197  Line 243 
243    
244  =back  =back
245    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
246  =cut  =cut
247  #: Return Type $%;  #: Return Type $%;
248  sub LoadGenomeData {  sub LoadGenomeData {
# Line 215  Line 253 
253      # Get the genome count.      # Get the genome count.
254      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
255      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
256      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
257      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
258      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
259      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
260      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
261      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
262        if ($self->{options}->{loadOnly}) {
263            Trace("Loading from existing files.") if T(2);
264        } else {
265            Trace("Generating genome data.") if T(2);
266      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
267      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
268          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
269          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
270          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
271          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
272          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
273          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
274          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
275          # Get the full taxonomy.          # Get the full taxonomy.
276          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284                # Open the NMPDR group file for this genome.
285                my $group;
286                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
287                    defined($group = <TMP>)) {
288                    # Clean the line ending.
289                    chomp $group;
290                } else {
291                    # No group, so use the default.
292                    $group = $FIG_Config::otherGroup;
293                }
294                close TMP;
295          # Output the genome record.          # Output the genome record.
296          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
299          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
300          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 267  Line 325 
325              }              }
326          }          }
327      }      }
328        }
329      # Finish the loads.      # Finish the loads.
330      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
331      # Return the result.      # Return the result.
# Line 307  Line 366 
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367      # Get the genome hash.      # Get the genome hash.
368      my $genomeFilter = $self->{genomes};      my $genomeFilter = $self->{genomes};
369      my $genomeCount = (keys %{$genomeFilter});      # Set up an ID counter for the PCHs.
370      my $featureCount = $genomeCount * 4000;      my $pchID = 0;
371      # Start the loads.      # Start the loads.
372      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
373      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
374      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
375      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
376      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
377      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
378            Trace("Loading from existing files.") if T(2);
379        } else {
380            Trace("Generating coupling data.") if T(2);
381      # Loop through the genomes found.      # Loop through the genomes found.
382      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
383          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
# Line 339  Line 401 
401              for my $coupleData (@couplings) {              for my $coupleData (@couplings) {
402                  my ($peg2, $score) = @{$coupleData};                  my ($peg2, $score) = @{$coupleData};
403                  # Compute the coupling ID.                  # Compute the coupling ID.
404                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                      my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);
405                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
406                      $loadCoupling->Add("couplingIn");                      $loadCoupling->Add("couplingIn");
407                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
# Line 367  Line 429 
429                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
430                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
431                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
432                              # at least keep the meaningful ones. Only evidence is                                  # at least keep the meaningful ones. Only evidence in
433                              # the hash makes it to the output.                              # the hash makes it to the output.
434                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
435                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 375  Line 437 
437                          }                          }
438                      }                      }
439                      for my $evidenceID (keys %evidenceMap) {                      for my $evidenceID (keys %evidenceMap) {
440                                # Get the ID for this evidence.
441                                $pchID++;
442                          # Create the evidence record.                          # Create the evidence record.
443                          my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};                          my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
444                          $loadPCH->Put($evidenceID, $usage);                              $loadPCH->Put($pchID, $usage);
445                          # Connect it to the coupling.                          # Connect it to the coupling.
446                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                              $loadIsEvidencedBy->Put($coupleID, $pchID);
447                          # Connect it to the features.                          # Connect it to the features.
448                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                              $loadUsesAsEvidence->Put($pchID, $peg3, 1);
449                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                              $loadUsesAsEvidence->Put($pchID, $peg4, 2);
450                            }
451                      }                      }
452                  }                  }
453              }              }
# Line 409  Line 474 
474      FeatureTranslation      FeatureTranslation
475      FeatureUpstream      FeatureUpstream
476      IsLocatedIn      IsLocatedIn
477        HasFeature
478        HasRoleInSubsystem
479        FeatureEssential
480        FeatureVirulent
481        FeatureIEDB
482    
483  =over 4  =over 4
484    
# Line 423  Line 493 
493  sub LoadFeatureData {  sub LoadFeatureData {
494      # Get this object instance.      # Get this object instance.
495      my ($self) = @_;      my ($self) = @_;
496      # Get the FIG object.      # Get the FIG and Sprout objects.
497      my $fig = $self->{fig};      my $fig = $self->{fig};
498      # Find out if this is a limited run.      my $sprout = $self->{sprout};
     my $limited = $self->{options}->{limitedFeatures};  
499      # Get the table of genome IDs.      # Get the table of genome IDs.
500      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
501      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
502      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
503      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
504      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
505      my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
506      if (! $limited) {      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
507          $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
508          $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);
509          $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);
510      }      my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
511        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
512        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
513        # Get the subsystem hash.
514        my $subHash = $self->{subsystems};
515      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
516      # locations.      # locations.
517      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
518      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
519            Trace("Loading from existing files.") if T(2);
520        } else {
521            Trace("Generating feature data.") if T(2);
522      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
523      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
524          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
525          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
526          # Get the feature list for this genome.          # Get the feature list for this genome.
527          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528                # Sort and count the list.
529                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530                my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532                Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535                # Set up for our duplicate-feature check.
536                my $oldFeatureID = "";
537          # Loop through the features.          # Loop through the features.
538          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
539              # Split the tuple.              # Split the tuple.
540              my ($featureID, $locations, undef, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541              # Create the feature record.                  # Check for duplicates.
542              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
543                        Trace("Duplicate feature $featureID found.") if T(1);
544                    } else {
545                        $oldFeatureID = $featureID;
546                        # Count this feature.
547                        $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                        # Begin building the keywords. We start with the genome ID, the
554                        # feature ID, the taxonomy, and the organism name.
555                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                                        $fig->taxonomy_of($genomeID));
557              # Create the aliases.              # Create the aliases.
558              for my $alias ($fig->feature_aliases($featureID)) {              for my $alias ($fig->feature_aliases($featureID)) {
559                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
560                            push @keywords, $alias;
561              }              }
562              # The next stuff is for a full load only.                      Trace("Assignment for $featureID is: $assignment") if T(4);
563              if (! $limited) {                      # Break the assignment into words and shove it onto the
564                        # keyword list.
565                        push @keywords, split(/\s+/, $assignment);
566                        # Link this feature to the parent genome.
567                        $loadHasFeature->Put($genomeID, $featureID, $type);
568                  # Get the links.                  # Get the links.
569                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
570                  for my $link (@links) {                  for my $link (@links) {
# Line 482  Line 583 
583                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
584                      }                      }
585                  }                  }
586              }                      # Now we need to find the subsystems this feature participates in.
587                        # We also add the subsystems to the keyword list. Before we do that,
588                        # we must convert underscores to spaces and tack on the classifications.
589                        my @subsystems = $fig->peg_to_subsystems($featureID);
590                        for my $subsystem (@subsystems) {
591                            # Only proceed if we like this subsystem.
592                            if (exists $subHash->{$subsystem}) {
593                                # Store the has-role link.
594                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
595                                # Save the subsystem's keyword data.
596                                my $subKeywords = $subHash->{$subsystem};
597                                push @keywords, split /\s+/, $subKeywords;
598                                # Now we need to get this feature's role in the subsystem.
599                                my $subObject = $fig->get_subsystem($subsystem);
600                                my @roleColumns = $subObject->get_peg_roles($featureID);
601                                my @allRoles = $subObject->get_roles();
602                                for my $col (@roleColumns) {
603                                    my $role = $allRoles[$col];
604                                    push @keywords, split /\s+/, $role;
605                                    push @keywords, $subObject->get_role_abbr($col);
606                                }
607                            }
608                        }
609                        # There are three special attributes computed from property
610                        # data that we build next. If the special attribute is non-empty,
611                        # its name will be added to the keyword list. First, we get all
612                        # the attributes for this feature. They will come back as
613                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
614                        # [name, value, value with URL]. (We don't need the PEG, since
615                        # we already know it.)
616                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                             @{$attributes->{$featureID}};
618                        # Now we process each of the special attributes.
619                        if (SpecialAttribute($featureID, \@attributes,
620                                             1, [0,2], '^(essential|potential_essential)$',
621                                             $loadFeatureEssential)) {
622                            push @keywords, 'essential';
623                            $loadFeature->Add('essential');
624                        }
625                        if (SpecialAttribute($featureID, \@attributes,
626                                             0, [2], '^virulen',
627                                             $loadFeatureVirulent)) {
628                            push @keywords, 'virulent';
629                            $loadFeature->Add('virulent');
630                        }
631                        if (SpecialAttribute($featureID, \@attributes,
632                                             0, [0,2], '^iedb_',
633                                             $loadFeatureIEDB)) {
634                            push @keywords, 'iedb';
635                            $loadFeature->Add('iedb');
636                        }
637                        # Now we need to bust up hyphenated words in the keyword
638                        # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                        my $keywordString = "";
641                        my $bustedString = "";
642                        for my $keyword (@keywords) {
643                            if (length $keyword >= 3) {
644                                $keywordString .= " $keyword";
645                                if ($keyword =~ /-/) {
646                                    my @words = split /-/, $keyword;
647                                    $bustedString .= join(" ", "", @words);
648                                }
649                            }
650                        }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                        # Clean the keyword list.
655                        my $cleanWords = $sprout->CleanKeywords($keywordString);
656                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                        # Create the feature record.
658                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
660              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
661              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
# Line 511  Line 684 
684              }              }
685          }          }
686      }      }
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
687          }          }
688      }      }
689      # Finish the loads.      # Finish the loads.
# Line 598  Line 706 
706  The following relations are loaded by this method.  The following relations are loaded by this method.
707    
708      Subsystem      Subsystem
709        SubsystemClass
710      Role      Role
711        RoleEC
712      SSCell      SSCell
713      ContainsFeature      ContainsFeature
714      IsGenomeOf      IsGenomeOf
# Line 606  Line 716 
716      OccursInSubsystem      OccursInSubsystem
717      ParticipatesIn      ParticipatesIn
718      HasSSCell      HasSSCell
719        ConsistsOfRoles
720        RoleSubset
721        HasRoleSubset
722        ConsistsOfGenomes
723        GenomeSubset
724        HasGenomeSubset
725        Catalyzes
726        Diagram
727        RoleOccursIn
728    
729  =over 4  =over 4
730    
# Line 615  Line 734 
734    
735  =back  =back
736    
 B<TO DO>  
   
 Generate RoleName table?  
   
737  =cut  =cut
738  #: Return Type $%;  #: Return Type $%;
739  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 632  Line 747 
747      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
748      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
749      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
750      my $subsysCount = @subsysIDs;      # Get the map list.
751      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
752      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
753      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
754      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
755      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
756      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
757      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
758      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
759      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
760      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
761      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
762      Trace("Beginning subsystem data load.") if T(2);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
763        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
764        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
765        my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
766        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
767        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
768        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
769        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
770        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
771        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
772        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);
773        if ($self->{options}->{loadOnly}) {
774            Trace("Loading from existing files.") if T(2);
775        } else {
776            Trace("Generating subsystem data.") if T(2);
777            # This hash will contain the role for each EC. When we're done, this
778            # information will be used to generate the Catalyzes table.
779            my %ecToRoles = ();
780      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
781      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
782      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
783      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
784            # and memorize up the reactions.
785            my ($genomeID, $roleID);
786      my %roleData = ();      my %roleData = ();
787      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
788                # Get the subsystem object.
789                my $sub = $fig->get_subsystem($subsysID);
790                # Only proceed if the subsystem has a spreadsheet.
791                if (defined($sub) && ! $sub->{empty_ss}) {
792          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
793          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
794          # Create the subsystem record.          # Create the subsystem record.
795          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
796          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
797          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
798          # Connect the roles to the subsystem. If a role is new, we create                  # Now for the classification string. This comes back as a list
799          # a role record for it.                  # reference and we convert it to a space-delimited string.
800          for my $roleID (@roles) {                  my $classList = $fig->subsystem_classification($subsysID);
801                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                    $loadSubsystemClass->Put($subsysID, $classString);
803                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
805                        # Connect to this role.
806              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
807              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
808                        # If it's a new role, add it to the role table.
809              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
810                  $loadRole->Put($roleID);                          # Get the role's abbreviation.
811                            my $abbr = $sub->get_role_abbr($col);
812                            # Add the role.
813                            $loadRole->Put($roleID, $abbr);
814                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
815                            # Check for an EC number.
816                            if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
817                                my $ec = $1;
818                                $loadRoleEC->Put($roleID, $ec);
819                                $ecToRoles{$ec} = $roleID;
820                            }
821              }              }
822          }          }
823          # Now all roles for this subsystem have been filled in. We create the                  # Now we create the spreadsheet for the subsystem by matching roles to
824          # spreadsheet by matches roles to genomes. To do this, we need to                  # genomes. Each genome is a row and each role is a column. We may need
825          # get the genomes on the sheet.                  # to actually create the roles as we find them.
826          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
827          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
828          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
829              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
                 Trace("Processing genome $genomeID for subsystem $subsysID.");  
                 # Connect the genome to the subsystem.  
                 $loadParticipatesIn->Put($genomeID, $subsysID);  
830                  # Count the PEGs and cells found for verification purposes.                  # Count the PEGs and cells found for verification purposes.
831                  my $pegCount = 0;                  my $pegCount = 0;
832                  my $cellCount = 0;                  my $cellCount = 0;
833                            # Create a list for the PEGs we find. This list will be used
834                            # to generate cluster numbers.
835                            my @pegsFound = ();
836                            # Create a hash that maps spreadsheet IDs to PEGs. We will
837                            # use this to generate the ContainsFeature data after we have
838                            # the cluster numbers.
839                            my %cellPegs = ();
840                            # Get the genome's variant code for this subsystem.
841                            my $variantCode = $sub->get_variant_code($row);
842                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
843                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
844                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
845                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
846                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
847                      # Only proceed if features exist.                      # Only proceed if features exist.
848                      if (@pegs > 0) {                      if (@pegs > 0) {
849                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
850                          $cellCount++;                          $cellCount++;
851                          my $cellID = "$subsysID:$genomeID:$i";                                  my $cellID = "$subsysID:$genomeID:$col";
852                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
853                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
854                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
855                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
856                          # Attach the features to it.                                  # Remember its features.
857                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
858                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
859                              $pegCount++;                                  $pegCount += @pegs;
860                                }
861                            }
862                            # If we found some cells for this genome, we need to compute clusters and
863                            # denote it participates in the subsystem.
864                            if ($pegCount > 0) {
865                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
866                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
867                                # Create a hash mapping PEG IDs to cluster numbers.
868                                # We default to -1 for all of them.
869                                my %clusterOf = map { $_ => -1 } @pegsFound;
870                                # Partition the PEGs found into clusters.
871                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
872                                for (my $i = 0; $i <= $#clusters; $i++) {
873                                    my $subList = $clusters[$i];
874                                    for my $peg (@{$subList}) {
875                                        $clusterOf{$peg} = $i;
876                                    }
877                                }
878                                # Create the ContainsFeature data.
879                                for my $cellID (keys %cellPegs) {
880                                    my $cellList = $cellPegs{$cellID};
881                                    for my $cellPeg (@$cellList) {
882                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
883                                    }
884                          }                          }
885                      }                      }
886                  }                  }
                 Trace("$pegCount PEGs found in $cellCount cells.") if T(3);  
887              }              }
888                    # Now we need to generate the subsets. The subset names must be concatenated to
889                    # the subsystem name to make them unique keys. There are two types of subsets:
890                    # genome subsets and role subsets. We do the role subsets first.
891                    my @subsetNames = $sub->get_subset_names();
892                    for my $subsetID (@subsetNames) {
893                        # Create the subset record.
894                        my $actualID = "$subsysID:$subsetID";
895                        $loadRoleSubset->Put($actualID);
896                        # Connect the subset to the subsystem.
897                        $loadHasRoleSubset->Put($subsysID, $actualID);
898                        # Connect the subset to its roles.
899                        my @roles = $sub->get_subsetC_roles($subsetID);
900                        for my $roleID (@roles) {
901                            $loadConsistsOfRoles->Put($actualID, $roleID);
902          }          }
903      }      }
904      # Finish the load.                  # Next the genome subsets.
905      my $retVal = $self->_FinishAll();                  @subsetNames = $sub->get_subset_namesR();
906      return $retVal;                  for my $subsetID (@subsetNames) {
907                        # Create the subset record.
908                        my $actualID = "$subsysID:$subsetID";
909                        $loadGenomeSubset->Put($actualID);
910                        # Connect the subset to the subsystem.
911                        $loadHasGenomeSubset->Put($subsysID, $actualID);
912                        # Connect the subset to its genomes.
913                        my @genomes = $sub->get_subsetR($subsetID);
914                        for my $genomeID (@genomes) {
915                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
916  }  }
917                    }
918  =head3 LoadDiagramData              }
919            }
920  C<< my $stats = $spl->LoadDiagramData(); >>          # Now we loop through the diagrams. We need to create the diagram records
921            # and link each diagram to its roles. Note that only roles which occur
922  Load the diagram data from FIG into Sprout.          # in subsystems (and therefore appear in the %ecToRoles hash) are
923            # included.
924  Diagrams are used to organize functional roles. The diagram shows the          for my $map (@maps) {
 connections between chemicals that interact with a subsystem.  
   
 The following relations are loaded by this method.  
   
     Diagram  
     RoleOccursIn  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
925          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
926          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
927          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 761  Line 930 
930          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
931          my %roleHash = ();          my %roleHash = ();
932          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
933              if (! $roleHash{$role}) {                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
934                  $loadRoleOccursIn->Put($role, $map);                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
935                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
936              }              }
937          }          }
938      }      }
939            # Before we leave, we must create the Catalyzes table. We start with the reactions,
940            # then use the "ecToRoles" table to convert EC numbers to role IDs.
941            my @reactions = $fig->all_reactions();
942            for my $reactionID (@reactions) {
943                # Get this reaction's list of roles. The results will be EC numbers.
944                my @roles = $fig->catalyzed_by($reactionID);
945                # Loop through the roles, creating catalyzation records.
946                for my $thisRole (@roles) {
947                    if (exists $ecToRoles{$thisRole}) {
948                        $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
949                    }
950                }
951            }
952        }
953      # Finish the load.      # Finish the load.
954      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
955      return $retVal;      return $retVal;
# Line 808  Line 991 
991      my $fig = $self->{fig};      my $fig = $self->{fig};
992      # Get the genome hash.      # Get the genome hash.
993      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
994      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
995      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
996      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
997      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
998            Trace("Loading from existing files.") if T(2);
999        } else {
1000            Trace("Generating property data.") if T(2);
1001      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
1002      my %propertyKeys = ();      my %propertyKeys = ();
1003      my $nextID = 1;      my $nextID = 1;
1004            # Get the attributes we intend to store in the property table.
1005            my @propKeys = $fig->get_group_keys("NMPDR");
1006      # Loop through the genomes.      # Loop through the genomes.
1007      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1008          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
1009          # Get the genome's features. The feature ID is the first field in the              Trace("Generating properties for $genomeID.") if T(3);
1010          # tuples returned by "all_features_detailed". We use "all_features_detailed"              # Initialize a counter.
1011          # rather than "all_features" because we want all features regardless of type.              my $propertyCount = 0;
1012          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              # Get the properties for this genome's features.
1013          # Loop through the features, creating HasProperty records.              my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys);
1014          for my $fid (@features) {              Trace("Property list built for $genomeID.") if T(3);
1015              $loadProperty->Add("featureIn");              # Loop through the results, creating HasProperty records.
1016              # Get all attributes for this feature. We do this one feature at a time              for my $attributeData (@attributes) {
1017              # to insure we do not get any genome attributes.                  # Pull apart the attribute tuple.
1018              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my ($fid, $key, $value, $url) = @{$attributeData};
             # Loop through the attributes.  
             for my $tuple (@attributeList) {  
                 # Get this attribute value's data. Note that we throw away the FID,  
                 # since it will always be the same as the value if "$fid".  
                 my (undef, $key, $value, $url) = @{$tuple};  
1019                  # Concatenate the key and value and check the "propertyKeys" hash to                  # Concatenate the key and value and check the "propertyKeys" hash to
1020                  # see if we already have an ID for it. We use a tab for the separator                  # see if we already have an ID for it. We use a tab for the separator
1021                  # character.                  # character.
# Line 851  Line 1033 
1033                  # Create the HasProperty entry for this feature/property association.                  # Create the HasProperty entry for this feature/property association.
1034                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
1035              }              }
1036                # Update the statistics.
1037                Trace("$propertyCount attributes processed.") if T(3);
1038                $loadHasProperty->Add("propertiesIn", $propertyCount);
1039          }          }
1040      }      }
1041      # Finish the load.      # Finish the load.
# Line 892  Line 1077 
1077      my $fig = $self->{fig};      my $fig = $self->{fig};
1078      # Get the genome hash.      # Get the genome hash.
1079      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1080      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1081      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1082      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1083      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1084      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1085      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1086      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1087            Trace("Loading from existing files.") if T(2);
1088        } else {
1089            Trace("Generating annotation data.") if T(2);
1090      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1091      # user records.      # user records.
1092      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 913  Line 1100 
1100      # Loop through the genomes.      # Loop through the genomes.
1101      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1102          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1103              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1104              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1105              my %seenTimestamps = ();              my %seenTimestamps = ();
1106              # Check for a functional assignment.              # Get the genome's annotations.
1107              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1108              if ($func) {              Trace("Processing annotations.") if T(2);
1109                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1110                  # assignment annotation for it.                  # Get the annotation tuple.
1111                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1112                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1113                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1114                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1115                      # stop the substitution search.                      # stop the substitution search.
1116                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 948  Line 1120 
1120                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1121                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1122                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1123                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1124                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1125                              $timestamp++;                      my $keyStamp = $timestamp;
1126                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1127                            $keyStamp++;
1128                          }                          }
1129                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1130                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1131                          # Insure the user exists.                          # Insure the user exists.
1132                          if (! $users{$user}) {                          if (! $users{$user}) {
1133                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 961  Line 1135 
1135                              $users{$user} = 1;                              $users{$user} = 1;
1136                          }                          }
1137                          # Generate the annotation.                          # Generate the annotation.
1138                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1139                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1140                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1141                      } else {                      } else {
# Line 971  Line 1145 
1145                  }                  }
1146              }              }
1147          }          }
     }  
1148      # Finish the load.      # Finish the load.
1149      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1150      return $retVal;      return $retVal;
# Line 1012  Line 1185 
1185      my $fig = $self->{fig};      my $fig = $self->{fig};
1186      # Get the genome hash.      # Get the genome hash.
1187      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1188      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1189      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1190      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1191      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1192      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1193            Trace("Loading from existing files.") if T(2);
1194        } else {
1195            Trace("Generating annotation data.") if T(2);
1196      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1197      my %sourceURL = ();      my %sourceURL = ();
1198      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1031  Line 1206 
1206              chomp $line;              chomp $line;
1207              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1208              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1209              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1210                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1211                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1212              }              }
1213              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1214                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1215                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1216                        $sourceDesc{$sourceID} = $sourceID;
1217              }              }
1218          }          }
1219          close TMP;          close TMP;
1220      }      }
1221            # Write the source descriptions.
1222            for my $sourceID (keys %sourceDesc) {
1223                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1224            }
1225        }
1226      # Finish the load.      # Finish the load.
1227      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1228      return $retVal;      return $retVal;
# Line 1081  Line 1262 
1262      my $fig = $self->{fig};      my $fig = $self->{fig};
1263      # Get the genome hash.      # Get the genome hash.
1264      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1265      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1266      # it the key.      # it the key.
1267      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1268      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1269      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1270      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1271      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1272            Trace("Loading from existing files.") if T(2);
1273        } else {
1274            Trace("Generating external data.") if T(2);
1275      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1276      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1277      my $orgLine;      my $orgLine;
1278      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1279          # Clean the input line.          # Clean the input line.
# Line 1102  Line 1285 
1285      close ORGS;      close ORGS;
1286      # Now the function file.      # Now the function file.
1287      my $funcLine;      my $funcLine;
1288      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1289      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1290          # Clean the line ending.          # Clean the line ending.
1291          chomp $funcLine;          chomp $funcLine;
# Line 1118  Line 1301 
1301              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1302          }          }
1303      }      }
1304        }
1305        # Finish the load.
1306        my $retVal = $self->_FinishAll();
1307        return $retVal;
1308    }
1309    
1310    
1311    =head3 LoadReactionData
1312    
1313    C<< my $stats = $spl->LoadReactionData(); >>
1314    
1315    Load the reaction data from FIG into Sprout.
1316    
1317    Reaction data connects reactions to the compounds that participate in them.
1318    
1319    The following relations are loaded by this method.
1320    
1321        Reaction
1322        ReactionURL
1323        Compound
1324        CompoundName
1325        CompoundCAS
1326        IsAComponentOf
1327    
1328    This method proceeds reaction by reaction rather than genome by genome.
1329    
1330    =over 4
1331    
1332    =item RETURNS
1333    
1334    Returns a statistics object for the loads.
1335    
1336    =back
1337    
1338    =cut
1339    #: Return Type $%;
1340    sub LoadReactionData {
1341        # Get this object instance.
1342        my ($self) = @_;
1343        # Get the FIG object.
1344        my $fig = $self->{fig};
1345        # Create load objects for each of the tables we're loading.
1346        my $loadReaction = $self->_TableLoader('Reaction');
1347        my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1348        my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1349        my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1350        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1351        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1352        if ($self->{options}->{loadOnly}) {
1353            Trace("Loading from existing files.") if T(2);
1354        } else {
1355            Trace("Generating annotation data.") if T(2);
1356            # First we create the compounds.
1357            my @compounds = $fig->all_compounds();
1358            for my $cid (@compounds) {
1359                # Check for names.
1360                my @names = $fig->names_of_compound($cid);
1361                # Each name will be given a priority number, starting with 1.
1362                my $prio = 1;
1363                for my $name (@names) {
1364                    $loadCompoundName->Put($cid, $name, $prio++);
1365                }
1366                # Create the main compound record. Note that the first name
1367                # becomes the label.
1368                my $label = (@names > 0 ? $names[0] : $cid);
1369                $loadCompound->Put($cid, $label);
1370                # Check for a CAS ID.
1371                my $cas = $fig->cas($cid);
1372                if ($cas) {
1373                    $loadCompoundCAS->Put($cid, $cas);
1374                }
1375            }
1376            # All the compounds are set up, so we need to loop through the reactions next. First,
1377            # we initialize the discriminator index. This is a single integer used to insure
1378            # duplicate elements in a reaction are not accidentally collapsed.
1379            my $discrim = 0;
1380            my @reactions = $fig->all_reactions();
1381            for my $reactionID (@reactions) {
1382                # Create the reaction record.
1383                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1384                # Compute the reaction's URL.
1385                my $url = HTML::reaction_link($reactionID);
1386                # Put it in the ReactionURL table.
1387                $loadReactionURL->Put($reactionID, $url);
1388                # Now we need all of the reaction's compounds. We get these in two phases,
1389                # substrates first and then products.
1390                for my $product (0, 1) {
1391                    # Get the compounds of the current type for the current reaction. FIG will
1392                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1393                    # have location data in SEED, so it defaults to the empty string.
1394                    my @compounds = $fig->reaction2comp($reactionID, $product);
1395                    for my $compData (@compounds) {
1396                        # Extract the compound data from the current tuple.
1397                        my ($cid, $stoich, $main) = @{$compData};
1398                        # Link the compound to the reaction.
1399                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1400                                                 $product, $stoich);
1401                    }
1402                }
1403            }
1404        }
1405      # Finish the load.      # Finish the load.
1406      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1407      return $retVal;      return $retVal;
# Line 1133  Line 1417 
1417    
1418      GenomeGroups      GenomeGroups
1419    
1420  There is no direct support for genome groups in FIG, so we access the SEED  Currently, we do not use groups. We used to use them for NMPDR groups,
1421    butThere is no direct support for genome groups in FIG, so we access the SEED
1422  files directly.  files directly.
1423    
1424  =over 4  =over 4
# Line 1153  Line 1438 
1438      my $fig = $self->{fig};      my $fig = $self->{fig};
1439      # Get the genome hash.      # Get the genome hash.
1440      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1441      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1442      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1443      Trace("Beginning group data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1444            Trace("Loading from existing files.") if T(2);
1445        } else {
1446            Trace("Generating group data.") if T(2);
1447            # Currently there are no groups.
1448        }
1449        # Finish the load.
1450        my $retVal = $self->_FinishAll();
1451        return $retVal;
1452    }
1453    
1454    =head3 LoadSynonymData
1455    
1456    C<< my $stats = $spl->LoadSynonymData(); >>
1457    
1458    Load the synonym groups into Sprout.
1459    
1460    The following relations are loaded by this method.
1461    
1462        SynonymGroup
1463        IsSynonymGroupFor
1464    
1465    The source information for these relations is taken from the C<maps_to_id> method
1466    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1467    SQL against the FIG database.
1468    
1469    =over 4
1470    
1471    =item RETURNS
1472    
1473    Returns a statistics object for the loads.
1474    
1475    =back
1476    
1477    =cut
1478    #: Return Type $%;
1479    sub LoadSynonymData {
1480        # Get this object instance.
1481        my ($self) = @_;
1482        # Get the FIG object.
1483        my $fig = $self->{fig};
1484        # Get the genome hash.
1485        my $genomeHash = $self->{genomes};
1486        # Create a load object for the table we're loading.
1487        my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1488        my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1489        if ($self->{options}->{loadOnly}) {
1490            Trace("Loading from existing files.") if T(2);
1491        } else {
1492            Trace("Generating synonym group data.") if T(2);
1493            # Get the database handle.
1494            my $dbh = $fig->db_handle();
1495            # Ask for the synonyms.
1496            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1497            my $result = $sth->execute();
1498            if (! defined($result)) {
1499                Confess("Database error in Synonym load: " . $sth->errstr());
1500            } else {
1501                # Remember the current synonym.
1502                my $current_syn = "";
1503                # Count the features.
1504                my $featureCount = 0;
1505                # Loop through the synonym/peg pairs.
1506                while (my @row = $sth->fetchrow()) {
1507                    # Get the synonym ID and feature ID.
1508                    my ($syn_id, $peg) = @row;
1509                    # Insure it's for one of our genomes.
1510                    my $genomeID = FIG::genome_of($peg);
1511                    if (exists $genomeHash->{$genomeID}) {
1512                        # Verify the synonym.
1513                        if ($syn_id ne $current_syn) {
1514                            # It's new, so put it in the group table.
1515                            $loadSynonymGroup->Put($syn_id);
1516                            $current_syn = $syn_id;
1517                        }
1518                        # Connect the synonym to the peg.
1519                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1520                        # Count this feature.
1521                        $featureCount++;
1522                        if ($featureCount % 1000 == 0) {
1523                            Trace("$featureCount features processed.") if T(3);
1524                        }
1525                    }
1526                }
1527            }
1528        }
1529        # Finish the load.
1530        my $retVal = $self->_FinishAll();
1531        return $retVal;
1532    }
1533    
1534    =head3 LoadFamilyData
1535    
1536    C<< my $stats = $spl->LoadFamilyData(); >>
1537    
1538    Load the protein families into Sprout.
1539    
1540    The following relations are loaded by this method.
1541    
1542        Family
1543        IsFamilyForFeature
1544    
1545    The source information for these relations is taken from the C<families_for_protein>,
1546    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1547    
1548    =over 4
1549    
1550    =item RETURNS
1551    
1552    Returns a statistics object for the loads.
1553    
1554    =back
1555    
1556    =cut
1557    #: Return Type $%;
1558    sub LoadFamilyData {
1559        # Get this object instance.
1560        my ($self) = @_;
1561        # Get the FIG object.
1562        my $fig = $self->{fig};
1563        # Get the genome hash.
1564        my $genomeHash = $self->{genomes};
1565        # Create load objects for the tables we're loading.
1566        my $loadFamily = $self->_TableLoader('Family');
1567        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1568        if ($self->{options}->{loadOnly}) {
1569            Trace("Loading from existing files.") if T(2);
1570        } else {
1571            Trace("Generating family data.") if T(2);
1572            # Create a hash for the family IDs.
1573            my %familyHash = ();
1574      # Loop through the genomes.      # Loop through the genomes.
1575      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1576      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1577          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1578          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1579          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1580              defined($line = <TMP>)) {                  # Get this feature's families.
1581              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1582              chomp $line;                  # Loop through the families, connecting them to the feature.
1583              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1584              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1585              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1586                        if (! exists $familyHash{$family}) {
1587                            $familyHash{$family} = 1;
1588                            $loadFamily->Add("families", 1);
1589                            my $size = $fig->sz_family($family);
1590                            my $func = $fig->family_function($family);
1591                            $loadFamily->Put($family, $size, $func);
1592                        }
1593                    }
1594                }
1595          }          }
         close TMP;  
1596      }      }
1597      # Finish the load.      # Finish the load.
1598      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1599      return $retVal;      return $retVal;
1600  }  }
1601    
1602    =head3 LoadDrugData
1603    
1604    C<< my $stats = $spl->LoadDrugData(); >>
1605    
1606    Load the drug target data into Sprout.
1607    
1608    The following relations are loaded by this method.
1609    
1610        PDB
1611        DocksWith
1612        IsProteinForFeature
1613        Ligand
1614    
1615    The source information for these relations is taken from attributes. The
1616    C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1617    The C<zinc_name> attribute describes the ligands. The C<docking_results>
1618    attribute contains the information for the B<DocksWith> relationship. It is
1619    expected that additional attributes and tables will be added in the future.
1620    
1621    =over 4
1622    
1623    =item RETURNS
1624    
1625    Returns a statistics object for the loads.
1626    
1627    =back
1628    
1629    =cut
1630    #: Return Type $%;
1631    sub LoadDrugData {
1632        # Get this object instance.
1633        my ($self) = @_;
1634        # Get the FIG object.
1635        my $fig = $self->{fig};
1636        # Get the genome hash.
1637        my $genomeHash = $self->{genomes};
1638        # Create load objects for the tables we're loading.
1639        my $loadPDB = $self->_TableLoader('PDB');
1640        my $loadLigand = $self->_TableLoader('Ligand');
1641        my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1642        my $loadDocksWith = $self->_TableLoader('DocksWith');
1643        if ($self->{options}->{loadOnly}) {
1644            Trace("Loading from existing files.") if T(2);
1645        } else {
1646            Trace("Generating drug target data.") if T(2);
1647            # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1648            # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1649            # this process, PDB information is collected in a hash table and then
1650            # unspooled after both relationships are created.
1651            my %pdbHash = ();
1652            Trace("Generating docking data.") if T(2);
1653            # Get all the docking data. This may cause problems if there are too many PDBs,
1654            # at which point we'll need another algorithm. The indicator that this is
1655            # happening will be a timeout error in the next statement.
1656            my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1657                                                  ['docking_results', $FIG_Config::dockLimit]);
1658            Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1659            for my $dockData (@dockData) {
1660                # Get the docking data components.
1661                my ($pdbID, $docking_key, @valueData) = @{$dockData};
1662                # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1663                $pdbID = lc $pdbID;
1664                # Strip off the object type.
1665                $pdbID =~ s/pdb://;
1666                # Extract the ZINC ID from the docking key. Note that there are two possible
1667                # formats.
1668                my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1669                if (! $zinc_id) {
1670                    Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1671                    $loadDocksWith->Add("errors");
1672                } else {
1673                    # Get the pieces of the value and parse the energy.
1674                    # Note that we don't care about the rank, since
1675                    # we can sort on the energy level itself in our database.
1676                    my ($energy, $tool, $type) = @valueData;
1677                    my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1678                    # Ignore predicted results.
1679                    if ($type ne "Predicted") {
1680                        # Count this docking result.
1681                        if (! exists $pdbHash{$pdbID}) {
1682                            $pdbHash{$pdbID} = 1;
1683                        } else {
1684                            $pdbHash{$pdbID}++;
1685                        }
1686                        # Write the result to the output.
1687                        $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1688                                            $total, $vanderwaals);
1689                    }
1690                }
1691            }
1692            Trace("Connecting features.") if T(2);
1693            # Loop through the genomes.
1694            for my $genome (sort keys %{$genomeHash}) {
1695                Trace("Generating PDBs for $genome.") if T(3);
1696                # Get all of the PDBs that BLAST against this genome's features.
1697                my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1698                for my $pdbData (@attributeData) {
1699                    # The PDB ID is coded as a subkey.
1700                    if ($pdbData->[1] !~ /PDB::(.+)/i) {
1701                        Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1702                        $loadPDB->Add("errors");
1703                    } else {
1704                        my $pdbID = $1;
1705                        # Insure the PDB is in the hash.
1706                        if (! exists $pdbHash{$pdbID}) {
1707                            $pdbHash{$pdbID} = 0;
1708                        }
1709                        # The score and locations are coded in the attribute value.
1710                        if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1711                            Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1712                            $loadIsProteinForFeature->Add("errors");
1713                        } else {
1714                            my ($score, $locData) = ($1,$2);
1715                            # The location data may not be present, so we have to start with some
1716                            # defaults and then check.
1717                            my ($start, $end) = (1, 0);
1718                            if ($locData) {
1719                                $locData =~ /(\d+)-(\d+)/;
1720                                $start = $1;
1721                                $end = $2;
1722                            }
1723                            # If we still don't have the end location, compute it from
1724                            # the feature length.
1725                            if (! $end) {
1726                                # Most features have one location, but we do a list iteration
1727                                # just in case.
1728                                my @locations = $fig->feature_location($pdbData->[0]);
1729                                $end = 0;
1730                                for my $loc (@locations) {
1731                                    my $locObject = BasicLocation->new($loc);
1732                                    $end += $locObject->Length;
1733                                }
1734                            }
1735                            # Decode the score.
1736                            my $realScore = FIGRules::DecodeScore($score);
1737                            # Connect the PDB to the feature.
1738                            $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);
1739                        }
1740                    }
1741                }
1742            }
1743            # We've got all our PDBs now, so we unspool them from the hash.
1744            Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1745            my $count = 0;
1746            for my $pdbID (sort keys %pdbHash) {
1747                $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1748                $count++;
1749                Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1750            }
1751            # Finally we create the ligand table. This information can be found in the
1752            # zinc_name attribute.
1753            Trace("Loading ligands.") if T(2);
1754            # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1755            my $last_zinc_id = "";
1756            my $zinc_id = "";
1757            my $done = 0;
1758            while (! $done) {
1759                # Get the next 10000 ligands. We insist that the object ID is greater than
1760                # the last ID we processed.
1761                Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1762                my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1763                                                           ["ZINC:$zinc_id", "zinc_name"]);
1764                Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1765                if (! @attributeData) {
1766                    # Here there are no attributes left, so we quit the loop.
1767                    $done = 1;
1768                } else {
1769                    # Process the attribute data we've received.
1770                    for my $zinc_data (@attributeData) {
1771                        # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1772                        if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1773                            $zinc_id = $1;
1774                            # Check for a duplicate.
1775                            if ($zinc_id eq $last_zinc_id) {
1776                                $loadLigand->Add("duplicate");
1777                            } else {
1778                                # Here it's safe to output the ligand. The ligand name is the attribute value
1779                                # (third column in the row).
1780                                $loadLigand->Put($zinc_id, $zinc_data->[2]);
1781                                # Insure we don't try to add this ID again.
1782                                $last_zinc_id = $zinc_id;
1783                            }
1784                        } else {
1785                            Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1786                            $loadLigand->Add("errors");
1787                        }
1788                    }
1789                }
1790            }
1791            Trace("Ligands loaded.") if T(2);
1792        }
1793        # Finish the load.
1794        my $retVal = $self->_FinishAll();
1795        return $retVal;
1796    }
1797    
1798    
1799  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1800    
1801    =head3 SpecialAttribute
1802    
1803    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1804    
1805    Look for special attributes of a given type. A special attribute is found by comparing one of
1806    the columns of the incoming attribute list to a search pattern. If a match is found, then
1807    a set of columns is put into an output table connected to the specified ID.
1808    
1809    For example, when processing features, the attribute list we look at has three columns: attribute
1810    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1811    begins with C<iedb_>. The call signature is therefore
1812    
1813        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1814    
1815    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1816    to the output along with the specified feature ID.
1817    
1818    =over 4
1819    
1820    =item id
1821    
1822    ID of the object whose special attributes are being loaded. This forms the first column of the
1823    output.
1824    
1825    =item attributes
1826    
1827    Reference to a list of tuples.
1828    
1829    =item idxMatch
1830    
1831    Index in each tuple of the column to be matched against the pattern. If the match is
1832    successful, an output record will be generated.
1833    
1834    =item idxValues
1835    
1836    Reference to a list containing the indexes in each tuple of the columns to be put as
1837    the second column of the output.
1838    
1839    =item pattern
1840    
1841    Pattern to be matched against the specified column. The match will be case-insensitive.
1842    
1843    =item loader
1844    
1845    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1846    but technically it could be anything with a C<Put> method.
1847    
1848    =item RETURN
1849    
1850    Returns a count of the matches found.
1851    
1852    =item
1853    
1854    =back
1855    
1856    =cut
1857    
1858    sub SpecialAttribute {
1859        # Get the parameters.
1860        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1861        # Declare the return variable.
1862        my $retVal = 0;
1863        # Loop through the attribute rows.
1864        for my $row (@{$attributes}) {
1865            # Check for a match.
1866            if ($row->[$idxMatch] =~ m/$pattern/i) {
1867                # We have a match, so output a row. This is a bit tricky, since we may
1868                # be putting out multiple columns of data from the input.
1869                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1870                $loader->Put($id, $value);
1871                $retVal++;
1872            }
1873        }
1874        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1875        # Return the number of matches.
1876        return $retVal;
1877    }
1878    
1879  =head3 TableLoader  =head3 TableLoader
1880    
1881  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1193  Line 1890 
1890    
1891  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1892    
1893  =item rowCount (optional)  =item ignore
1894    
1895  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1896    
1897  =item RETURN  =item RETURN
1898    
# Line 1207  Line 1904 
1904    
1905  sub _TableLoader {  sub _TableLoader {
1906      # Get the parameters.      # Get the parameters.
1907      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1908      # Create the load object.      # Create the load object.
1909      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1910                                   $ignore);
1911      # Cache it in the loader list.      # Cache it in the loader list.
1912      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1913      # Return it to the caller.      # Return it to the caller.
# Line 1243  Line 1941 
1941      my $retVal = Stats->new();      my $retVal = Stats->new();
1942      # Get the loader list.      # Get the loader list.
1943      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
1944        # Create a hash to hold the statistics objects, keyed on relation name.
1945        my %loaderHash = ();
1946      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
1947      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
1948      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
1949            # Get the relation name.
1950            my $relName = $loader->RelName;
1951            # Check the ignore flag.
1952            if ($loader->Ignore) {
1953                Trace("Relation $relName not loaded.") if T(2);
1954            } else {
1955                # Here we really need to finish.
1956                Trace("Finishing $relName.") if T(2);
1957          my $stats = $loader->Finish();          my $stats = $loader->Finish();
1958                $loaderHash{$relName} = $stats;
1959            }
1960        }
1961        # Now we loop through again, actually loading the tables. We want to finish before
1962        # loading so that if something goes wrong at this point, all the load files are usable
1963        # and we don't have to redo all that work.
1964        for my $relName (sort keys %loaderHash) {
1965            # Get the statistics for this relation.
1966            my $stats = $loaderHash{$relName};
1967            # Check for a database load.
1968            if ($self->{options}->{dbLoad}) {
1969                # Here we want to use the load file just created to load the database.
1970                Trace("Loading relation $relName.") if T(2);
1971                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1972                # Accumulate the statistics from the DB load.
1973                $stats->Accumulate($newStats);
1974            }
1975          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
1976          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1977      }      }
1978      # Return the load statistics.      # Return the load statistics.
1979      return $retVal;      return $retVal;
1980  }  }
1981    
1982    =head3 GetGenomeAttributes
1983    
1984    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
1985    
1986    Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1987    attributes for all the features of a genome in a single call, then organizes them into
1988    a hash.
1989    
1990    =over 4
1991    
1992    =item fig
1993    
1994    FIG-like object for accessing attributes.
1995    
1996    =item genomeID
1997    
1998    ID of the genome who's attributes are desired.
1999    
2000    =item fids
2001    
2002    Reference to a list of the feature IDs whose attributes are to be kept.
2003    
2004    =item RETURN
2005    
2006    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2007    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2008    the attribute key, and one or more attribute values.
2009    
2010    =back
2011    
2012    =cut
2013    
2014    sub GetGenomeAttributes {
2015        # Get the parameters.
2016        my ($fig, $genomeID, $fids) = @_;
2017        # Declare the return variable.
2018        my $retVal = {};
2019        # Get a list of the attributes we care about.
2020        my @propKeys = $fig->get_group_keys("NMPDR");
2021        # Get the attributes.
2022        my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys);
2023        # Initialize the hash. This not only enables us to easily determine which FIDs to
2024        # keep, it insures that the caller sees a list reference for every known fid,
2025        # simplifying the logic.
2026        for my $fid (@{$fids}) {
2027            $retVal->{$fid} = [];
2028        }
2029        # Populate the hash.
2030        for my $aListEntry (@aList) {
2031            my $fid = $aListEntry->[0];
2032            if (exists $retVal->{$fid}) {
2033                push @{$retVal->{$fid}}, $aListEntry;
2034            }
2035        }
2036        # Return the result.
2037        return $retVal;
2038    }
2039    
2040  1;  1;

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.84

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3