[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Tue Sep 13 19:05:20 2005 UTC revision 1.82, Tue Apr 10 06:15:35 2007 UTC
# Line 10  Line 10 
10      use Sprout;      use Sprout;
11      use Stats;      use Stats;
12      use BasicLocation;      use BasicLocation;
13        use HTML;
14    
15  =head1 Sprout Load Methods  =head1 Sprout Load Methods
16    
# Line 29  Line 30 
30      $stats->Accumulate($spl->LoadFeatureData());      $stats->Accumulate($spl->LoadFeatureData());
31      print $stats->Show();      print $stats->Show();
32    
 This module makes use of the internal Sprout property C<_erdb>.  
   
33  It is worth noting that the FIG object does not need to be a real one. Any object  It is worth noting that the FIG object does not need to be a real one. Any object
34  that implements the FIG methods for data retrieval could be used. So, for example,  that implements the FIG methods for data retrieval could be used. So, for example,
35  this object could be used to copy data from one Sprout database to another, or  this object could be used to copy data from one Sprout database to another, or
# Line 51  Line 50 
50    
51  =head3 new  =head3 new
52    
53  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
54    
55  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
56  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 79  Line 78 
78  =item subsysFile  =item subsysFile
79    
80  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
81  to a list of subsystem names. If nothing is specified, all known subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82  considered trusted. Only subsystem data related to the trusted subsystems is loaded.  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83    in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
84    
85    =item options
86    
87    Reference to a hash of command-line options.
88    
89  =back  =back
90    
# Line 88  Line 92 
92    
93  sub new {  sub new {
94      # Get the parameters.      # Get the parameters.
95      my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;      my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
96      # Load the list of genomes into a hash.      # Create the genome hash.
97      my %genomes;      my %genomes = ();
98        # We only need it if load-only is NOT specified.
99        if (! $options->{loadOnly}) {
100      if (! defined($genomeFile) || $genomeFile eq '') {      if (! defined($genomeFile) || $genomeFile eq '') {
101          # Here we want all the complete genomes and an access code of 1.          # Here we want all the complete genomes and an access code of 1.
102          my @genomeList = $fig->genomes(1);          my @genomeList = $fig->genomes(1);
# Line 114  Line 120 
120                  # an omitted access code can be defaulted to 1.                  # an omitted access code can be defaulted to 1.
121                  for my $genomeLine (@genomeList) {                  for my $genomeLine (@genomeList) {
122                      my ($genomeID, $accessCode) = split("\t", $genomeLine);                      my ($genomeID, $accessCode) = split("\t", $genomeLine);
123                      if (undef $accessCode) {                          if (! defined($accessCode)) {
124                          $accessCode = 1;                          $accessCode = 1;
125                      }                      }
126                      $genomes{$genomeID} = $accessCode;                      $genomes{$genomeID} = $accessCode;
# Line 124  Line 130 
130              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");              Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
131          }          }
132      }      }
133        }
134      # Load the list of trusted subsystems.      # Load the list of trusted subsystems.
135      my %subsystems = ();      my %subsystems = ();
136        # We only need it if load-only is NOT specified.
137        if (! $options->{loadOnly}) {
138      if (! defined $subsysFile || $subsysFile eq '') {      if (! defined $subsysFile || $subsysFile eq '') {
139          # Here we want all the subsystems.              # Here we want all the usable subsystems. First we get the whole list.
140          %subsystems = map { $_ => 1 } $fig->all_subsystems();              my @subs = $fig->all_subsystems();
141                # Loop through, checking for the NMPDR file.
142                for my $sub (@subs) {
143                    if ($fig->nmpdr_subsystem($sub)) {
144                        $subsystems{$sub} = 1;
145                    }
146                }
147      } else {      } else {
148          my $type = ref $subsysFile;          my $type = ref $subsysFile;
149          if ($type eq 'ARRAY') {          if ($type eq 'ARRAY') {
# Line 148  Line 163 
163              Confess("Invalid subsystem parameter in SproutLoad constructor.");              Confess("Invalid subsystem parameter in SproutLoad constructor.");
164          }          }
165      }      }
166            # Go through the subsys hash again, creating the keyword list for each subsystem.
167            for my $subsystem (keys %subsystems) {
168                my $name = $subsystem;
169                $name =~ s/_/ /g;
170                my $classes = $fig->subsystem_classification($subsystem);
171                $name .= " " . join(" ", @{$classes});
172                $subsystems{$subsystem} = $name;
173            }
174        }
175      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
176      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
177      # Create the Sprout load object.      # Create the Sprout load object.
# Line 157  Line 181 
181                    subsystems => \%subsystems,                    subsystems => \%subsystems,
182                    sprout => $sprout,                    sprout => $sprout,
183                    loadDirectory => $directory,                    loadDirectory => $directory,
184                    erdb => $sprout->{_erdb},                    erdb => $sprout,
185                    loaders => []                    loaders => [],
186                      options => $options
187                   };                   };
188      # Bless and return it.      # Bless and return it.
189      bless $retVal, $class;      bless $retVal, $class;
190      return $retVal;      return $retVal;
191  }  }
192    
193    =head3 LoadOnly
194    
195    C<< my $flag = $spl->LoadOnly; >>
196    
197    Return TRUE if we are in load-only mode, else FALSE.
198    
199    =cut
200    
201    sub LoadOnly {
202        my ($self) = @_;
203        return $self->{options}->{loadOnly};
204    }
205    
206    =head3 PrimaryOnly
207    
208    C<< my $flag = $spl->PrimaryOnly; >>
209    
210    Return TRUE if only the main entity is to be loaded, else FALSE.
211    
212    =cut
213    
214    sub PrimaryOnly {
215        my ($self) = @_;
216        return $self->{options}->{primaryOnly};
217    }
218    
219  =head3 LoadGenomeData  =head3 LoadGenomeData
220    
221  C<< my $stats = $spl->LoadGenomeData(); >>  C<< my $stats = $spl->LoadGenomeData(); >>
# Line 192  Line 243 
243    
244  =back  =back
245    
 B<TO DO>  
   
 Real quality vectors instead of C<unknown> for everything.  
   
 GenomeGroup relation. (The original script took group information from the C<NMPDR> file  
 in each genome's main directory, but no such file exists anywhere in my version of the  
 data store.)  
   
246  =cut  =cut
247  #: Return Type $%;  #: Return Type $%;
248  sub LoadGenomeData {  sub LoadGenomeData {
# Line 210  Line 253 
253      # Get the genome count.      # Get the genome count.
254      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
255      my $genomeCount = (keys %{$genomeHash});      my $genomeCount = (keys %{$genomeHash});
     Trace("Beginning genome data load.") if T(2);  
256      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
257      my $loadGenome = $self->_TableLoader('Genome', $genomeCount);      my $loadGenome = $self->_TableLoader('Genome');
258      my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);      my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
259      my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);      my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
260      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);      my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
261      my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);      my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
262        if ($self->{options}->{loadOnly}) {
263            Trace("Loading from existing files.") if T(2);
264        } else {
265            Trace("Generating genome data.") if T(2);
266      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
267      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
268          Trace("Loading data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
269          $loadGenome->Add("genomeIn");          $loadGenome->Add("genomeIn");
270          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
271          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
272          # Get the genus, species, and strain from the scientific name. Note that we append              # Get the genus, species, and strain from the scientific name.
         # the genome ID to the strain. In some cases this is the totality of the strain name.  
273          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);          my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
274          my $extra = join " ", @extraData, "[$genomeID]";              my $extra = join " ", @extraData;
275          # Get the full taxonomy.          # Get the full taxonomy.
276          my $taxonomy = $fig->taxonomy_of($genomeID);          my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284                # Open the NMPDR group file for this genome.
285                my $group;
286                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
287                    defined($group = <TMP>)) {
288                    # Clean the line ending.
289                    chomp $group;
290                } else {
291                    # No group, so use the default.
292                    $group = $FIG_Config::otherGroup;
293                }
294                close TMP;
295          # Output the genome record.          # Output the genome record.
296          $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                           $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298          # Now we loop through each of the genome's contigs.          # Now we loop through each of the genome's contigs.
299          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
300          for my $contigID (@contigs) {          for my $contigID (@contigs) {
# Line 262  Line 325 
325              }              }
326          }          }
327      }      }
328        }
329      # Finish the loads.      # Finish the loads.
330      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
331      # Return the result.      # Return the result.
# Line 302  Line 366 
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367      # Get the genome hash.      # Get the genome hash.
368      my $genomeFilter = $self->{genomes};      my $genomeFilter = $self->{genomes};
369      my $genomeCount = (keys %{$genomeFilter});      # Set up an ID counter for the PCHs.
370      my $featureCount = $genomeCount * 4000;      my $pchID = 0;
371      # Start the loads.      # Start the loads.
372      my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);      my $loadCoupling = $self->_TableLoader('Coupling');
373      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
374      my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);      my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
375      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);      my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
376      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);      my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
377      Trace("Beginning coupling data load.") if T(2);      if ($self->{options}->{loadOnly}) {
378            Trace("Loading from existing files.") if T(2);
379        } else {
380            Trace("Generating coupling data.") if T(2);
381      # Loop through the genomes found.      # Loop through the genomes found.
382      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
383          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
# Line 334  Line 401 
401              for my $coupleData (@couplings) {              for my $coupleData (@couplings) {
402                  my ($peg2, $score) = @{$coupleData};                  my ($peg2, $score) = @{$coupleData};
403                  # Compute the coupling ID.                  # Compute the coupling ID.
404                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                      my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);
405                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
406                      $loadCoupling->Add("couplingIn");                      $loadCoupling->Add("couplingIn");
407                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
# Line 362  Line 429 
429                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
430                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
431                              # insures that if there is duplicate evidence, we                              # insures that if there is duplicate evidence, we
432                              # at least keep the meaningful ones. Only evidence is                                  # at least keep the meaningful ones. Only evidence in
433                              # the hash makes it to the output.                              # the hash makes it to the output.
434                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {                              if ($usage || ! exists $evidenceMap{$evidenceKey}) {
435                                  $evidenceMap{$evidenceKey} = $evidenceData;                                  $evidenceMap{$evidenceKey} = $evidenceData;
# Line 370  Line 437 
437                          }                          }
438                      }                      }
439                      for my $evidenceID (keys %evidenceMap) {                      for my $evidenceID (keys %evidenceMap) {
440                                # Get the ID for this evidence.
441                                $pchID++;
442                          # Create the evidence record.                          # Create the evidence record.
443                          my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};                          my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
444                          $loadPCH->Put($evidenceID, $usage);                              $loadPCH->Put($pchID, $usage);
445                          # Connect it to the coupling.                          # Connect it to the coupling.
446                          $loadIsEvidencedBy->Put($coupleID, $evidenceID);                              $loadIsEvidencedBy->Put($coupleID, $pchID);
447                          # Connect it to the features.                          # Connect it to the features.
448                          $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                              $loadUsesAsEvidence->Put($pchID, $peg3, 1);
449                          $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);                              $loadUsesAsEvidence->Put($pchID, $peg4, 2);
450                            }
451                      }                      }
452                  }                  }
453              }              }
# Line 404  Line 474 
474      FeatureTranslation      FeatureTranslation
475      FeatureUpstream      FeatureUpstream
476      IsLocatedIn      IsLocatedIn
477        HasFeature
478        HasRoleInSubsystem
479        FeatureEssential
480        FeatureVirulent
481        FeatureIEDB
482    
483  =over 4  =over 4
484    
# Line 418  Line 493 
493  sub LoadFeatureData {  sub LoadFeatureData {
494      # Get this object instance.      # Get this object instance.
495      my ($self) = @_;      my ($self) = @_;
496      # Get the FIG object.      # Get the FIG and Sprout objects.
497      my $fig = $self->{fig};      my $fig = $self->{fig};
498        my $sprout = $self->{sprout};
499      # Get the table of genome IDs.      # Get the table of genome IDs.
500      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
501      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
502      my $loadFeature = $self->_TableLoader('Feature', $featureCount);      my $loadFeature = $self->_TableLoader('Feature');
503      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
504      my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);      my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
505      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
506      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
507      my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
508        my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);
509        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);
510        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
511        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
512        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
513        # Get the subsystem hash.
514        my $subHash = $self->{subsystems};
515      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
516      # locations.      # locations.
517      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
518      Trace("Beginning feature data load.") if T(2);      if ($self->{options}->{loadOnly}) {
519            Trace("Loading from existing files.") if T(2);
520        } else {
521            Trace("Generating feature data.") if T(2);
522      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
523      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
524          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
525          $loadFeature->Add("genomeIn");          $loadFeature->Add("genomeIn");
526          # Get the feature list for this genome.          # Get the feature list for this genome.
527          my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528                # Sort and count the list.
529                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530                my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532                Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535                # Set up for our duplicate-feature check.
536                my $oldFeatureID = "";
537          # Loop through the features.          # Loop through the features.
538          for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
             $loadFeature->Add("featureIn");  
539              # Split the tuple.              # Split the tuple.
540              my ($featureID, $locations, $aliases, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541              # Create the feature record.                  # Check for duplicates.
542              $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
543                        Trace("Duplicate feature $featureID found.") if T(1);
544                    } else {
545                        $oldFeatureID = $featureID;
546                        # Count this feature.
547                        $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                        # Begin building the keywords. We start with the genome ID, the
554                        # feature ID, the taxonomy, and the organism name.
555                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                                        $fig->taxonomy_of($genomeID));
557              # Create the aliases.              # Create the aliases.
558              for my $alias (split /\s*,\s*/, $aliases) {                      for my $alias ($fig->feature_aliases($featureID)) {
559                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
560                            push @keywords, $alias;
561              }              }
562                        Trace("Assignment for $featureID is: $assignment") if T(4);
563                        # Break the assignment into words and shove it onto the
564                        # keyword list.
565                        push @keywords, split(/\s+/, $assignment);
566                        # Link this feature to the parent genome.
567                        $loadHasFeature->Put($genomeID, $featureID, $type);
568              # Get the links.              # Get the links.
569              my @links = $fig->fid_links($featureID);              my @links = $fig->fid_links($featureID);
570              for my $link (@links) {              for my $link (@links) {
# Line 470  Line 583 
583                      $loadFeatureUpstream->Put($featureID, $upstream);                      $loadFeatureUpstream->Put($featureID, $upstream);
584                  }                  }
585              }              }
586                        # Now we need to find the subsystems this feature participates in.
587                        # We also add the subsystems to the keyword list. Before we do that,
588                        # we must convert underscores to spaces and tack on the classifications.
589                        my @subsystems = $fig->peg_to_subsystems($featureID);
590                        for my $subsystem (@subsystems) {
591                            # Only proceed if we like this subsystem.
592                            if (exists $subHash->{$subsystem}) {
593                                # Store the has-role link.
594                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
595                                # Save the subsystem's keyword data.
596                                my $subKeywords = $subHash->{$subsystem};
597                                push @keywords, split /\s+/, $subKeywords;
598                                # Now we need to get this feature's role in the subsystem.
599                                my $subObject = $fig->get_subsystem($subsystem);
600                                my @roleColumns = $subObject->get_peg_roles($featureID);
601                                my @allRoles = $subObject->get_roles();
602                                for my $col (@roleColumns) {
603                                    my $role = $allRoles[$col];
604                                    push @keywords, split /\s+/, $role;
605                                    push @keywords, $subObject->get_role_abbr($col);
606                                }
607                            }
608                        }
609                        # There are three special attributes computed from property
610                        # data that we build next. If the special attribute is non-empty,
611                        # its name will be added to the keyword list. First, we get all
612                        # the attributes for this feature. They will come back as
613                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
614                        # [name, value, value with URL]. (We don't need the PEG, since
615                        # we already know it.)
616                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                             @{$attributes->{$featureID}};
618                        # Now we process each of the special attributes.
619                        if (SpecialAttribute($featureID, \@attributes,
620                                             1, [0,2], '^(essential|potential_essential)$',
621                                             $loadFeatureEssential)) {
622                            push @keywords, 'essential';
623                            $loadFeature->Add('essential');
624                        }
625                        if (SpecialAttribute($featureID, \@attributes,
626                                             0, [2], '^virulen',
627                                             $loadFeatureVirulent)) {
628                            push @keywords, 'virulent';
629                            $loadFeature->Add('virulent');
630                        }
631                        if (SpecialAttribute($featureID, \@attributes,
632                                             0, [0,2], '^iedb_',
633                                             $loadFeatureIEDB)) {
634                            push @keywords, 'iedb';
635                            $loadFeature->Add('iedb');
636                        }
637                        # Now we need to bust up hyphenated words in the keyword
638                        # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                        my $keywordString = "";
641                        my $bustedString = "";
642                        for my $keyword (@keywords) {
643                            if (length $keyword >= 3) {
644                                $keywordString .= " $keyword";
645                                if ($keyword =~ /-/) {
646                                    my @words = split /-/, $keyword;
647                                    $bustedString .= join(" ", "", @words);
648                                }
649                            }
650                        }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                        # Clean the keyword list.
655                        my $cleanWords = $sprout->CleanKeywords($keywordString);
656                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                        # Create the feature record.
658                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659              # This part is the roughest. We need to relate the features to contig              # This part is the roughest. We need to relate the features to contig
660              # locations, and the locations must be split so that none of them exceed              # locations, and the locations must be split so that none of them exceed
661              # the maximum segment size. This simplifies the genes_in_region processing              # the maximum segment size. This simplifies the genes_in_region processing
662              # for Sprout.              # for Sprout.
663              my @locationList = split /\s*,\s*/, $locations;              my @locationList = split /\s*,\s*/, $locations;
664                        # Create the location position indicator.
665                        my $i = 1;
666              # Loop through the locations.              # Loop through the locations.
667              for my $location (@locationList) {              for my $location (@locationList) {
668                  # Parse the location.                  # Parse the location.
669                  my $locObject = BasicLocation->new($location);                          my $locObject = BasicLocation->new("$genomeID:$location");
670                  # Split it into a list of chunks.                  # Split it into a list of chunks.
671                  my @locOList = ();                  my @locOList = ();
672                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
# Line 488  Line 676 
676                  push @locOList, $locObject;                  push @locOList, $locObject;
677                  # Loop through the chunks, creating IsLocatedIn records. The variable                  # Loop through the chunks, creating IsLocatedIn records. The variable
678                  # "$i" will be used to keep the location index.                  # "$i" will be used to keep the location index.
                 my $i = 1;  
679                  for my $locChunk (@locOList) {                  for my $locChunk (@locOList) {
680                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,                      $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
681                                            $locChunk->Dir, $locChunk->Length, $i);                                            $locChunk->Dir, $locChunk->Length, $i);
# Line 497  Line 684 
684              }              }
685          }          }
686      }      }
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     my $genomeCount = (keys %{$genomeHash});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',  
                                                            $featureCount * $genomeCount);  
     Trace("Beginning BBH load.") if T(2);  
     # Now we loop through the genomes, generating the data for each one.  
     for my $genomeID (sort keys %{$genomeHash}) {  
         $loadIsBidirectionalBestHitOf->Add("genomeIn");  
         Trace("Processing features for genome $genomeID.") if T(3);  
         # Get the feature list for this genome.  
         my $features = $fig->all_features_detailed($genomeID);  
         # Loop through the features.  
         for my $featureData (@{$features}) {  
             # Split the tuple.  
             my ($featureID, $locations, $aliases, $type) = @{$featureData};  
             # Get the bi-directional best hits.  
             my @bbhList = $fig->bbhs($featureID);  
             for my $bbhEntry (@bbhList) {  
                 # Get the target feature ID and the score.  
                 my ($targetID, $score) = @{$bbhEntry};  
                 # Check the target feature's genome.  
                 my $targetGenomeID = $fig->genome_of($targetID);  
                 # Only proceed if it's one of our genomes.  
                 if ($genomeHash->{$targetGenomeID}) {  
                     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                        $score);  
                 }  
             }  
687          }          }
688      }      }
689      # Finish the loads.      # Finish the loads.
# Line 584  Line 706 
706  The following relations are loaded by this method.  The following relations are loaded by this method.
707    
708      Subsystem      Subsystem
709        SubsystemClass
710      Role      Role
711        RoleEC
712      SSCell      SSCell
713      ContainsFeature      ContainsFeature
714      IsGenomeOf      IsGenomeOf
# Line 592  Line 716 
716      OccursInSubsystem      OccursInSubsystem
717      ParticipatesIn      ParticipatesIn
718      HasSSCell      HasSSCell
719        ConsistsOfRoles
720        RoleSubset
721        HasRoleSubset
722        ConsistsOfGenomes
723        GenomeSubset
724        HasGenomeSubset
725        Catalyzes
726        Diagram
727        RoleOccursIn
728    
729  =over 4  =over 4
730    
# Line 601  Line 734 
734    
735  =back  =back
736    
 B<TO DO>  
   
 Generate RoleName table?  
   
737  =cut  =cut
738  #: Return Type $%;  #: Return Type $%;
739  sub LoadSubsystemData {  sub LoadSubsystemData {
# Line 618  Line 747 
747      # Get the subsystem hash. This lists the subsystems we'll process.      # Get the subsystem hash. This lists the subsystems we'll process.
748      my $subsysHash = $self->{subsystems};      my $subsysHash = $self->{subsystems};
749      my @subsysIDs = sort keys %{$subsysHash};      my @subsysIDs = sort keys %{$subsysHash};
750      my $subsysCount = @subsysIDs;      # Get the map list.
751      my $genomeCount = (keys %{$genomeHash});      my @maps = $fig->all_maps;
     my $featureCount = $genomeCount * 4000;  
752      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
753      my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);      my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
754      my $loadRole = $self->_TableLoader('Role', $featureCount * 6);      my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
755      my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);      my $loadSubsystem = $self->_TableLoader('Subsystem');
756      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);      my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
757      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);      my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
758      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);      my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
759      my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);      my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
760      my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);      my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
761      my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);      my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
762      Trace("Beginning subsystem data load.") if T(2);      my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
763        my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
764        my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
765        my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
766        my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
767        my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
768        my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
769        my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
770        my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
771        my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
772        my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);
773        if ($self->{options}->{loadOnly}) {
774            Trace("Loading from existing files.") if T(2);
775        } else {
776            Trace("Generating subsystem data.") if T(2);
777            # This hash will contain the role for each EC. When we're done, this
778            # information will be used to generate the Catalyzes table.
779            my %ecToRoles = ();
780      # Loop through the subsystems. Our first task will be to create the      # Loop through the subsystems. Our first task will be to create the
781      # roles. We do this by looping through the subsystems and creating a      # roles. We do this by looping through the subsystems and creating a
782      # role hash. The hash tracks each role ID so that we don't create      # role hash. The hash tracks each role ID so that we don't create
783      # duplicates. As we move along, we'll connect the roles and subsystems.          # duplicates. As we move along, we'll connect the roles and subsystems
784            # and memorize up the reactions.
785            my ($genomeID, $roleID);
786      my %roleData = ();      my %roleData = ();
787      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
788                # Get the subsystem object.
789                my $sub = $fig->get_subsystem($subsysID);
790                # Only proceed if the subsystem has a spreadsheet.
791                if (! $sub->{empty_ss}) {
792          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
793          $loadSubsystem->Add("subsystemIn");          $loadSubsystem->Add("subsystemIn");
794          # Create the subsystem record.          # Create the subsystem record.
795          $loadSubsystem->Put($subsysID);                  my $curator = $sub->get_curator();
796          # Get the subsystem's roles.                  my $notes = $sub->get_notes();
797          my @roles = $fig->subsystem_to_roles($subsysID);                  $loadSubsystem->Put($subsysID, $curator, $notes);
798          # Connect the roles to the subsystem. If a role is new, we create                  # Now for the classification string. This comes back as a list
799          # a role record for it.                  # reference and we convert it to a space-delimited string.
800          for my $roleID (@roles) {                  my $classList = $fig->subsystem_classification($subsysID);
801                    my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                    $loadSubsystemClass->Put($subsysID, $classString);
803                    # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                    for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
805                        # Connect to this role.
806              $loadOccursInSubsystem->Add("roleIn");              $loadOccursInSubsystem->Add("roleIn");
807              $loadOccursInSubsystem->Put($roleID, $subsysID);                      $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
808                        # If it's a new role, add it to the role table.
809              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
810                  $loadRole->Put($roleID);                          # Get the role's abbreviation.
811                            my $abbr = $sub->get_role_abbr($col);
812                            # Add the role.
813                            $loadRole->Put($roleID, $abbr);
814                  $roleData{$roleID} = 1;                  $roleData{$roleID} = 1;
815                            # Check for an EC number.
816                            if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
817                                my $ec = $1;
818                                $loadRoleEC->Put($roleID, $ec);
819                                $ecToRoles{$ec} = $roleID;
820                            }
821              }              }
822          }          }
823          # Now all roles for this subsystem have been filled in. We create the                  # Now we create the spreadsheet for the subsystem by matching roles to
824          # spreadsheet by matches roles to genomes. To do this, we need to                  # genomes. Each genome is a row and each role is a column. We may need
825          # get the genomes on the sheet.                  # to actually create the roles as we find them.
826          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);          Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
827          my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};                  for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
828          for my $genomeID (@genomes) {                      # Only proceed if this is one of our genomes.
             # Only process this genome if it's one of ours.  
829              if (exists $genomeHash->{$genomeID}) {              if (exists $genomeHash->{$genomeID}) {
830                  # Connect the genome to the subsystem.                          # Count the PEGs and cells found for verification purposes.
831                  $loadParticipatesIn->Put($genomeID, $subsysID);                          my $pegCount = 0;
832                            my $cellCount = 0;
833                            # Create a list for the PEGs we find. This list will be used
834                            # to generate cluster numbers.
835                            my @pegsFound = ();
836                            # Create a hash that maps spreadsheet IDs to PEGs. We will
837                            # use this to generate the ContainsFeature data after we have
838                            # the cluster numbers.
839                            my %cellPegs = ();
840                            # Get the genome's variant code for this subsystem.
841                            my $variantCode = $sub->get_variant_code($row);
842                  # Loop through the subsystem's roles. We use an index because it is                  # Loop through the subsystem's roles. We use an index because it is
843                  # part of the spreadsheet cell ID.                  # part of the spreadsheet cell ID.
844                  for (my $i = 0; $i <= $#roles; $i++) {                          for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                     my $role = $roles[$i];  
845                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
846                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);                              my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
847                      # Only proceed if features exist.                      # Only proceed if features exist.
848                      if (@pegs > 0) {                      if (@pegs > 0) {
849                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
850                          my $cellID = "$subsysID:$genomeID:$i";                                  $cellCount++;
851                                    my $cellID = "$subsysID:$genomeID:$col";
852                          $loadSSCell->Put($cellID);                          $loadSSCell->Put($cellID);
853                          $loadIsGenomeOf->Put($genomeID, $cellID);                          $loadIsGenomeOf->Put($genomeID, $cellID);
854                          $loadIsRoleOf->Put($role, $cellID);                                  $loadIsRoleOf->Put($roleID, $cellID);
855                          $loadHasSSCell->Put($subsysID, $cellID);                          $loadHasSSCell->Put($subsysID, $cellID);
856                          # Attach the features to it.                                  # Remember its features.
857                          for my $pegID (@pegs) {                                  push @pegsFound, @pegs;
858                              $loadContainsFeature->Put($cellID, $pegID);                                  $cellPegs{$cellID} = \@pegs;
859                                    $pegCount += @pegs;
860                                }
861                            }
862                            # If we found some cells for this genome, we need to compute clusters and
863                            # denote it participates in the subsystem.
864                            if ($pegCount > 0) {
865                                Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
866                                $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
867                                # Create a hash mapping PEG IDs to cluster numbers.
868                                # We default to -1 for all of them.
869                                my %clusterOf = map { $_ => -1 } @pegsFound;
870                                # Partition the PEGs found into clusters.
871                                my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
872                                for (my $i = 0; $i <= $#clusters; $i++) {
873                                    my $subList = $clusters[$i];
874                                    for my $peg (@{$subList}) {
875                                        $clusterOf{$peg} = $i;
876                                    }
877                                }
878                                # Create the ContainsFeature data.
879                                for my $cellID (keys %cellPegs) {
880                                    my $cellList = $cellPegs{$cellID};
881                                    for my $cellPeg (@$cellList) {
882                                        $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
883                          }                          }
884                      }                      }
885                  }                  }
886              }              }
887          }          }
888                    # Now we need to generate the subsets. The subset names must be concatenated to
889                    # the subsystem name to make them unique keys. There are two types of subsets:
890                    # genome subsets and role subsets. We do the role subsets first.
891                    my @subsetNames = $sub->get_subset_names();
892                    for my $subsetID (@subsetNames) {
893                        # Create the subset record.
894                        my $actualID = "$subsysID:$subsetID";
895                        $loadRoleSubset->Put($actualID);
896                        # Connect the subset to the subsystem.
897                        $loadHasRoleSubset->Put($subsysID, $actualID);
898                        # Connect the subset to its roles.
899                        my @roles = $sub->get_subsetC_roles($subsetID);
900                        for my $roleID (@roles) {
901                            $loadConsistsOfRoles->Put($actualID, $roleID);
902      }      }
     # Finish the load.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
903  }  }
904                    # Next the genome subsets.
905  =head3 LoadDiagramData                  @subsetNames = $sub->get_subset_namesR();
906                    for my $subsetID (@subsetNames) {
907  C<< my $stats = $spl->LoadDiagramData(); >>                      # Create the subset record.
908                        my $actualID = "$subsysID:$subsetID";
909  Load the diagram data from FIG into Sprout.                      $loadGenomeSubset->Put($actualID);
910                        # Connect the subset to the subsystem.
911  Diagrams are used to organize functional roles. The diagram shows the                      $loadHasGenomeSubset->Put($subsysID, $actualID);
912  connections between chemicals that interact with a subsystem.                      # Connect the subset to its genomes.
913                        my @genomes = $sub->get_subsetR($subsetID);
914  The following relations are loaded by this method.                      for my $genomeID (@genomes) {
915                            $loadConsistsOfGenomes->Put($actualID, $genomeID);
916      Diagram                      }
917      RoleOccursIn                  }
918                }
919  =over 4          }
920            # Now we loop through the diagrams. We need to create the diagram records
921  =item RETURNS          # and link each diagram to its roles. Note that only roles which occur
922            # in subsystems (and therefore appear in the %ecToRoles hash) are
923  Returns a statistics object for the loads.          # included.
924            for my $map (@maps) {
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadDiagramData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the map list.  
     my @maps = $fig->all_maps;  
     my $mapCount = @maps;  
     my $genomeCount = (keys %{$self->{genomes}});  
     my $featureCount = $genomeCount * 4000;  
     # Create load objects for each of the tables we're loading.  
     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);  
     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);  
     Trace("Beginning diagram data load.") if T(2);  
     # Loop through the diagrams.  
     for my $map ($fig->all_maps) {  
925          Trace("Loading diagram $map.") if T(3);          Trace("Loading diagram $map.") if T(3);
926          # Get the diagram's descriptive name.          # Get the diagram's descriptive name.
927          my $name = $fig->map_name($map);          my $name = $fig->map_name($map);
# Line 740  Line 930 
930          # A hash is used to prevent duplicates.          # A hash is used to prevent duplicates.
931          my %roleHash = ();          my %roleHash = ();
932          for my $role ($fig->map_to_ecs($map)) {          for my $role ($fig->map_to_ecs($map)) {
933              if (! $roleHash{$role}) {                  if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
934                  $loadRoleOccursIn->Put($role, $map);                      $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
935                  $roleHash{$role} = 1;                  $roleHash{$role} = 1;
936              }              }
937          }          }
938      }      }
939            # Before we leave, we must create the Catalyzes table. We start with the reactions,
940            # then use the "ecToRoles" table to convert EC numbers to role IDs.
941            my @reactions = $fig->all_reactions();
942            for my $reactionID (@reactions) {
943                # Get this reaction's list of roles. The results will be EC numbers.
944                my @roles = $fig->catalyzed_by($reactionID);
945                # Loop through the roles, creating catalyzation records.
946                for my $thisRole (@roles) {
947                    if (exists $ecToRoles{$thisRole}) {
948                        $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
949                    }
950                }
951            }
952        }
953      # Finish the load.      # Finish the load.
954      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
955      return $retVal;      return $retVal;
# Line 787  Line 991 
991      my $fig = $self->{fig};      my $fig = $self->{fig};
992      # Get the genome hash.      # Get the genome hash.
993      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
994      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
995      my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);      my $loadProperty = $self->_TableLoader('Property');
996      my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);      my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
997      Trace("Beginning property data load.") if T(2);      if ($self->{options}->{loadOnly}) {
998            Trace("Loading from existing files.") if T(2);
999        } else {
1000            Trace("Generating property data.") if T(2);
1001      # Create a hash for storing property IDs.      # Create a hash for storing property IDs.
1002      my %propertyKeys = ();      my %propertyKeys = ();
1003      my $nextID = 1;      my $nextID = 1;
1004      # Loop through the genomes.      # Loop through the genomes.
1005      for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1006          $loadProperty->Add("genomeIn");          $loadProperty->Add("genomeIn");
1007                Trace("Generating properties for $genomeID.") if T(3);
1008          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
1009          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
1010          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
1011          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
1012                my $featureCount = 0;
1013                my $propertyCount = 0;
1014                # Get the properties for this genome's features.
1015                my $attributes = GetGenomeAttributes($fig, $genomeID, \@features);
1016                Trace("Property hash built for $genomeID.") if T(3);
1017          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
1018          for my $fid (@features) {          for my $fid (@features) {
             $loadProperty->Add("featureIn");  
1019              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
1020              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
1021              my @attributeList = $fig->get_attributes($fid, '', '', '');                  my @attributeList = @{$attributes->{$fid}};
1022                    if (scalar @attributeList) {
1023                        $featureCount++;
1024                    }
1025              # Loop through the attributes.              # Loop through the attributes.
1026              for my $tuple (@attributeList) {              for my $tuple (@attributeList) {
1027                        $propertyCount++;
1028                  # Get this attribute value's data. Note that we throw away the FID,                  # Get this attribute value's data. Note that we throw away the FID,
1029                  # since it will always be the same as the value if "$fid".                  # since it will always be the same as the value if "$fid".
1030                  my (undef, $key, $value, $url) = @{$tuple};                  my (undef, $key, $value, $url) = @{$tuple};
# Line 831  Line 1046 
1046                  $loadHasProperty->Put($fid, $propertyID, $url);                  $loadHasProperty->Put($fid, $propertyID, $url);
1047              }              }
1048          }          }
1049                # Update the statistics.
1050                Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
1051                $loadHasProperty->Add("featuresIn", $featureCount);
1052                $loadHasProperty->Add("propertiesIn", $propertyCount);
1053            }
1054      }      }
1055      # Finish the load.      # Finish the load.
1056      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 871  Line 1091 
1091      my $fig = $self->{fig};      my $fig = $self->{fig};
1092      # Get the genome hash.      # Get the genome hash.
1093      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1094      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1095      my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);      my $loadAnnotation = $self->_TableLoader('Annotation');
1096      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);      my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1097      my $loadSproutUser = $self->_TableLoader('SproutUser', 100);      my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1098      my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);      my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1099      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);      my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1100      Trace("Beginning annotation data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1101            Trace("Loading from existing files.") if T(2);
1102        } else {
1103            Trace("Generating annotation data.") if T(2);
1104      # Create a hash of user names. We'll use this to prevent us from generating duplicate      # Create a hash of user names. We'll use this to prevent us from generating duplicate
1105      # user records.      # user records.
1106      my %users = ( FIG => 1, master => 1 );      my %users = ( FIG => 1, master => 1 );
# Line 892  Line 1114 
1114      # Loop through the genomes.      # Loop through the genomes.
1115      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1116          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
         # Get the genome's PEGs.  
         my @pegs = $fig->pegs_of($genomeID);  
         for my $peg (@pegs) {  
             Trace("Processing $peg.") if T(4);  
1117              # Create a hash of timestamps. We use this to prevent duplicate time stamps              # Create a hash of timestamps. We use this to prevent duplicate time stamps
1118              # from showing up for a single PEG's annotations.              # from showing up for a single PEG's annotations.
1119              my %seenTimestamps = ();              my %seenTimestamps = ();
1120              # Check for a functional assignment.              # Get the genome's annotations.
1121              my $func = $fig->function_of($peg);              my @annotations = $fig->read_all_annotations($genomeID);
1122              if ($func) {              Trace("Processing annotations.") if T(2);
1123                  # If this is NOT a hypothetical assignment, we create an              for my $tuple (@annotations) {
1124                  # assignment annotation for it.                  # Get the annotation tuple.
1125                  if (! FIG::hypo($peg)) {                  my ($peg, $timestamp, $user, $text) = @{$tuple};
                     # Note that we double the slashes so that what goes into the database is  
                     # a new-line escape sequence rather than an actual new-line.  
                     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");  
                     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");  
                     $loadMadeAnnotation->Put("FIG", "$peg:$time");  
                     # Denote we've seen this timestamp.  
                     $seenTimestamps{$time} = 1;  
                 }  
                 # Now loop through the real annotations.  
                 for my $tuple ($fig->feature_annotations($peg, "raw")) {  
                     my ($fid, $timestamp, $user, $text) = @{$tuple};  
1126                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
1127                      # and "\t" and "\n" are escaped. Note we use the "s"                  # and "\t" and "\n" are escaped. Note we use the "gs"
1128                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
1129                      # stop the substitution search.                      # stop the substitution search.
1130                      $text =~ s/\r//gs;                      $text =~ s/\r//gs;
# Line 927  Line 1134 
1134                      $text =~ s/Set master function/Set FIG function/s;                      $text =~ s/Set master function/Set FIG function/s;
1135                      # Insure the time stamp is valid.                      # Insure the time stamp is valid.
1136                      if ($timestamp =~ /^\d+$/) {                      if ($timestamp =~ /^\d+$/) {
1137                          # Here it's a number. We need to insure it's unique.                      # Here it's a number. We need to insure the one we use to form
1138                          while ($seenTimestamps{$timestamp}) {                      # the key is unique.
1139                              $timestamp++;                      my $keyStamp = $timestamp;
1140                        while ($seenTimestamps{"$peg:$keyStamp"}) {
1141                            $keyStamp++;
1142                          }                          }
1143                          $seenTimestamps{$timestamp} = 1;                      my $annotationID = "$peg:$keyStamp";
1144                          my $annotationID = "$peg:$timestamp";                      $seenTimestamps{$annotationID} = 1;
1145                          # Insure the user exists.                          # Insure the user exists.
1146                          if (! $users{$user}) {                          if (! $users{$user}) {
1147                              $loadSproutUser->Put($user, "SEED user");                              $loadSproutUser->Put($user, "SEED user");
# Line 940  Line 1149 
1149                              $users{$user} = 1;                              $users{$user} = 1;
1150                          }                          }
1151                          # Generate the annotation.                          # Generate the annotation.
1152                          $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");                      $loadAnnotation->Put($annotationID, $timestamp, $text);
1153                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);                          $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1154                          $loadMadeAnnotation->Put($user, $annotationID);                          $loadMadeAnnotation->Put($user, $annotationID);
1155                      } else {                      } else {
# Line 950  Line 1159 
1159                  }                  }
1160              }              }
1161          }          }
     }  
1162      # Finish the load.      # Finish the load.
1163      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1164      return $retVal;      return $retVal;
# Line 991  Line 1199 
1199      my $fig = $self->{fig};      my $fig = $self->{fig};
1200      # Get the genome hash.      # Get the genome hash.
1201      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1202      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1203      my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);      my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1204      my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);      my $loadSource = $self->_TableLoader('Source');
1205      my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);      my $loadSourceURL = $self->_TableLoader('SourceURL');
1206      Trace("Beginning source data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1207            Trace("Loading from existing files.") if T(2);
1208        } else {
1209            Trace("Generating annotation data.") if T(2);
1210      # Create hashes to collect the Source information.      # Create hashes to collect the Source information.
1211      my %sourceURL = ();      my %sourceURL = ();
1212      my %sourceDesc = ();      my %sourceDesc = ();
# Line 1010  Line 1220 
1220              chomp $line;              chomp $line;
1221              my($sourceID, $desc, $url) = split(/\t/,$line);              my($sourceID, $desc, $url) = split(/\t/,$line);
1222              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1223              if ($url && ! exists $sourceURL{$genomeID}) {                  if ($url && ! exists $sourceURL{$sourceID}) {
1224                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
1225                  $sourceURL{$sourceID} = 1;                  $sourceURL{$sourceID} = 1;
1226              }              }
1227              if ($desc && ! exists $sourceDesc{$sourceID}) {                  if ($desc) {
1228                  $loadSource->Put($sourceID, $desc);                      $sourceDesc{$sourceID} = $desc;
1229                  $sourceDesc{$sourceID} = 1;                  } elsif (! exists $sourceDesc{$sourceID}) {
1230                        $sourceDesc{$sourceID} = $sourceID;
1231              }              }
1232          }          }
1233          close TMP;          close TMP;
1234      }      }
1235            # Write the source descriptions.
1236            for my $sourceID (keys %sourceDesc) {
1237                $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1238            }
1239        }
1240      # Finish the load.      # Finish the load.
1241      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1242      return $retVal;      return $retVal;
# Line 1060  Line 1276 
1276      my $fig = $self->{fig};      my $fig = $self->{fig};
1277      # Get the genome hash.      # Get the genome hash.
1278      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1279      # Convert the genome hash. We'll get the genus and species for each genome and make      # Convert the genome hash. We'll get the genus and species for each genome and make
1280      # it the key.      # it the key.
1281      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});      my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1282      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
1283      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);      my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1284      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);      my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1285      Trace("Beginning external data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1286            Trace("Loading from existing files.") if T(2);
1287        } else {
1288            Trace("Generating external data.") if T(2);
1289      # We loop through the files one at a time. First, the organism file.      # We loop through the files one at a time. First, the organism file.
1290      Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1291      my $orgLine;      my $orgLine;
1292      while (defined($orgLine = <ORGS>)) {      while (defined($orgLine = <ORGS>)) {
1293          # Clean the input line.          # Clean the input line.
# Line 1081  Line 1299 
1299      close ORGS;      close ORGS;
1300      # Now the function file.      # Now the function file.
1301      my $funcLine;      my $funcLine;
1302      Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1303      while (defined($funcLine = <FUNCS>)) {      while (defined($funcLine = <FUNCS>)) {
1304          # Clean the line ending.          # Clean the line ending.
1305          chomp $funcLine;          chomp $funcLine;
# Line 1097  Line 1315 
1315              $loadExternalAliasFunc->Put(@funcFields[0,1]);              $loadExternalAliasFunc->Put(@funcFields[0,1]);
1316          }          }
1317      }      }
1318        }
1319        # Finish the load.
1320        my $retVal = $self->_FinishAll();
1321        return $retVal;
1322    }
1323    
1324    
1325    =head3 LoadReactionData
1326    
1327    C<< my $stats = $spl->LoadReactionData(); >>
1328    
1329    Load the reaction data from FIG into Sprout.
1330    
1331    Reaction data connects reactions to the compounds that participate in them.
1332    
1333    The following relations are loaded by this method.
1334    
1335        Reaction
1336        ReactionURL
1337        Compound
1338        CompoundName
1339        CompoundCAS
1340        IsAComponentOf
1341    
1342    This method proceeds reaction by reaction rather than genome by genome.
1343    
1344    =over 4
1345    
1346    =item RETURNS
1347    
1348    Returns a statistics object for the loads.
1349    
1350    =back
1351    
1352    =cut
1353    #: Return Type $%;
1354    sub LoadReactionData {
1355        # Get this object instance.
1356        my ($self) = @_;
1357        # Get the FIG object.
1358        my $fig = $self->{fig};
1359        # Create load objects for each of the tables we're loading.
1360        my $loadReaction = $self->_TableLoader('Reaction');
1361        my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1362        my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1363        my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1364        my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1365        my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1366        if ($self->{options}->{loadOnly}) {
1367            Trace("Loading from existing files.") if T(2);
1368        } else {
1369            Trace("Generating annotation data.") if T(2);
1370            # First we create the compounds.
1371            my @compounds = $fig->all_compounds();
1372            for my $cid (@compounds) {
1373                # Check for names.
1374                my @names = $fig->names_of_compound($cid);
1375                # Each name will be given a priority number, starting with 1.
1376                my $prio = 1;
1377                for my $name (@names) {
1378                    $loadCompoundName->Put($cid, $name, $prio++);
1379                }
1380                # Create the main compound record. Note that the first name
1381                # becomes the label.
1382                my $label = (@names > 0 ? $names[0] : $cid);
1383                $loadCompound->Put($cid, $label);
1384                # Check for a CAS ID.
1385                my $cas = $fig->cas($cid);
1386                if ($cas) {
1387                    $loadCompoundCAS->Put($cid, $cas);
1388                }
1389            }
1390            # All the compounds are set up, so we need to loop through the reactions next. First,
1391            # we initialize the discriminator index. This is a single integer used to insure
1392            # duplicate elements in a reaction are not accidentally collapsed.
1393            my $discrim = 0;
1394            my @reactions = $fig->all_reactions();
1395            for my $reactionID (@reactions) {
1396                # Create the reaction record.
1397                $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1398                # Compute the reaction's URL.
1399                my $url = HTML::reaction_link($reactionID);
1400                # Put it in the ReactionURL table.
1401                $loadReactionURL->Put($reactionID, $url);
1402                # Now we need all of the reaction's compounds. We get these in two phases,
1403                # substrates first and then products.
1404                for my $product (0, 1) {
1405                    # Get the compounds of the current type for the current reaction. FIG will
1406                    # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1407                    # have location data in SEED, so it defaults to the empty string.
1408                    my @compounds = $fig->reaction2comp($reactionID, $product);
1409                    for my $compData (@compounds) {
1410                        # Extract the compound data from the current tuple.
1411                        my ($cid, $stoich, $main) = @{$compData};
1412                        # Link the compound to the reaction.
1413                        $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1414                                                 $product, $stoich);
1415                    }
1416                }
1417            }
1418        }
1419      # Finish the load.      # Finish the load.
1420      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1421      return $retVal;      return $retVal;
# Line 1112  Line 1431 
1431    
1432      GenomeGroups      GenomeGroups
1433    
1434  There is no direct support for genome groups in FIG, so we access the SEED  Currently, we do not use groups. We used to use them for NMPDR groups,
1435    butThere is no direct support for genome groups in FIG, so we access the SEED
1436  files directly.  files directly.
1437    
1438  =over 4  =over 4
# Line 1132  Line 1452 
1452      my $fig = $self->{fig};      my $fig = $self->{fig};
1453      # Get the genome hash.      # Get the genome hash.
1454      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
     my $genomeCount = (keys %{$genomeHash});  
1455      # Create a load object for the table we're loading.      # Create a load object for the table we're loading.
1456      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);      my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1457      Trace("Beginning group data load.") if T(2);      if ($self->{options}->{loadOnly}) {
1458            Trace("Loading from existing files.") if T(2);
1459        } else {
1460            Trace("Generating group data.") if T(2);
1461            # Currently there are no groups.
1462        }
1463        # Finish the load.
1464        my $retVal = $self->_FinishAll();
1465        return $retVal;
1466    }
1467    
1468    =head3 LoadSynonymData
1469    
1470    C<< my $stats = $spl->LoadSynonymData(); >>
1471    
1472    Load the synonym groups into Sprout.
1473    
1474    The following relations are loaded by this method.
1475    
1476        SynonymGroup
1477        IsSynonymGroupFor
1478    
1479    The source information for these relations is taken from the C<maps_to_id> method
1480    of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1481    SQL against the FIG database.
1482    
1483    =over 4
1484    
1485    =item RETURNS
1486    
1487    Returns a statistics object for the loads.
1488    
1489    =back
1490    
1491    =cut
1492    #: Return Type $%;
1493    sub LoadSynonymData {
1494        # Get this object instance.
1495        my ($self) = @_;
1496        # Get the FIG object.
1497        my $fig = $self->{fig};
1498        # Get the genome hash.
1499        my $genomeHash = $self->{genomes};
1500        # Create a load object for the table we're loading.
1501        my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1502        my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1503        if ($self->{options}->{loadOnly}) {
1504            Trace("Loading from existing files.") if T(2);
1505        } else {
1506            Trace("Generating synonym group data.") if T(2);
1507            # Get the database handle.
1508            my $dbh = $fig->db_handle();
1509            # Ask for the synonyms.
1510            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1511            my $result = $sth->execute();
1512            if (! defined($result)) {
1513                Confess("Database error in Synonym load: " . $sth->errstr());
1514            } else {
1515                # Remember the current synonym.
1516                my $current_syn = "";
1517                # Count the features.
1518                my $featureCount = 0;
1519                # Loop through the synonym/peg pairs.
1520                while (my @row = $sth->fetchrow()) {
1521                    # Get the synonym ID and feature ID.
1522                    my ($syn_id, $peg) = @row;
1523                    # Insure it's for one of our genomes.
1524                    my $genomeID = FIG::genome_of($peg);
1525                    if (exists $genomeHash->{$genomeID}) {
1526                        # Verify the synonym.
1527                        if ($syn_id ne $current_syn) {
1528                            # It's new, so put it in the group table.
1529                            $loadSynonymGroup->Put($syn_id);
1530                            $current_syn = $syn_id;
1531                        }
1532                        # Connect the synonym to the peg.
1533                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1534                        # Count this feature.
1535                        $featureCount++;
1536                        if ($featureCount % 1000 == 0) {
1537                            Trace("$featureCount features processed.") if T(3);
1538                        }
1539                    }
1540                }
1541            }
1542        }
1543        # Finish the load.
1544        my $retVal = $self->_FinishAll();
1545        return $retVal;
1546    }
1547    
1548    =head3 LoadFamilyData
1549    
1550    C<< my $stats = $spl->LoadFamilyData(); >>
1551    
1552    Load the protein families into Sprout.
1553    
1554    The following relations are loaded by this method.
1555    
1556        Family
1557        IsFamilyForFeature
1558    
1559    The source information for these relations is taken from the C<families_for_protein>,
1560    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1561    
1562    =over 4
1563    
1564    =item RETURNS
1565    
1566    Returns a statistics object for the loads.
1567    
1568    =back
1569    
1570    =cut
1571    #: Return Type $%;
1572    sub LoadFamilyData {
1573        # Get this object instance.
1574        my ($self) = @_;
1575        # Get the FIG object.
1576        my $fig = $self->{fig};
1577        # Get the genome hash.
1578        my $genomeHash = $self->{genomes};
1579        # Create load objects for the tables we're loading.
1580        my $loadFamily = $self->_TableLoader('Family');
1581        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1582        if ($self->{options}->{loadOnly}) {
1583            Trace("Loading from existing files.") if T(2);
1584        } else {
1585            Trace("Generating family data.") if T(2);
1586            # Create a hash for the family IDs.
1587            my %familyHash = ();
1588      # Loop through the genomes.      # Loop through the genomes.
1589      my $line;          for my $genomeID (sort keys %{$genomeHash}) {
1590      for my $genomeID (keys %{$genomeHash}) {              Trace("Processing features for $genomeID.") if T(2);
1591          Trace("Processing $genomeID.") if T(3);              # Loop through this genome's PEGs.
1592          # Open the NMPDR group file for this genome.              for my $fid ($fig->all_features($genomeID, "peg")) {
1593          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&                  $loadIsFamilyForFeature->Add("features", 1);
1594              defined($line = <TMP>)) {                  # Get this feature's families.
1595              # Clean the line ending.                  my @families = $fig->families_for_protein($fid);
1596              chomp $line;                  # Loop through the families, connecting them to the feature.
1597              # Add the group to the table. Note that there can only be one group                  for my $family (@families) {
1598              # per genome.                      $loadIsFamilyForFeature->Put($family, $fid);
1599              $loadGenomeGroups->Put($genomeID, $line);                      # If this is a new family, create a record for it.
1600                        if (! exists $familyHash{$family}) {
1601                            $familyHash{$family} = 1;
1602                            $loadFamily->Add("families", 1);
1603                            my $size = $fig->sz_family($family);
1604                            my $func = $fig->family_function($family);
1605                            $loadFamily->Put($family, $size, $func);
1606                        }
1607                    }
1608                }
1609          }          }
         close TMP;  
1610      }      }
1611      # Finish the load.      # Finish the load.
1612      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
1613      return $retVal;      return $retVal;
1614  }  }
1615    
1616    =head3 LoadDrugData
1617    
1618    C<< my $stats = $spl->LoadDrugData(); >>
1619    
1620    Load the drug target data into Sprout.
1621    
1622    The following relations are loaded by this method.
1623    
1624        DrugProject
1625        ContainsTopic
1626        DrugTopic
1627        ContainsAnalysisOf
1628        PDB
1629        IncludesBound
1630        IsBoundIn
1631        BindsWith
1632        Ligand
1633        DescribesProteinForFeature
1634        FeatureConservation
1635    
1636    The source information for these relations is taken from flat files in the
1637    C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains
1638    a list of drug project names paired with file names. The named file (in the
1639    same directory) contains all the data for the project.
1640    
1641    =over 4
1642    
1643    =item RETURNS
1644    
1645    Returns a statistics object for the loads.
1646    
1647    =back
1648    
1649    =cut
1650    #: Return Type $%;
1651    sub LoadDrugData {
1652        # Get this object instance.
1653        my ($self) = @_;
1654        # Get the FIG object.
1655        my $fig = $self->{fig};
1656        # Get the genome hash.
1657        my $genomeHash = $self->{genomes};
1658        # Create load objects for the tables we're loading.
1659        my $loadDrugProject = $self->_TableLoader('DrugProject');
1660        my $loadContainsTopic = $self->_TableLoader('ContainsTopic');
1661        my $loadDrugTopic = $self->_TableLoader('DrugTopic');
1662        my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');
1663        my $loadPDB = $self->_TableLoader('PDB');
1664        my $loadIncludesBound = $self->_TableLoader('IncludesBound');
1665        my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');
1666        my $loadBindsWith = $self->_TableLoader('BindsWith');
1667        my $loadLigand = $self->_TableLoader('Ligand');
1668        my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');
1669        my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');
1670        if ($self->{options}->{loadOnly}) {
1671            Trace("Loading from existing files.") if T(2);
1672        } else {
1673            Trace("Generating drug target data.") if T(2);
1674            # Load the project list. The file comes in as a list of chomped lines,
1675            # and we split them on the TAB character to make the project name the
1676            # key and the file name the value of the resulting hash.
1677            my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");
1678            # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects
1679            # may occur multiple times in a single project file or even in multiple project
1680            # files.
1681            my %ligands = ();
1682            my %pdbs = ();
1683            my %features = ();
1684            my %bindings = ();
1685            # Set up a counter for drug topics. This will be used as the key.
1686            my $topicCounter = 0;
1687            # Loop through the projects. We sort the keys not because we need them sorted, but
1688            # because it makes it easier to infer our progress from trace messages.
1689            for my $project (sort keys %projects) {
1690                Trace("Processing project $project.") if T(3);
1691                # Only proceed if the download file exists.
1692                my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";
1693                if (! -f $projectFile) {
1694                    Trace("Project file $projectFile not found.") if T(0);
1695                } else {
1696                    # Create the project record.
1697                    $loadDrugProject->Put($project);
1698                    # Create a hash for the topics. Each project has one or more topics. The
1699                    # topic is identified by a URL, a category, and an identifier.
1700                    my %topics = ();
1701                    # Now we can open the project file.
1702                    Trace("Reading project file $projectFile.") if T(3);
1703                    Open(\*PROJECT, "<$projectFile");
1704                    # Get the first record, which is a list of column headers. We don't use this
1705                    # for anything, but it may be useful for debugging.
1706                    my $headerLine = <PROJECT>;
1707                    # Loop through the rest of the records.
1708                    while (! eof PROJECT) {
1709                        # Get the current line of data. Note that not all lines will have all
1710                        # the fields. In particular, the CLIBE data is fairly rare.
1711                        my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,
1712                            $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,
1713                            $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,
1714                            $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,
1715                            $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)
1716                           = Tracer::GetLine(\*PROJECT);
1717                        # The tag contains an identifier for the current line of data followed
1718                        # by a text statement that generally matches a property name in the
1719                        # main database. We split it up, since the identifier goes with
1720                        # the PDB data and the text statement is part of the topic.
1721                        my ($lineID, $topicTag) = split /\s*,\s*/, $tag;
1722                        $loadDrugProject->Add("data line");
1723                        # Check for a new topic.
1724                        my $topicData = "$category\t$topicTag\t$refURL";
1725                        if (! exists $topics{$topicData}) {
1726                            # Here we have a new topic. Compute its ID.
1727                            $topicCounter++;
1728                            $topics{$topicData} = $topicCounter;
1729                            # Create its database record.
1730                            $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,
1731                                                $topicTag);
1732                            # Connect it to the project.
1733                            $loadContainsTopic->Put($project, $topicCounter);
1734                            $loadDrugTopic->Add("topic");
1735                        }
1736                        # Now we know the topic ID exists in the hash and the topic will
1737                        # appear in the database, so we get this topic's ID.
1738                        my $topicID = $topics{$topicData};
1739                        # If the feature in this line is new, we need to save its conservation
1740                        # number.
1741                        if (! exists $features{$peg}) {
1742                            $loadFeatureConservation->Put($peg, $conservation);
1743                            $features{$peg} = 1;
1744                        }
1745                        # Now we have two PDBs to deal with-- a bound PDB and a free PDB.
1746                        # The free PDB will have data about docking points; the bound PDB
1747                        # will have data about docking. We store both types as PDBs, and
1748                        # the special data comes from relationships. First we process the
1749                        # bound PDB.
1750                        if ($pdbBound) {
1751                            $loadPDB->Add("bound line");
1752                            # Insure this PDB is in the database.
1753                            $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);
1754                            # Connect it to this topic.
1755                            $loadIncludesBound->Put($topicID, $pdbBound);
1756                            # Check for CLIBE data.
1757                            if ($clibeInfo) {
1758                                $loadLigand->Add("clibes");
1759                                # We have CLIBE data, so we create a ligand and relate it to the PDB.
1760                                if (! exists $ligands{$clibeInfo}) {
1761                                    # This is a new ligand, so create its record.
1762                                    $loadLigand->Put($clibeInfo);
1763                                    $loadLigand->Add("ligand");
1764                                    # Make sure we know this ligand already exists.
1765                                    $ligands{$clibeInfo} = 1;
1766                                }
1767                                # Now connect the PDB to the ligand using the CLIBE data.
1768                                $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,
1769                                                    $clibeSolvationE, $clibeVanderwaals);
1770                            }
1771                            # Connect this PDB to the feature.
1772                            $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);
1773                        }
1774                        # Next is the free PDB.
1775                        if ($pdbFree) {
1776                            $loadPDB->Add("free line");
1777                            # Insure this PDB is in the database.
1778                            $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);
1779                            # Connect it to this topic.
1780                            $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,
1781                                                         $passWeightFile, $passWeightInfo, $passAspFile);
1782                            # Connect this PDB to the feature.
1783                            $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);
1784                        }
1785                        # If we have both PDBs, we may need to link them.
1786                        if ($pdbFree && $pdbBound) {
1787                            $loadIsBoundIn->Add("connection");
1788                            # Insure we only link them once.
1789                            my $bindingKey =  "$pdbFree\t$pdbBound";
1790                            if (! exists $bindings{$bindingKey}) {
1791                                $loadIsBoundIn->Add("newConnection");
1792                                $loadIsBoundIn->Put($pdbFree, $pdbBound);
1793                                $bindings{$bindingKey} = 1;
1794                            }
1795                        }
1796                    }
1797                    # Close off this project.
1798                    close PROJECT;
1799                }
1800            }
1801        }
1802        # Finish the load.
1803        my $retVal = $self->_FinishAll();
1804        return $retVal;
1805    }
1806    
1807    
1808  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1809    
1810    =head3 SpecialAttribute
1811    
1812    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1813    
1814    Look for special attributes of a given type. A special attribute is found by comparing one of
1815    the columns of the incoming attribute list to a search pattern. If a match is found, then
1816    a set of columns is put into an output table connected to the specified ID.
1817    
1818    For example, when processing features, the attribute list we look at has three columns: attribute
1819    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1820    begins with C<iedb_>. The call signature is therefore
1821    
1822        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1823    
1824    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1825    to the output along with the specified feature ID.
1826    
1827    =over 4
1828    
1829    =item id
1830    
1831    ID of the object whose special attributes are being loaded. This forms the first column of the
1832    output.
1833    
1834    =item attributes
1835    
1836    Reference to a list of tuples.
1837    
1838    =item idxMatch
1839    
1840    Index in each tuple of the column to be matched against the pattern. If the match is
1841    successful, an output record will be generated.
1842    
1843    =item idxValues
1844    
1845    Reference to a list containing the indexes in each tuple of the columns to be put as
1846    the second column of the output.
1847    
1848    =item pattern
1849    
1850    Pattern to be matched against the specified column. The match will be case-insensitive.
1851    
1852    =item loader
1853    
1854    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1855    but technically it could be anything with a C<Put> method.
1856    
1857    =item RETURN
1858    
1859    Returns a count of the matches found.
1860    
1861    =item
1862    
1863    =back
1864    
1865    =cut
1866    
1867    sub SpecialAttribute {
1868        # Get the parameters.
1869        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1870        # Declare the return variable.
1871        my $retVal = 0;
1872        # Loop through the attribute rows.
1873        for my $row (@{$attributes}) {
1874            # Check for a match.
1875            if ($row->[$idxMatch] =~ m/$pattern/i) {
1876                # We have a match, so output a row. This is a bit tricky, since we may
1877                # be putting out multiple columns of data from the input.
1878                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1879                $loader->Put($id, $value);
1880                $retVal++;
1881            }
1882        }
1883        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1884        # Return the number of matches.
1885        return $retVal;
1886    }
1887    
1888    =head3 CreatePDB
1889    
1890    C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>
1891    
1892    Insure that a PDB record exists for the identified PDB. If one does not exist, it will be
1893    created.
1894    
1895    =over 4
1896    
1897    =item pdbID
1898    
1899    ID string (usually an unqualified file name) for the desired PDB.
1900    
1901    =item title
1902    
1903    Title to use if the PDB must be created.
1904    
1905    =item type
1906    
1907    Type of PDB: C<free> or C<bound>
1908    
1909    =item pdbHash
1910    
1911    Hash containing the IDs of PDBs that have already been created.
1912    
1913    =item pdbLoader
1914    
1915    Load object for the PDB table.
1916    
1917    =back
1918    
1919    =cut
1920    
1921    sub CreatePDB {
1922        # Get the parameters.
1923        my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;
1924        $pdbLoader->Add("PDB check");
1925        # Check to see if this is a new PDB.
1926        if (! exists $pdbHash->{$pdbID}) {
1927            # It is, so we create it.
1928            $pdbLoader->Put($pdbID, $title, $type);
1929            $pdbHash->{$pdbID} = 1;
1930            # Count it.
1931            $pdbLoader->Add("PDB-$type");
1932        }
1933    }
1934    
1935  =head3 TableLoader  =head3 TableLoader
1936    
1937  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1172  Line 1946 
1946    
1947  Name of the table (relation) being loaded.  Name of the table (relation) being loaded.
1948    
1949  =item rowCount (optional)  =item ignore
1950    
1951  Estimated maximum number of rows in the table.  TRUE if the table should be ignored entirely, else FALSE.
1952    
1953  =item RETURN  =item RETURN
1954    
# Line 1186  Line 1960 
1960    
1961  sub _TableLoader {  sub _TableLoader {
1962      # Get the parameters.      # Get the parameters.
1963      my ($self, $tableName, $rowCount) = @_;      my ($self, $tableName, $ignore) = @_;
1964      # Create the load object.      # Create the load object.
1965      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);      my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1966                                   $ignore);
1967      # Cache it in the loader list.      # Cache it in the loader list.
1968      push @{$self->{loaders}}, $retVal;      push @{$self->{loaders}}, $retVal;
1969      # Return it to the caller.      # Return it to the caller.
# Line 1222  Line 1997 
1997      my $retVal = Stats->new();      my $retVal = Stats->new();
1998      # Get the loader list.      # Get the loader list.
1999      my $loadList = $self->{loaders};      my $loadList = $self->{loaders};
2000        # Create a hash to hold the statistics objects, keyed on relation name.
2001        my %loaderHash = ();
2002      # Loop through the list, finishing the loads. Note that if the finish fails, we die      # Loop through the list, finishing the loads. Note that if the finish fails, we die
2003      # ignominiously. At some future point, we want to make the loads restartable.      # ignominiously. At some future point, we want to make the loads more restartable.
2004      while (my $loader = pop @{$loadList}) {      while (my $loader = pop @{$loadList}) {
2005            # Get the relation name.
2006            my $relName = $loader->RelName;
2007            # Check the ignore flag.
2008            if ($loader->Ignore) {
2009                Trace("Relation $relName not loaded.") if T(2);
2010            } else {
2011                # Here we really need to finish.
2012                Trace("Finishing $relName.") if T(2);
2013          my $stats = $loader->Finish();          my $stats = $loader->Finish();
2014                $loaderHash{$relName} = $stats;
2015            }
2016        }
2017        # Now we loop through again, actually loading the tables. We want to finish before
2018        # loading so that if something goes wrong at this point, all the load files are usable
2019        # and we don't have to redo all that work.
2020        for my $relName (sort keys %loaderHash) {
2021            # Get the statistics for this relation.
2022            my $stats = $loaderHash{$relName};
2023            # Check for a database load.
2024            if ($self->{options}->{dbLoad}) {
2025                # Here we want to use the load file just created to load the database.
2026                Trace("Loading relation $relName.") if T(2);
2027                my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
2028                # Accumulate the statistics from the DB load.
2029                $stats->Accumulate($newStats);
2030            }
2031          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
         my $relName = $loader->RelName;  
2032          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
2033      }      }
2034      # Return the load statistics.      # Return the load statistics.
2035      return $retVal;      return $retVal;
2036  }  }
2037    =head3 GetGenomeAttributes
2038    
2039    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
2040    
2041    Return a hash of attributes keyed on feature ID. This method gets all the attributes
2042    for all the features of a genome in a single call, then organizes them into a hash.
2043    
2044    =over 4
2045    
2046    =item fig
2047    
2048    FIG-like object for accessing attributes.
2049    
2050    =item genomeID
2051    
2052    ID of the genome who's attributes are desired.
2053    
2054    =item fids
2055    
2056    Reference to a list of the feature IDs whose attributes are to be kept.
2057    
2058    =item RETURN
2059    
2060    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2061    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2062    the attribute key, and one or more attribute values.
2063    
2064    =back
2065    
2066    =cut
2067    
2068    sub GetGenomeAttributes {
2069        # Get the parameters.
2070        my ($fig, $genomeID, $fids) = @_;
2071        # Declare the return variable.
2072        my $retVal = {};
2073        # Get the attributes.
2074        my @aList = $fig->get_attributes("fig|$genomeID%");
2075        # Initialize the hash. This not only enables us to easily determine which FIDs to
2076        # keep, it insures that the caller sees a list reference for every known fid,
2077        # simplifying the logic.
2078        for my $fid (@{$fids}) {
2079            $retVal->{$fid} = [];
2080        }
2081        # Populate the hash.
2082        for my $aListEntry (@aList) {
2083            my $fid = $aListEntry->[0];
2084            if (exists $retVal->{$fid}) {
2085                push @{$retVal->{$fid}}, $aListEntry;
2086            }
2087        }
2088        # Return the result.
2089        return $retVal;
2090    }
2091    
2092  1;  1;

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.82

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3