[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.48, Fri Jul 7 00:24:16 2006 UTC revision 1.82, Tue Apr 10 06:15:35 2007 UTC
# Line 80  Line 80 
80  Either the name of the file containing the list of trusted subsystems or a reference  Either the name of the file containing the list of trusted subsystems or a reference
81  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be  to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>  considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83  in its data directory.) Only subsystem data related to the trusted subsystems is loaded.  in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
84    
85  =item options  =item options
86    
# Line 120  Line 120 
120                      # an omitted access code can be defaulted to 1.                      # an omitted access code can be defaulted to 1.
121                      for my $genomeLine (@genomeList) {                      for my $genomeLine (@genomeList) {
122                          my ($genomeID, $accessCode) = split("\t", $genomeLine);                          my ($genomeID, $accessCode) = split("\t", $genomeLine);
123                          if (undef $accessCode) {                          if (! defined($accessCode)) {
124                              $accessCode = 1;                              $accessCode = 1;
125                          }                          }
126                          $genomes{$genomeID} = $accessCode;                          $genomes{$genomeID} = $accessCode;
# Line 136  Line 136 
136      # We only need it if load-only is NOT specified.      # We only need it if load-only is NOT specified.
137      if (! $options->{loadOnly}) {      if (! $options->{loadOnly}) {
138          if (! defined $subsysFile || $subsysFile eq '') {          if (! defined $subsysFile || $subsysFile eq '') {
139              # Here we want all the NMPDR subsystems. First we get the whole list.              # Here we want all the usable subsystems. First we get the whole list.
140              my @subs = $fig->all_subsystems();              my @subs = $fig->all_subsystems();
141              # Loop through, checking for the NMPDR file.              # Loop through, checking for the NMPDR file.
142              for my $sub (@subs) {              for my $sub (@subs) {
143                  if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") {                  if ($fig->nmpdr_subsystem($sub)) {
144                      $subsystems{$sub} = 1;                      $subsystems{$sub} = 1;
145                  }                  }
146              }              }
# Line 163  Line 163 
163                  Confess("Invalid subsystem parameter in SproutLoad constructor.");                  Confess("Invalid subsystem parameter in SproutLoad constructor.");
164              }              }
165          }          }
166            # Go through the subsys hash again, creating the keyword list for each subsystem.
167            for my $subsystem (keys %subsystems) {
168                my $name = $subsystem;
169                $name =~ s/_/ /g;
170                my $classes = $fig->subsystem_classification($subsystem);
171                $name .= " " . join(" ", @{$classes});
172                $subsystems{$subsystem} = $name;
173            }
174      }      }
175      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
176      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
# Line 266  Line 274 
274              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
275              # Get the full taxonomy.              # Get the full taxonomy.
276              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284                # Open the NMPDR group file for this genome.
285                my $group;
286                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
287                    defined($group = <TMP>)) {
288                    # Clean the line ending.
289                    chomp $group;
290                } else {
291                    # No group, so use the default.
292                    $group = $FIG_Config::otherGroup;
293                }
294                close TMP;
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                               $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
299              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
300              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 340  Line 366 
366      my $fig = $self->{fig};      my $fig = $self->{fig};
367      # Get the genome hash.      # Get the genome hash.
368      my $genomeFilter = $self->{genomes};      my $genomeFilter = $self->{genomes};
369      my $genomeCount = (keys %{$genomeFilter});      # Set up an ID counter for the PCHs.
370      my $featureCount = $genomeCount * 4000;      my $pchID = 0;
371      # Start the loads.      # Start the loads.
372      my $loadCoupling = $self->_TableLoader('Coupling');      my $loadCoupling = $self->_TableLoader('Coupling');
373      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);      my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
# Line 411  Line 437 
437                              }                              }
438                          }                          }
439                          for my $evidenceID (keys %evidenceMap) {                          for my $evidenceID (keys %evidenceMap) {
440                                # Get the ID for this evidence.
441                                $pchID++;
442                              # Create the evidence record.                              # Create the evidence record.
443                              my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};                              my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
444                              $loadPCH->Put($evidenceID, $usage);                              $loadPCH->Put($pchID, $usage);
445                              # Connect it to the coupling.                              # Connect it to the coupling.
446                              $loadIsEvidencedBy->Put($coupleID, $evidenceID);                              $loadIsEvidencedBy->Put($coupleID, $pchID);
447                              # Connect it to the features.                              # Connect it to the features.
448                              $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);                              $loadUsesAsEvidence->Put($pchID, $peg3, 1);
449                              $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);                              $loadUsesAsEvidence->Put($pchID, $peg4, 2);
450                          }                          }
451                      }                      }
452                  }                  }
# Line 447  Line 475 
475      FeatureUpstream      FeatureUpstream
476      IsLocatedIn      IsLocatedIn
477      HasFeature      HasFeature
478        HasRoleInSubsystem
479        FeatureEssential
480        FeatureVirulent
481        FeatureIEDB
482    
483  =over 4  =over 4
484    
# Line 461  Line 493 
493  sub LoadFeatureData {  sub LoadFeatureData {
494      # Get this object instance.      # Get this object instance.
495      my ($self) = @_;      my ($self) = @_;
496      # Get the FIG object.      # Get the FIG and Sprout objects.
497      my $fig = $self->{fig};      my $fig = $self->{fig};
498        my $sprout = $self->{sprout};
499      # Get the table of genome IDs.      # Get the table of genome IDs.
500      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
501      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
# Line 472  Line 505 
505      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
506      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
507      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
508      my $loadHasFeature = $self->_TableLoader('HasFeature');      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);
509        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);
510        my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
511        my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
512        my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
513        # Get the subsystem hash.
514        my $subHash = $self->{subsystems};
515      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
516      # locations.      # locations.
517      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 485  Line 524 
524              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
525              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
526              # Get the feature list for this genome.              # Get the feature list for this genome.
527              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528                # Sort and count the list.
529                my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530                my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532                Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535                # Set up for our duplicate-feature check.
536                my $oldFeatureID = "";
537              # Loop through the features.              # Loop through the features.
538              for my $featureData (@{$features}) {              for my $featureTuple (@featureTuples) {
                 $loadFeature->Add("featureIn");  
539                  # Split the tuple.                  # Split the tuple.
540                  my ($featureID, $locations, undef, $type) = @{$featureData};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541                  # Create the feature record.                  # Check for duplicates.
542                  $loadFeature->Put($featureID, 1, $type);                  if ($featureID eq $oldFeatureID) {
543                  # Link it to the parent genome.                      Trace("Duplicate feature $featureID found.") if T(1);
544                  $loadHasFeature->Put($genomeID, $featureID, $type);                  } else {
545                        $oldFeatureID = $featureID;
546                        # Count this feature.
547                        $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                        # Begin building the keywords. We start with the genome ID, the
554                        # feature ID, the taxonomy, and the organism name.
555                        my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                                        $fig->taxonomy_of($genomeID));
557                  # Create the aliases.                  # Create the aliases.
558                  for my $alias ($fig->feature_aliases($featureID)) {                  for my $alias ($fig->feature_aliases($featureID)) {
559                      $loadFeatureAlias->Put($featureID, $alias);                      $loadFeatureAlias->Put($featureID, $alias);
560                            push @keywords, $alias;
561                  }                  }
562                        Trace("Assignment for $featureID is: $assignment") if T(4);
563                        # Break the assignment into words and shove it onto the
564                        # keyword list.
565                        push @keywords, split(/\s+/, $assignment);
566                        # Link this feature to the parent genome.
567                        $loadHasFeature->Put($genomeID, $featureID, $type);
568                  # Get the links.                  # Get the links.
569                  my @links = $fig->fid_links($featureID);                  my @links = $fig->fid_links($featureID);
570                  for my $link (@links) {                  for my $link (@links) {
# Line 517  Line 583 
583                          $loadFeatureUpstream->Put($featureID, $upstream);                          $loadFeatureUpstream->Put($featureID, $upstream);
584                      }                      }
585                  }                  }
586                        # Now we need to find the subsystems this feature participates in.
587                        # We also add the subsystems to the keyword list. Before we do that,
588                        # we must convert underscores to spaces and tack on the classifications.
589                        my @subsystems = $fig->peg_to_subsystems($featureID);
590                        for my $subsystem (@subsystems) {
591                            # Only proceed if we like this subsystem.
592                            if (exists $subHash->{$subsystem}) {
593                                # Store the has-role link.
594                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
595                                # Save the subsystem's keyword data.
596                                my $subKeywords = $subHash->{$subsystem};
597                                push @keywords, split /\s+/, $subKeywords;
598                                # Now we need to get this feature's role in the subsystem.
599                                my $subObject = $fig->get_subsystem($subsystem);
600                                my @roleColumns = $subObject->get_peg_roles($featureID);
601                                my @allRoles = $subObject->get_roles();
602                                for my $col (@roleColumns) {
603                                    my $role = $allRoles[$col];
604                                    push @keywords, split /\s+/, $role;
605                                    push @keywords, $subObject->get_role_abbr($col);
606                                }
607                            }
608                        }
609                        # There are three special attributes computed from property
610                        # data that we build next. If the special attribute is non-empty,
611                        # its name will be added to the keyword list. First, we get all
612                        # the attributes for this feature. They will come back as
613                        # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
614                        # [name, value, value with URL]. (We don't need the PEG, since
615                        # we already know it.)
616                        my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                             @{$attributes->{$featureID}};
618                        # Now we process each of the special attributes.
619                        if (SpecialAttribute($featureID, \@attributes,
620                                             1, [0,2], '^(essential|potential_essential)$',
621                                             $loadFeatureEssential)) {
622                            push @keywords, 'essential';
623                            $loadFeature->Add('essential');
624                        }
625                        if (SpecialAttribute($featureID, \@attributes,
626                                             0, [2], '^virulen',
627                                             $loadFeatureVirulent)) {
628                            push @keywords, 'virulent';
629                            $loadFeature->Add('virulent');
630                        }
631                        if (SpecialAttribute($featureID, \@attributes,
632                                             0, [0,2], '^iedb_',
633                                             $loadFeatureIEDB)) {
634                            push @keywords, 'iedb';
635                            $loadFeature->Add('iedb');
636                        }
637                        # Now we need to bust up hyphenated words in the keyword
638                        # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                        my $keywordString = "";
641                        my $bustedString = "";
642                        for my $keyword (@keywords) {
643                            if (length $keyword >= 3) {
644                                $keywordString .= " $keyword";
645                                if ($keyword =~ /-/) {
646                                    my @words = split /-/, $keyword;
647                                    $bustedString .= join(" ", "", @words);
648                                }
649                            }
650                        }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                        # Clean the keyword list.
655                        my $cleanWords = $sprout->CleanKeywords($keywordString);
656                        Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                        # Create the feature record.
658                        $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659                  # This part is the roughest. We need to relate the features to contig                  # This part is the roughest. We need to relate the features to contig
660                  # locations, and the locations must be split so that none of them exceed                  # locations, and the locations must be split so that none of them exceed
661                  # the maximum segment size. This simplifies the genes_in_region processing                  # the maximum segment size. This simplifies the genes_in_region processing
# Line 546  Line 685 
685              }              }
686          }          }
687      }      }
     # Finish the loads.  
     my $retVal = $self->_FinishAll();  
     return $retVal;  
 }  
   
 =head3 LoadBBHData  
   
 C<< my $stats = $spl->LoadBBHData(); >>  
   
 Load the bidirectional best hit data from FIG into Sprout.  
   
 Sprout does not store information on similarities. Instead, it has only the  
 bi-directional best hits. Even so, the BBH table is one of the largest in  
 the database.  
   
 The following relations are loaded by this method.  
   
     IsBidirectionalBestHitOf  
   
 =over 4  
   
 =item RETURNS  
   
 Returns a statistics object for the loads.  
   
 =back  
   
 =cut  
 #: Return Type $%;  
 sub LoadBBHData {  
     # Get this object instance.  
     my ($self) = @_;  
     # Get the FIG object.  
     my $fig = $self->{fig};  
     # Get the table of genome IDs.  
     my $genomeHash = $self->{genomes};  
     # Create load objects for each of the tables we're loading.  
     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');  
     if ($self->{options}->{loadOnly}) {  
         Trace("Loading from existing files.") if T(2);  
     } else {  
         Trace("Generating BBH data.") if T(2);  
         # Now we loop through the genomes, generating the data for each one.  
         for my $genomeID (sort keys %{$genomeHash}) {  
             $loadIsBidirectionalBestHitOf->Add("genomeIn");  
             Trace("Processing features for genome $genomeID.") if T(3);  
             # Get the feature list for this genome.  
             my $features = $fig->all_features_detailed($genomeID);  
             # Loop through the features.  
             for my $featureData (@{$features}) {  
                 # Split the tuple.  
                 my ($featureID, $locations, $aliases, $type) = @{$featureData};  
                 # Get the bi-directional best hits.  
                 my @bbhList = $fig->bbhs($featureID);  
                 for my $bbhEntry (@bbhList) {  
                     # Get the target feature ID and the score.  
                     my ($targetID, $score) = @{$bbhEntry};  
                     # Check the target feature's genome.  
                     my $targetGenomeID = $fig->genome_of($targetID);  
                     # Only proceed if it's one of our genomes.  
                     if ($genomeHash->{$targetGenomeID}) {  
                         $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,  
                                                            $score);  
                     }  
                 }  
             }  
         }  
688      }      }
689      # Finish the loads.      # Finish the loads.
690      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 723  Line 795 
795                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
796                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
797                  $loadSubsystem->Put($subsysID, $curator, $notes);                  $loadSubsystem->Put($subsysID, $curator, $notes);
798                  my $class = $fig->subsystem_classification($subsysID);                  # Now for the classification string. This comes back as a list
799                  if ($class) {                  # reference and we convert it to a space-delimited string.
800                      $loadSubsystemClass->Put($subsysID, $class);                  my $classList = $fig->subsystem_classification($subsysID);
801                  }                  my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                    $loadSubsystemClass->Put($subsysID, $classString);
803                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
805                      # Connect to this role.                      # Connect to this role.
# Line 843  Line 916 
916                      }                      }
917                  }                  }
918              }              }
919            }
920              # Now we loop through the diagrams. We need to create the diagram records              # Now we loop through the diagrams. We need to create the diagram records
921              # and link each diagram to its roles. Note that only roles which occur              # and link each diagram to its roles. Note that only roles which occur
922              # in subsystems (and therefore appear in the %ecToRoles hash) are              # in subsystems (and therefore appear in the %ecToRoles hash) are
# Line 876  Line 950 
950                  }                  }
951              }              }
952          }          }
     }  
953      # Finish the load.      # Finish the load.
954      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
955      return $retVal;      return $retVal;
# Line 929  Line 1002 
1002          my %propertyKeys = ();          my %propertyKeys = ();
1003          my $nextID = 1;          my $nextID = 1;
1004          # Loop through the genomes.          # Loop through the genomes.
1005          for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1006              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1007              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1008              # Get the genome's features. The feature ID is the first field in the              # Get the genome's features. The feature ID is the first field in the
# Line 938  Line 1011 
1011              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
1012              my $featureCount = 0;              my $featureCount = 0;
1013              my $propertyCount = 0;              my $propertyCount = 0;
1014                # Get the properties for this genome's features.
1015                my $attributes = GetGenomeAttributes($fig, $genomeID, \@features);
1016                Trace("Property hash built for $genomeID.") if T(3);
1017              # Loop through the features, creating HasProperty records.              # Loop through the features, creating HasProperty records.
1018              for my $fid (@features) {              for my $fid (@features) {
1019                  # Get all attributes for this feature. We do this one feature at a time                  # Get all attributes for this feature. We do this one feature at a time
1020                  # to insure we do not get any genome attributes.                  # to insure we do not get any genome attributes.
1021                  my @attributeList = $fig->get_attributes($fid, '', '', '');                  my @attributeList = @{$attributes->{$fid}};
1022                  if (scalar @attributeList) {                  if (scalar @attributeList) {
1023                      $featureCount++;                      $featureCount++;
1024                  }                  }
# Line 1211  Line 1287 
1287      } else {      } else {
1288          Trace("Generating external data.") if T(2);          Trace("Generating external data.") if T(2);
1289          # We loop through the files one at a time. First, the organism file.          # We loop through the files one at a time. First, the organism file.
1290          Open(\*ORGS, "<$FIG_Config::global/ext_org.table");          Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1291          my $orgLine;          my $orgLine;
1292          while (defined($orgLine = <ORGS>)) {          while (defined($orgLine = <ORGS>)) {
1293              # Clean the input line.              # Clean the input line.
# Line 1223  Line 1299 
1299          close ORGS;          close ORGS;
1300          # Now the function file.          # Now the function file.
1301          my $funcLine;          my $funcLine;
1302          Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");          Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1303          while (defined($funcLine = <FUNCS>)) {          while (defined($funcLine = <FUNCS>)) {
1304              # Clean the line ending.              # Clean the line ending.
1305              chomp $funcLine;              chomp $funcLine;
# Line 1355  Line 1431 
1431    
1432      GenomeGroups      GenomeGroups
1433    
1434  There is no direct support for genome groups in FIG, so we access the SEED  Currently, we do not use groups. We used to use them for NMPDR groups,
1435    butThere is no direct support for genome groups in FIG, so we access the SEED
1436  files directly.  files directly.
1437    
1438  =over 4  =over 4
# Line 1381  Line 1458 
1458          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1459      } else {      } else {
1460          Trace("Generating group data.") if T(2);          Trace("Generating group data.") if T(2);
1461          # Loop through the genomes.          # Currently there are no groups.
         my $line;  
         for my $genomeID (keys %{$genomeHash}) {  
             Trace("Processing $genomeID.") if T(3);  
             # Open the NMPDR group file for this genome.  
             if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&  
                 defined($line = <TMP>)) {  
                 # Clean the line ending.  
                 chomp $line;  
                 # Add the group to the table. Note that there can only be one group  
                 # per genome.  
                 $loadGenomeGroups->Put($genomeID, $line);  
             }  
             close TMP;  
         }  
1462      }      }
1463      # Finish the load.      # Finish the load.
1464      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1414  Line 1477 
1477      IsSynonymGroupFor      IsSynonymGroupFor
1478    
1479  The source information for these relations is taken from the C<maps_to_id> method  The source information for these relations is taken from the C<maps_to_id> method
1480  of the B<FIG> object. The process starts from the features, so it is possible  of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1481  that there will be duplicates in the SynonymGroup load file, since the relationship  SQL against the FIG database.
 is one-to-many toward the features. The automatic sort on primary entity relations  
 will fix this for us.  
1482    
1483  =over 4  =over 4
1484    
# Line 1443  Line 1504 
1504          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1505      } else {      } else {
1506          Trace("Generating synonym group data.") if T(2);          Trace("Generating synonym group data.") if T(2);
1507            # Get the database handle.
1508            my $dbh = $fig->db_handle();
1509            # Ask for the synonyms.
1510            my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1511            my $result = $sth->execute();
1512            if (! defined($result)) {
1513                Confess("Database error in Synonym load: " . $sth->errstr());
1514            } else {
1515                # Remember the current synonym.
1516                my $current_syn = "";
1517                # Count the features.
1518                my $featureCount = 0;
1519                # Loop through the synonym/peg pairs.
1520                while (my @row = $sth->fetchrow()) {
1521                    # Get the synonym ID and feature ID.
1522                    my ($syn_id, $peg) = @row;
1523                    # Insure it's for one of our genomes.
1524                    my $genomeID = FIG::genome_of($peg);
1525                    if (exists $genomeHash->{$genomeID}) {
1526                        # Verify the synonym.
1527                        if ($syn_id ne $current_syn) {
1528                            # It's new, so put it in the group table.
1529                            $loadSynonymGroup->Put($syn_id);
1530                            $current_syn = $syn_id;
1531                        }
1532                        # Connect the synonym to the peg.
1533                        $loadIsSynonymGroupFor->Put($syn_id, $peg);
1534                        # Count this feature.
1535                        $featureCount++;
1536                        if ($featureCount % 1000 == 0) {
1537                            Trace("$featureCount features processed.") if T(3);
1538                        }
1539                    }
1540                }
1541            }
1542        }
1543        # Finish the load.
1544        my $retVal = $self->_FinishAll();
1545        return $retVal;
1546    }
1547    
1548    =head3 LoadFamilyData
1549    
1550    C<< my $stats = $spl->LoadFamilyData(); >>
1551    
1552    Load the protein families into Sprout.
1553    
1554    The following relations are loaded by this method.
1555    
1556        Family
1557        IsFamilyForFeature
1558    
1559    The source information for these relations is taken from the C<families_for_protein>,
1560    C<family_function>, and C<sz_family> methods of the B<FIG> object.
1561    
1562    =over 4
1563    
1564    =item RETURNS
1565    
1566    Returns a statistics object for the loads.
1567    
1568    =back
1569    
1570    =cut
1571    #: Return Type $%;
1572    sub LoadFamilyData {
1573        # Get this object instance.
1574        my ($self) = @_;
1575        # Get the FIG object.
1576        my $fig = $self->{fig};
1577        # Get the genome hash.
1578        my $genomeHash = $self->{genomes};
1579        # Create load objects for the tables we're loading.
1580        my $loadFamily = $self->_TableLoader('Family');
1581        my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1582        if ($self->{options}->{loadOnly}) {
1583            Trace("Loading from existing files.") if T(2);
1584        } else {
1585            Trace("Generating family data.") if T(2);
1586            # Create a hash for the family IDs.
1587            my %familyHash = ();
1588          # Loop through the genomes.          # Loop through the genomes.
1589          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1590              Trace("Processing $genomeID.") if T(3);              Trace("Processing features for $genomeID.") if T(2);
1591              # Get all of the features for this genome. The only method that does this is              # Loop through this genome's PEGs.
1592              # all_features_detailed, which returns extra baggage that we discard.              for my $fid ($fig->all_features($genomeID, "peg")) {
1593              my $featureData = $fig->all_features_detailed($genomeID);                  $loadIsFamilyForFeature->Add("features", 1);
1594              my @fids = map { $_->[0] } @{$featureData};                  # Get this feature's families.
1595              Trace(scalar(@fids) . " features found for genome $genomeID.") if T(3);                  my @families = $fig->families_for_protein($fid);
1596              # Loop through the feature IDs.                  # Loop through the families, connecting them to the feature.
1597              for my $fid (@fids) {                  for my $family (@families) {
1598                  # Get the group for this feature.                      $loadIsFamilyForFeature->Put($family, $fid);
1599                  my $synonym = $fig->maps_to_id($fid);                      # If this is a new family, create a record for it.
1600                  # Only proceed if the synonym is a real group.                      if (! exists $familyHash{$family}) {
1601                  if ($synonym ne $fid) {                          $familyHash{$family} = 1;
1602                      $loadSynonymGroup->Put($synonym);                          $loadFamily->Add("families", 1);
1603                      $loadIsSynonymGroupFor->Put($synonym, $fid);                          my $size = $fig->sz_family($family);
1604                            my $func = $fig->family_function($family);
1605                            $loadFamily->Put($family, $size, $func);
1606                        }
1607                  }                  }
1608              }              }
1609          }          }
# Line 1468  Line 1613 
1613      return $retVal;      return $retVal;
1614  }  }
1615    
1616    =head3 LoadDrugData
1617    
1618    C<< my $stats = $spl->LoadDrugData(); >>
1619    
1620    Load the drug target data into Sprout.
1621    
1622    The following relations are loaded by this method.
1623    
1624        DrugProject
1625        ContainsTopic
1626        DrugTopic
1627        ContainsAnalysisOf
1628        PDB
1629        IncludesBound
1630        IsBoundIn
1631        BindsWith
1632        Ligand
1633        DescribesProteinForFeature
1634        FeatureConservation
1635    
1636    The source information for these relations is taken from flat files in the
1637    C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains
1638    a list of drug project names paired with file names. The named file (in the
1639    same directory) contains all the data for the project.
1640    
1641    =over 4
1642    
1643    =item RETURNS
1644    
1645    Returns a statistics object for the loads.
1646    
1647    =back
1648    
1649    =cut
1650    #: Return Type $%;
1651    sub LoadDrugData {
1652        # Get this object instance.
1653        my ($self) = @_;
1654        # Get the FIG object.
1655        my $fig = $self->{fig};
1656        # Get the genome hash.
1657        my $genomeHash = $self->{genomes};
1658        # Create load objects for the tables we're loading.
1659        my $loadDrugProject = $self->_TableLoader('DrugProject');
1660        my $loadContainsTopic = $self->_TableLoader('ContainsTopic');
1661        my $loadDrugTopic = $self->_TableLoader('DrugTopic');
1662        my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf');
1663        my $loadPDB = $self->_TableLoader('PDB');
1664        my $loadIncludesBound = $self->_TableLoader('IncludesBound');
1665        my $loadIsBoundIn = $self->_TableLoader('IsBoundIn');
1666        my $loadBindsWith = $self->_TableLoader('BindsWith');
1667        my $loadLigand = $self->_TableLoader('Ligand');
1668        my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature');
1669        my $loadFeatureConservation = $self->_TableLoader('FeatureConservation');
1670        if ($self->{options}->{loadOnly}) {
1671            Trace("Loading from existing files.") if T(2);
1672        } else {
1673            Trace("Generating drug target data.") if T(2);
1674            # Load the project list. The file comes in as a list of chomped lines,
1675            # and we split them on the TAB character to make the project name the
1676            # key and the file name the value of the resulting hash.
1677            my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list");
1678            # Create hashes for the derived objects: PDBs, Features, and Ligands. These objects
1679            # may occur multiple times in a single project file or even in multiple project
1680            # files.
1681            my %ligands = ();
1682            my %pdbs = ();
1683            my %features = ();
1684            my %bindings = ();
1685            # Set up a counter for drug topics. This will be used as the key.
1686            my $topicCounter = 0;
1687            # Loop through the projects. We sort the keys not because we need them sorted, but
1688            # because it makes it easier to infer our progress from trace messages.
1689            for my $project (sort keys %projects) {
1690                Trace("Processing project $project.") if T(3);
1691                # Only proceed if the download file exists.
1692                my $projectFile = "$FIG_Config::drug_directory/$projects{$project}";
1693                if (! -f $projectFile) {
1694                    Trace("Project file $projectFile not found.") if T(0);
1695                } else {
1696                    # Create the project record.
1697                    $loadDrugProject->Put($project);
1698                    # Create a hash for the topics. Each project has one or more topics. The
1699                    # topic is identified by a URL, a category, and an identifier.
1700                    my %topics = ();
1701                    # Now we can open the project file.
1702                    Trace("Reading project file $projectFile.") if T(3);
1703                    Open(\*PROJECT, "<$projectFile");
1704                    # Get the first record, which is a list of column headers. We don't use this
1705                    # for anything, but it may be useful for debugging.
1706                    my $headerLine = <PROJECT>;
1707                    # Loop through the rest of the records.
1708                    while (! eof PROJECT) {
1709                        # Get the current line of data. Note that not all lines will have all
1710                        # the fields. In particular, the CLIBE data is fairly rare.
1711                        my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation,
1712                            $pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle,
1713                            $protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo,
1714                            $passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy,
1715                            $clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE)
1716                           = Tracer::GetLine(\*PROJECT);
1717                        # The tag contains an identifier for the current line of data followed
1718                        # by a text statement that generally matches a property name in the
1719                        # main database. We split it up, since the identifier goes with
1720                        # the PDB data and the text statement is part of the topic.
1721                        my ($lineID, $topicTag) = split /\s*,\s*/, $tag;
1722                        $loadDrugProject->Add("data line");
1723                        # Check for a new topic.
1724                        my $topicData = "$category\t$topicTag\t$refURL";
1725                        if (! exists $topics{$topicData}) {
1726                            # Here we have a new topic. Compute its ID.
1727                            $topicCounter++;
1728                            $topics{$topicData} = $topicCounter;
1729                            # Create its database record.
1730                            $loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism,
1731                                                $topicTag);
1732                            # Connect it to the project.
1733                            $loadContainsTopic->Put($project, $topicCounter);
1734                            $loadDrugTopic->Add("topic");
1735                        }
1736                        # Now we know the topic ID exists in the hash and the topic will
1737                        # appear in the database, so we get this topic's ID.
1738                        my $topicID = $topics{$topicData};
1739                        # If the feature in this line is new, we need to save its conservation
1740                        # number.
1741                        if (! exists $features{$peg}) {
1742                            $loadFeatureConservation->Put($peg, $conservation);
1743                            $features{$peg} = 1;
1744                        }
1745                        # Now we have two PDBs to deal with-- a bound PDB and a free PDB.
1746                        # The free PDB will have data about docking points; the bound PDB
1747                        # will have data about docking. We store both types as PDBs, and
1748                        # the special data comes from relationships. First we process the
1749                        # bound PDB.
1750                        if ($pdbBound) {
1751                            $loadPDB->Add("bound line");
1752                            # Insure this PDB is in the database.
1753                            $self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB);
1754                            # Connect it to this topic.
1755                            $loadIncludesBound->Put($topicID, $pdbBound);
1756                            # Check for CLIBE data.
1757                            if ($clibeInfo) {
1758                                $loadLigand->Add("clibes");
1759                                # We have CLIBE data, so we create a ligand and relate it to the PDB.
1760                                if (! exists $ligands{$clibeInfo}) {
1761                                    # This is a new ligand, so create its record.
1762                                    $loadLigand->Put($clibeInfo);
1763                                    $loadLigand->Add("ligand");
1764                                    # Make sure we know this ligand already exists.
1765                                    $ligands{$clibeInfo} = 1;
1766                                }
1767                                # Now connect the PDB to the ligand using the CLIBE data.
1768                                $loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI,
1769                                                    $clibeSolvationE, $clibeVanderwaals);
1770                            }
1771                            # Connect this PDB to the feature.
1772                            $loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval);
1773                        }
1774                        # Next is the free PDB.
1775                        if ($pdbFree) {
1776                            $loadPDB->Add("free line");
1777                            # Insure this PDB is in the database.
1778                            $self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB);
1779                            # Connect it to this topic.
1780                            $loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo,
1781                                                         $passWeightFile, $passWeightInfo, $passAspFile);
1782                            # Connect this PDB to the feature.
1783                            $loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval);
1784                        }
1785                        # If we have both PDBs, we may need to link them.
1786                        if ($pdbFree && $pdbBound) {
1787                            $loadIsBoundIn->Add("connection");
1788                            # Insure we only link them once.
1789                            my $bindingKey =  "$pdbFree\t$pdbBound";
1790                            if (! exists $bindings{$bindingKey}) {
1791                                $loadIsBoundIn->Add("newConnection");
1792                                $loadIsBoundIn->Put($pdbFree, $pdbBound);
1793                                $bindings{$bindingKey} = 1;
1794                            }
1795                        }
1796                    }
1797                    # Close off this project.
1798                    close PROJECT;
1799                }
1800            }
1801        }
1802        # Finish the load.
1803        my $retVal = $self->_FinishAll();
1804        return $retVal;
1805    }
1806    
1807    
1808  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1809    
1810    =head3 SpecialAttribute
1811    
1812    C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1813    
1814    Look for special attributes of a given type. A special attribute is found by comparing one of
1815    the columns of the incoming attribute list to a search pattern. If a match is found, then
1816    a set of columns is put into an output table connected to the specified ID.
1817    
1818    For example, when processing features, the attribute list we look at has three columns: attribute
1819    name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1820    begins with C<iedb_>. The call signature is therefore
1821    
1822        my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1823    
1824    The pattern is matched against column 0, and if we have a match, then column 2's value is put
1825    to the output along with the specified feature ID.
1826    
1827    =over 4
1828    
1829    =item id
1830    
1831    ID of the object whose special attributes are being loaded. This forms the first column of the
1832    output.
1833    
1834    =item attributes
1835    
1836    Reference to a list of tuples.
1837    
1838    =item idxMatch
1839    
1840    Index in each tuple of the column to be matched against the pattern. If the match is
1841    successful, an output record will be generated.
1842    
1843    =item idxValues
1844    
1845    Reference to a list containing the indexes in each tuple of the columns to be put as
1846    the second column of the output.
1847    
1848    =item pattern
1849    
1850    Pattern to be matched against the specified column. The match will be case-insensitive.
1851    
1852    =item loader
1853    
1854    An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
1855    but technically it could be anything with a C<Put> method.
1856    
1857    =item RETURN
1858    
1859    Returns a count of the matches found.
1860    
1861    =item
1862    
1863    =back
1864    
1865    =cut
1866    
1867    sub SpecialAttribute {
1868        # Get the parameters.
1869        my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1870        # Declare the return variable.
1871        my $retVal = 0;
1872        # Loop through the attribute rows.
1873        for my $row (@{$attributes}) {
1874            # Check for a match.
1875            if ($row->[$idxMatch] =~ m/$pattern/i) {
1876                # We have a match, so output a row. This is a bit tricky, since we may
1877                # be putting out multiple columns of data from the input.
1878                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1879                $loader->Put($id, $value);
1880                $retVal++;
1881            }
1882        }
1883        Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
1884        # Return the number of matches.
1885        return $retVal;
1886    }
1887    
1888    =head3 CreatePDB
1889    
1890    C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >>
1891    
1892    Insure that a PDB record exists for the identified PDB. If one does not exist, it will be
1893    created.
1894    
1895    =over 4
1896    
1897    =item pdbID
1898    
1899    ID string (usually an unqualified file name) for the desired PDB.
1900    
1901    =item title
1902    
1903    Title to use if the PDB must be created.
1904    
1905    =item type
1906    
1907    Type of PDB: C<free> or C<bound>
1908    
1909    =item pdbHash
1910    
1911    Hash containing the IDs of PDBs that have already been created.
1912    
1913    =item pdbLoader
1914    
1915    Load object for the PDB table.
1916    
1917    =back
1918    
1919    =cut
1920    
1921    sub CreatePDB {
1922        # Get the parameters.
1923        my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_;
1924        $pdbLoader->Add("PDB check");
1925        # Check to see if this is a new PDB.
1926        if (! exists $pdbHash->{$pdbID}) {
1927            # It is, so we create it.
1928            $pdbLoader->Put($pdbID, $title, $type);
1929            $pdbHash->{$pdbID} = 1;
1930            # Count it.
1931            $pdbLoader->Add("PDB-$type");
1932        }
1933    }
1934    
1935  =head3 TableLoader  =head3 TableLoader
1936    
1937  Create an ERDBLoad object for the specified table. The object is also added to  Create an ERDBLoad object for the specified table. The object is also added to
# Line 1570  Line 2031 
2031          $retVal->Accumulate($stats);          $retVal->Accumulate($stats);
2032          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);          Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
2033          }          }
     }  
2034      # Return the load statistics.      # Return the load statistics.
2035      return $retVal;      return $retVal;
2036  }  }
2037    =head3 GetGenomeAttributes
2038    
2039    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
2040    
2041    Return a hash of attributes keyed on feature ID. This method gets all the attributes
2042    for all the features of a genome in a single call, then organizes them into a hash.
2043    
2044    =over 4
2045    
2046    =item fig
2047    
2048    FIG-like object for accessing attributes.
2049    
2050    =item genomeID
2051    
2052    ID of the genome who's attributes are desired.
2053    
2054    =item fids
2055    
2056    Reference to a list of the feature IDs whose attributes are to be kept.
2057    
2058    =item RETURN
2059    
2060    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2061    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2062    the attribute key, and one or more attribute values.
2063    
2064    =back
2065    
2066    =cut
2067    
2068    sub GetGenomeAttributes {
2069        # Get the parameters.
2070        my ($fig, $genomeID, $fids) = @_;
2071        # Declare the return variable.
2072        my $retVal = {};
2073        # Get the attributes.
2074        my @aList = $fig->get_attributes("fig|$genomeID%");
2075        # Initialize the hash. This not only enables us to easily determine which FIDs to
2076        # keep, it insures that the caller sees a list reference for every known fid,
2077        # simplifying the logic.
2078        for my $fid (@{$fids}) {
2079            $retVal->{$fid} = [];
2080        }
2081        # Populate the hash.
2082        for my $aListEntry (@aList) {
2083            my $fid = $aListEntry->[0];
2084            if (exists $retVal->{$fid}) {
2085                push @{$retVal->{$fid}}, $aListEntry;
2086            }
2087        }
2088        # Return the result.
2089        return $retVal;
2090    }
2091    
2092  1;  1;

Legend:
Removed from v.1.48  
changed lines
  Added in v.1.82

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3