[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.62, Sun Jul 30 05:44:57 2006 UTC revision 1.72, Sat Oct 14 18:12:14 2006 UTC
# Line 120  Line 120 
120                      # an omitted access code can be defaulted to 1.                      # an omitted access code can be defaulted to 1.
121                      for my $genomeLine (@genomeList) {                      for my $genomeLine (@genomeList) {
122                          my ($genomeID, $accessCode) = split("\t", $genomeLine);                          my ($genomeID, $accessCode) = split("\t", $genomeLine);
123                          if (undef $accessCode) {                          if (! defined($accessCode)) {
124                              $accessCode = 1;                              $accessCode = 1;
125                          }                          }
126                          $genomes{$genomeID} = $accessCode;                          $genomes{$genomeID} = $accessCode;
# Line 163  Line 163 
163                  Confess("Invalid subsystem parameter in SproutLoad constructor.");                  Confess("Invalid subsystem parameter in SproutLoad constructor.");
164              }              }
165          }          }
166            # Go through the subsys hash again, creating the keyword list for each subsystem.
167            for my $subsystem (keys %subsystems) {
168                my $name = $subsystem;
169                $name =~ s/_/ /g;
170                my $classes = $fig->subsystem_classification($subsystem);
171                my @classList = map { " $_" } @{$classes};
172                $name .= join("", @classList);
173                $subsystems{$subsystem} = $name;
174            }
175      }      }
176      # Get the data directory from the Sprout object.      # Get the data directory from the Sprout object.
177      my ($directory) = $sprout->LoadInfo();      my ($directory) = $sprout->LoadInfo();
# Line 266  Line 275 
275              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
276              # Get the full taxonomy.              # Get the full taxonomy.
277              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
278                # Open the NMPDR group file for this genome.
279                my $group;
280                if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
281                    defined($group = <TMP>)) {
282                    # Clean the line ending.
283                    chomp $group;
284                } else {
285                    # No group, so use the default.
286                    $group = $FIG_Config::otherGroup;
287                }
288                close TMP;
289              # Output the genome record.              # Output the genome record.
290              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,
291                               $species, $extra, $taxonomy);                               $group, $species, $extra, $taxonomy);
292              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
293              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
294              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 449  Line 469 
469      FeatureUpstream      FeatureUpstream
470      IsLocatedIn      IsLocatedIn
471      HasFeature      HasFeature
472        HasRoleInSubsystem
473    
474  =over 4  =over 4
475    
# Line 463  Line 484 
484  sub LoadFeatureData {  sub LoadFeatureData {
485      # Get this object instance.      # Get this object instance.
486      my ($self) = @_;      my ($self) = @_;
487      # Get the FIG object.      # Get the FIG and Sprout objects.
488      my $fig = $self->{fig};      my $fig = $self->{fig};
489        my $sprout = $self->{sprout};
490      # Get the table of genome IDs.      # Get the table of genome IDs.
491      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
492      # Create load objects for each of the tables we're loading.      # Create load objects for each of the tables we're loading.
# Line 474  Line 496 
496      my $loadFeatureLink = $self->_TableLoader('FeatureLink');      my $loadFeatureLink = $self->_TableLoader('FeatureLink');
497      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');      my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
498      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');      my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
499      my $loadHasFeature = $self->_TableLoader('HasFeature');      my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly);
500        my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly);
501        # Get the subsystem hash.
502        my $subHash = $self->{subsystems};
503      # Get the maximum sequence size. We need this later for splitting up the      # Get the maximum sequence size. We need this later for splitting up the
504      # locations.      # locations.
505      my $chunkSize = $self->{sprout}->MaxSegment();      my $chunkSize = $self->{sprout}->MaxSegment();
# Line 505  Line 530 
530                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
531                      # Count this feature.                      # Count this feature.
532                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
533                      # Create the feature record.                      # Get the functional assignment.
534                      $loadFeature->Put($featureID, 1, $type);                      my $assignment = $fig->function_of($featureID);
535                      # Link it to the parent genome.                      # Begin building the keywords.
536                        my $keywords = "$assignment $genomeID";
537                        # Link this feature to the parent genome.
538                      $loadHasFeature->Put($genomeID, $featureID, $type);                      $loadHasFeature->Put($genomeID, $featureID, $type);
539                      # Create the aliases.                      # Create the aliases.
540                      for my $alias ($fig->feature_aliases($featureID)) {                      for my $alias ($fig->feature_aliases($featureID)) {
541                          $loadFeatureAlias->Put($featureID, $alias);                          $loadFeatureAlias->Put($featureID, $alias);
542                            $keywords .= " $alias";
543                      }                      }
544                      # Get the links.                      # Get the links.
545                      my @links = $fig->fid_links($featureID);                      my @links = $fig->fid_links($featureID);
# Line 531  Line 559 
559                              $loadFeatureUpstream->Put($featureID, $upstream);                              $loadFeatureUpstream->Put($featureID, $upstream);
560                          }                          }
561                      }                      }
562                        # Now we need to find the subsystems this feature participates in.
563                        # We also add the subsystems to the keyword list. Before we do that,
564                        # we must convert underscores to spaces and tack on the classifications.
565                        my @subsystems = $fig->peg_to_subsystems($featureID);
566                        for my $subsystem (@subsystems) {
567                            # Only proceed if we like this subsystem.
568                            if (exists $subHash->{$subsystem}) {
569                                # Store the has-role link.
570                                $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
571                                # Save the subsystem's keyword data.
572                                my $subKeywords = $subHash->{$subsystem};
573                                $keywords .= " $subKeywords";
574                            }
575                        }
576                        # The final task is to add virulence and essentiality attributes.
577                        if ($fig->virulent($featureID)) {
578                            $keywords .= " virulent";
579                        }
580                        if ($fig->essential($featureID)) {
581                            $keywords .= " essential";
582                        }
583                        # Clean the keyword list.
584                        my $cleanWords = $sprout->CleanKeywords($keywords);
585                        # Create the feature record.
586                        $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);
587                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
588                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
589                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 609  Line 662 
662              Trace("Processing features for genome $genomeID.") if T(3);              Trace("Processing features for genome $genomeID.") if T(3);
663              # Get the feature list for this genome.              # Get the feature list for this genome.
664              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed($genomeID);
665                # Count the BBHs we find.
666                my $bbhCount = 0;
667              # Loop through the features.              # Loop through the features.
668              for my $featureData (@{$features}) {              for my $featureData (@{$features}) {
669                  # Split the tuple.                  # Split the tuple.
# Line 624  Line 679 
679                      if ($genomeHash->{$targetGenomeID}) {                      if ($genomeHash->{$targetGenomeID}) {
680                          $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,                          $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,
681                                                             $score);                                                             $score);
682                            $bbhCount++;
683                      }                      }
684                  }                  }
685              }              }
686                Trace("$bbhCount BBHs found for $genomeID.") if T(3);
687          }          }
688      }      }
689      # Finish the loads.      # Finish the loads.
# Line 738  Line 795 
795                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
796                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
797                  $loadSubsystem->Put($subsysID, $curator, $notes);                  $loadSubsystem->Put($subsysID, $curator, $notes);
798                  my $class = $fig->subsystem_classification($subsysID);                  # Now for the classification string. This comes back as a list
799                  if ($class) {                  # reference and we convert it to a space-delimited string.
800                      $loadSubsystemClass->Put($subsysID, $class);                  my $classList = $fig->subsystem_classification($subsysID);
801                  }                  my $classString = join(" ", grep { $_ } @$classList);
802                    $loadSubsystemClass->Put($subsysID, $classString);
803                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
805                      # Connect to this role.                      # Connect to this role.
# Line 944  Line 1002 
1002          my %propertyKeys = ();          my %propertyKeys = ();
1003          my $nextID = 1;          my $nextID = 1;
1004          # Loop through the genomes.          # Loop through the genomes.
1005          for my $genomeID (keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
1006              $loadProperty->Add("genomeIn");              $loadProperty->Add("genomeIn");
1007              Trace("Generating properties for $genomeID.") if T(3);              Trace("Generating properties for $genomeID.") if T(3);
1008              # Get the genome's features. The feature ID is the first field in the              # Get the genome's features. The feature ID is the first field in the
# Line 958  Line 1016 
1016                  # Get all attributes for this feature. We do this one feature at a time                  # Get all attributes for this feature. We do this one feature at a time
1017                  # to insure we do not get any genome attributes.                  # to insure we do not get any genome attributes.
1018                  my @attributeList = $fig->get_attributes($fid, '', '', '');                  my @attributeList = $fig->get_attributes($fid, '', '', '');
1019                    # Add essentiality and virulence attributes.
1020                    if ($fig->essential($fid)) {
1021                        push @attributeList, [$fid, 'essential', 1, ''];
1022                    }
1023                    if ($fig->virulent($fid)) {
1024                        push @attributeList, [$fid, 'virulent', 1, ''];
1025                    }
1026                  if (scalar @attributeList) {                  if (scalar @attributeList) {
1027                      $featureCount++;                      $featureCount++;
1028                  }                  }
# Line 1370  Line 1435 
1435    
1436      GenomeGroups      GenomeGroups
1437    
1438  There is no direct support for genome groups in FIG, so we access the SEED  Currently, we do not use groups. We used to use them for NMPDR groups,
1439    butThere is no direct support for genome groups in FIG, so we access the SEED
1440  files directly.  files directly.
1441    
1442  =over 4  =over 4
# Line 1396  Line 1462 
1462          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1463      } else {      } else {
1464          Trace("Generating group data.") if T(2);          Trace("Generating group data.") if T(2);
1465          # Loop through the genomes.          # Currently there are no groups.
         my $line;  
         for my $genomeID (keys %{$genomeHash}) {  
             Trace("Processing $genomeID.") if T(3);  
             # Open the NMPDR group file for this genome.  
             if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&  
                 defined($line = <TMP>)) {  
                 # Clean the line ending.  
                 chomp $line;  
                 # Add the group to the table. Note that there can only be one group  
                 # per genome.  
                 $loadGenomeGroups->Put($genomeID, $line);  
             }  
             close TMP;  
         }  
1466      }      }
1467      # Finish the load.      # Finish the load.
1468      my $retVal = $self->_FinishAll();      my $retVal = $self->_FinishAll();
# Line 1506  Line 1558 
1558  The following relations are loaded by this method.  The following relations are loaded by this method.
1559    
1560      Family      Family
1561      ContainsFeature      IsFamilyForFeature
1562    
1563  The source information for these relations is taken from the C<families_for_protein>,  The source information for these relations is taken from the C<families_for_protein>,
1564  C<family_function>, and C<sz_family> methods of the B<FIG> object.  C<family_function>, and C<sz_family> methods of the B<FIG> object.
# Line 1530  Line 1582 
1582      my $genomeHash = $self->{genomes};      my $genomeHash = $self->{genomes};
1583      # Create load objects for the tables we're loading.      # Create load objects for the tables we're loading.
1584      my $loadFamily = $self->_TableLoader('Family');      my $loadFamily = $self->_TableLoader('Family');
1585      my $loadContainsFeature = $self->_TableLoader('ContainsFeature');      my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1586      if ($self->{options}->{loadOnly}) {      if ($self->{options}->{loadOnly}) {
1587          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
1588      } else {      } else {
# Line 1542  Line 1594 
1594              Trace("Processing features for $genomeID.") if T(2);              Trace("Processing features for $genomeID.") if T(2);
1595              # Loop through this genome's PEGs.              # Loop through this genome's PEGs.
1596              for my $fid ($fig->all_features($genomeID, "peg")) {              for my $fid ($fig->all_features($genomeID, "peg")) {
1597                  $loadContainsFeature->Add("features", 1);                  $loadIsFamilyForFeature->Add("features", 1);
1598                  # Get this feature's families.                  # Get this feature's families.
1599                  my @families = $fig->families_for_protein($fid);                  my @families = $fig->families_for_protein($fid);
1600                  # Loop through the families, connecting them to the feature.                  # Loop through the families, connecting them to the feature.
1601                  for my $family (@families) {                  for my $family (@families) {
1602                      $loadContainsFeature->Put($family, $fid);                      $loadIsFamilyForFeature->Put($family, $fid);
1603                      # If this is a new family, create a record for it.                      # If this is a new family, create a record for it.
1604                      if (! exists $familyHash{$family}) {                      if (! exists $familyHash{$family}) {
1605                          $familyHash{$family} = 1;                          $familyHash{$family} = 1;
# Line 1565  Line 1617 
1617      return $retVal;      return $retVal;
1618  }  }
1619    
1620    
1621    
1622  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1623    
1624  =head3 TableLoader  =head3 TableLoader

Legend:
Removed from v.1.62  
changed lines
  Added in v.1.72

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3