[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.78, Wed Nov 15 12:15:30 2006 UTC revision 1.82, Tue Apr 10 06:15:35 2007 UTC
# Line 274  Line 274 
274              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
275              # Get the full taxonomy.              # Get the full taxonomy.
276              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284              # Open the NMPDR group file for this genome.              # Open the NMPDR group file for this genome.
285              my $group;              my $group;
286              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
# Line 286  Line 293 
293              }              }
294              close TMP;              close TMP;
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                               $group, $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
299              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
300              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 517  Line 524 
524              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
525              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
526              # Get the feature list for this genome.              # Get the feature list for this genome.
527              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528              # Sort and count the list.              # Sort and count the list.
529              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
536              my $oldFeatureID = "";              my $oldFeatureID = "";
537              # Loop through the features.              # Loop through the features.
538              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
539                  # Split the tuple.                  # Split the tuple.
540                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541                  # Check for duplicates.                  # Check for duplicates.
542                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
543                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 535  Line 545 
545                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
546                      # Count this feature.                      # Count this feature.
547                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                      # Begin building the keywords. We start with the genome ID, the                      # Begin building the keywords. We start with the genome ID, the
554                      # feature ID, and the organism name.                      # feature ID, the taxonomy, and the organism name.
555                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID));                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                      # Get the functional assignment and aliases. This                                      $fig->taxonomy_of($genomeID));
                     # depends on the feature type.  
                     my $assignment;  
                     if ($type eq "peg") {  
                         $assignment = $fig->function_of($featureID);  
557                          # Create the aliases.                          # Create the aliases.
558                          for my $alias ($fig->feature_aliases($featureID)) {                          for my $alias ($fig->feature_aliases($featureID)) {
559                              $loadFeatureAlias->Put($featureID, $alias);                              $loadFeatureAlias->Put($featureID, $alias);
560                              push @keywords, $alias;                              push @keywords, $alias;
561                          }                          }
                     } else {  
                         # For other types, the assignment is the first (and ONLY) alias.  
                         ($assignment) = $fig->feature_aliases($featureID);  
                     }  
562                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
563                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
564                      # keyword list.                      # keyword list.
# Line 607  Line 614 
614                      # [name, value, value with URL]. (We don't need the PEG, since                      # [name, value, value with URL]. (We don't need the PEG, since
615                      # we already know it.)                      # we already know it.)
616                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                           $fig->get_attributes($featureID);                                           @{$attributes->{$featureID}};
618                      # Now we process each of the special attributes.                      # Now we process each of the special attributes.
619                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
620                                           1, [0,2], '^(essential|potential_essential)$',                                           1, [0,2], '^(essential|potential_essential)$',
# Line 628  Line 635 
635                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
636                      }                      }
637                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
638                      # list.                      # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                      my $keywordString = "";                      my $keywordString = "";
641                        my $bustedString = "";
642                      for my $keyword (@keywords) {                      for my $keyword (@keywords) {
643                          if (length $keyword >= 4) {                          if (length $keyword >= 3) {
644                              $keywordString .= " $keyword";                              $keywordString .= " $keyword";
645                              if ($keyword =~ /-/) {                              if ($keyword =~ /-/) {
646                                  my @words = grep { length($_) >= 4 } split /-/, $keyword;                                  my @words = split /-/, $keyword;
647                                  $keywordString .= join(" ", "", @words);                                  $bustedString .= join(" ", "", @words);
648                              }                              }
649                          }                          }
650                      }                      }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                      # Clean the keyword list.                      # Clean the keyword list.
655                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my $cleanWords = $sprout->CleanKeywords($keywordString);
656                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                      # Create the feature record.                      # Create the feature record.
658                      $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
660                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
661                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 999  Line 1011 
1011              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
1012              my $featureCount = 0;              my $featureCount = 0;
1013              my $propertyCount = 0;              my $propertyCount = 0;
1014                # Get the properties for this genome's features.
1015                my $attributes = GetGenomeAttributes($fig, $genomeID, \@features);
1016                Trace("Property hash built for $genomeID.") if T(3);
1017              # Loop through the features, creating HasProperty records.              # Loop through the features, creating HasProperty records.
1018              for my $fid (@features) {              for my $fid (@features) {
1019                  # Get all attributes for this feature. We do this one feature at a time                  # Get all attributes for this feature. We do this one feature at a time
1020                  # to insure we do not get any genome attributes.                  # to insure we do not get any genome attributes.
1021                  my @attributeList = $fig->get_attributes($fid);                  my @attributeList = @{$attributes->{$fid}};
                 # Add essentiality and virulence attributes.  
                 if ($fig->essential($fid)) {  
                     push @attributeList, [$fid, 'essential', 1, ''];  
                 }  
                 if ($fig->virulent($fid)) {  
                     push @attributeList, [$fid, 'virulent', 1, ''];  
                 }  
1022                  if (scalar @attributeList) {                  if (scalar @attributeList) {
1023                      $featureCount++;                      $featureCount++;
1024                  }                  }
# Line 2026  Line 2034 
2034      # Return the load statistics.      # Return the load statistics.
2035      return $retVal;      return $retVal;
2036  }  }
2037    =head3 GetGenomeAttributes
2038    
2039    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
2040    
2041    Return a hash of attributes keyed on feature ID. This method gets all the attributes
2042    for all the features of a genome in a single call, then organizes them into a hash.
2043    
2044    =over 4
2045    
2046    =item fig
2047    
2048    FIG-like object for accessing attributes.
2049    
2050    =item genomeID
2051    
2052    ID of the genome who's attributes are desired.
2053    
2054    =item fids
2055    
2056    Reference to a list of the feature IDs whose attributes are to be kept.
2057    
2058    =item RETURN
2059    
2060    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2061    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2062    the attribute key, and one or more attribute values.
2063    
2064    =back
2065    
2066    =cut
2067    
2068    sub GetGenomeAttributes {
2069        # Get the parameters.
2070        my ($fig, $genomeID, $fids) = @_;
2071        # Declare the return variable.
2072        my $retVal = {};
2073        # Get the attributes.
2074        my @aList = $fig->get_attributes("fig|$genomeID%");
2075        # Initialize the hash. This not only enables us to easily determine which FIDs to
2076        # keep, it insures that the caller sees a list reference for every known fid,
2077        # simplifying the logic.
2078        for my $fid (@{$fids}) {
2079            $retVal->{$fid} = [];
2080        }
2081        # Populate the hash.
2082        for my $aListEntry (@aList) {
2083            my $fid = $aListEntry->[0];
2084            if (exists $retVal->{$fid}) {
2085                push @{$retVal->{$fid}}, $aListEntry;
2086            }
2087        }
2088        # Return the result.
2089        return $retVal;
2090    }
2091    
2092  1;  1;

Legend:
Removed from v.1.78  
changed lines
  Added in v.1.82

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3