[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.76, Fri Nov 3 00:43:22 2006 UTC revision 1.82, Tue Apr 10 06:15:35 2007 UTC
# Line 274  Line 274 
274              my $extra = join " ", @extraData;              my $extra = join " ", @extraData;
275              # Get the full taxonomy.              # Get the full taxonomy.
276              my $taxonomy = $fig->taxonomy_of($genomeID);              my $taxonomy = $fig->taxonomy_of($genomeID);
277                # Get the version. If no version is specified, we default to the genome ID by itself.
278                my $version = $fig->genome_version($genomeID);
279                if (! defined($version)) {
280                    $version = $genomeID;
281                }
282                # Get the DNA size.
283                my $dnaSize = $fig->genome_szdna($genomeID);
284              # Open the NMPDR group file for this genome.              # Open the NMPDR group file for this genome.
285              my $group;              my $group;
286              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&              if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
# Line 286  Line 293 
293              }              }
294              close TMP;              close TMP;
295              # Output the genome record.              # Output the genome record.
296              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),
297                               $group, $species, $extra, $taxonomy);                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);
298              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
299              my @contigs = $fig->all_contigs($genomeID);              my @contigs = $fig->all_contigs($genomeID);
300              for my $contigID (@contigs) {              for my $contigID (@contigs) {
# Line 517  Line 524 
524              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
525              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
526              # Get the feature list for this genome.              # Get the feature list for this genome.
527              my $features = $fig->all_features_detailed($genomeID);              my $features = $fig->all_features_detailed_fast($genomeID);
528              # Sort and count the list.              # Sort and count the list.
529              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};              my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
530              my $count = scalar @featureTuples;              my $count = scalar @featureTuples;
531                my @fids = map { $_->[0] } @featureTuples;
532              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
533                # Get the attributes for this genome and put them in a hash by feature ID.
534                my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids);
535              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
536              my $oldFeatureID = "";              my $oldFeatureID = "";
537              # Loop through the features.              # Loop through the features.
538              for my $featureTuple (@featureTuples) {              for my $featureTuple (@featureTuples) {
539                  # Split the tuple.                  # Split the tuple.
540                  my ($featureID, $locations, undef, $type) = @{$featureTuple};                  my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
541                  # Check for duplicates.                  # Check for duplicates.
542                  if ($featureID eq $oldFeatureID) {                  if ($featureID eq $oldFeatureID) {
543                      Trace("Duplicate feature $featureID found.") if T(1);                      Trace("Duplicate feature $featureID found.") if T(1);
# Line 535  Line 545 
545                      $oldFeatureID = $featureID;                      $oldFeatureID = $featureID;
546                      # Count this feature.                      # Count this feature.
547                      $loadFeature->Add("featureIn");                      $loadFeature->Add("featureIn");
548                        # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
549                        # Sprout database requires a single character.
550                        if (! defined($quality) || $quality eq "") {
551                            $quality = " ";
552                        }
553                      # Begin building the keywords. We start with the genome ID, the                      # Begin building the keywords. We start with the genome ID, the
554                      # feature ID, and the organism name.                      # feature ID, the taxonomy, and the organism name.
555                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID));                      my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
556                      # Get the functional assignment and aliases. This                                      $fig->taxonomy_of($genomeID));
                     # depends on the feature type.  
                     my $assignment;  
                     if ($type eq "peg") {  
                         $assignment = $fig->function_of($featureID);  
557                          # Create the aliases.                          # Create the aliases.
558                          for my $alias ($fig->feature_aliases($featureID)) {                          for my $alias ($fig->feature_aliases($featureID)) {
559                              $loadFeatureAlias->Put($featureID, $alias);                              $loadFeatureAlias->Put($featureID, $alias);
560                              push @keywords, $alias;                              push @keywords, $alias;
561                          }                          }
                     } else {  
                         # For other types, the assignment is the first (and ONLY) alias.  
                         ($assignment) = $fig->feature_aliases($featureID);  
                     }  
562                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
563                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
564                      # keyword list.                      # keyword list.
# Line 607  Line 614 
614                      # [name, value, value with URL]. (We don't need the PEG, since                      # [name, value, value with URL]. (We don't need the PEG, since
615                      # we already know it.)                      # we already know it.)
616                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }                      my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
617                                           $fig->get_attributes($featureID);                                           @{$attributes->{$featureID}};
618                      # Now we process each of the special attributes.                      # Now we process each of the special attributes.
619                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
620                                           1, 2, '^(essential|potential_essential)$',                                           1, [0,2], '^(essential|potential_essential)$',
621                                           $loadFeatureEssential)) {                                           $loadFeatureEssential)) {
622                          push @keywords, 'essential';                          push @keywords, 'essential';
623                          $loadFeature->Add('essential');                          $loadFeature->Add('essential');
624                      }                      }
625                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
626                                           0, 2, '^virulen',                                           0, [2], '^virulen',
627                                           $loadFeatureVirulent)) {                                           $loadFeatureVirulent)) {
628                          push @keywords, 'virulent';                          push @keywords, 'virulent';
629                          $loadFeature->Add('virulent');                          $loadFeature->Add('virulent');
630                      }                      }
631                      if (SpecialAttribute($featureID, \@attributes,                      if (SpecialAttribute($featureID, \@attributes,
632                                           0, 2, '^iedb_',                                           0, [0,2], '^iedb_',
633                                           $loadFeatureIEDB)) {                                           $loadFeatureIEDB)) {
634                          push @keywords, 'iedb';                          push @keywords, 'iedb';
635                          $loadFeature->Add('iedb');                          $loadFeature->Add('iedb');
636                      }                      }
637                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
638                      # list.                      # list. We keep them separate and put them at the end so
639                        # the original word order is available.
640                      my $keywordString = "";                      my $keywordString = "";
641                        my $bustedString = "";
642                      for my $keyword (@keywords) {                      for my $keyword (@keywords) {
643                          if (length $keyword >= 4) {                          if (length $keyword >= 3) {
644                              $keywordString .= " $keyword";                              $keywordString .= " $keyword";
645                              if ($keyword =~ /-/) {                              if ($keyword =~ /-/) {
646                                  my @words = grep { length($_) >= 4 } split /-/, $keyword;                                  my @words = split /-/, $keyword;
647                                  $keywordString .= join(" ", "", @words);                                  $bustedString .= join(" ", "", @words);
648                              }                              }
649                          }                          }
650                      }                      }
651                        $keywordString .= $bustedString;
652                        # Get rid of annoying punctuation.
653                        $keywordString =~ s/[();]//g;
654                      # Clean the keyword list.                      # Clean the keyword list.
655                      my $cleanWords = $sprout->CleanKeywords($keywordString);                      my $cleanWords = $sprout->CleanKeywords($keywordString);
656                      Trace("Keyword string for $featureID: $cleanWords") if T(4);                      Trace("Keyword string for $featureID: $cleanWords") if T(4);
657                      # Create the feature record.                      # Create the feature record.
658                      $loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords);                      $loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords);
659                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
660                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
661                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 786  Line 798 
798                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
799                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
800                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
801                  my $classString = join(" : ", grep { $_ } @$classList);                  my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
802                  $loadSubsystemClass->Put($subsysID, $classString);                  $loadSubsystemClass->Put($subsysID, $classString);
803                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.                  # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
804                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {                  for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
# Line 999  Line 1011 
1011              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};              my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
1012              my $featureCount = 0;              my $featureCount = 0;
1013              my $propertyCount = 0;              my $propertyCount = 0;
1014                # Get the properties for this genome's features.
1015                my $attributes = GetGenomeAttributes($fig, $genomeID, \@features);
1016                Trace("Property hash built for $genomeID.") if T(3);
1017              # Loop through the features, creating HasProperty records.              # Loop through the features, creating HasProperty records.
1018              for my $fid (@features) {              for my $fid (@features) {
1019                  # Get all attributes for this feature. We do this one feature at a time                  # Get all attributes for this feature. We do this one feature at a time
1020                  # to insure we do not get any genome attributes.                  # to insure we do not get any genome attributes.
1021                  my @attributeList = $fig->get_attributes($fid, '', '', '');                  my @attributeList = @{$attributes->{$fid}};
                 # Add essentiality and virulence attributes.  
                 if ($fig->essential($fid)) {  
                     push @attributeList, [$fid, 'essential', 1, ''];  
                 }  
                 if ($fig->virulent($fid)) {  
                     push @attributeList, [$fid, 'virulent', 1, ''];  
                 }  
1022                  if (scalar @attributeList) {                  if (scalar @attributeList) {
1023                      $featureCount++;                      $featureCount++;
1024                  }                  }
# Line 1801  Line 1809 
1809    
1810  =head3 SpecialAttribute  =head3 SpecialAttribute
1811    
1812  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, $idxValue, $pattern, $loader); >>  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>
1813    
1814  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1815  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
1816  another column is put into an output table connected to the specified ID.  a set of columns is put into an output table connected to the specified ID.
1817    
1818  For example, when processing features, the attribute list we look at has three columns: attribute  For example, when processing features, the attribute list we look at has three columns: attribute
1819  name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name  name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1820  begins with C<iedb_>. The call signature is therefore  begins with C<iedb_>. The call signature is therefore
1821    
1822      my $found = SpecialAttribute($fid, \@attributeList, 0, 2, '^iedb_', $loadFeatureIEDB);      my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
1823    
1824  The pattern is matched against column 0, and if we have a match, then column 2's value is put  The pattern is matched against column 0, and if we have a match, then column 2's value is put
1825  to the output along with the specified feature ID.  to the output along with the specified feature ID.
# Line 1832  Line 1840 
1840  Index in each tuple of the column to be matched against the pattern. If the match is  Index in each tuple of the column to be matched against the pattern. If the match is
1841  successful, an output record will be generated.  successful, an output record will be generated.
1842    
1843  =item idxValue  =item idxValues
1844    
1845  Index in each tuple of the column to be put as the second column of the output.  Reference to a list containing the indexes in each tuple of the columns to be put as
1846    the second column of the output.
1847    
1848  =item pattern  =item pattern
1849    
# Line 1857  Line 1866 
1866    
1867  sub SpecialAttribute {  sub SpecialAttribute {
1868      # Get the parameters.      # Get the parameters.
1869      my ($id, $attributes, $idxMatch, $idxValue, $pattern, $loader) = @_;      my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
1870      # Declare the return variable.      # Declare the return variable.
1871      my $retVal = 0;      my $retVal = 0;
1872      # Loop through the attribute rows.      # Loop through the attribute rows.
1873      for my $row (@{$attributes}) {      for my $row (@{$attributes}) {
1874          # Check for a match.          # Check for a match.
1875          if ($row->[$idxMatch] =~ m/$pattern/i) {          if ($row->[$idxMatch] =~ m/$pattern/i) {
1876              # We have a match, so output a row.              # We have a match, so output a row. This is a bit tricky, since we may
1877              $loader->Put($id, $row->[$idxValue]);              # be putting out multiple columns of data from the input.
1878                my $value = join(" ", map { $row->[$_] } @{$idxValues});
1879                $loader->Put($id, $value);
1880              $retVal++;              $retVal++;
1881          }          }
1882      }      }
# Line 2023  Line 2034 
2034      # Return the load statistics.      # Return the load statistics.
2035      return $retVal;      return $retVal;
2036  }  }
2037    =head3 GetGenomeAttributes
2038    
2039    C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >>
2040    
2041    Return a hash of attributes keyed on feature ID. This method gets all the attributes
2042    for all the features of a genome in a single call, then organizes them into a hash.
2043    
2044    =over 4
2045    
2046    =item fig
2047    
2048    FIG-like object for accessing attributes.
2049    
2050    =item genomeID
2051    
2052    ID of the genome who's attributes are desired.
2053    
2054    =item fids
2055    
2056    Reference to a list of the feature IDs whose attributes are to be kept.
2057    
2058    =item RETURN
2059    
2060    Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2061    reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2062    the attribute key, and one or more attribute values.
2063    
2064    =back
2065    
2066    =cut
2067    
2068    sub GetGenomeAttributes {
2069        # Get the parameters.
2070        my ($fig, $genomeID, $fids) = @_;
2071        # Declare the return variable.
2072        my $retVal = {};
2073        # Get the attributes.
2074        my @aList = $fig->get_attributes("fig|$genomeID%");
2075        # Initialize the hash. This not only enables us to easily determine which FIDs to
2076        # keep, it insures that the caller sees a list reference for every known fid,
2077        # simplifying the logic.
2078        for my $fid (@{$fids}) {
2079            $retVal->{$fid} = [];
2080        }
2081        # Populate the hash.
2082        for my $aListEntry (@aList) {
2083            my $fid = $aListEntry->[0];
2084            if (exists $retVal->{$fid}) {
2085                push @{$retVal->{$fid}}, $aListEntry;
2086            }
2087        }
2088        # Return the result.
2089        return $retVal;
2090    }
2091    
2092  1;  1;

Legend:
Removed from v.1.76  
changed lines
  Added in v.1.82

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3