[Bio] / Sprout / FeatureSproutLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Mon Mar 16 00:11:19 2009 UTC revision 1.9, Thu Apr 2 01:42:00 2009 UTC
# Line 28  Line 28 
28      use HyperLink;      use HyperLink;
29      use FFs;      use FFs;
30      use SOAP::Lite;      use SOAP::Lite;
31        use Time::HiRes;
32      use base 'BaseSproutLoader';      use base 'BaseSproutLoader';
33    
34  =head1 Sprout Feature Load Group Class  =head1 Sprout Feature Load Group Class
# Line 66  Line 67 
67      # Create the table list.      # Create the table list.
68      my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink      my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink
69                           FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem                           FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem
70                           FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf                           FeatureEssential FeatureVirulent FeatureIEDB CDD
71                           CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword                           IsPresentOnProteinOf CellLocation IsPossiblePlaceFor
72                           ProteinFamily IsFamilyForFeature ProteinFamilyName FeatureEC);                           IsAlsoFoundIn ExternalDatabase Keyword ProteinFamily
73                             IsFamilyForFeature ProteinFamilyName FeatureEC);
74      # Create the BaseSproutLoader object.      # Create the BaseSproutLoader object.
75      my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
76      # Get the list of relevant attributes.      # Get the list of relevant attributes.
# Line 100  Line 102 
102      my $stemmer = $sprout->GetStemmer();      my $stemmer = $sprout->GetStemmer();
103      # Get access to FIGfams.      # Get access to FIGfams.
104      my $figfam_data = &FIG::get_figfams_data();      my $figfam_data = &FIG::get_figfams_data();
105      my $ffs = new FFs($figfam_data);      my $ffs = new FFs($figfam_data, $fig);
106        # Compute the load directory.
107        my $loadDirectory = $sprout->LoadDirectory();
108      # Only proceed if this is not the global section.      # Only proceed if this is not the global section.
109      if (! $self->global()) {      if (! $self->global()) {
110          # Get the section ID.          # Get the section ID.
111          my $genomeID = $self->section();          my $genomeID = $self->section();
112            MemTrace("Starting section $genomeID.") if T(ERDBLoadGroup => 3);
113          # Connect to the ontology database.          # Connect to the ontology database.
114          my $sqlite_db = "/home/mkubal/Temp/Ontology/ontology.sqlite";          my $sqlite_db = "/home/mkubal/Temp/Ontology/ontology.sqlite";
115          my $ontology_dbmaster = DBMaster->new(-database => $sqlite_db, -backend => 'SQLite');          my $ontology_dbmaster = DBMaster->new(-database => $sqlite_db, -backend => 'SQLite');
116            # This is our master hash of FIG IDs to aliases.
117            my %aliasMasterHash;
118            # Open this genome's alias file. The alias files are created by the AliasCrunch
119            # script.
120            my $aliasFile = "$loadDirectory/alias.$genomeID.tbl";
121            if (! -f $aliasFile) {
122                Trace("No aliases found for $genomeID.") if T(ERDBLoadGroup => 1);
123            } else {
124                my $aliasH = Open(undef, "<$aliasFile");
125                while (! eof $aliasH) {
126                    my ($aliasFid, $aliasID, $aliasType, $aliasConf) = Tracer::GetLine($aliasH);
127                    push @{$aliasMasterHash{$aliasFid}}, [$aliasID, $aliasType, $aliasConf];
128                }
129                close $aliasH;
130                MemTrace("Aliases adjusted.") if T(ERDBLoadGroup => 3);
131            }
132          # Get the maximum sequence size. We need this later for splitting up the          # Get the maximum sequence size. We need this later for splitting up the
133          # locations.          # locations.
134          my $chunkSize = $sprout->MaxSegment();          my $chunkSize = $sprout->MaxSegment();
135          Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);          MemTrace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);
136          # Get the feature list for this genome.          # Get the feature list for this genome.
137          my $features = $fig->all_features_detailed_fast($genomeID);          my $features = $fig->all_features_detailed_fast($genomeID);
138          # Sort and count the list.          # Sort and count the list.
139          my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};          my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
140          my $count = scalar @featureTuples;          my $count = scalar @featureTuples;
141          Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);          MemTrace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);
142          # Get the attributes for this genome and put them in a hash by feature ID.          # Get the attributes for this genome and put them in a hash by feature ID.
143          my $attributes = $self->GetGenomeAttributes($genomeID, \@featureTuples);          my $attributes = $self->GetGenomeAttributes($genomeID, \@featureTuples);
144          Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);          Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);
145          # Loop through the features.          # Loop through the features.
146          for my $featureTuple (@featureTuples) {          for my $featureTuple (@featureTuples) {
147              # Split the tuple.              # Split the tuple.
148              my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};              my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment,
149                    $user, $quality) = @{$featureTuple};
150              # Make sure this feature is active.              # Make sure this feature is active.
151              if (! $fig->is_deleted_fid($featureID)) {              if (! $fig->is_deleted_fid($featureID)) {
152                  # Handle missing assignments.                  # Handle missing assignments.
# Line 149  Line 171 
171                  # feature ID, the taxonomy, and the organism name.                  # feature ID, the taxonomy, and the organism name.
172                  my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),                  my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
173                                  $fig->taxonomy_of($genomeID));                                  $fig->taxonomy_of($genomeID));
174                  # We need to insure we don't put multiple copies of the same alias                  # Next come the aliases. We put all aliases found in this hash.
175                  # in the keyword list, so we'll put all aliases found in this hash.                  # They will be output as alias names and as keywords.
176                  my %aliasHash;                  my %aliasHash;
177                  # We need a list of all the aliases. These come from two sources: one                  # Note the trick here to insure that we have a list reference even
178                  # from the annotation clearinghouse, and one from the detailed features                  # if this feature isn't in the alias table.
179                  # list.                  my $aliasList = $aliasMasterHash{$featureID} || [];
180                  my @rawAliasList = split /,/, $aliases;                  # Loop through this feature ID's aliases.
181                  # Only PEGs appear in the clearinghouse.                  for my $aliasTuple (@$aliasList) {
182                  if ($type eq 'peg') {                      my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
183                      push @rawAliasList, AliasAnalysis::QueryACLH($fig, $featureID);                      # Only proceed if this alias is new.
184                  }                      if (! exists $aliasHash{$aliasID}) {
185                  # Loop through the aliases, recording the normal and natural forms/                          # Save this alias.
186                  for my $alias (@rawAliasList) {                          $aliasHash{$aliasID} = 1;
187                      # Save the alias.                          # Get its natural form.
188                      $aliasHash{$alias} = 1;                          my $natural = AliasAnalysis::Type($aliasType => $aliasID);
189                      # Check for a natural form.                          # Only proceed if a natural form exists.
190                      my $natural = AliasAnalysis::Format(natural => $alias);                          if ($natural) {
191                      if (defined $natural) {                              $self->Add(miscAlias => 1);
192                          # A natural form was found, so we add it to the alias                              # Save the natural form.
                         # list.  
193                          $aliasHash{$natural} = 1;                          $aliasHash{$natural} = 1;
194                      } elsif ($alias =~ /^[A-Z]{3}\d+$/) {                              # Is this a corresponding ID?
195                          # Here it's not a recognized type, but it's probably a                              if ($aliasConf eq 'A') {
196                          # locus tag, so we create a prefixed version.                                  # Yes. Connect its natural form to the feature.
197                          my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);                                  $self->PutR(IsAlsoFoundIn => $featureID, $aliasType,
198                          $aliasHash{$normalized} = 1;                                              alias => $natural);
199                      }                                  $self->PutE(ExternalDatabase => $aliasType);
200                  }                              }
                 # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).  
                 my @corresponders = $fig->get_corresponding_ids($featureID, 1);  
                 for my $tuple (@corresponders) {  
                     my ($id, $xdb) = @{$tuple};  
                     # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug  
                     # at PIR.  
                     if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {  
                         # Connect this ID to the feature and mark its database.  
                         $self->PutR(IsAlsoFoundIn => $featureID, $xdb,  
                                    alias => $id);  
                         $self->PutE(ExternalDatabase => $xdb);  
                         # Compute the ID's normalized form.  
                         my $normalized = AliasAnalysis::Normalize($xdb => $id);  
                         # Add both to the alias hash.  
                         $aliasHash{$id} = 1;  
                         $aliasHash{$normalized} = 1;  
201                      }                      }
202                  }                  }
203                  # Create the aliases.                  }
204                    # Create the aliases and put them in the keyword list.
205                  for my $alias (sort keys %aliasHash) {                  for my $alias (sort keys %aliasHash) {
206                      # Connect this alias to this feature and make an Alias record for it.                      # Connect this alias to this feature and make an Alias record for it.
207                      $self->PutR(IsAliasOf => $alias, $featureID);                      $self->PutR(IsAliasOf => $alias, $featureID);
# Line 507  Line 513 
513  }  }
514    
515    
 =head3 AliasFix  
   
     $sl->AliasFix($orgName, $fidArray);  
   
 Ask the %FIG{Annotation Clearinghouse}% for additional aliases of the  
 features in the specified array. The array should be a result array from  
 the [[FigPm]] method C<all_features_detailed_fast>. The first column of  
 the array contains feature IDs. The third column contains a  
 comma-delimited list of aliases. This method modifies the third column so  
 that it contains any additional aliases available from the Clearinghouse.  
   
 =over 4  
   
 =item orgName  
   
 Organism name for the current genome.  
   
 =item fidArray  
   
 Reference to a list of feature data. Each feature is represented by an n-tuple  
 in the array: the tuple's first element is the feature ID, and the third element  
 (which we will be modifying) is a comma-delimited list of aliases.  
   
 =back  
   
 =cut  
   
 sub AliasFix {  
     # Get the parameters.  
     my ($self, $orgName, $fidArray) = @_;  
     # To control the cost of this operation, we process the features in  
     # batches. The batch size is controlled by a FIG_Config parameter.  
     my $fidCount = scalar @$fidArray;  
     my $fidNext;  
     for (my $fidIdx = 0; $fidIdx < $fidCount; $fidIdx = $fidNext) {  
         # Compute the index of the first feature in the next batch.  
         $fidNext = $fidIdx + $FIG_Config::ach_fixup_batch_size;  
         $fidNext = $fidCount if $fidNext > $fidCount;  
         Trace("Processing ACH feature batch from $fidIdx to $fidNext.") if T(3);  
         # The hash below will map the IDs of the features in this batch  
         # to the alias strings we want to fix up.  
         my %batch;  
         for (my $i = $fidIdx; $i < $fidNext; $i++) {  
             # Get this feature ID.  
             my $fid = $fidArray->[$i][0];  
             # Only proceed if it's a PEG.  
             if ($fid =~ /peg/) {  
                 # It is, so put it in the batch hash. The  
                 # value is a reference to the PEG's alias string.  
                 $batch{$fid} = \$fidArray->[$i][2];  
             }  
         }  
         # Now we have our batch. Ask the clearinghouse for any data it might  
         # have about these FIDs.  
         my $resp = SOAP::Lite->uri($FIG_Config::ach_soap)->proxy($FIG_Config::ach_proxy)  
                              ->get_annotations([keys %batch]);  
         Trace("Processing results from clearinghouse.") if T(3);  
         # Insure we got a result.  
         if (! $resp) {  
             Confess("No response from Clearinghouse.");  
         } elsif ($resp->fault) {  
             Confess("Error requesting alias data from Clearinghouse: " . $resp->faultstring);  
         } else {  
             # Extract the result. It will be a hash mapping FIDs to tuple arrays.  
             my $respHash = $resp->result;  
             # Loop through the result hash.  
             for my $respFid (keys %$respHash) {  
                 # Get a reference to the alias string.  
                 my $pointer = $batch{$respFid};  
                 # Build a hash of the existing aliases.  
                 my %aliasHash = map { $_ => 1 } split /,/, $$pointer;  
                 # Add aliases from the clearinghouse response.  
                 my $tuples = $respHash->{$respFid};  
                 my @newAliases = AliasAnalysis::AnalyzeClearinghouseArray($orgName,  
                                                                           $tuples);  
                 for my $newAlias (@newAliases) {  
                     if (! exists $aliasHash{$newAlias}) {  
                         $aliasHash{$newAlias} = 1;  
                         $self->Add(ACLHaliases => 1);  
                     }  
                 }  
                 # Save the result.  
                 $$pointer = join(",", sort keys %aliasHash);  
             }  
         }  
     }  
 }  
   
   
516  =head3 SpecialAttribute  =head3 SpecialAttribute
517    
518      my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);      my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);
# Line 676  Line 593 
593      return $retVal;      return $retVal;
594  }  }
595    
596    
597  1;  1;

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3