[Bio] / Sprout / FeatureSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use BioWords;
26 :     use AliasAnalysis;
27 :     use base 'BaseSproutLoader';
28 :    
29 :     =head1 Sprout Feature Load Group Class
30 :    
31 :     =head2 Introduction
32 :    
33 :     The Feature Load Group includes all of the major feature-related tables.
34 :    
35 :     =head3 new
36 :    
37 :     my $sl = BaseSproutLoader->new($erdb, $source, $options, @tables);
38 :    
39 :     Construct a new BaseSproutLoader object.
40 :    
41 :     =over 4
42 :    
43 :     =item erdb
44 :    
45 :     [[SproutPm]] object for the database being loaded.
46 :    
47 :     =item source
48 :    
49 :     [[FigPm]] object used to access the source data.
50 :    
51 :     =item options
52 :    
53 :     Reference to a hash of command-line options.
54 :    
55 :     =item tables
56 :    
57 :     List of tables in this load group.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub new {
64 :     # Get the parameters.
65 :     my ($class, $erdb, $source, $options) = @_;
66 :     # Create the table list.
67 :     my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink
68 :     FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem
69 :     FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf
70 :     CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword);
71 :     # Create the BaseSproutLoader object.
72 :     my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
73 : parrello 1.2 # Get the list of relevant attributes.
74 : parrello 1.1 # Bless and return it.
75 :     bless $retVal, $class;
76 :     return $retVal;
77 :     }
78 :    
79 :     =head2 Public Methods
80 :    
81 :     =head3 Generate
82 :    
83 :     $sl->Generate();
84 :    
85 :     Generate the data for the feature-related files.
86 :    
87 :     =cut
88 :    
89 :     sub Generate {
90 :     # Get the parameters.
91 :     my ($self) = @_;
92 :     # Get the section ID.
93 :     my $genomeID = $self->section();
94 :     # Get the sprout object.
95 :     my $sprout = $self->db();
96 :     # Get the FIG object.
97 :     my $fig = $self->source();
98 :     # Get the subsystem list.
99 :     my $subHash = $self->GetSubsystems();
100 :     # Get the word stemmer.
101 :     my $stemmer = $sprout->GetStemmer();
102 :     # Only proceed if this is not the global section.
103 :     if (! $self->global()) {
104 :     # Get the maximum sequence size. We need this later for splitting up the
105 :     # locations.
106 :     my $chunkSize = $sprout->MaxSegment();
107 :     Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);
108 :     # Get the feature list for this genome.
109 :     my $features = $fig->all_features_detailed_fast($genomeID);
110 :     # Sort and count the list.
111 :     my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
112 :     my $count = scalar @featureTuples;
113 :     Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);
114 :     # Get the attributes for this genome and put them in a hash by feature ID.
115 :     my $attributes = $self->GetGenomeAttributes($genomeID);
116 :     Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);
117 :     # Loop through the features.
118 :     for my $featureTuple (@featureTuples) {
119 :     # Split the tuple.
120 :     my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
121 :     # Make sure this feature is active.
122 :     if (! $fig->is_deleted_fid($featureID)) {
123 :     # Handle missing assignments.
124 :     if (! defined $assignment) {
125 :     $assignment = '';
126 :     $user = '';
127 :     } else {
128 :     # The default assignment-maker is FIG.
129 :     $user ||= 'fig';
130 :     }
131 :     # Count this feature.
132 :     $self->Track(features => $featureID, 1000);
133 :     # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
134 :     # Sprout database requires a single character.
135 :     if (! defined($quality) || $quality eq "") {
136 :     $quality = " ";
137 :     }
138 :     # Begin building the keywords. We start with the genome ID, the
139 :     # feature ID, the taxonomy, and the organism name.
140 :     my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
141 :     $fig->taxonomy_of($genomeID));
142 :     # Create the aliases.
143 :     for my $alias ($fig->feature_aliases($featureID)) {
144 :     # Connect this alias to this feature and make an Alias record for it.
145 :     $self->Put('IsAliasOf', 'from-link' => $alias, 'to-link' => $featureID);
146 :     $self->Put('FeatureAlias', id => $alias);
147 :     # Add it to the keyword list.
148 :     push @keywords, $alias;
149 :     # If this is a locus tag, also add its natural form as a keyword.
150 :     my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
151 :     if ($naturalName) {
152 :     push @keywords, $naturalName;
153 :     }
154 :     }
155 :     # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
156 :     my @corresponders = $fig->get_corresponding_ids($featureID, 1);
157 :     for my $tuple (@corresponders) {
158 :     my ($id, $xdb) = @{$tuple};
159 :     # Ignore SEED: that's us.
160 :     if ($xdb ne 'SEED') {
161 :     # Connect this ID to the feature and mark its database.
162 :     $self->Put('IsAlsoFoundIn', 'from-link' => $featureID, 'to-link' => $xdb,
163 :     alias => $id);
164 :     $self->Put('ExternalDatabase', id => $xdb);
165 :     # Add it as a keyword.
166 :     push @keywords, $id;
167 :     }
168 :     }
169 :     Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4);
170 :     # Break the assignment into words and shove it onto the
171 :     # keyword list.
172 :     push @keywords, split(/\s+/, $assignment);
173 :     # Link this feature to the parent genome.
174 :     $self->Put('HasFeature', 'from-link' => $genomeID, 'to-link' => $featureID,
175 :     type => $type);
176 :     # Get the links.
177 :     my @links = $fig->fid_links($featureID);
178 :     for my $link (@links) {
179 :     $self->Put('FeatureLink', id => $featureID, link => $link);
180 :     }
181 :     # If this is a peg, generate the translation and the upstream.
182 :     if ($type eq 'peg') {
183 :     $self->Add(pegIn => 1);
184 :     my $translation = $fig->get_translation($featureID);
185 :     if ($translation) {
186 :     $self->Put('FeatureTranslation', id => $featureID,
187 :     translation => $translation);
188 :     }
189 :     # We use the default upstream values of u=200 and c=100.
190 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
191 :     if ($upstream) {
192 :     $self->Put('FeatureUpstream', id => $featureID,
193 :     'upstream-sequence' => $upstream);
194 :     }
195 :     }
196 :     # Now we need to find the subsystems this feature participates in.
197 :     my @ssList = $fig->subsystems_for_peg($featureID);
198 :     # This hash prevents us from adding the same subsystem twice.
199 :     my %seen = ();
200 :     for my $ssEntry (@ssList) {
201 :     # Get the subsystem and role.
202 :     my ($subsystem, $role) = @{$ssEntry};
203 :     # Only proceed if we like this subsystem.
204 :     if (exists $subHash->{$subsystem}) {
205 :     # If this is the first time we've seen this subsystem for
206 :     # this peg, store the has-role link.
207 :     if (! $seen{$subsystem}) {
208 :     $self->Put('HasRoleInSubsystem', 'from-link' => $featureID,
209 :     'to-link' => $subsystem, genome => $genomeID,
210 :     type => $type);
211 :     # Save the subsystem's keyword data.
212 :     push @keywords, split /[\s_]+/, $subsystem;
213 :     }
214 :     # Now add the role to the keyword list.
215 :     push @keywords, split /\s+/, $role;
216 :     }
217 :     }
218 :     # There are three special attributes computed from property
219 :     # data that we build next. If the special attribute is non-empty,
220 :     # its name will be added to the keyword list. First, we get all
221 :     # the attributes for this feature. They will come back as
222 :     # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
223 :     # [name, value, value with URL]. (We don't need the PEG, since
224 :     # we already know it.)
225 :     my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
226 :     @{$attributes->{$featureID}};
227 :     # Now we process each of the special attributes.
228 :     if ($self->SpecialAttribute($featureID, \@attributes,
229 :     1, [0,2], '^(essential|potential_essential)$',
230 :     qw(FeatureEssential essential))) {
231 :     push @keywords, 'essential';
232 :     $self->Add(essential => 1);
233 :     }
234 :     if ($self->SpecialAttribute($featureID, \@attributes,
235 :     0, [2], '^virulen',
236 :     qw(FeatureVirulent virulent))) {
237 :     push @keywords, 'virulent';
238 :     $self->Add(virulent => 1);
239 :     }
240 :     if ($self->SpecialAttribute($featureID, \@attributes,
241 :     0, [0,2], '^iedb_',
242 :     qw(FeatureIEDB iedb))) {
243 :     push @keywords, 'iedb';
244 :     $self->Add(iedb => 1);
245 :     }
246 :     # Now we have some other attributes we need to process. To get
247 :     # through them, we convert the attribute list for this feature
248 :     # into a two-layer hash: key => subkey => value.
249 :     my %attributeHash = ();
250 :     for my $attrRow (@{$attributes->{$featureID}}) {
251 :     my (undef, $key, @values) = @{$attrRow};
252 :     my ($realKey, $subKey);
253 :     if ($key =~ /^([^:]+)::(.+)/) {
254 :     ($realKey, $subKey) = ($1, $2);
255 :     } else {
256 :     ($realKey, $subKey) = ($key, "");
257 :     }
258 :     if (exists $attributeHash{$realKey}) {
259 :     $attributeHash{$realKey}->{$subKey} = \@values;
260 :     } else {
261 :     $attributeHash{$realKey} = {$subKey => \@values};
262 :     }
263 :     }
264 :     # First we handle CDD. This is a bit complicated, because
265 :     # there are multiple CDDs per protein.
266 :     if (exists $attributeHash{CDD}) {
267 :     # Get the hash of CDD IDs to scores for this feature. We
268 :     # already know it exists because of the above IF.
269 :     my $cddHash = $attributeHash{CDD};
270 :     my @cddData = sort keys %{$cddHash};
271 :     for my $cdd (@cddData) {
272 :     # Extract the score for this CDD and decode it.
273 :     my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
274 :     my $realScore = FIGRules::DecodeScore($codeScore);
275 :     # We can't afford to crash because of a bad attribute
276 :     # value, hence the IF below.
277 :     if (! defined($realScore)) {
278 :     # Bad score, so count it.
279 :     $self->Add(badCDDscore => 1);
280 :     Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3);
281 :     } else {
282 :     # Create the connection and a CDD record.
283 :     $self->Put('IsPresentOnProteinOf', 'from-link' => $cdd,
284 :     'to-link' => $featureID, score => $realScore);
285 :     $self->Put('CDD', id => $cdd);
286 :     }
287 :     }
288 :     }
289 :     # Next we do PSORT cell locations. here the confidence value
290 :     # could have the value "unknown", which we translate to -1.
291 :     if (exists $attributeHash{PSORT}) {
292 :     # This will be a hash of cell locations to confidence
293 :     # factors.
294 :     my $psortHash = $attributeHash{PSORT};
295 :     for my $psort (keys %{$psortHash}) {
296 :     # Get the confidence, and convert it to a number if necessary.
297 :     my $confidence = $psortHash->{$psort};
298 :     if ($confidence eq 'unknown') {
299 :     $confidence = -1;
300 :     }
301 :     $self->Put('IsPossiblePlaceFor', 'from-link' => $psort,
302 :     'to-link' => $featureID, confidence => $confidence);
303 :     $self->Put('CellLocation', id => $psort);
304 :     # If this is a significant location, add it as a keyword.
305 :     if ($confidence > 2.5) {
306 :     push @keywords, $psort;
307 :     }
308 :     }
309 :     }
310 :     # Phobius data is next. This consists of the signal peptide location and
311 :     # the transmembrane locations.
312 :     my $signalList = "";
313 :     my $transList = "";
314 :     if (exists $attributeHash{Phobius}) {
315 :     # This will be a hash of two keys (transmembrane and signal) to
316 :     # location strings. If there's no value, we stuff in an empty string.
317 :     $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal});
318 :     $transList = $self->GetCommaList($attributeHash{Phobius}->{transmembrane});
319 :     }
320 :     # Here are some more numbers: isoelectric point, molecular weight, and
321 :     # the similar-to-human flag.
322 :     my $isoelectric = 0;
323 :     if (exists $attributeHash{isoelectric_point}) {
324 :     $isoelectric = $attributeHash{isoelectric_point}->{""};
325 :     }
326 :     my $similarToHuman = 0;
327 :     if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
328 :     $similarToHuman = 1;
329 :     }
330 :     my $molecularWeight = 0;
331 :     if (exists $attributeHash{molecular_weight}) {
332 :     $molecularWeight = $attributeHash{molecular_weight}->{""};
333 :     }
334 :     # Join the keyword string.
335 :     my $keywordString = join(" ", @keywords);
336 :     # Get rid of annoying punctuation.
337 :     $keywordString =~ s/[();@#\/]/ /g;
338 :     # Get the list of keywords in the keyword string.
339 :     my @realKeywords = $stemmer->Split($keywordString);
340 :     # We need to do two things here: create the keyword string for the feature table
341 :     # and write records to the keyword table for the keywords.
342 :     my (%keys, %stems, @realStems);
343 :     for my $keyword (@realKeywords) {
344 :     # Compute the stem and phonex for this keyword.
345 :     my ($stem, $phonex) = $stemmer->StemLookup($keyword);
346 :     # Only proceed if a stem comes back. If no stem came back, it's a
347 :     # stop word and we throw it away.
348 :     if ($stem) {
349 :     $keys{$keyword} = $stem;
350 :     $stems{$stem} = $phonex;
351 :     push @realStems, $stem;
352 :     }
353 :     }
354 :     # Now create the keyword string.
355 :     my $cleanWords = join(" ", @realStems);
356 :     Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4);
357 :     # Create keyword table entries for the keywords found.
358 :     for my $key (keys %keys) {
359 :     my $stem = $keys{$key};
360 :     $self->Put('Keyword', id => $key, stem => $stem, phonex => $stems{$stem});
361 :     }
362 :     # Now we need to process the feature's locations. First, we split them up.
363 :     my @locationList = split /\s*,\s*/, $locations;
364 :     # Next, we convert them to Sprout location objects.
365 :     my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
366 :     # Assemble them into a sprout location string for later.
367 :     my $locationString = join(", ", map { $_->String } @locObjectList);
368 :     # We'll store the sequence length in here.
369 :     my $sequenceLength = 0;
370 :     # This part is the roughest. We need to relate the features to contig
371 :     # locations, and the locations must be split so that none of them exceed
372 :     # the maximum segment size. This simplifies the genes_in_region processing
373 :     # for Sprout. To start, we create the location position indicator.
374 :     my $i = 1;
375 :     # Loop through the locations.
376 :     for my $locObject (@locObjectList) {
377 :     # Record the length.
378 :     $sequenceLength += $locObject->Length;
379 :     # Split this location into a list of chunks.
380 :     my @locOList = ();
381 :     while (my $peeling = $locObject->Peel($chunkSize)) {
382 :     $self->Add(peeling => 1);
383 :     push @locOList, $peeling;
384 :     }
385 :     push @locOList, $locObject;
386 :     # Loop through the chunks, creating IsLocatedIn records. The variable
387 :     # "$i" will be used to keep the location index.
388 :     for my $locChunk (@locOList) {
389 :     $self->Put('IsLocatedIn', 'from-link' => $featureID,
390 :     'to-link' => $locChunk->Contig, beg => $locChunk->Left,
391 :     dir => $locChunk->Dir, len => $locChunk->Length,
392 :     locN => $i);
393 :     $i++;
394 :     }
395 :     }
396 :     # Now we get some ancillary flags.
397 :     my $locked = $fig->is_locked_fid($featureID);
398 :     my $in_genbank = $fig->peg_in_gendb($featureID);
399 :     # Create the feature record.
400 :     $self->Put('Feature', id => $featureID, 'assignment-maker' => $user,
401 :     'assignment-quality' => $quality, 'feature-type' => $type,
402 :     'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric,
403 :     locked => $locked, 'molecular-weight' => $molecularWeight,
404 :     'sequence-length' => $sequenceLength,
405 :     'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman,
406 :     assignment => $assignment, keywords => $cleanWords,
407 :     'location-string' => $locationString,
408 :     'transmembrane-map' => $transList);
409 :     }
410 :     }
411 :     }
412 :     }
413 :    
414 :    
415 :     =head3 SpecialAttribute
416 :    
417 :     my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);
418 :    
419 :     Look for special attributes of a given type. A special attribute is found by comparing one of
420 :     the columns of the incoming attribute list to a search pattern. If a match is found, then
421 :     a set of columns is put into an output table connected to the specified ID.
422 :    
423 :     For example, when processing features, the attribute list we look at has three columns: attribute
424 :     name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
425 :     begins with C<iedb_>. The call signature is therefore
426 :    
427 :     my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb');
428 :    
429 :     The pattern is matched against column 0, and if we have a match, then column 2's value is put
430 :     to the output along with the specified feature ID.
431 :    
432 :     =over 4
433 :    
434 :     =item id
435 :    
436 :     ID of the object whose special attributes are being loaded. This forms the first column of the
437 :     output.
438 :    
439 :     =item attributes
440 :    
441 :     Reference to a list of tuples.
442 :    
443 :     =item idxMatch
444 :    
445 :     Index in each tuple of the column to be matched against the pattern. If the match is
446 :     successful, an output record will be generated.
447 :    
448 :     =item idxValues
449 :    
450 :     Reference to a list containing the indexes in each tuple of the columns to be put as
451 :     the second column of the output.
452 :    
453 :     =item pattern
454 :    
455 :     Pattern to be matched against the specified column. The match will be case-insensitive.
456 :    
457 :     =item tableName
458 :    
459 :     Name of the table to contain the attribute values found.
460 :    
461 :     =item fieldName
462 :    
463 :     Name of the field to contain the attribute values in the output table.
464 :    
465 :     =item RETURN
466 :    
467 :     Returns a count of the matches found.
468 :    
469 :     =item
470 :    
471 :     =back
472 :    
473 :     =cut
474 :    
475 :     sub SpecialAttribute {
476 :     # Get the parameters.
477 :     my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_;
478 :     # Declare the return variable.
479 :     my $retVal = 0;
480 :     # Loop through the attribute rows.
481 :     for my $row (@{$attributes}) {
482 :     # Check for a match.
483 :     if ($row->[$idxMatch] =~ m/$pattern/i) {
484 :     # We have a match, so output a row. This is a bit tricky, since we may
485 :     # be putting out multiple columns of data from the input. We join
486 :     # the columns together into a single space-delimited string.
487 :     my $value = join(" ", map { $row->[$_] } @{$idxValues});
488 :     $self->Put($tableName, id => $id, $fieldName => $value);
489 :     $retVal++;
490 :     }
491 :     }
492 :     Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal;
493 :     # Return the number of matches.
494 :     return $retVal;
495 :     }
496 :    
497 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3