[Bio] / Sprout / FeatureSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use BioWords;
26 :     use AliasAnalysis;
27 :     use base 'BaseSproutLoader';
28 :    
29 :     =head1 Sprout Feature Load Group Class
30 :    
31 :     =head2 Introduction
32 :    
33 :     The Feature Load Group includes all of the major feature-related tables.
34 :    
35 :     =head3 new
36 :    
37 :     my $sl = BaseSproutLoader->new($erdb, $source, $options, @tables);
38 :    
39 :     Construct a new BaseSproutLoader object.
40 :    
41 :     =over 4
42 :    
43 :     =item erdb
44 :    
45 :     [[SproutPm]] object for the database being loaded.
46 :    
47 :     =item source
48 :    
49 :     [[FigPm]] object used to access the source data.
50 :    
51 :     =item options
52 :    
53 :     Reference to a hash of command-line options.
54 :    
55 :     =item tables
56 :    
57 :     List of tables in this load group.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub new {
64 :     # Get the parameters.
65 :     my ($class, $erdb, $source, $options) = @_;
66 :     # Create the table list.
67 :     my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink
68 :     FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem
69 :     FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf
70 :     CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword);
71 :     # Create the BaseSproutLoader object.
72 :     my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
73 :     # Bless and return it.
74 :     bless $retVal, $class;
75 :     return $retVal;
76 :     }
77 :    
78 :     =head2 Public Methods
79 :    
80 :     =head3 Generate
81 :    
82 :     $sl->Generate();
83 :    
84 :     Generate the data for the feature-related files.
85 :    
86 :     =cut
87 :    
88 :     sub Generate {
89 :     # Get the parameters.
90 :     my ($self) = @_;
91 :     # Get the section ID.
92 :     my $genomeID = $self->section();
93 :     # Get the sprout object.
94 :     my $sprout = $self->db();
95 :     # Get the FIG object.
96 :     my $fig = $self->source();
97 :     # Get the subsystem list.
98 :     my $subHash = $self->GetSubsystems();
99 :     # Get the word stemmer.
100 :     my $stemmer = $sprout->GetStemmer();
101 :     # Only proceed if this is not the global section.
102 :     if (! $self->global()) {
103 :     # Get the maximum sequence size. We need this later for splitting up the
104 :     # locations.
105 :     my $chunkSize = $sprout->MaxSegment();
106 :     Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);
107 :     # Get the feature list for this genome.
108 :     my $features = $fig->all_features_detailed_fast($genomeID);
109 :     # Sort and count the list.
110 :     my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
111 :     my $count = scalar @featureTuples;
112 :     Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);
113 :     # Get the attributes for this genome and put them in a hash by feature ID.
114 :     my $attributes = $self->GetGenomeAttributes($genomeID);
115 :     Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);
116 :     # Loop through the features.
117 :     for my $featureTuple (@featureTuples) {
118 :     # Split the tuple.
119 :     my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
120 :     # Make sure this feature is active.
121 :     if (! $fig->is_deleted_fid($featureID)) {
122 :     # Handle missing assignments.
123 :     if (! defined $assignment) {
124 :     $assignment = '';
125 :     $user = '';
126 :     } else {
127 :     # The default assignment-maker is FIG.
128 :     $user ||= 'fig';
129 :     }
130 :     # Count this feature.
131 :     $self->Track(features => $featureID, 1000);
132 :     # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
133 :     # Sprout database requires a single character.
134 :     if (! defined($quality) || $quality eq "") {
135 :     $quality = " ";
136 :     }
137 :     # Begin building the keywords. We start with the genome ID, the
138 :     # feature ID, the taxonomy, and the organism name.
139 :     my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
140 :     $fig->taxonomy_of($genomeID));
141 :     # Create the aliases.
142 :     for my $alias ($fig->feature_aliases($featureID)) {
143 :     # Connect this alias to this feature and make an Alias record for it.
144 :     $self->Put('IsAliasOf', 'from-link' => $alias, 'to-link' => $featureID);
145 :     $self->Put('FeatureAlias', id => $alias);
146 :     # Add it to the keyword list.
147 :     push @keywords, $alias;
148 :     # If this is a locus tag, also add its natural form as a keyword.
149 :     my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
150 :     if ($naturalName) {
151 :     push @keywords, $naturalName;
152 :     }
153 :     }
154 :     # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
155 :     my @corresponders = $fig->get_corresponding_ids($featureID, 1);
156 :     for my $tuple (@corresponders) {
157 :     my ($id, $xdb) = @{$tuple};
158 :     # Ignore SEED: that's us.
159 :     if ($xdb ne 'SEED') {
160 :     # Connect this ID to the feature and mark its database.
161 :     $self->Put('IsAlsoFoundIn', 'from-link' => $featureID, 'to-link' => $xdb,
162 :     alias => $id);
163 :     $self->Put('ExternalDatabase', id => $xdb);
164 :     # Add it as a keyword.
165 :     push @keywords, $id;
166 :     }
167 :     }
168 :     Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4);
169 :     # Break the assignment into words and shove it onto the
170 :     # keyword list.
171 :     push @keywords, split(/\s+/, $assignment);
172 :     # Link this feature to the parent genome.
173 :     $self->Put('HasFeature', 'from-link' => $genomeID, 'to-link' => $featureID,
174 :     type => $type);
175 :     # Get the links.
176 :     my @links = $fig->fid_links($featureID);
177 :     for my $link (@links) {
178 :     $self->Put('FeatureLink', id => $featureID, link => $link);
179 :     }
180 :     # If this is a peg, generate the translation and the upstream.
181 :     if ($type eq 'peg') {
182 :     $self->Add(pegIn => 1);
183 :     my $translation = $fig->get_translation($featureID);
184 :     if ($translation) {
185 :     $self->Put('FeatureTranslation', id => $featureID,
186 :     translation => $translation);
187 :     }
188 :     # We use the default upstream values of u=200 and c=100.
189 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
190 :     if ($upstream) {
191 :     $self->Put('FeatureUpstream', id => $featureID,
192 :     'upstream-sequence' => $upstream);
193 :     }
194 :     }
195 :     # Now we need to find the subsystems this feature participates in.
196 :     my @ssList = $fig->subsystems_for_peg($featureID);
197 :     # This hash prevents us from adding the same subsystem twice.
198 :     my %seen = ();
199 :     for my $ssEntry (@ssList) {
200 :     # Get the subsystem and role.
201 :     my ($subsystem, $role) = @{$ssEntry};
202 :     # Only proceed if we like this subsystem.
203 :     if (exists $subHash->{$subsystem}) {
204 :     # If this is the first time we've seen this subsystem for
205 :     # this peg, store the has-role link.
206 :     if (! $seen{$subsystem}) {
207 :     $self->Put('HasRoleInSubsystem', 'from-link' => $featureID,
208 :     'to-link' => $subsystem, genome => $genomeID,
209 :     type => $type);
210 :     # Save the subsystem's keyword data.
211 :     push @keywords, split /[\s_]+/, $subsystem;
212 :     }
213 :     # Now add the role to the keyword list.
214 :     push @keywords, split /\s+/, $role;
215 :     }
216 :     }
217 :     # There are three special attributes computed from property
218 :     # data that we build next. If the special attribute is non-empty,
219 :     # its name will be added to the keyword list. First, we get all
220 :     # the attributes for this feature. They will come back as
221 :     # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
222 :     # [name, value, value with URL]. (We don't need the PEG, since
223 :     # we already know it.)
224 :     my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
225 :     @{$attributes->{$featureID}};
226 :     # Now we process each of the special attributes.
227 :     if ($self->SpecialAttribute($featureID, \@attributes,
228 :     1, [0,2], '^(essential|potential_essential)$',
229 :     qw(FeatureEssential essential))) {
230 :     push @keywords, 'essential';
231 :     $self->Add(essential => 1);
232 :     }
233 :     if ($self->SpecialAttribute($featureID, \@attributes,
234 :     0, [2], '^virulen',
235 :     qw(FeatureVirulent virulent))) {
236 :     push @keywords, 'virulent';
237 :     $self->Add(virulent => 1);
238 :     }
239 :     if ($self->SpecialAttribute($featureID, \@attributes,
240 :     0, [0,2], '^iedb_',
241 :     qw(FeatureIEDB iedb))) {
242 :     push @keywords, 'iedb';
243 :     $self->Add(iedb => 1);
244 :     }
245 :     # Now we have some other attributes we need to process. To get
246 :     # through them, we convert the attribute list for this feature
247 :     # into a two-layer hash: key => subkey => value.
248 :     my %attributeHash = ();
249 :     for my $attrRow (@{$attributes->{$featureID}}) {
250 :     my (undef, $key, @values) = @{$attrRow};
251 :     my ($realKey, $subKey);
252 :     if ($key =~ /^([^:]+)::(.+)/) {
253 :     ($realKey, $subKey) = ($1, $2);
254 :     } else {
255 :     ($realKey, $subKey) = ($key, "");
256 :     }
257 :     if (exists $attributeHash{$realKey}) {
258 :     $attributeHash{$realKey}->{$subKey} = \@values;
259 :     } else {
260 :     $attributeHash{$realKey} = {$subKey => \@values};
261 :     }
262 :     }
263 :     # First we handle CDD. This is a bit complicated, because
264 :     # there are multiple CDDs per protein.
265 :     if (exists $attributeHash{CDD}) {
266 :     # Get the hash of CDD IDs to scores for this feature. We
267 :     # already know it exists because of the above IF.
268 :     my $cddHash = $attributeHash{CDD};
269 :     my @cddData = sort keys %{$cddHash};
270 :     for my $cdd (@cddData) {
271 :     # Extract the score for this CDD and decode it.
272 :     my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
273 :     my $realScore = FIGRules::DecodeScore($codeScore);
274 :     # We can't afford to crash because of a bad attribute
275 :     # value, hence the IF below.
276 :     if (! defined($realScore)) {
277 :     # Bad score, so count it.
278 :     $self->Add(badCDDscore => 1);
279 :     Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3);
280 :     } else {
281 :     # Create the connection and a CDD record.
282 :     $self->Put('IsPresentOnProteinOf', 'from-link' => $cdd,
283 :     'to-link' => $featureID, score => $realScore);
284 :     $self->Put('CDD', id => $cdd);
285 :     }
286 :     }
287 :     }
288 :     # Next we do PSORT cell locations. here the confidence value
289 :     # could have the value "unknown", which we translate to -1.
290 :     if (exists $attributeHash{PSORT}) {
291 :     # This will be a hash of cell locations to confidence
292 :     # factors.
293 :     my $psortHash = $attributeHash{PSORT};
294 :     for my $psort (keys %{$psortHash}) {
295 :     # Get the confidence, and convert it to a number if necessary.
296 :     my $confidence = $psortHash->{$psort};
297 :     if ($confidence eq 'unknown') {
298 :     $confidence = -1;
299 :     }
300 :     $self->Put('IsPossiblePlaceFor', 'from-link' => $psort,
301 :     'to-link' => $featureID, confidence => $confidence);
302 :     $self->Put('CellLocation', id => $psort);
303 :     # If this is a significant location, add it as a keyword.
304 :     if ($confidence > 2.5) {
305 :     push @keywords, $psort;
306 :     }
307 :     }
308 :     }
309 :     # Phobius data is next. This consists of the signal peptide location and
310 :     # the transmembrane locations.
311 :     my $signalList = "";
312 :     my $transList = "";
313 :     if (exists $attributeHash{Phobius}) {
314 :     # This will be a hash of two keys (transmembrane and signal) to
315 :     # location strings. If there's no value, we stuff in an empty string.
316 :     $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal});
317 :     $transList = $self->GetCommaList($attributeHash{Phobius}->{transmembrane});
318 :     }
319 :     # Here are some more numbers: isoelectric point, molecular weight, and
320 :     # the similar-to-human flag.
321 :     my $isoelectric = 0;
322 :     if (exists $attributeHash{isoelectric_point}) {
323 :     $isoelectric = $attributeHash{isoelectric_point}->{""};
324 :     }
325 :     my $similarToHuman = 0;
326 :     if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
327 :     $similarToHuman = 1;
328 :     }
329 :     my $molecularWeight = 0;
330 :     if (exists $attributeHash{molecular_weight}) {
331 :     $molecularWeight = $attributeHash{molecular_weight}->{""};
332 :     }
333 :     # Join the keyword string.
334 :     my $keywordString = join(" ", @keywords);
335 :     # Get rid of annoying punctuation.
336 :     $keywordString =~ s/[();@#\/]/ /g;
337 :     # Get the list of keywords in the keyword string.
338 :     my @realKeywords = $stemmer->Split($keywordString);
339 :     # We need to do two things here: create the keyword string for the feature table
340 :     # and write records to the keyword table for the keywords.
341 :     my (%keys, %stems, @realStems);
342 :     for my $keyword (@realKeywords) {
343 :     # Compute the stem and phonex for this keyword.
344 :     my ($stem, $phonex) = $stemmer->StemLookup($keyword);
345 :     # Only proceed if a stem comes back. If no stem came back, it's a
346 :     # stop word and we throw it away.
347 :     if ($stem) {
348 :     $keys{$keyword} = $stem;
349 :     $stems{$stem} = $phonex;
350 :     push @realStems, $stem;
351 :     }
352 :     }
353 :     # Now create the keyword string.
354 :     my $cleanWords = join(" ", @realStems);
355 :     Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4);
356 :     # Create keyword table entries for the keywords found.
357 :     for my $key (keys %keys) {
358 :     my $stem = $keys{$key};
359 :     $self->Put('Keyword', id => $key, stem => $stem, phonex => $stems{$stem});
360 :     }
361 :     # Now we need to process the feature's locations. First, we split them up.
362 :     my @locationList = split /\s*,\s*/, $locations;
363 :     # Next, we convert them to Sprout location objects.
364 :     my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
365 :     # Assemble them into a sprout location string for later.
366 :     my $locationString = join(", ", map { $_->String } @locObjectList);
367 :     # We'll store the sequence length in here.
368 :     my $sequenceLength = 0;
369 :     # This part is the roughest. We need to relate the features to contig
370 :     # locations, and the locations must be split so that none of them exceed
371 :     # the maximum segment size. This simplifies the genes_in_region processing
372 :     # for Sprout. To start, we create the location position indicator.
373 :     my $i = 1;
374 :     # Loop through the locations.
375 :     for my $locObject (@locObjectList) {
376 :     # Record the length.
377 :     $sequenceLength += $locObject->Length;
378 :     # Split this location into a list of chunks.
379 :     my @locOList = ();
380 :     while (my $peeling = $locObject->Peel($chunkSize)) {
381 :     $self->Add(peeling => 1);
382 :     push @locOList, $peeling;
383 :     }
384 :     push @locOList, $locObject;
385 :     # Loop through the chunks, creating IsLocatedIn records. The variable
386 :     # "$i" will be used to keep the location index.
387 :     for my $locChunk (@locOList) {
388 :     $self->Put('IsLocatedIn', 'from-link' => $featureID,
389 :     'to-link' => $locChunk->Contig, beg => $locChunk->Left,
390 :     dir => $locChunk->Dir, len => $locChunk->Length,
391 :     locN => $i);
392 :     $i++;
393 :     }
394 :     }
395 :     # Now we get some ancillary flags.
396 :     my $locked = $fig->is_locked_fid($featureID);
397 :     my $in_genbank = $fig->peg_in_gendb($featureID);
398 :     # Create the feature record.
399 :     $self->Put('Feature', id => $featureID, 'assignment-maker' => $user,
400 :     'assignment-quality' => $quality, 'feature-type' => $type,
401 :     'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric,
402 :     locked => $locked, 'molecular-weight' => $molecularWeight,
403 :     'sequence-length' => $sequenceLength,
404 :     'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman,
405 :     assignment => $assignment, keywords => $cleanWords,
406 :     'location-string' => $locationString,
407 :     'transmembrane-map' => $transList);
408 :     }
409 :     }
410 :     }
411 :     }
412 :    
413 :    
414 :     =head3 SpecialAttribute
415 :    
416 :     my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);
417 :    
418 :     Look for special attributes of a given type. A special attribute is found by comparing one of
419 :     the columns of the incoming attribute list to a search pattern. If a match is found, then
420 :     a set of columns is put into an output table connected to the specified ID.
421 :    
422 :     For example, when processing features, the attribute list we look at has three columns: attribute
423 :     name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
424 :     begins with C<iedb_>. The call signature is therefore
425 :    
426 :     my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb');
427 :    
428 :     The pattern is matched against column 0, and if we have a match, then column 2's value is put
429 :     to the output along with the specified feature ID.
430 :    
431 :     =over 4
432 :    
433 :     =item id
434 :    
435 :     ID of the object whose special attributes are being loaded. This forms the first column of the
436 :     output.
437 :    
438 :     =item attributes
439 :    
440 :     Reference to a list of tuples.
441 :    
442 :     =item idxMatch
443 :    
444 :     Index in each tuple of the column to be matched against the pattern. If the match is
445 :     successful, an output record will be generated.
446 :    
447 :     =item idxValues
448 :    
449 :     Reference to a list containing the indexes in each tuple of the columns to be put as
450 :     the second column of the output.
451 :    
452 :     =item pattern
453 :    
454 :     Pattern to be matched against the specified column. The match will be case-insensitive.
455 :    
456 :     =item tableName
457 :    
458 :     Name of the table to contain the attribute values found.
459 :    
460 :     =item fieldName
461 :    
462 :     Name of the field to contain the attribute values in the output table.
463 :    
464 :     =item RETURN
465 :    
466 :     Returns a count of the matches found.
467 :    
468 :     =item
469 :    
470 :     =back
471 :    
472 :     =cut
473 :    
474 :     sub SpecialAttribute {
475 :     # Get the parameters.
476 :     my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_;
477 :     # Declare the return variable.
478 :     my $retVal = 0;
479 :     # Loop through the attribute rows.
480 :     for my $row (@{$attributes}) {
481 :     # Check for a match.
482 :     if ($row->[$idxMatch] =~ m/$pattern/i) {
483 :     # We have a match, so output a row. This is a bit tricky, since we may
484 :     # be putting out multiple columns of data from the input. We join
485 :     # the columns together into a single space-delimited string.
486 :     my $value = join(" ", map { $row->[$_] } @{$idxValues});
487 :     $self->Put($tableName, id => $id, $fieldName => $value);
488 :     $retVal++;
489 :     }
490 :     }
491 :     Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal;
492 :     # Return the number of matches.
493 :     return $retVal;
494 :     }
495 :    
496 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3