[Bio] / Sprout / FeatureSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use BioWords;
26 :     use AliasAnalysis;
27 :     use base 'BaseSproutLoader';
28 :    
29 :     =head1 Sprout Feature Load Group Class
30 :    
31 :     =head2 Introduction
32 :    
33 :     The Feature Load Group includes all of the major feature-related tables.
34 :    
35 :     =head3 new
36 :    
37 : parrello 1.3 my $sl = FeatureSproutLoader->new($erdb, $source, $options, @tables);
38 : parrello 1.1
39 : parrello 1.3 Construct a new FeatureSproutLoader object.
40 : parrello 1.1
41 :     =over 4
42 :    
43 :     =item erdb
44 :    
45 :     [[SproutPm]] object for the database being loaded.
46 :    
47 :     =item source
48 :    
49 : parrello 1.3 [[FigPm]] object used to access the source data. If this parameter is undefined,
50 :     it will be created the first time the L</source> method is called.
51 : parrello 1.1
52 :     =item options
53 :    
54 :     Reference to a hash of command-line options.
55 :    
56 :     =item tables
57 :    
58 :     List of tables in this load group.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub new {
65 :     # Get the parameters.
66 :     my ($class, $erdb, $source, $options) = @_;
67 :     # Create the table list.
68 :     my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink
69 :     FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem
70 :     FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf
71 :     CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword);
72 :     # Create the BaseSproutLoader object.
73 :     my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
74 : parrello 1.2 # Get the list of relevant attributes.
75 : parrello 1.1 # Bless and return it.
76 :     bless $retVal, $class;
77 :     return $retVal;
78 :     }
79 :    
80 :     =head2 Public Methods
81 :    
82 :     =head3 Generate
83 :    
84 :     $sl->Generate();
85 :    
86 :     Generate the data for the feature-related files.
87 :    
88 :     =cut
89 :    
90 :     sub Generate {
91 :     # Get the parameters.
92 :     my ($self) = @_;
93 :     # Get the sprout object.
94 :     my $sprout = $self->db();
95 :     # Get the FIG object.
96 :     my $fig = $self->source();
97 :     # Get the subsystem list.
98 :     my $subHash = $self->GetSubsystems();
99 :     # Get the word stemmer.
100 :     my $stemmer = $sprout->GetStemmer();
101 :     # Only proceed if this is not the global section.
102 :     if (! $self->global()) {
103 : parrello 1.4 # Get the section ID.
104 :     my $genomeID = $self->section();
105 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
106 :     # locations.
107 :     my $chunkSize = $sprout->MaxSegment();
108 :     Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);
109 :     # Get the feature list for this genome.
110 :     my $features = $fig->all_features_detailed_fast($genomeID);
111 :     # Sort and count the list.
112 :     my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
113 :     my $count = scalar @featureTuples;
114 :     Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);
115 :     # Get the attributes for this genome and put them in a hash by feature ID.
116 :     my $attributes = $self->GetGenomeAttributes($genomeID);
117 :     Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);
118 :     # Loop through the features.
119 :     for my $featureTuple (@featureTuples) {
120 :     # Split the tuple.
121 : parrello 1.4 my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
122 : parrello 1.1 # Make sure this feature is active.
123 :     if (! $fig->is_deleted_fid($featureID)) {
124 :     # Handle missing assignments.
125 :     if (! defined $assignment) {
126 :     $assignment = '';
127 :     $user = '';
128 :     } else {
129 :     # The default assignment-maker is FIG.
130 :     $user ||= 'fig';
131 :     }
132 :     # Count this feature.
133 :     $self->Track(features => $featureID, 1000);
134 :     # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
135 :     # Sprout database requires a single character.
136 :     if (! defined($quality) || $quality eq "") {
137 :     $quality = " ";
138 :     }
139 :     # Begin building the keywords. We start with the genome ID, the
140 :     # feature ID, the taxonomy, and the organism name.
141 :     my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
142 :     $fig->taxonomy_of($genomeID));
143 :     # Create the aliases.
144 : parrello 1.4 for my $alias (split /,/, $aliases) {
145 : parrello 1.1 # Connect this alias to this feature and make an Alias record for it.
146 :     $self->Put('IsAliasOf', 'from-link' => $alias, 'to-link' => $featureID);
147 :     $self->Put('FeatureAlias', id => $alias);
148 :     # Add it to the keyword list.
149 :     push @keywords, $alias;
150 :     # If this is a locus tag, also add its natural form as a keyword.
151 :     my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
152 :     if ($naturalName) {
153 :     push @keywords, $naturalName;
154 :     }
155 :     }
156 :     # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
157 :     my @corresponders = $fig->get_corresponding_ids($featureID, 1);
158 :     for my $tuple (@corresponders) {
159 :     my ($id, $xdb) = @{$tuple};
160 :     # Ignore SEED: that's us.
161 :     if ($xdb ne 'SEED') {
162 :     # Connect this ID to the feature and mark its database.
163 :     $self->Put('IsAlsoFoundIn', 'from-link' => $featureID, 'to-link' => $xdb,
164 :     alias => $id);
165 :     $self->Put('ExternalDatabase', id => $xdb);
166 :     # Add it as a keyword.
167 :     push @keywords, $id;
168 :     }
169 :     }
170 :     Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4);
171 :     # Break the assignment into words and shove it onto the
172 :     # keyword list.
173 :     push @keywords, split(/\s+/, $assignment);
174 : parrello 1.5 # Add any EC numbers.
175 :     push @keywords, BioWords::ExtractECs($assignment);
176 : parrello 1.1 # Link this feature to the parent genome.
177 :     $self->Put('HasFeature', 'from-link' => $genomeID, 'to-link' => $featureID,
178 :     type => $type);
179 :     # Get the links.
180 :     my @links = $fig->fid_links($featureID);
181 :     for my $link (@links) {
182 :     $self->Put('FeatureLink', id => $featureID, link => $link);
183 :     }
184 :     # If this is a peg, generate the translation and the upstream.
185 :     if ($type eq 'peg') {
186 :     $self->Add(pegIn => 1);
187 :     my $translation = $fig->get_translation($featureID);
188 :     if ($translation) {
189 :     $self->Put('FeatureTranslation', id => $featureID,
190 :     translation => $translation);
191 :     }
192 :     # We use the default upstream values of u=200 and c=100.
193 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
194 :     if ($upstream) {
195 :     $self->Put('FeatureUpstream', id => $featureID,
196 :     'upstream-sequence' => $upstream);
197 :     }
198 :     }
199 :     # Now we need to find the subsystems this feature participates in.
200 :     my @ssList = $fig->subsystems_for_peg($featureID);
201 :     # This hash prevents us from adding the same subsystem twice.
202 :     my %seen = ();
203 :     for my $ssEntry (@ssList) {
204 :     # Get the subsystem and role.
205 :     my ($subsystem, $role) = @{$ssEntry};
206 :     # Only proceed if we like this subsystem.
207 :     if (exists $subHash->{$subsystem}) {
208 :     # If this is the first time we've seen this subsystem for
209 :     # this peg, store the has-role link.
210 :     if (! $seen{$subsystem}) {
211 :     $self->Put('HasRoleInSubsystem', 'from-link' => $featureID,
212 :     'to-link' => $subsystem, genome => $genomeID,
213 :     type => $type);
214 : parrello 1.5 # Save the subsystem's keywords.
215 : parrello 1.1 push @keywords, split /[\s_]+/, $subsystem;
216 :     }
217 : parrello 1.5 # Now add the role and any embedded EC nubmers to the keyword list.
218 : parrello 1.1 push @keywords, split /\s+/, $role;
219 : parrello 1.5 push @keywords, BioWords::ExtractECs($role);
220 : parrello 1.1 }
221 :     }
222 : parrello 1.5 # For each hyphenated word, we also need the pieces.
223 :     my @hyphenated = grep { $_ =~ /-/ } @keywords;
224 :     for my $hyphenated (@hyphenated) {
225 :     # Bust it into pieces.
226 :     my @pieces = grep { length($_) > 2 } split /-/, $hyphenated;
227 :     push @keywords, @pieces;
228 :     }
229 : parrello 1.1 # There are three special attributes computed from property
230 :     # data that we build next. If the special attribute is non-empty,
231 :     # its name will be added to the keyword list. First, we get all
232 :     # the attributes for this feature. They will come back as
233 :     # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
234 :     # [name, value, value with URL]. (We don't need the PEG, since
235 :     # we already know it.)
236 :     my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
237 :     @{$attributes->{$featureID}};
238 :     # Now we process each of the special attributes.
239 :     if ($self->SpecialAttribute($featureID, \@attributes,
240 :     1, [0,2], '^(essential|potential_essential)$',
241 :     qw(FeatureEssential essential))) {
242 :     push @keywords, 'essential';
243 :     $self->Add(essential => 1);
244 :     }
245 :     if ($self->SpecialAttribute($featureID, \@attributes,
246 :     0, [2], '^virulen',
247 :     qw(FeatureVirulent virulent))) {
248 :     push @keywords, 'virulent';
249 :     $self->Add(virulent => 1);
250 :     }
251 :     if ($self->SpecialAttribute($featureID, \@attributes,
252 :     0, [0,2], '^iedb_',
253 :     qw(FeatureIEDB iedb))) {
254 :     push @keywords, 'iedb';
255 :     $self->Add(iedb => 1);
256 :     }
257 :     # Now we have some other attributes we need to process. To get
258 :     # through them, we convert the attribute list for this feature
259 :     # into a two-layer hash: key => subkey => value.
260 :     my %attributeHash = ();
261 :     for my $attrRow (@{$attributes->{$featureID}}) {
262 :     my (undef, $key, @values) = @{$attrRow};
263 :     my ($realKey, $subKey);
264 :     if ($key =~ /^([^:]+)::(.+)/) {
265 :     ($realKey, $subKey) = ($1, $2);
266 :     } else {
267 :     ($realKey, $subKey) = ($key, "");
268 :     }
269 :     if (exists $attributeHash{$realKey}) {
270 :     $attributeHash{$realKey}->{$subKey} = \@values;
271 :     } else {
272 :     $attributeHash{$realKey} = {$subKey => \@values};
273 :     }
274 :     }
275 :     # First we handle CDD. This is a bit complicated, because
276 :     # there are multiple CDDs per protein.
277 :     if (exists $attributeHash{CDD}) {
278 :     # Get the hash of CDD IDs to scores for this feature. We
279 :     # already know it exists because of the above IF.
280 :     my $cddHash = $attributeHash{CDD};
281 :     my @cddData = sort keys %{$cddHash};
282 :     for my $cdd (@cddData) {
283 :     # Extract the score for this CDD and decode it.
284 :     my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
285 :     my $realScore = FIGRules::DecodeScore($codeScore);
286 :     # We can't afford to crash because of a bad attribute
287 :     # value, hence the IF below.
288 :     if (! defined($realScore)) {
289 :     # Bad score, so count it.
290 :     $self->Add(badCDDscore => 1);
291 :     Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3);
292 :     } else {
293 :     # Create the connection and a CDD record.
294 :     $self->Put('IsPresentOnProteinOf', 'from-link' => $cdd,
295 :     'to-link' => $featureID, score => $realScore);
296 :     $self->Put('CDD', id => $cdd);
297 :     }
298 :     }
299 :     }
300 :     # Next we do PSORT cell locations. here the confidence value
301 :     # could have the value "unknown", which we translate to -1.
302 :     if (exists $attributeHash{PSORT}) {
303 :     # This will be a hash of cell locations to confidence
304 :     # factors.
305 :     my $psortHash = $attributeHash{PSORT};
306 :     for my $psort (keys %{$psortHash}) {
307 :     # Get the confidence, and convert it to a number if necessary.
308 :     my $confidence = $psortHash->{$psort};
309 :     if ($confidence eq 'unknown') {
310 :     $confidence = -1;
311 :     }
312 :     $self->Put('IsPossiblePlaceFor', 'from-link' => $psort,
313 :     'to-link' => $featureID, confidence => $confidence);
314 :     $self->Put('CellLocation', id => $psort);
315 :     # If this is a significant location, add it as a keyword.
316 :     if ($confidence > 2.5) {
317 :     push @keywords, $psort;
318 :     }
319 :     }
320 :     }
321 :     # Phobius data is next. This consists of the signal peptide location and
322 :     # the transmembrane locations.
323 :     my $signalList = "";
324 :     my $transList = "";
325 :     if (exists $attributeHash{Phobius}) {
326 :     # This will be a hash of two keys (transmembrane and signal) to
327 :     # location strings. If there's no value, we stuff in an empty string.
328 :     $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal});
329 :     $transList = $self->GetCommaList($attributeHash{Phobius}->{transmembrane});
330 :     }
331 :     # Here are some more numbers: isoelectric point, molecular weight, and
332 :     # the similar-to-human flag.
333 :     my $isoelectric = 0;
334 :     if (exists $attributeHash{isoelectric_point}) {
335 :     $isoelectric = $attributeHash{isoelectric_point}->{""};
336 :     }
337 :     my $similarToHuman = 0;
338 :     if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
339 :     $similarToHuman = 1;
340 :     }
341 :     my $molecularWeight = 0;
342 :     if (exists $attributeHash{molecular_weight}) {
343 :     $molecularWeight = $attributeHash{molecular_weight}->{""};
344 :     }
345 :     # Join the keyword string.
346 :     my $keywordString = join(" ", @keywords);
347 :     # Get rid of annoying punctuation.
348 : parrello 1.5 $keywordString =~ s/[();@#\/,]/ /g;
349 : parrello 1.1 # Get the list of keywords in the keyword string.
350 :     my @realKeywords = $stemmer->Split($keywordString);
351 :     # We need to do two things here: create the keyword string for the feature table
352 :     # and write records to the keyword table for the keywords.
353 :     my (%keys, %stems, @realStems);
354 :     for my $keyword (@realKeywords) {
355 :     # Compute the stem and phonex for this keyword.
356 :     my ($stem, $phonex) = $stemmer->StemLookup($keyword);
357 :     # Only proceed if a stem comes back. If no stem came back, it's a
358 :     # stop word and we throw it away.
359 :     if ($stem) {
360 :     $keys{$keyword} = $stem;
361 :     $stems{$stem} = $phonex;
362 :     push @realStems, $stem;
363 :     }
364 :     }
365 :     # Now create the keyword string.
366 :     my $cleanWords = join(" ", @realStems);
367 :     Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4);
368 :     # Create keyword table entries for the keywords found.
369 :     for my $key (keys %keys) {
370 :     my $stem = $keys{$key};
371 :     $self->Put('Keyword', id => $key, stem => $stem, phonex => $stems{$stem});
372 :     }
373 :     # Now we need to process the feature's locations. First, we split them up.
374 :     my @locationList = split /\s*,\s*/, $locations;
375 :     # Next, we convert them to Sprout location objects.
376 :     my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
377 :     # Assemble them into a sprout location string for later.
378 :     my $locationString = join(", ", map { $_->String } @locObjectList);
379 :     # We'll store the sequence length in here.
380 :     my $sequenceLength = 0;
381 :     # This part is the roughest. We need to relate the features to contig
382 :     # locations, and the locations must be split so that none of them exceed
383 :     # the maximum segment size. This simplifies the genes_in_region processing
384 :     # for Sprout. To start, we create the location position indicator.
385 :     my $i = 1;
386 :     # Loop through the locations.
387 :     for my $locObject (@locObjectList) {
388 :     # Record the length.
389 :     $sequenceLength += $locObject->Length;
390 :     # Split this location into a list of chunks.
391 :     my @locOList = ();
392 :     while (my $peeling = $locObject->Peel($chunkSize)) {
393 :     $self->Add(peeling => 1);
394 :     push @locOList, $peeling;
395 :     }
396 :     push @locOList, $locObject;
397 :     # Loop through the chunks, creating IsLocatedIn records. The variable
398 :     # "$i" will be used to keep the location index.
399 :     for my $locChunk (@locOList) {
400 :     $self->Put('IsLocatedIn', 'from-link' => $featureID,
401 :     'to-link' => $locChunk->Contig, beg => $locChunk->Left,
402 :     dir => $locChunk->Dir, len => $locChunk->Length,
403 :     locN => $i);
404 :     $i++;
405 :     }
406 :     }
407 :     # Now we get some ancillary flags.
408 :     my $locked = $fig->is_locked_fid($featureID);
409 :     my $in_genbank = $fig->peg_in_gendb($featureID);
410 :     # Create the feature record.
411 :     $self->Put('Feature', id => $featureID, 'assignment-maker' => $user,
412 :     'assignment-quality' => $quality, 'feature-type' => $type,
413 :     'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric,
414 :     locked => $locked, 'molecular-weight' => $molecularWeight,
415 :     'sequence-length' => $sequenceLength,
416 :     'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman,
417 :     assignment => $assignment, keywords => $cleanWords,
418 :     'location-string' => $locationString,
419 :     'transmembrane-map' => $transList);
420 :     }
421 :     }
422 :     }
423 :     }
424 :    
425 :    
426 :     =head3 SpecialAttribute
427 :    
428 :     my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);
429 :    
430 :     Look for special attributes of a given type. A special attribute is found by comparing one of
431 :     the columns of the incoming attribute list to a search pattern. If a match is found, then
432 :     a set of columns is put into an output table connected to the specified ID.
433 :    
434 :     For example, when processing features, the attribute list we look at has three columns: attribute
435 :     name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
436 :     begins with C<iedb_>. The call signature is therefore
437 :    
438 :     my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb');
439 :    
440 :     The pattern is matched against column 0, and if we have a match, then column 2's value is put
441 :     to the output along with the specified feature ID.
442 :    
443 :     =over 4
444 :    
445 :     =item id
446 :    
447 :     ID of the object whose special attributes are being loaded. This forms the first column of the
448 :     output.
449 :    
450 :     =item attributes
451 :    
452 :     Reference to a list of tuples.
453 :    
454 :     =item idxMatch
455 :    
456 :     Index in each tuple of the column to be matched against the pattern. If the match is
457 :     successful, an output record will be generated.
458 :    
459 :     =item idxValues
460 :    
461 :     Reference to a list containing the indexes in each tuple of the columns to be put as
462 :     the second column of the output.
463 :    
464 :     =item pattern
465 :    
466 :     Pattern to be matched against the specified column. The match will be case-insensitive.
467 :    
468 :     =item tableName
469 :    
470 :     Name of the table to contain the attribute values found.
471 :    
472 :     =item fieldName
473 :    
474 :     Name of the field to contain the attribute values in the output table.
475 :    
476 :     =item RETURN
477 :    
478 :     Returns a count of the matches found.
479 :    
480 :     =item
481 :    
482 :     =back
483 :    
484 :     =cut
485 :    
486 :     sub SpecialAttribute {
487 :     # Get the parameters.
488 :     my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_;
489 :     # Declare the return variable.
490 :     my $retVal = 0;
491 :     # Loop through the attribute rows.
492 :     for my $row (@{$attributes}) {
493 :     # Check for a match.
494 :     if ($row->[$idxMatch] =~ m/$pattern/i) {
495 :     # We have a match, so output a row. This is a bit tricky, since we may
496 :     # be putting out multiple columns of data from the input. We join
497 :     # the columns together into a single space-delimited string.
498 :     my $value = join(" ", map { $row->[$_] } @{$idxValues});
499 :     $self->Put($tableName, id => $id, $fieldName => $value);
500 :     $retVal++;
501 :     }
502 :     }
503 :     Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal;
504 :     # Return the number of matches.
505 :     return $retVal;
506 :     }
507 :    
508 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3