Parent Directory
|
Revision Log
Revision 1.10 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | # | ||
4 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
5 : | # for Interpretations of Genomes. All Rights Reserved. | ||
6 : | # | ||
7 : | # This file is part of the SEED Toolkit. | ||
8 : | # | ||
9 : | # The SEED Toolkit is free software. You can redistribute | ||
10 : | # it and/or modify it under the terms of the SEED Toolkit | ||
11 : | # Public License. | ||
12 : | # | ||
13 : | # You should have received a copy of the SEED Toolkit Public License | ||
14 : | # along with this program; if not write to the University of Chicago | ||
15 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
16 : | # Genomes at veronika@thefig.info or download a copy from | ||
17 : | # http://www.theseed.org/LICENSE.TXT. | ||
18 : | # | ||
19 : | |||
20 : | package FeatureSproutLoader; | ||
21 : | |||
22 : | use strict; | ||
23 : | use Tracer; | ||
24 : | use ERDB; | ||
25 : | use BioWords; | ||
26 : | use AliasAnalysis; | ||
27 : | parrello | 1.6 | use DBMaster; |
28 : | use HyperLink; | ||
29 : | use FFs; | ||
30 : | parrello | 1.8 | use SOAP::Lite; |
31 : | parrello | 1.9 | use Time::HiRes; |
32 : | parrello | 1.10 | use LoaderUtils; |
33 : | parrello | 1.1 | use base 'BaseSproutLoader'; |
34 : | |||
35 : | =head1 Sprout Feature Load Group Class | ||
36 : | |||
37 : | =head2 Introduction | ||
38 : | |||
39 : | The Feature Load Group includes all of the major feature-related tables. | ||
40 : | |||
41 : | =head3 new | ||
42 : | |||
43 : | parrello | 1.3 | my $sl = FeatureSproutLoader->new($erdb, $source, $options, @tables); |
44 : | parrello | 1.1 | |
45 : | parrello | 1.3 | Construct a new FeatureSproutLoader object. |
46 : | parrello | 1.1 | |
47 : | =over 4 | ||
48 : | |||
49 : | =item erdb | ||
50 : | |||
51 : | [[SproutPm]] object for the database being loaded. | ||
52 : | |||
53 : | =item options | ||
54 : | |||
55 : | Reference to a hash of command-line options. | ||
56 : | |||
57 : | =item tables | ||
58 : | |||
59 : | List of tables in this load group. | ||
60 : | |||
61 : | =back | ||
62 : | |||
63 : | =cut | ||
64 : | |||
65 : | sub new { | ||
66 : | # Get the parameters. | ||
67 : | parrello | 1.6 | my ($class, $erdb, $options) = @_; |
68 : | parrello | 1.1 | # Create the table list. |
69 : | my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink | ||
70 : | FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem | ||
71 : | parrello | 1.9 | FeatureEssential FeatureVirulent FeatureIEDB CDD |
72 : | IsPresentOnProteinOf CellLocation IsPossiblePlaceFor | ||
73 : | IsAlsoFoundIn ExternalDatabase Keyword ProteinFamily | ||
74 : | IsFamilyForFeature ProteinFamilyName FeatureEC); | ||
75 : | parrello | 1.1 | # Create the BaseSproutLoader object. |
76 : | parrello | 1.6 | my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables); |
77 : | parrello | 1.2 | # Get the list of relevant attributes. |
78 : | parrello | 1.1 | # Bless and return it. |
79 : | bless $retVal, $class; | ||
80 : | return $retVal; | ||
81 : | } | ||
82 : | |||
83 : | =head2 Public Methods | ||
84 : | |||
85 : | =head3 Generate | ||
86 : | |||
87 : | $sl->Generate(); | ||
88 : | |||
89 : | Generate the data for the feature-related files. | ||
90 : | |||
91 : | =cut | ||
92 : | |||
93 : | sub Generate { | ||
94 : | # Get the parameters. | ||
95 : | my ($self) = @_; | ||
96 : | # Get the sprout object. | ||
97 : | my $sprout = $self->db(); | ||
98 : | # Get the FIG object. | ||
99 : | my $fig = $self->source(); | ||
100 : | # Get the subsystem list. | ||
101 : | my $subHash = $self->GetSubsystems(); | ||
102 : | # Get the word stemmer. | ||
103 : | my $stemmer = $sprout->GetStemmer(); | ||
104 : | parrello | 1.6 | # Get access to FIGfams. |
105 : | my $figfam_data = &FIG::get_figfams_data(); | ||
106 : | parrello | 1.9 | my $ffs = new FFs($figfam_data, $fig); |
107 : | # Compute the load directory. | ||
108 : | my $loadDirectory = $sprout->LoadDirectory(); | ||
109 : | parrello | 1.1 | # Only proceed if this is not the global section. |
110 : | if (! $self->global()) { | ||
111 : | parrello | 1.4 | # Get the section ID. |
112 : | my $genomeID = $self->section(); | ||
113 : | parrello | 1.9 | MemTrace("Starting section $genomeID.") if T(ERDBLoadGroup => 3); |
114 : | parrello | 1.6 | # Connect to the ontology database. |
115 : | my $sqlite_db = "/home/mkubal/Temp/Ontology/ontology.sqlite"; | ||
116 : | my $ontology_dbmaster = DBMaster->new(-database => $sqlite_db, -backend => 'SQLite'); | ||
117 : | parrello | 1.9 | # This is our master hash of FIG IDs to aliases. |
118 : | parrello | 1.10 | my $aliasMasterHash = LoaderUtils::ReadAliasFile($loadDirectory, $genomeID) || {}; |
119 : | parrello | 1.1 | # Get the maximum sequence size. We need this later for splitting up the |
120 : | # locations. | ||
121 : | my $chunkSize = $sprout->MaxSegment(); | ||
122 : | parrello | 1.9 | MemTrace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3); |
123 : | parrello | 1.1 | # Get the feature list for this genome. |
124 : | my $features = $fig->all_features_detailed_fast($genomeID); | ||
125 : | # Sort and count the list. | ||
126 : | my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; | ||
127 : | my $count = scalar @featureTuples; | ||
128 : | parrello | 1.9 | MemTrace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3); |
129 : | parrello | 1.1 | # Get the attributes for this genome and put them in a hash by feature ID. |
130 : | parrello | 1.6 | my $attributes = $self->GetGenomeAttributes($genomeID, \@featureTuples); |
131 : | parrello | 1.1 | Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3); |
132 : | # Loop through the features. | ||
133 : | for my $featureTuple (@featureTuples) { | ||
134 : | # Split the tuple. | ||
135 : | parrello | 1.9 | my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment, |
136 : | $user, $quality) = @{$featureTuple}; | ||
137 : | parrello | 1.1 | # Make sure this feature is active. |
138 : | if (! $fig->is_deleted_fid($featureID)) { | ||
139 : | # Handle missing assignments. | ||
140 : | if (! defined $assignment) { | ||
141 : | $assignment = ''; | ||
142 : | $user = ''; | ||
143 : | } else { | ||
144 : | # The default assignment-maker is FIG. | ||
145 : | $user ||= 'fig'; | ||
146 : | } | ||
147 : | # Count this feature. | ||
148 : | $self->Track(features => $featureID, 1000); | ||
149 : | # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the | ||
150 : | # Sprout database requires a single character. | ||
151 : | if (! defined($quality) || $quality eq "") { | ||
152 : | $quality = " "; | ||
153 : | } | ||
154 : | parrello | 1.6 | # Get the coupling count. The coupled features are returned as a list, |
155 : | # and we store it as a scalar to get the count. | ||
156 : | my $couplingCount = $fig->coupled_to($featureID); | ||
157 : | parrello | 1.1 | # Begin building the keywords. We start with the genome ID, the |
158 : | # feature ID, the taxonomy, and the organism name. | ||
159 : | my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID), | ||
160 : | $fig->taxonomy_of($genomeID)); | ||
161 : | parrello | 1.9 | # Next come the aliases. We put all aliases found in this hash. |
162 : | # They will be output as alias names and as keywords. | ||
163 : | parrello | 1.7 | my %aliasHash; |
164 : | parrello | 1.9 | # Note the trick here to insure that we have a list reference even |
165 : | # if this feature isn't in the alias table. | ||
166 : | parrello | 1.10 | my $aliasList = $aliasMasterHash->{$featureID} || []; |
167 : | parrello | 1.9 | # Loop through this feature ID's aliases. |
168 : | for my $aliasTuple (@$aliasList) { | ||
169 : | my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple; | ||
170 : | # Only proceed if this alias is new. | ||
171 : | if (! exists $aliasHash{$aliasID}) { | ||
172 : | # Save this alias. | ||
173 : | $aliasHash{$aliasID} = 1; | ||
174 : | # Get its natural form. | ||
175 : | my $natural = AliasAnalysis::Type($aliasType => $aliasID); | ||
176 : | # Only proceed if a natural form exists. | ||
177 : | if ($natural) { | ||
178 : | $self->Add(miscAlias => 1); | ||
179 : | # Save the natural form. | ||
180 : | $aliasHash{$natural} = 1; | ||
181 : | # Is this a corresponding ID? | ||
182 : | if ($aliasConf eq 'A') { | ||
183 : | # Yes. Connect its natural form to the feature. | ||
184 : | $self->PutR(IsAlsoFoundIn => $featureID, $aliasType, | ||
185 : | alias => $natural); | ||
186 : | $self->PutE(ExternalDatabase => $aliasType); | ||
187 : | } | ||
188 : | } | ||
189 : | parrello | 1.1 | } |
190 : | } | ||
191 : | parrello | 1.9 | # Create the aliases and put them in the keyword list. |
192 : | parrello | 1.7 | for my $alias (sort keys %aliasHash) { |
193 : | # Connect this alias to this feature and make an Alias record for it. | ||
194 : | $self->PutR(IsAliasOf => $alias, $featureID); | ||
195 : | $self->PutE(FeatureAlias => $alias); | ||
196 : | # Add it to the keyword list. | ||
197 : | push @keywords, $alias; | ||
198 : | } | ||
199 : | parrello | 1.1 | Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4); |
200 : | # Break the assignment into words and shove it onto the | ||
201 : | # keyword list. | ||
202 : | push @keywords, split(/\s+/, $assignment); | ||
203 : | parrello | 1.5 | # Add any EC numbers. |
204 : | parrello | 1.6 | my @ecs = BioWords::ExtractECs($assignment); |
205 : | for my $ec (@ecs) { | ||
206 : | push @keywords, $ec; | ||
207 : | $self->PutE(FeatureEC => $featureID, ec => $ec); | ||
208 : | } | ||
209 : | parrello | 1.1 | # Link this feature to the parent genome. |
210 : | parrello | 1.6 | $self->PutR(HasFeature => $genomeID, $featureID, |
211 : | parrello | 1.1 | type => $type); |
212 : | # Get the links. | ||
213 : | my @links = $fig->fid_links($featureID); | ||
214 : | for my $link (@links) { | ||
215 : | parrello | 1.6 | $self->PutE(FeatureLink => $featureID, link => $link); |
216 : | parrello | 1.1 | } |
217 : | # If this is a peg, generate the translation and the upstream. | ||
218 : | if ($type eq 'peg') { | ||
219 : | $self->Add(pegIn => 1); | ||
220 : | my $translation = $fig->get_translation($featureID); | ||
221 : | if ($translation) { | ||
222 : | parrello | 1.6 | $self->PutE(FeatureTranslation => $featureID, |
223 : | parrello | 1.1 | translation => $translation); |
224 : | } | ||
225 : | # We use the default upstream values of u=200 and c=100. | ||
226 : | my $upstream = $fig->upstream_of($featureID, 200, 100); | ||
227 : | if ($upstream) { | ||
228 : | parrello | 1.6 | $self->PutE(FeatureUpstream => $featureID, |
229 : | parrello | 1.1 | 'upstream-sequence' => $upstream); |
230 : | } | ||
231 : | } | ||
232 : | # Now we need to find the subsystems this feature participates in. | ||
233 : | my @ssList = $fig->subsystems_for_peg($featureID); | ||
234 : | # This hash prevents us from adding the same subsystem twice. | ||
235 : | my %seen = (); | ||
236 : | for my $ssEntry (@ssList) { | ||
237 : | # Get the subsystem and role. | ||
238 : | my ($subsystem, $role) = @{$ssEntry}; | ||
239 : | # Only proceed if we like this subsystem. | ||
240 : | if (exists $subHash->{$subsystem}) { | ||
241 : | # If this is the first time we've seen this subsystem for | ||
242 : | # this peg, store the has-role link. | ||
243 : | if (! $seen{$subsystem}) { | ||
244 : | parrello | 1.6 | $self->PutR(HasRoleInSubsystem => $featureID, $subsystem, |
245 : | genome => $genomeID, type => $type); | ||
246 : | parrello | 1.5 | # Save the subsystem's keywords. |
247 : | parrello | 1.1 | push @keywords, split /[\s_]+/, $subsystem; |
248 : | } | ||
249 : | parrello | 1.5 | # Now add the role and any embedded EC nubmers to the keyword list. |
250 : | parrello | 1.1 | push @keywords, split /\s+/, $role; |
251 : | parrello | 1.5 | push @keywords, BioWords::ExtractECs($role); |
252 : | parrello | 1.1 | } |
253 : | } | ||
254 : | parrello | 1.5 | # For each hyphenated word, we also need the pieces. |
255 : | my @hyphenated = grep { $_ =~ /-/ } @keywords; | ||
256 : | for my $hyphenated (@hyphenated) { | ||
257 : | # Bust it into pieces. | ||
258 : | my @pieces = grep { length($_) > 2 } split /-/, $hyphenated; | ||
259 : | push @keywords, @pieces; | ||
260 : | } | ||
261 : | parrello | 1.1 | # There are three special attributes computed from property |
262 : | # data that we build next. If the special attribute is non-empty, | ||
263 : | # its name will be added to the keyword list. First, we get all | ||
264 : | # the attributes for this feature. They will come back as | ||
265 : | parrello | 1.6 | # 4-tuples: [peg, name, value, URL]. |
266 : | my @attributes = @{$attributes->{$featureID}}; | ||
267 : | parrello | 1.1 | # Now we process each of the special attributes. |
268 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
269 : | parrello | 1.6 | 2, [1,3], '^(essential|potential_essential)$', |
270 : | parrello | 1.1 | qw(FeatureEssential essential))) { |
271 : | push @keywords, 'essential'; | ||
272 : | $self->Add(essential => 1); | ||
273 : | } | ||
274 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
275 : | parrello | 1.6 | 1, [2,3], '^virulen', |
276 : | parrello | 1.1 | qw(FeatureVirulent virulent))) { |
277 : | push @keywords, 'virulent'; | ||
278 : | $self->Add(virulent => 1); | ||
279 : | } | ||
280 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
281 : | parrello | 1.6 | 1, [2,3], '^iedb_', |
282 : | parrello | 1.1 | qw(FeatureIEDB iedb))) { |
283 : | push @keywords, 'iedb'; | ||
284 : | $self->Add(iedb => 1); | ||
285 : | } | ||
286 : | # Now we have some other attributes we need to process. To get | ||
287 : | # through them, we convert the attribute list for this feature | ||
288 : | # into a two-layer hash: key => subkey => value. | ||
289 : | my %attributeHash = (); | ||
290 : | for my $attrRow (@{$attributes->{$featureID}}) { | ||
291 : | my (undef, $key, @values) = @{$attrRow}; | ||
292 : | my ($realKey, $subKey); | ||
293 : | if ($key =~ /^([^:]+)::(.+)/) { | ||
294 : | ($realKey, $subKey) = ($1, $2); | ||
295 : | } else { | ||
296 : | ($realKey, $subKey) = ($key, ""); | ||
297 : | } | ||
298 : | if (exists $attributeHash{$realKey}) { | ||
299 : | $attributeHash{$realKey}->{$subKey} = \@values; | ||
300 : | } else { | ||
301 : | $attributeHash{$realKey} = {$subKey => \@values}; | ||
302 : | } | ||
303 : | } | ||
304 : | parrello | 1.6 | TraceDump(AttributeHash => \%attributeHash) if T(FeatureLoadGroup => 4); |
305 : | parrello | 1.1 | # First we handle CDD. This is a bit complicated, because |
306 : | # there are multiple CDDs per protein. | ||
307 : | if (exists $attributeHash{CDD}) { | ||
308 : | # Get the hash of CDD IDs to scores for this feature. We | ||
309 : | # already know it exists because of the above IF. | ||
310 : | my $cddHash = $attributeHash{CDD}; | ||
311 : | parrello | 1.6 | my @cddData = sort keys %$cddHash; |
312 : | parrello | 1.1 | for my $cdd (@cddData) { |
313 : | # Extract the score for this CDD and decode it. | ||
314 : | my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]); | ||
315 : | my $realScore = FIGRules::DecodeScore($codeScore); | ||
316 : | # We can't afford to crash because of a bad attribute | ||
317 : | # value, hence the IF below. | ||
318 : | if (! defined($realScore)) { | ||
319 : | # Bad score, so count it. | ||
320 : | $self->Add(badCDDscore => 1); | ||
321 : | Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3); | ||
322 : | } else { | ||
323 : | # Create the connection and a CDD record. | ||
324 : | parrello | 1.6 | $self->PutR(IsPresentOnProteinOf => $cdd, $featureID, |
325 : | score => $realScore); | ||
326 : | $self->PutE(CDD => $cdd); | ||
327 : | } | ||
328 : | } | ||
329 : | } | ||
330 : | # A similar situation exists for protein families. | ||
331 : | if (exists $attributeHash{PFAM}) { | ||
332 : | # Get the hash of PFAMs to scores for this feature. | ||
333 : | my $pfamHash = $attributeHash{PFAM}; | ||
334 : | for my $pfam (sort keys %$pfamHash) { | ||
335 : | # Extract the range. | ||
336 : | my $codeScore = $pfamHash->{$pfam}->[0]; | ||
337 : | $codeScore =~ /;(.+)/; | ||
338 : | my $range = $1; | ||
339 : | # Strip off the PFAM id from the source. | ||
340 : | my ($pfamID) = split /_/, $pfam, 2; | ||
341 : | # Emit the ProteinFamily record. | ||
342 : | $self->PutE(ProteinFamily => $pfamID); | ||
343 : | # Connect it to the feature. | ||
344 : | $self->PutR(IsFamilyForFeature => $pfamID, $featureID, | ||
345 : | range => $range); | ||
346 : | # Get its name from the ontology database. There can | ||
347 : | # be at most one. | ||
348 : | my $dt_objs = | ||
349 : | $ontology_dbmaster->pfam->get_objects({id => $pfamID}); | ||
350 : | if (defined $dt_objs->[0]) { | ||
351 : | $self->PutE(ProteinFamilyName => $pfamID, | ||
352 : | common_name => $dt_objs->[0]->term()); | ||
353 : | parrello | 1.1 | } |
354 : | } | ||
355 : | } | ||
356 : | # Next we do PSORT cell locations. here the confidence value | ||
357 : | # could have the value "unknown", which we translate to -1. | ||
358 : | if (exists $attributeHash{PSORT}) { | ||
359 : | # This will be a hash of cell locations to confidence | ||
360 : | # factors. | ||
361 : | my $psortHash = $attributeHash{PSORT}; | ||
362 : | for my $psort (keys %{$psortHash}) { | ||
363 : | # Get the confidence, and convert it to a number if necessary. | ||
364 : | parrello | 1.6 | my $confidence = $psortHash->{$psort}->[0]; |
365 : | parrello | 1.1 | if ($confidence eq 'unknown') { |
366 : | $confidence = -1; | ||
367 : | } | ||
368 : | parrello | 1.6 | $self->PutR(IsPossiblePlaceFor => $psort, $featureID, |
369 : | confidence => $confidence); | ||
370 : | $self->PutE(CellLocation => $psort); | ||
371 : | parrello | 1.1 | # If this is a significant location, add it as a keyword. |
372 : | if ($confidence > 2.5) { | ||
373 : | parrello | 1.6 | # Before we add it as a keyword, we convert it from |
374 : | # capital-case to hyphenated by inserting hyphens at | ||
375 : | # case transition points. | ||
376 : | $psort =~ s/([a-z])([A-Z])/$1-$2/g; | ||
377 : | parrello | 1.1 | push @keywords, $psort; |
378 : | } | ||
379 : | } | ||
380 : | } | ||
381 : | # Phobius data is next. This consists of the signal peptide location and | ||
382 : | # the transmembrane locations. | ||
383 : | my $signalList = ""; | ||
384 : | my $transList = ""; | ||
385 : | parrello | 1.6 | my $transCount = 0; |
386 : | parrello | 1.1 | if (exists $attributeHash{Phobius}) { |
387 : | # This will be a hash of two keys (transmembrane and signal) to | ||
388 : | parrello | 1.6 | # location lists. GetCommaList converts them into comma-separated |
389 : | # location strings. If there's no value, it returns an empty string. | ||
390 : | parrello | 1.1 | $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal}); |
391 : | parrello | 1.6 | my $transList = $attributeHash{Phobius}->{transmembrane}; |
392 : | my @transMap = split /\s*,\s*/, $transList; | ||
393 : | $transCount = (defined $transList ? scalar(@transMap) : 0); | ||
394 : | parrello | 1.1 | } |
395 : | # Here are some more numbers: isoelectric point, molecular weight, and | ||
396 : | # the similar-to-human flag. | ||
397 : | my $isoelectric = 0; | ||
398 : | if (exists $attributeHash{isoelectric_point}) { | ||
399 : | parrello | 1.6 | $isoelectric = $attributeHash{isoelectric_point}->{""}->[0]; |
400 : | parrello | 1.1 | } |
401 : | my $similarToHuman = 0; | ||
402 : | parrello | 1.6 | if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""}->[0] eq 'yes') { |
403 : | parrello | 1.1 | $similarToHuman = 1; |
404 : | } | ||
405 : | my $molecularWeight = 0; | ||
406 : | if (exists $attributeHash{molecular_weight}) { | ||
407 : | parrello | 1.6 | $molecularWeight = $attributeHash{molecular_weight}->{""}->[0]; |
408 : | parrello | 1.1 | } |
409 : | # Join the keyword string. | ||
410 : | my $keywordString = join(" ", @keywords); | ||
411 : | # Get rid of annoying punctuation. | ||
412 : | parrello | 1.5 | $keywordString =~ s/[();@#\/,]/ /g; |
413 : | parrello | 1.6 | # Get the list of keywords in the keyword string, minus the delimiters. |
414 : | my @realKeywords = grep { $stemmer->IsWord($_) } | ||
415 : | $stemmer->Split($keywordString); | ||
416 : | parrello | 1.1 | # We need to do two things here: create the keyword string for the feature table |
417 : | # and write records to the keyword table for the keywords. | ||
418 : | my (%keys, %stems, @realStems); | ||
419 : | for my $keyword (@realKeywords) { | ||
420 : | # Compute the stem and phonex for this keyword. | ||
421 : | my ($stem, $phonex) = $stemmer->StemLookup($keyword); | ||
422 : | # Only proceed if a stem comes back. If no stem came back, it's a | ||
423 : | # stop word and we throw it away. | ||
424 : | if ($stem) { | ||
425 : | $keys{$keyword} = $stem; | ||
426 : | $stems{$stem} = $phonex; | ||
427 : | push @realStems, $stem; | ||
428 : | } | ||
429 : | } | ||
430 : | # Now create the keyword string. | ||
431 : | my $cleanWords = join(" ", @realStems); | ||
432 : | Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4); | ||
433 : | # Create keyword table entries for the keywords found. | ||
434 : | for my $key (keys %keys) { | ||
435 : | my $stem = $keys{$key}; | ||
436 : | parrello | 1.6 | $self->PutE(Keyword => $key, stem => $stem, phonex => $stems{$stem}); |
437 : | parrello | 1.1 | } |
438 : | # Now we need to process the feature's locations. First, we split them up. | ||
439 : | my @locationList = split /\s*,\s*/, $locations; | ||
440 : | # Next, we convert them to Sprout location objects. | ||
441 : | my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; | ||
442 : | # Assemble them into a sprout location string for later. | ||
443 : | my $locationString = join(", ", map { $_->String } @locObjectList); | ||
444 : | # We'll store the sequence length in here. | ||
445 : | my $sequenceLength = 0; | ||
446 : | # This part is the roughest. We need to relate the features to contig | ||
447 : | # locations, and the locations must be split so that none of them exceed | ||
448 : | # the maximum segment size. This simplifies the genes_in_region processing | ||
449 : | # for Sprout. To start, we create the location position indicator. | ||
450 : | my $i = 1; | ||
451 : | # Loop through the locations. | ||
452 : | for my $locObject (@locObjectList) { | ||
453 : | # Record the length. | ||
454 : | $sequenceLength += $locObject->Length; | ||
455 : | # Split this location into a list of chunks. | ||
456 : | my @locOList = (); | ||
457 : | while (my $peeling = $locObject->Peel($chunkSize)) { | ||
458 : | $self->Add(peeling => 1); | ||
459 : | push @locOList, $peeling; | ||
460 : | } | ||
461 : | push @locOList, $locObject; | ||
462 : | # Loop through the chunks, creating IsLocatedIn records. The variable | ||
463 : | # "$i" will be used to keep the location index. | ||
464 : | for my $locChunk (@locOList) { | ||
465 : | parrello | 1.6 | $self->PutR(IsLocatedIn => $featureID, $locChunk->Contig, |
466 : | beg => $locChunk->Left, dir => $locChunk->Dir, | ||
467 : | len => $locChunk->Length, locN => $i); | ||
468 : | parrello | 1.1 | $i++; |
469 : | } | ||
470 : | } | ||
471 : | parrello | 1.6 | # Check for figfams. In case we find any, we need the range. |
472 : | # It's the whole sequence. | ||
473 : | my $range = "1-$sequenceLength"; | ||
474 : | # Ask for the figfams. | ||
475 : | my @fams = $ffs->families_containing_peg($featureID); | ||
476 : | # Connect them to the feature (if any). | ||
477 : | for my $fam (@fams) { | ||
478 : | $self->PutE(ProteinFamily => $fam); | ||
479 : | $self->PutR(IsFamilyForFeature => $fam, $featureID, | ||
480 : | range => $range); | ||
481 : | } | ||
482 : | parrello | 1.1 | # Now we get some ancillary flags. |
483 : | my $locked = $fig->is_locked_fid($featureID); | ||
484 : | my $in_genbank = $fig->peg_in_gendb($featureID); | ||
485 : | # Create the feature record. | ||
486 : | parrello | 1.6 | $self->PutE(Feature => $featureID, 'assignment-maker' => $user, |
487 : | parrello | 1.1 | 'assignment-quality' => $quality, 'feature-type' => $type, |
488 : | 'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric, | ||
489 : | locked => $locked, 'molecular-weight' => $molecularWeight, | ||
490 : | 'sequence-length' => $sequenceLength, | ||
491 : | 'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman, | ||
492 : | assignment => $assignment, keywords => $cleanWords, | ||
493 : | 'location-string' => $locationString, | ||
494 : | parrello | 1.6 | 'transmembrane-map' => $transList, |
495 : | 'conserved-neighbors' => $couplingCount, | ||
496 : | 'transmembrane-domain-count' => $transCount); | ||
497 : | parrello | 1.1 | } |
498 : | } | ||
499 : | } | ||
500 : | } | ||
501 : | |||
502 : | |||
503 : | =head3 SpecialAttribute | ||
504 : | |||
505 : | my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field); | ||
506 : | |||
507 : | Look for special attributes of a given type. A special attribute is found by comparing one of | ||
508 : | the columns of the incoming attribute list to a search pattern. If a match is found, then | ||
509 : | a set of columns is put into an output table connected to the specified ID. | ||
510 : | |||
511 : | For example, when processing features, the attribute list we look at has three columns: attribute | ||
512 : | name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name | ||
513 : | begins with C<iedb_>. The call signature is therefore | ||
514 : | |||
515 : | my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb'); | ||
516 : | |||
517 : | The pattern is matched against column 0, and if we have a match, then column 2's value is put | ||
518 : | to the output along with the specified feature ID. | ||
519 : | |||
520 : | =over 4 | ||
521 : | |||
522 : | =item id | ||
523 : | |||
524 : | ID of the object whose special attributes are being loaded. This forms the first column of the | ||
525 : | output. | ||
526 : | |||
527 : | =item attributes | ||
528 : | |||
529 : | Reference to a list of tuples. | ||
530 : | |||
531 : | =item idxMatch | ||
532 : | |||
533 : | Index in each tuple of the column to be matched against the pattern. If the match is | ||
534 : | successful, an output record will be generated. | ||
535 : | |||
536 : | =item idxValues | ||
537 : | |||
538 : | parrello | 1.6 | Reference to a list containing the indexes of the value and URL to put in the |
539 : | second column of the output. | ||
540 : | parrello | 1.1 | |
541 : | =item pattern | ||
542 : | |||
543 : | Pattern to be matched against the specified column. The match will be case-insensitive. | ||
544 : | |||
545 : | =item tableName | ||
546 : | |||
547 : | Name of the table to contain the attribute values found. | ||
548 : | |||
549 : | =item fieldName | ||
550 : | |||
551 : | Name of the field to contain the attribute values in the output table. | ||
552 : | |||
553 : | =item RETURN | ||
554 : | |||
555 : | Returns a count of the matches found. | ||
556 : | |||
557 : | =item | ||
558 : | |||
559 : | =back | ||
560 : | |||
561 : | =cut | ||
562 : | |||
563 : | sub SpecialAttribute { | ||
564 : | # Get the parameters. | ||
565 : | my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_; | ||
566 : | # Declare the return variable. | ||
567 : | my $retVal = 0; | ||
568 : | # Loop through the attribute rows. | ||
569 : | for my $row (@{$attributes}) { | ||
570 : | # Check for a match. | ||
571 : | if ($row->[$idxMatch] =~ m/$pattern/i) { | ||
572 : | parrello | 1.6 | # We have a match, so output a row. |
573 : | my $value = HyperLink->new(map { $row->[$_] } @$idxValues); | ||
574 : | $self->PutE($tableName => $id, $fieldName => $value); | ||
575 : | parrello | 1.1 | $retVal++; |
576 : | } | ||
577 : | } | ||
578 : | Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal; | ||
579 : | # Return the number of matches. | ||
580 : | return $retVal; | ||
581 : | } | ||
582 : | |||
583 : | parrello | 1.9 | |
584 : | parrello | 1.1 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |