Parent Directory
|
Revision Log
Revision 1.6 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | # | ||
4 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
5 : | # for Interpretations of Genomes. All Rights Reserved. | ||
6 : | # | ||
7 : | # This file is part of the SEED Toolkit. | ||
8 : | # | ||
9 : | # The SEED Toolkit is free software. You can redistribute | ||
10 : | # it and/or modify it under the terms of the SEED Toolkit | ||
11 : | # Public License. | ||
12 : | # | ||
13 : | # You should have received a copy of the SEED Toolkit Public License | ||
14 : | # along with this program; if not write to the University of Chicago | ||
15 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
16 : | # Genomes at veronika@thefig.info or download a copy from | ||
17 : | # http://www.theseed.org/LICENSE.TXT. | ||
18 : | # | ||
19 : | |||
20 : | package FeatureSproutLoader; | ||
21 : | |||
22 : | use strict; | ||
23 : | use Tracer; | ||
24 : | use ERDB; | ||
25 : | use BioWords; | ||
26 : | use AliasAnalysis; | ||
27 : | parrello | 1.6 | use DBMaster; |
28 : | use HyperLink; | ||
29 : | use FFs; | ||
30 : | parrello | 1.1 | use base 'BaseSproutLoader'; |
31 : | |||
32 : | =head1 Sprout Feature Load Group Class | ||
33 : | |||
34 : | =head2 Introduction | ||
35 : | |||
36 : | The Feature Load Group includes all of the major feature-related tables. | ||
37 : | |||
38 : | =head3 new | ||
39 : | |||
40 : | parrello | 1.3 | my $sl = FeatureSproutLoader->new($erdb, $source, $options, @tables); |
41 : | parrello | 1.1 | |
42 : | parrello | 1.3 | Construct a new FeatureSproutLoader object. |
43 : | parrello | 1.1 | |
44 : | =over 4 | ||
45 : | |||
46 : | =item erdb | ||
47 : | |||
48 : | [[SproutPm]] object for the database being loaded. | ||
49 : | |||
50 : | =item options | ||
51 : | |||
52 : | Reference to a hash of command-line options. | ||
53 : | |||
54 : | =item tables | ||
55 : | |||
56 : | List of tables in this load group. | ||
57 : | |||
58 : | =back | ||
59 : | |||
60 : | =cut | ||
61 : | |||
62 : | sub new { | ||
63 : | # Get the parameters. | ||
64 : | parrello | 1.6 | my ($class, $erdb, $options) = @_; |
65 : | parrello | 1.1 | # Create the table list. |
66 : | my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink | ||
67 : | FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem | ||
68 : | FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf | ||
69 : | parrello | 1.6 | CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword |
70 : | ProteinFamily IsFamilyForFeature ProteinFamilyName FeatureEC); | ||
71 : | parrello | 1.1 | # Create the BaseSproutLoader object. |
72 : | parrello | 1.6 | my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables); |
73 : | parrello | 1.2 | # Get the list of relevant attributes. |
74 : | parrello | 1.1 | # Bless and return it. |
75 : | bless $retVal, $class; | ||
76 : | return $retVal; | ||
77 : | } | ||
78 : | |||
79 : | =head2 Public Methods | ||
80 : | |||
81 : | =head3 Generate | ||
82 : | |||
83 : | $sl->Generate(); | ||
84 : | |||
85 : | Generate the data for the feature-related files. | ||
86 : | |||
87 : | =cut | ||
88 : | |||
89 : | sub Generate { | ||
90 : | # Get the parameters. | ||
91 : | my ($self) = @_; | ||
92 : | # Get the sprout object. | ||
93 : | my $sprout = $self->db(); | ||
94 : | # Get the FIG object. | ||
95 : | my $fig = $self->source(); | ||
96 : | # Get the subsystem list. | ||
97 : | my $subHash = $self->GetSubsystems(); | ||
98 : | # Get the word stemmer. | ||
99 : | my $stemmer = $sprout->GetStemmer(); | ||
100 : | parrello | 1.6 | # Get access to FIGfams. |
101 : | my $figfam_data = &FIG::get_figfams_data(); | ||
102 : | my $ffs = new FFs($figfam_data); | ||
103 : | parrello | 1.1 | # Only proceed if this is not the global section. |
104 : | if (! $self->global()) { | ||
105 : | parrello | 1.4 | # Get the section ID. |
106 : | my $genomeID = $self->section(); | ||
107 : | parrello | 1.6 | # Connect to the ontology database. |
108 : | my $sqlite_db = "/home/mkubal/Temp/Ontology/ontology.sqlite"; | ||
109 : | my $ontology_dbmaster = DBMaster->new(-database => $sqlite_db, -backend => 'SQLite'); | ||
110 : | parrello | 1.1 | # Get the maximum sequence size. We need this later for splitting up the |
111 : | # locations. | ||
112 : | my $chunkSize = $sprout->MaxSegment(); | ||
113 : | Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3); | ||
114 : | # Get the feature list for this genome. | ||
115 : | my $features = $fig->all_features_detailed_fast($genomeID); | ||
116 : | # Sort and count the list. | ||
117 : | my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; | ||
118 : | my $count = scalar @featureTuples; | ||
119 : | Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3); | ||
120 : | # Get the attributes for this genome and put them in a hash by feature ID. | ||
121 : | parrello | 1.6 | my $attributes = $self->GetGenomeAttributes($genomeID, \@featureTuples); |
122 : | parrello | 1.1 | Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3); |
123 : | # Loop through the features. | ||
124 : | for my $featureTuple (@featureTuples) { | ||
125 : | # Split the tuple. | ||
126 : | parrello | 1.4 | my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple}; |
127 : | parrello | 1.1 | # Make sure this feature is active. |
128 : | if (! $fig->is_deleted_fid($featureID)) { | ||
129 : | # Handle missing assignments. | ||
130 : | if (! defined $assignment) { | ||
131 : | $assignment = ''; | ||
132 : | $user = ''; | ||
133 : | } else { | ||
134 : | # The default assignment-maker is FIG. | ||
135 : | $user ||= 'fig'; | ||
136 : | } | ||
137 : | # Count this feature. | ||
138 : | $self->Track(features => $featureID, 1000); | ||
139 : | # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the | ||
140 : | # Sprout database requires a single character. | ||
141 : | if (! defined($quality) || $quality eq "") { | ||
142 : | $quality = " "; | ||
143 : | } | ||
144 : | parrello | 1.6 | # Get the coupling count. The coupled features are returned as a list, |
145 : | # and we store it as a scalar to get the count. | ||
146 : | my $couplingCount = $fig->coupled_to($featureID); | ||
147 : | parrello | 1.1 | # Begin building the keywords. We start with the genome ID, the |
148 : | # feature ID, the taxonomy, and the organism name. | ||
149 : | my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID), | ||
150 : | $fig->taxonomy_of($genomeID)); | ||
151 : | # Create the aliases. | ||
152 : | parrello | 1.4 | for my $alias (split /,/, $aliases) { |
153 : | parrello | 1.1 | # Connect this alias to this feature and make an Alias record for it. |
154 : | parrello | 1.6 | $self->PutR(IsAliasOf => $alias, $featureID); |
155 : | $self->PutE(FeatureAlias => $alias); | ||
156 : | parrello | 1.1 | # Add it to the keyword list. |
157 : | push @keywords, $alias; | ||
158 : | # If this is a locus tag, also add its natural form as a keyword. | ||
159 : | my $naturalName = AliasAnalysis::Type(LocusTag => $alias); | ||
160 : | if ($naturalName) { | ||
161 : | push @keywords, $naturalName; | ||
162 : | } | ||
163 : | } | ||
164 : | # Add the corresponding IDs. We ask for 2-tuples of the form (id, database). | ||
165 : | my @corresponders = $fig->get_corresponding_ids($featureID, 1); | ||
166 : | for my $tuple (@corresponders) { | ||
167 : | my ($id, $xdb) = @{$tuple}; | ||
168 : | parrello | 1.6 | # Ignore SEED: that's us. Also ignore contig IDs. Those are due to a bug. |
169 : | if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) { | ||
170 : | parrello | 1.1 | # Connect this ID to the feature and mark its database. |
171 : | parrello | 1.6 | $self->PutR(IsAlsoFoundIn => $featureID, $xdb, |
172 : | parrello | 1.1 | alias => $id); |
173 : | parrello | 1.6 | $self->PutE(ExternalDatabase => $xdb); |
174 : | # Add it as a keyword and an alias. | ||
175 : | parrello | 1.1 | push @keywords, $id; |
176 : | parrello | 1.6 | $self->PutR(IsAliasOf => $id, $featureID); |
177 : | $self->PutE(FeatureAlias => $id); | ||
178 : | parrello | 1.1 | } |
179 : | } | ||
180 : | Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4); | ||
181 : | # Break the assignment into words and shove it onto the | ||
182 : | # keyword list. | ||
183 : | push @keywords, split(/\s+/, $assignment); | ||
184 : | parrello | 1.5 | # Add any EC numbers. |
185 : | parrello | 1.6 | my @ecs = BioWords::ExtractECs($assignment); |
186 : | for my $ec (@ecs) { | ||
187 : | push @keywords, $ec; | ||
188 : | $self->PutE(FeatureEC => $featureID, ec => $ec); | ||
189 : | } | ||
190 : | parrello | 1.1 | # Link this feature to the parent genome. |
191 : | parrello | 1.6 | $self->PutR(HasFeature => $genomeID, $featureID, |
192 : | parrello | 1.1 | type => $type); |
193 : | # Get the links. | ||
194 : | my @links = $fig->fid_links($featureID); | ||
195 : | for my $link (@links) { | ||
196 : | parrello | 1.6 | $self->PutE(FeatureLink => $featureID, link => $link); |
197 : | parrello | 1.1 | } |
198 : | # If this is a peg, generate the translation and the upstream. | ||
199 : | if ($type eq 'peg') { | ||
200 : | $self->Add(pegIn => 1); | ||
201 : | my $translation = $fig->get_translation($featureID); | ||
202 : | if ($translation) { | ||
203 : | parrello | 1.6 | $self->PutE(FeatureTranslation => $featureID, |
204 : | parrello | 1.1 | translation => $translation); |
205 : | } | ||
206 : | # We use the default upstream values of u=200 and c=100. | ||
207 : | my $upstream = $fig->upstream_of($featureID, 200, 100); | ||
208 : | if ($upstream) { | ||
209 : | parrello | 1.6 | $self->PutE(FeatureUpstream => $featureID, |
210 : | parrello | 1.1 | 'upstream-sequence' => $upstream); |
211 : | } | ||
212 : | } | ||
213 : | # Now we need to find the subsystems this feature participates in. | ||
214 : | my @ssList = $fig->subsystems_for_peg($featureID); | ||
215 : | # This hash prevents us from adding the same subsystem twice. | ||
216 : | my %seen = (); | ||
217 : | for my $ssEntry (@ssList) { | ||
218 : | # Get the subsystem and role. | ||
219 : | my ($subsystem, $role) = @{$ssEntry}; | ||
220 : | # Only proceed if we like this subsystem. | ||
221 : | if (exists $subHash->{$subsystem}) { | ||
222 : | # If this is the first time we've seen this subsystem for | ||
223 : | # this peg, store the has-role link. | ||
224 : | if (! $seen{$subsystem}) { | ||
225 : | parrello | 1.6 | $self->PutR(HasRoleInSubsystem => $featureID, $subsystem, |
226 : | genome => $genomeID, type => $type); | ||
227 : | parrello | 1.5 | # Save the subsystem's keywords. |
228 : | parrello | 1.1 | push @keywords, split /[\s_]+/, $subsystem; |
229 : | } | ||
230 : | parrello | 1.5 | # Now add the role and any embedded EC nubmers to the keyword list. |
231 : | parrello | 1.1 | push @keywords, split /\s+/, $role; |
232 : | parrello | 1.5 | push @keywords, BioWords::ExtractECs($role); |
233 : | parrello | 1.1 | } |
234 : | } | ||
235 : | parrello | 1.5 | # For each hyphenated word, we also need the pieces. |
236 : | my @hyphenated = grep { $_ =~ /-/ } @keywords; | ||
237 : | for my $hyphenated (@hyphenated) { | ||
238 : | # Bust it into pieces. | ||
239 : | my @pieces = grep { length($_) > 2 } split /-/, $hyphenated; | ||
240 : | push @keywords, @pieces; | ||
241 : | } | ||
242 : | parrello | 1.1 | # There are three special attributes computed from property |
243 : | # data that we build next. If the special attribute is non-empty, | ||
244 : | # its name will be added to the keyword list. First, we get all | ||
245 : | # the attributes for this feature. They will come back as | ||
246 : | parrello | 1.6 | # 4-tuples: [peg, name, value, URL]. |
247 : | my @attributes = @{$attributes->{$featureID}}; | ||
248 : | parrello | 1.1 | # Now we process each of the special attributes. |
249 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
250 : | parrello | 1.6 | 2, [1,3], '^(essential|potential_essential)$', |
251 : | parrello | 1.1 | qw(FeatureEssential essential))) { |
252 : | push @keywords, 'essential'; | ||
253 : | $self->Add(essential => 1); | ||
254 : | } | ||
255 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
256 : | parrello | 1.6 | 1, [2,3], '^virulen', |
257 : | parrello | 1.1 | qw(FeatureVirulent virulent))) { |
258 : | push @keywords, 'virulent'; | ||
259 : | $self->Add(virulent => 1); | ||
260 : | } | ||
261 : | if ($self->SpecialAttribute($featureID, \@attributes, | ||
262 : | parrello | 1.6 | 1, [2,3], '^iedb_', |
263 : | parrello | 1.1 | qw(FeatureIEDB iedb))) { |
264 : | push @keywords, 'iedb'; | ||
265 : | $self->Add(iedb => 1); | ||
266 : | } | ||
267 : | # Now we have some other attributes we need to process. To get | ||
268 : | # through them, we convert the attribute list for this feature | ||
269 : | # into a two-layer hash: key => subkey => value. | ||
270 : | my %attributeHash = (); | ||
271 : | for my $attrRow (@{$attributes->{$featureID}}) { | ||
272 : | my (undef, $key, @values) = @{$attrRow}; | ||
273 : | my ($realKey, $subKey); | ||
274 : | if ($key =~ /^([^:]+)::(.+)/) { | ||
275 : | ($realKey, $subKey) = ($1, $2); | ||
276 : | } else { | ||
277 : | ($realKey, $subKey) = ($key, ""); | ||
278 : | } | ||
279 : | if (exists $attributeHash{$realKey}) { | ||
280 : | $attributeHash{$realKey}->{$subKey} = \@values; | ||
281 : | } else { | ||
282 : | $attributeHash{$realKey} = {$subKey => \@values}; | ||
283 : | } | ||
284 : | } | ||
285 : | parrello | 1.6 | TraceDump(AttributeHash => \%attributeHash) if T(FeatureLoadGroup => 4); |
286 : | parrello | 1.1 | # First we handle CDD. This is a bit complicated, because |
287 : | # there are multiple CDDs per protein. | ||
288 : | if (exists $attributeHash{CDD}) { | ||
289 : | # Get the hash of CDD IDs to scores for this feature. We | ||
290 : | # already know it exists because of the above IF. | ||
291 : | my $cddHash = $attributeHash{CDD}; | ||
292 : | parrello | 1.6 | my @cddData = sort keys %$cddHash; |
293 : | parrello | 1.1 | for my $cdd (@cddData) { |
294 : | # Extract the score for this CDD and decode it. | ||
295 : | my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]); | ||
296 : | my $realScore = FIGRules::DecodeScore($codeScore); | ||
297 : | # We can't afford to crash because of a bad attribute | ||
298 : | # value, hence the IF below. | ||
299 : | if (! defined($realScore)) { | ||
300 : | # Bad score, so count it. | ||
301 : | $self->Add(badCDDscore => 1); | ||
302 : | Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3); | ||
303 : | } else { | ||
304 : | # Create the connection and a CDD record. | ||
305 : | parrello | 1.6 | $self->PutR(IsPresentOnProteinOf => $cdd, $featureID, |
306 : | score => $realScore); | ||
307 : | $self->PutE(CDD => $cdd); | ||
308 : | } | ||
309 : | } | ||
310 : | } | ||
311 : | # A similar situation exists for protein families. | ||
312 : | if (exists $attributeHash{PFAM}) { | ||
313 : | # Get the hash of PFAMs to scores for this feature. | ||
314 : | my $pfamHash = $attributeHash{PFAM}; | ||
315 : | for my $pfam (sort keys %$pfamHash) { | ||
316 : | # Extract the range. | ||
317 : | my $codeScore = $pfamHash->{$pfam}->[0]; | ||
318 : | $codeScore =~ /;(.+)/; | ||
319 : | my $range = $1; | ||
320 : | # Strip off the PFAM id from the source. | ||
321 : | my ($pfamID) = split /_/, $pfam, 2; | ||
322 : | # Emit the ProteinFamily record. | ||
323 : | $self->PutE(ProteinFamily => $pfamID); | ||
324 : | # Connect it to the feature. | ||
325 : | $self->PutR(IsFamilyForFeature => $pfamID, $featureID, | ||
326 : | range => $range); | ||
327 : | # Get its name from the ontology database. There can | ||
328 : | # be at most one. | ||
329 : | my $dt_objs = | ||
330 : | $ontology_dbmaster->pfam->get_objects({id => $pfamID}); | ||
331 : | if (defined $dt_objs->[0]) { | ||
332 : | $self->PutE(ProteinFamilyName => $pfamID, | ||
333 : | common_name => $dt_objs->[0]->term()); | ||
334 : | parrello | 1.1 | } |
335 : | } | ||
336 : | } | ||
337 : | # Next we do PSORT cell locations. here the confidence value | ||
338 : | # could have the value "unknown", which we translate to -1. | ||
339 : | if (exists $attributeHash{PSORT}) { | ||
340 : | # This will be a hash of cell locations to confidence | ||
341 : | # factors. | ||
342 : | my $psortHash = $attributeHash{PSORT}; | ||
343 : | for my $psort (keys %{$psortHash}) { | ||
344 : | # Get the confidence, and convert it to a number if necessary. | ||
345 : | parrello | 1.6 | my $confidence = $psortHash->{$psort}->[0]; |
346 : | parrello | 1.1 | if ($confidence eq 'unknown') { |
347 : | $confidence = -1; | ||
348 : | } | ||
349 : | parrello | 1.6 | $self->PutR(IsPossiblePlaceFor => $psort, $featureID, |
350 : | confidence => $confidence); | ||
351 : | $self->PutE(CellLocation => $psort); | ||
352 : | parrello | 1.1 | # If this is a significant location, add it as a keyword. |
353 : | if ($confidence > 2.5) { | ||
354 : | parrello | 1.6 | # Before we add it as a keyword, we convert it from |
355 : | # capital-case to hyphenated by inserting hyphens at | ||
356 : | # case transition points. | ||
357 : | $psort =~ s/([a-z])([A-Z])/$1-$2/g; | ||
358 : | parrello | 1.1 | push @keywords, $psort; |
359 : | } | ||
360 : | } | ||
361 : | } | ||
362 : | # Phobius data is next. This consists of the signal peptide location and | ||
363 : | # the transmembrane locations. | ||
364 : | my $signalList = ""; | ||
365 : | my $transList = ""; | ||
366 : | parrello | 1.6 | my $transCount = 0; |
367 : | parrello | 1.1 | if (exists $attributeHash{Phobius}) { |
368 : | # This will be a hash of two keys (transmembrane and signal) to | ||
369 : | parrello | 1.6 | # location lists. GetCommaList converts them into comma-separated |
370 : | # location strings. If there's no value, it returns an empty string. | ||
371 : | parrello | 1.1 | $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal}); |
372 : | parrello | 1.6 | my $transList = $attributeHash{Phobius}->{transmembrane}; |
373 : | my @transMap = split /\s*,\s*/, $transList; | ||
374 : | $transCount = (defined $transList ? scalar(@transMap) : 0); | ||
375 : | parrello | 1.1 | } |
376 : | # Here are some more numbers: isoelectric point, molecular weight, and | ||
377 : | # the similar-to-human flag. | ||
378 : | my $isoelectric = 0; | ||
379 : | if (exists $attributeHash{isoelectric_point}) { | ||
380 : | parrello | 1.6 | $isoelectric = $attributeHash{isoelectric_point}->{""}->[0]; |
381 : | parrello | 1.1 | } |
382 : | my $similarToHuman = 0; | ||
383 : | parrello | 1.6 | if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""}->[0] eq 'yes') { |
384 : | parrello | 1.1 | $similarToHuman = 1; |
385 : | } | ||
386 : | my $molecularWeight = 0; | ||
387 : | if (exists $attributeHash{molecular_weight}) { | ||
388 : | parrello | 1.6 | $molecularWeight = $attributeHash{molecular_weight}->{""}->[0]; |
389 : | parrello | 1.1 | } |
390 : | # Join the keyword string. | ||
391 : | my $keywordString = join(" ", @keywords); | ||
392 : | # Get rid of annoying punctuation. | ||
393 : | parrello | 1.5 | $keywordString =~ s/[();@#\/,]/ /g; |
394 : | parrello | 1.6 | # Get the list of keywords in the keyword string, minus the delimiters. |
395 : | my @realKeywords = grep { $stemmer->IsWord($_) } | ||
396 : | $stemmer->Split($keywordString); | ||
397 : | parrello | 1.1 | # We need to do two things here: create the keyword string for the feature table |
398 : | # and write records to the keyword table for the keywords. | ||
399 : | my (%keys, %stems, @realStems); | ||
400 : | for my $keyword (@realKeywords) { | ||
401 : | # Compute the stem and phonex for this keyword. | ||
402 : | my ($stem, $phonex) = $stemmer->StemLookup($keyword); | ||
403 : | # Only proceed if a stem comes back. If no stem came back, it's a | ||
404 : | # stop word and we throw it away. | ||
405 : | if ($stem) { | ||
406 : | $keys{$keyword} = $stem; | ||
407 : | $stems{$stem} = $phonex; | ||
408 : | push @realStems, $stem; | ||
409 : | } | ||
410 : | } | ||
411 : | # Now create the keyword string. | ||
412 : | my $cleanWords = join(" ", @realStems); | ||
413 : | Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4); | ||
414 : | # Create keyword table entries for the keywords found. | ||
415 : | for my $key (keys %keys) { | ||
416 : | my $stem = $keys{$key}; | ||
417 : | parrello | 1.6 | $self->PutE(Keyword => $key, stem => $stem, phonex => $stems{$stem}); |
418 : | parrello | 1.1 | } |
419 : | # Now we need to process the feature's locations. First, we split them up. | ||
420 : | my @locationList = split /\s*,\s*/, $locations; | ||
421 : | # Next, we convert them to Sprout location objects. | ||
422 : | my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; | ||
423 : | # Assemble them into a sprout location string for later. | ||
424 : | my $locationString = join(", ", map { $_->String } @locObjectList); | ||
425 : | # We'll store the sequence length in here. | ||
426 : | my $sequenceLength = 0; | ||
427 : | # This part is the roughest. We need to relate the features to contig | ||
428 : | # locations, and the locations must be split so that none of them exceed | ||
429 : | # the maximum segment size. This simplifies the genes_in_region processing | ||
430 : | # for Sprout. To start, we create the location position indicator. | ||
431 : | my $i = 1; | ||
432 : | # Loop through the locations. | ||
433 : | for my $locObject (@locObjectList) { | ||
434 : | # Record the length. | ||
435 : | $sequenceLength += $locObject->Length; | ||
436 : | # Split this location into a list of chunks. | ||
437 : | my @locOList = (); | ||
438 : | while (my $peeling = $locObject->Peel($chunkSize)) { | ||
439 : | $self->Add(peeling => 1); | ||
440 : | push @locOList, $peeling; | ||
441 : | } | ||
442 : | push @locOList, $locObject; | ||
443 : | # Loop through the chunks, creating IsLocatedIn records. The variable | ||
444 : | # "$i" will be used to keep the location index. | ||
445 : | for my $locChunk (@locOList) { | ||
446 : | parrello | 1.6 | $self->PutR(IsLocatedIn => $featureID, $locChunk->Contig, |
447 : | beg => $locChunk->Left, dir => $locChunk->Dir, | ||
448 : | len => $locChunk->Length, locN => $i); | ||
449 : | parrello | 1.1 | $i++; |
450 : | } | ||
451 : | } | ||
452 : | parrello | 1.6 | # Check for figfams. In case we find any, we need the range. |
453 : | # It's the whole sequence. | ||
454 : | my $range = "1-$sequenceLength"; | ||
455 : | # Ask for the figfams. | ||
456 : | my @fams = $ffs->families_containing_peg($featureID); | ||
457 : | # Connect them to the feature (if any). | ||
458 : | for my $fam (@fams) { | ||
459 : | $self->PutE(ProteinFamily => $fam); | ||
460 : | $self->PutR(IsFamilyForFeature => $fam, $featureID, | ||
461 : | range => $range); | ||
462 : | } | ||
463 : | parrello | 1.1 | # Now we get some ancillary flags. |
464 : | my $locked = $fig->is_locked_fid($featureID); | ||
465 : | my $in_genbank = $fig->peg_in_gendb($featureID); | ||
466 : | # Create the feature record. | ||
467 : | parrello | 1.6 | $self->PutE(Feature => $featureID, 'assignment-maker' => $user, |
468 : | parrello | 1.1 | 'assignment-quality' => $quality, 'feature-type' => $type, |
469 : | 'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric, | ||
470 : | locked => $locked, 'molecular-weight' => $molecularWeight, | ||
471 : | 'sequence-length' => $sequenceLength, | ||
472 : | 'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman, | ||
473 : | assignment => $assignment, keywords => $cleanWords, | ||
474 : | 'location-string' => $locationString, | ||
475 : | parrello | 1.6 | 'transmembrane-map' => $transList, |
476 : | 'conserved-neighbors' => $couplingCount, | ||
477 : | 'transmembrane-domain-count' => $transCount); | ||
478 : | parrello | 1.1 | } |
479 : | } | ||
480 : | } | ||
481 : | } | ||
482 : | |||
483 : | |||
484 : | =head3 SpecialAttribute | ||
485 : | |||
486 : | my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field); | ||
487 : | |||
488 : | Look for special attributes of a given type. A special attribute is found by comparing one of | ||
489 : | the columns of the incoming attribute list to a search pattern. If a match is found, then | ||
490 : | a set of columns is put into an output table connected to the specified ID. | ||
491 : | |||
492 : | For example, when processing features, the attribute list we look at has three columns: attribute | ||
493 : | name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name | ||
494 : | begins with C<iedb_>. The call signature is therefore | ||
495 : | |||
496 : | my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb'); | ||
497 : | |||
498 : | The pattern is matched against column 0, and if we have a match, then column 2's value is put | ||
499 : | to the output along with the specified feature ID. | ||
500 : | |||
501 : | =over 4 | ||
502 : | |||
503 : | =item id | ||
504 : | |||
505 : | ID of the object whose special attributes are being loaded. This forms the first column of the | ||
506 : | output. | ||
507 : | |||
508 : | =item attributes | ||
509 : | |||
510 : | Reference to a list of tuples. | ||
511 : | |||
512 : | =item idxMatch | ||
513 : | |||
514 : | Index in each tuple of the column to be matched against the pattern. If the match is | ||
515 : | successful, an output record will be generated. | ||
516 : | |||
517 : | =item idxValues | ||
518 : | |||
519 : | parrello | 1.6 | Reference to a list containing the indexes of the value and URL to put in the |
520 : | second column of the output. | ||
521 : | parrello | 1.1 | |
522 : | =item pattern | ||
523 : | |||
524 : | Pattern to be matched against the specified column. The match will be case-insensitive. | ||
525 : | |||
526 : | =item tableName | ||
527 : | |||
528 : | Name of the table to contain the attribute values found. | ||
529 : | |||
530 : | =item fieldName | ||
531 : | |||
532 : | Name of the field to contain the attribute values in the output table. | ||
533 : | |||
534 : | =item RETURN | ||
535 : | |||
536 : | Returns a count of the matches found. | ||
537 : | |||
538 : | =item | ||
539 : | |||
540 : | =back | ||
541 : | |||
542 : | =cut | ||
543 : | |||
544 : | sub SpecialAttribute { | ||
545 : | # Get the parameters. | ||
546 : | my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_; | ||
547 : | # Declare the return variable. | ||
548 : | my $retVal = 0; | ||
549 : | # Loop through the attribute rows. | ||
550 : | for my $row (@{$attributes}) { | ||
551 : | # Check for a match. | ||
552 : | if ($row->[$idxMatch] =~ m/$pattern/i) { | ||
553 : | parrello | 1.6 | # We have a match, so output a row. |
554 : | my $value = HyperLink->new(map { $row->[$_] } @$idxValues); | ||
555 : | $self->PutE($tableName => $id, $fieldName => $value); | ||
556 : | parrello | 1.1 | $retVal++; |
557 : | } | ||
558 : | } | ||
559 : | Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal; | ||
560 : | # Return the number of matches. | ||
561 : | return $retVal; | ||
562 : | } | ||
563 : | |||
564 : | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |