[Bio] / Sprout / FeatureSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use BioWords;
26 :     use AliasAnalysis;
27 : parrello 1.6 use DBMaster;
28 :     use HyperLink;
29 :     use FFs;
30 : parrello 1.8 use SOAP::Lite;
31 : parrello 1.1 use base 'BaseSproutLoader';
32 :    
33 :     =head1 Sprout Feature Load Group Class
34 :    
35 :     =head2 Introduction
36 :    
37 :     The Feature Load Group includes all of the major feature-related tables.
38 :    
39 :     =head3 new
40 :    
41 : parrello 1.3 my $sl = FeatureSproutLoader->new($erdb, $source, $options, @tables);
42 : parrello 1.1
43 : parrello 1.3 Construct a new FeatureSproutLoader object.
44 : parrello 1.1
45 :     =over 4
46 :    
47 :     =item erdb
48 :    
49 :     [[SproutPm]] object for the database being loaded.
50 :    
51 :     =item options
52 :    
53 :     Reference to a hash of command-line options.
54 :    
55 :     =item tables
56 :    
57 :     List of tables in this load group.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub new {
64 :     # Get the parameters.
65 : parrello 1.6 my ($class, $erdb, $options) = @_;
66 : parrello 1.1 # Create the table list.
67 :     my @tables = sort qw(Feature IsLocatedIn FeatureAlias IsAliasOf FeatureLink
68 :     FeatureTranslation FeatureUpstream HasFeature HasRoleInSubsystem
69 :     FeatureEssential FeatureVirulent FeatureIEDB CDD IsPresentOnProteinOf
70 : parrello 1.6 CellLocation IsPossiblePlaceFor IsAlsoFoundIn ExternalDatabase Keyword
71 :     ProteinFamily IsFamilyForFeature ProteinFamilyName FeatureEC);
72 : parrello 1.1 # Create the BaseSproutLoader object.
73 : parrello 1.6 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
74 : parrello 1.2 # Get the list of relevant attributes.
75 : parrello 1.1 # Bless and return it.
76 :     bless $retVal, $class;
77 :     return $retVal;
78 :     }
79 :    
80 :     =head2 Public Methods
81 :    
82 :     =head3 Generate
83 :    
84 :     $sl->Generate();
85 :    
86 :     Generate the data for the feature-related files.
87 :    
88 :     =cut
89 :    
90 :     sub Generate {
91 :     # Get the parameters.
92 :     my ($self) = @_;
93 :     # Get the sprout object.
94 :     my $sprout = $self->db();
95 :     # Get the FIG object.
96 :     my $fig = $self->source();
97 :     # Get the subsystem list.
98 :     my $subHash = $self->GetSubsystems();
99 :     # Get the word stemmer.
100 :     my $stemmer = $sprout->GetStemmer();
101 : parrello 1.6 # Get access to FIGfams.
102 :     my $figfam_data = &FIG::get_figfams_data();
103 :     my $ffs = new FFs($figfam_data);
104 : parrello 1.1 # Only proceed if this is not the global section.
105 :     if (! $self->global()) {
106 : parrello 1.4 # Get the section ID.
107 :     my $genomeID = $self->section();
108 : parrello 1.6 # Connect to the ontology database.
109 :     my $sqlite_db = "/home/mkubal/Temp/Ontology/ontology.sqlite";
110 :     my $ontology_dbmaster = DBMaster->new(-database => $sqlite_db, -backend => 'SQLite');
111 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
112 :     # locations.
113 :     my $chunkSize = $sprout->MaxSegment();
114 :     Trace("Loading features for genome $genomeID.") if T(ERDBLoadGroup => 3);
115 :     # Get the feature list for this genome.
116 :     my $features = $fig->all_features_detailed_fast($genomeID);
117 :     # Sort and count the list.
118 :     my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
119 :     my $count = scalar @featureTuples;
120 :     Trace("$count features found for genome $genomeID.") if T(ERDBLoadGroup => 3);
121 :     # Get the attributes for this genome and put them in a hash by feature ID.
122 : parrello 1.6 my $attributes = $self->GetGenomeAttributes($genomeID, \@featureTuples);
123 : parrello 1.1 Trace("Looping through features for $genomeID.") if T(ERDBLoadGroup => 3);
124 :     # Loop through the features.
125 :     for my $featureTuple (@featureTuples) {
126 :     # Split the tuple.
127 : parrello 1.4 my ($featureID, $locations, $aliases, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
128 : parrello 1.1 # Make sure this feature is active.
129 :     if (! $fig->is_deleted_fid($featureID)) {
130 :     # Handle missing assignments.
131 :     if (! defined $assignment) {
132 :     $assignment = '';
133 :     $user = '';
134 :     } else {
135 :     # The default assignment-maker is FIG.
136 :     $user ||= 'fig';
137 :     }
138 :     # Count this feature.
139 :     $self->Track(features => $featureID, 1000);
140 :     # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
141 :     # Sprout database requires a single character.
142 :     if (! defined($quality) || $quality eq "") {
143 :     $quality = " ";
144 :     }
145 : parrello 1.6 # Get the coupling count. The coupled features are returned as a list,
146 :     # and we store it as a scalar to get the count.
147 :     my $couplingCount = $fig->coupled_to($featureID);
148 : parrello 1.1 # Begin building the keywords. We start with the genome ID, the
149 :     # feature ID, the taxonomy, and the organism name.
150 :     my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
151 :     $fig->taxonomy_of($genomeID));
152 : parrello 1.7 # We need to insure we don't put multiple copies of the same alias
153 :     # in the keyword list, so we'll put all aliases found in this hash.
154 :     my %aliasHash;
155 : parrello 1.8 # We need a list of all the aliases. These come from two sources: one
156 :     # from the annotation clearinghouse, and one from the detailed features
157 :     # list.
158 :     my @rawAliasList = split /,/, $aliases;
159 :     # Only PEGs appear in the clearinghouse.
160 :     if ($type eq 'peg') {
161 :     push @rawAliasList, AliasAnalysis::QueryACLH($fig, $featureID);
162 :     }
163 : parrello 1.7 # Loop through the aliases, recording the normal and natural forms/
164 : parrello 1.8 for my $alias (@rawAliasList) {
165 : parrello 1.7 # Save the alias.
166 :     $aliasHash{$alias} = 1;
167 :     # Check for a natural form.
168 :     my $natural = AliasAnalysis::Format(natural => $alias);
169 :     if (defined $natural) {
170 :     # A natural form was found, so we add it to the alias
171 :     # list.
172 :     $aliasHash{$natural} = 1;
173 :     } elsif ($alias =~ /^[A-Z]{3}\d+$/) {
174 :     # Here it's not a recognized type, but it's probably a
175 :     # locus tag, so we create a prefixed version.
176 :     my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);
177 :     $aliasHash{$normalized} = 1;
178 : parrello 1.1 }
179 :     }
180 :     # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
181 :     my @corresponders = $fig->get_corresponding_ids($featureID, 1);
182 :     for my $tuple (@corresponders) {
183 :     my ($id, $xdb) = @{$tuple};
184 : parrello 1.7 # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug
185 :     # at PIR.
186 : parrello 1.6 if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {
187 : parrello 1.1 # Connect this ID to the feature and mark its database.
188 : parrello 1.6 $self->PutR(IsAlsoFoundIn => $featureID, $xdb,
189 : parrello 1.1 alias => $id);
190 : parrello 1.6 $self->PutE(ExternalDatabase => $xdb);
191 : parrello 1.7 # Compute the ID's normalized form.
192 :     my $normalized = AliasAnalysis::Normalize($xdb => $id);
193 :     # Add both to the alias hash.
194 :     $aliasHash{$id} = 1;
195 :     $aliasHash{$normalized} = 1;
196 : parrello 1.1 }
197 :     }
198 : parrello 1.7 # Create the aliases.
199 :     for my $alias (sort keys %aliasHash) {
200 :     # Connect this alias to this feature and make an Alias record for it.
201 :     $self->PutR(IsAliasOf => $alias, $featureID);
202 :     $self->PutE(FeatureAlias => $alias);
203 :     # Add it to the keyword list.
204 :     push @keywords, $alias;
205 :     }
206 : parrello 1.1 Trace("Assignment for $featureID is: $assignment") if T(ERDBLoadGroup => 4);
207 :     # Break the assignment into words and shove it onto the
208 :     # keyword list.
209 :     push @keywords, split(/\s+/, $assignment);
210 : parrello 1.5 # Add any EC numbers.
211 : parrello 1.6 my @ecs = BioWords::ExtractECs($assignment);
212 :     for my $ec (@ecs) {
213 :     push @keywords, $ec;
214 :     $self->PutE(FeatureEC => $featureID, ec => $ec);
215 :     }
216 : parrello 1.1 # Link this feature to the parent genome.
217 : parrello 1.6 $self->PutR(HasFeature => $genomeID, $featureID,
218 : parrello 1.1 type => $type);
219 :     # Get the links.
220 :     my @links = $fig->fid_links($featureID);
221 :     for my $link (@links) {
222 : parrello 1.6 $self->PutE(FeatureLink => $featureID, link => $link);
223 : parrello 1.1 }
224 :     # If this is a peg, generate the translation and the upstream.
225 :     if ($type eq 'peg') {
226 :     $self->Add(pegIn => 1);
227 :     my $translation = $fig->get_translation($featureID);
228 :     if ($translation) {
229 : parrello 1.6 $self->PutE(FeatureTranslation => $featureID,
230 : parrello 1.1 translation => $translation);
231 :     }
232 :     # We use the default upstream values of u=200 and c=100.
233 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
234 :     if ($upstream) {
235 : parrello 1.6 $self->PutE(FeatureUpstream => $featureID,
236 : parrello 1.1 'upstream-sequence' => $upstream);
237 :     }
238 :     }
239 :     # Now we need to find the subsystems this feature participates in.
240 :     my @ssList = $fig->subsystems_for_peg($featureID);
241 :     # This hash prevents us from adding the same subsystem twice.
242 :     my %seen = ();
243 :     for my $ssEntry (@ssList) {
244 :     # Get the subsystem and role.
245 :     my ($subsystem, $role) = @{$ssEntry};
246 :     # Only proceed if we like this subsystem.
247 :     if (exists $subHash->{$subsystem}) {
248 :     # If this is the first time we've seen this subsystem for
249 :     # this peg, store the has-role link.
250 :     if (! $seen{$subsystem}) {
251 : parrello 1.6 $self->PutR(HasRoleInSubsystem => $featureID, $subsystem,
252 :     genome => $genomeID, type => $type);
253 : parrello 1.5 # Save the subsystem's keywords.
254 : parrello 1.1 push @keywords, split /[\s_]+/, $subsystem;
255 :     }
256 : parrello 1.5 # Now add the role and any embedded EC nubmers to the keyword list.
257 : parrello 1.1 push @keywords, split /\s+/, $role;
258 : parrello 1.5 push @keywords, BioWords::ExtractECs($role);
259 : parrello 1.1 }
260 :     }
261 : parrello 1.5 # For each hyphenated word, we also need the pieces.
262 :     my @hyphenated = grep { $_ =~ /-/ } @keywords;
263 :     for my $hyphenated (@hyphenated) {
264 :     # Bust it into pieces.
265 :     my @pieces = grep { length($_) > 2 } split /-/, $hyphenated;
266 :     push @keywords, @pieces;
267 :     }
268 : parrello 1.1 # There are three special attributes computed from property
269 :     # data that we build next. If the special attribute is non-empty,
270 :     # its name will be added to the keyword list. First, we get all
271 :     # the attributes for this feature. They will come back as
272 : parrello 1.6 # 4-tuples: [peg, name, value, URL].
273 :     my @attributes = @{$attributes->{$featureID}};
274 : parrello 1.1 # Now we process each of the special attributes.
275 :     if ($self->SpecialAttribute($featureID, \@attributes,
276 : parrello 1.6 2, [1,3], '^(essential|potential_essential)$',
277 : parrello 1.1 qw(FeatureEssential essential))) {
278 :     push @keywords, 'essential';
279 :     $self->Add(essential => 1);
280 :     }
281 :     if ($self->SpecialAttribute($featureID, \@attributes,
282 : parrello 1.6 1, [2,3], '^virulen',
283 : parrello 1.1 qw(FeatureVirulent virulent))) {
284 :     push @keywords, 'virulent';
285 :     $self->Add(virulent => 1);
286 :     }
287 :     if ($self->SpecialAttribute($featureID, \@attributes,
288 : parrello 1.6 1, [2,3], '^iedb_',
289 : parrello 1.1 qw(FeatureIEDB iedb))) {
290 :     push @keywords, 'iedb';
291 :     $self->Add(iedb => 1);
292 :     }
293 :     # Now we have some other attributes we need to process. To get
294 :     # through them, we convert the attribute list for this feature
295 :     # into a two-layer hash: key => subkey => value.
296 :     my %attributeHash = ();
297 :     for my $attrRow (@{$attributes->{$featureID}}) {
298 :     my (undef, $key, @values) = @{$attrRow};
299 :     my ($realKey, $subKey);
300 :     if ($key =~ /^([^:]+)::(.+)/) {
301 :     ($realKey, $subKey) = ($1, $2);
302 :     } else {
303 :     ($realKey, $subKey) = ($key, "");
304 :     }
305 :     if (exists $attributeHash{$realKey}) {
306 :     $attributeHash{$realKey}->{$subKey} = \@values;
307 :     } else {
308 :     $attributeHash{$realKey} = {$subKey => \@values};
309 :     }
310 :     }
311 : parrello 1.6 TraceDump(AttributeHash => \%attributeHash) if T(FeatureLoadGroup => 4);
312 : parrello 1.1 # First we handle CDD. This is a bit complicated, because
313 :     # there are multiple CDDs per protein.
314 :     if (exists $attributeHash{CDD}) {
315 :     # Get the hash of CDD IDs to scores for this feature. We
316 :     # already know it exists because of the above IF.
317 :     my $cddHash = $attributeHash{CDD};
318 : parrello 1.6 my @cddData = sort keys %$cddHash;
319 : parrello 1.1 for my $cdd (@cddData) {
320 :     # Extract the score for this CDD and decode it.
321 :     my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
322 :     my $realScore = FIGRules::DecodeScore($codeScore);
323 :     # We can't afford to crash because of a bad attribute
324 :     # value, hence the IF below.
325 :     if (! defined($realScore)) {
326 :     # Bad score, so count it.
327 :     $self->Add(badCDDscore => 1);
328 :     Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(ERDBLoadGroup => 3);
329 :     } else {
330 :     # Create the connection and a CDD record.
331 : parrello 1.6 $self->PutR(IsPresentOnProteinOf => $cdd, $featureID,
332 :     score => $realScore);
333 :     $self->PutE(CDD => $cdd);
334 :     }
335 :     }
336 :     }
337 :     # A similar situation exists for protein families.
338 :     if (exists $attributeHash{PFAM}) {
339 :     # Get the hash of PFAMs to scores for this feature.
340 :     my $pfamHash = $attributeHash{PFAM};
341 :     for my $pfam (sort keys %$pfamHash) {
342 :     # Extract the range.
343 :     my $codeScore = $pfamHash->{$pfam}->[0];
344 :     $codeScore =~ /;(.+)/;
345 :     my $range = $1;
346 :     # Strip off the PFAM id from the source.
347 :     my ($pfamID) = split /_/, $pfam, 2;
348 :     # Emit the ProteinFamily record.
349 :     $self->PutE(ProteinFamily => $pfamID);
350 :     # Connect it to the feature.
351 :     $self->PutR(IsFamilyForFeature => $pfamID, $featureID,
352 :     range => $range);
353 :     # Get its name from the ontology database. There can
354 :     # be at most one.
355 :     my $dt_objs =
356 :     $ontology_dbmaster->pfam->get_objects({id => $pfamID});
357 :     if (defined $dt_objs->[0]) {
358 :     $self->PutE(ProteinFamilyName => $pfamID,
359 :     common_name => $dt_objs->[0]->term());
360 : parrello 1.1 }
361 :     }
362 :     }
363 :     # Next we do PSORT cell locations. here the confidence value
364 :     # could have the value "unknown", which we translate to -1.
365 :     if (exists $attributeHash{PSORT}) {
366 :     # This will be a hash of cell locations to confidence
367 :     # factors.
368 :     my $psortHash = $attributeHash{PSORT};
369 :     for my $psort (keys %{$psortHash}) {
370 :     # Get the confidence, and convert it to a number if necessary.
371 : parrello 1.6 my $confidence = $psortHash->{$psort}->[0];
372 : parrello 1.1 if ($confidence eq 'unknown') {
373 :     $confidence = -1;
374 :     }
375 : parrello 1.6 $self->PutR(IsPossiblePlaceFor => $psort, $featureID,
376 :     confidence => $confidence);
377 :     $self->PutE(CellLocation => $psort);
378 : parrello 1.1 # If this is a significant location, add it as a keyword.
379 :     if ($confidence > 2.5) {
380 : parrello 1.6 # Before we add it as a keyword, we convert it from
381 :     # capital-case to hyphenated by inserting hyphens at
382 :     # case transition points.
383 :     $psort =~ s/([a-z])([A-Z])/$1-$2/g;
384 : parrello 1.1 push @keywords, $psort;
385 :     }
386 :     }
387 :     }
388 :     # Phobius data is next. This consists of the signal peptide location and
389 :     # the transmembrane locations.
390 :     my $signalList = "";
391 :     my $transList = "";
392 : parrello 1.6 my $transCount = 0;
393 : parrello 1.1 if (exists $attributeHash{Phobius}) {
394 :     # This will be a hash of two keys (transmembrane and signal) to
395 : parrello 1.6 # location lists. GetCommaList converts them into comma-separated
396 :     # location strings. If there's no value, it returns an empty string.
397 : parrello 1.1 $signalList = $self->GetCommaList($attributeHash{Phobius}->{signal});
398 : parrello 1.6 my $transList = $attributeHash{Phobius}->{transmembrane};
399 :     my @transMap = split /\s*,\s*/, $transList;
400 :     $transCount = (defined $transList ? scalar(@transMap) : 0);
401 : parrello 1.1 }
402 :     # Here are some more numbers: isoelectric point, molecular weight, and
403 :     # the similar-to-human flag.
404 :     my $isoelectric = 0;
405 :     if (exists $attributeHash{isoelectric_point}) {
406 : parrello 1.6 $isoelectric = $attributeHash{isoelectric_point}->{""}->[0];
407 : parrello 1.1 }
408 :     my $similarToHuman = 0;
409 : parrello 1.6 if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""}->[0] eq 'yes') {
410 : parrello 1.1 $similarToHuman = 1;
411 :     }
412 :     my $molecularWeight = 0;
413 :     if (exists $attributeHash{molecular_weight}) {
414 : parrello 1.6 $molecularWeight = $attributeHash{molecular_weight}->{""}->[0];
415 : parrello 1.1 }
416 :     # Join the keyword string.
417 :     my $keywordString = join(" ", @keywords);
418 :     # Get rid of annoying punctuation.
419 : parrello 1.5 $keywordString =~ s/[();@#\/,]/ /g;
420 : parrello 1.6 # Get the list of keywords in the keyword string, minus the delimiters.
421 :     my @realKeywords = grep { $stemmer->IsWord($_) }
422 :     $stemmer->Split($keywordString);
423 : parrello 1.1 # We need to do two things here: create the keyword string for the feature table
424 :     # and write records to the keyword table for the keywords.
425 :     my (%keys, %stems, @realStems);
426 :     for my $keyword (@realKeywords) {
427 :     # Compute the stem and phonex for this keyword.
428 :     my ($stem, $phonex) = $stemmer->StemLookup($keyword);
429 :     # Only proceed if a stem comes back. If no stem came back, it's a
430 :     # stop word and we throw it away.
431 :     if ($stem) {
432 :     $keys{$keyword} = $stem;
433 :     $stems{$stem} = $phonex;
434 :     push @realStems, $stem;
435 :     }
436 :     }
437 :     # Now create the keyword string.
438 :     my $cleanWords = join(" ", @realStems);
439 :     Trace("Keyword string for $featureID: $cleanWords") if T(ERDBLoadGroup => 4);
440 :     # Create keyword table entries for the keywords found.
441 :     for my $key (keys %keys) {
442 :     my $stem = $keys{$key};
443 : parrello 1.6 $self->PutE(Keyword => $key, stem => $stem, phonex => $stems{$stem});
444 : parrello 1.1 }
445 :     # Now we need to process the feature's locations. First, we split them up.
446 :     my @locationList = split /\s*,\s*/, $locations;
447 :     # Next, we convert them to Sprout location objects.
448 :     my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
449 :     # Assemble them into a sprout location string for later.
450 :     my $locationString = join(", ", map { $_->String } @locObjectList);
451 :     # We'll store the sequence length in here.
452 :     my $sequenceLength = 0;
453 :     # This part is the roughest. We need to relate the features to contig
454 :     # locations, and the locations must be split so that none of them exceed
455 :     # the maximum segment size. This simplifies the genes_in_region processing
456 :     # for Sprout. To start, we create the location position indicator.
457 :     my $i = 1;
458 :     # Loop through the locations.
459 :     for my $locObject (@locObjectList) {
460 :     # Record the length.
461 :     $sequenceLength += $locObject->Length;
462 :     # Split this location into a list of chunks.
463 :     my @locOList = ();
464 :     while (my $peeling = $locObject->Peel($chunkSize)) {
465 :     $self->Add(peeling => 1);
466 :     push @locOList, $peeling;
467 :     }
468 :     push @locOList, $locObject;
469 :     # Loop through the chunks, creating IsLocatedIn records. The variable
470 :     # "$i" will be used to keep the location index.
471 :     for my $locChunk (@locOList) {
472 : parrello 1.6 $self->PutR(IsLocatedIn => $featureID, $locChunk->Contig,
473 :     beg => $locChunk->Left, dir => $locChunk->Dir,
474 :     len => $locChunk->Length, locN => $i);
475 : parrello 1.1 $i++;
476 :     }
477 :     }
478 : parrello 1.6 # Check for figfams. In case we find any, we need the range.
479 :     # It's the whole sequence.
480 :     my $range = "1-$sequenceLength";
481 :     # Ask for the figfams.
482 :     my @fams = $ffs->families_containing_peg($featureID);
483 :     # Connect them to the feature (if any).
484 :     for my $fam (@fams) {
485 :     $self->PutE(ProteinFamily => $fam);
486 :     $self->PutR(IsFamilyForFeature => $fam, $featureID,
487 :     range => $range);
488 :     }
489 : parrello 1.1 # Now we get some ancillary flags.
490 :     my $locked = $fig->is_locked_fid($featureID);
491 :     my $in_genbank = $fig->peg_in_gendb($featureID);
492 :     # Create the feature record.
493 : parrello 1.6 $self->PutE(Feature => $featureID, 'assignment-maker' => $user,
494 : parrello 1.1 'assignment-quality' => $quality, 'feature-type' => $type,
495 :     'in-genbank' => $in_genbank, 'isoelectric-point' => $isoelectric,
496 :     locked => $locked, 'molecular-weight' => $molecularWeight,
497 :     'sequence-length' => $sequenceLength,
498 :     'signal-peptide' => $signalList, 'similar-to-human' => $similarToHuman,
499 :     assignment => $assignment, keywords => $cleanWords,
500 :     'location-string' => $locationString,
501 : parrello 1.6 'transmembrane-map' => $transList,
502 :     'conserved-neighbors' => $couplingCount,
503 :     'transmembrane-domain-count' => $transCount);
504 : parrello 1.1 }
505 :     }
506 :     }
507 :     }
508 :    
509 :    
510 : parrello 1.8 =head3 AliasFix
511 :    
512 :     $sl->AliasFix($orgName, $fidArray);
513 :    
514 :     Ask the %FIG{Annotation Clearinghouse}% for additional aliases of the
515 :     features in the specified array. The array should be a result array from
516 :     the [[FigPm]] method C<all_features_detailed_fast>. The first column of
517 :     the array contains feature IDs. The third column contains a
518 :     comma-delimited list of aliases. This method modifies the third column so
519 :     that it contains any additional aliases available from the Clearinghouse.
520 :    
521 :     =over 4
522 :    
523 :     =item orgName
524 :    
525 :     Organism name for the current genome.
526 :    
527 :     =item fidArray
528 :    
529 :     Reference to a list of feature data. Each feature is represented by an n-tuple
530 :     in the array: the tuple's first element is the feature ID, and the third element
531 :     (which we will be modifying) is a comma-delimited list of aliases.
532 :    
533 :     =back
534 :    
535 :     =cut
536 :    
537 :     sub AliasFix {
538 :     # Get the parameters.
539 :     my ($self, $orgName, $fidArray) = @_;
540 :     # To control the cost of this operation, we process the features in
541 :     # batches. The batch size is controlled by a FIG_Config parameter.
542 :     my $fidCount = scalar @$fidArray;
543 :     my $fidNext;
544 :     for (my $fidIdx = 0; $fidIdx < $fidCount; $fidIdx = $fidNext) {
545 :     # Compute the index of the first feature in the next batch.
546 :     $fidNext = $fidIdx + $FIG_Config::ach_fixup_batch_size;
547 :     $fidNext = $fidCount if $fidNext > $fidCount;
548 :     Trace("Processing ACH feature batch from $fidIdx to $fidNext.") if T(3);
549 :     # The hash below will map the IDs of the features in this batch
550 :     # to the alias strings we want to fix up.
551 :     my %batch;
552 :     for (my $i = $fidIdx; $i < $fidNext; $i++) {
553 :     # Get this feature ID.
554 :     my $fid = $fidArray->[$i][0];
555 :     # Only proceed if it's a PEG.
556 :     if ($fid =~ /peg/) {
557 :     # It is, so put it in the batch hash. The
558 :     # value is a reference to the PEG's alias string.
559 :     $batch{$fid} = \$fidArray->[$i][2];
560 :     }
561 :     }
562 :     # Now we have our batch. Ask the clearinghouse for any data it might
563 :     # have about these FIDs.
564 :     my $resp = SOAP::Lite->uri($FIG_Config::ach_soap)->proxy($FIG_Config::ach_proxy)
565 :     ->get_annotations([keys %batch]);
566 :     Trace("Processing results from clearinghouse.") if T(3);
567 :     # Insure we got a result.
568 :     if (! $resp) {
569 :     Confess("No response from Clearinghouse.");
570 :     } elsif ($resp->fault) {
571 :     Confess("Error requesting alias data from Clearinghouse: " . $resp->faultstring);
572 :     } else {
573 :     # Extract the result. It will be a hash mapping FIDs to tuple arrays.
574 :     my $respHash = $resp->result;
575 :     # Loop through the result hash.
576 :     for my $respFid (keys %$respHash) {
577 :     # Get a reference to the alias string.
578 :     my $pointer = $batch{$respFid};
579 :     # Build a hash of the existing aliases.
580 :     my %aliasHash = map { $_ => 1 } split /,/, $$pointer;
581 :     # Add aliases from the clearinghouse response.
582 :     my $tuples = $respHash->{$respFid};
583 :     my @newAliases = AliasAnalysis::AnalyzeClearinghouseArray($orgName,
584 :     $tuples);
585 :     for my $newAlias (@newAliases) {
586 :     if (! exists $aliasHash{$newAlias}) {
587 :     $aliasHash{$newAlias} = 1;
588 :     $self->Add(ACLHaliases => 1);
589 :     }
590 :     }
591 :     # Save the result.
592 :     $$pointer = join(",", sort keys %aliasHash);
593 :     }
594 :     }
595 :     }
596 :     }
597 :    
598 :    
599 : parrello 1.1 =head3 SpecialAttribute
600 :    
601 :     my $count = $sl->SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $tableName, $field);
602 :    
603 :     Look for special attributes of a given type. A special attribute is found by comparing one of
604 :     the columns of the incoming attribute list to a search pattern. If a match is found, then
605 :     a set of columns is put into an output table connected to the specified ID.
606 :    
607 :     For example, when processing features, the attribute list we look at has three columns: attribute
608 :     name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
609 :     begins with C<iedb_>. The call signature is therefore
610 :    
611 :     my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', 'FeatureIEDB', 'iedb');
612 :    
613 :     The pattern is matched against column 0, and if we have a match, then column 2's value is put
614 :     to the output along with the specified feature ID.
615 :    
616 :     =over 4
617 :    
618 :     =item id
619 :    
620 :     ID of the object whose special attributes are being loaded. This forms the first column of the
621 :     output.
622 :    
623 :     =item attributes
624 :    
625 :     Reference to a list of tuples.
626 :    
627 :     =item idxMatch
628 :    
629 :     Index in each tuple of the column to be matched against the pattern. If the match is
630 :     successful, an output record will be generated.
631 :    
632 :     =item idxValues
633 :    
634 : parrello 1.6 Reference to a list containing the indexes of the value and URL to put in the
635 :     second column of the output.
636 : parrello 1.1
637 :     =item pattern
638 :    
639 :     Pattern to be matched against the specified column. The match will be case-insensitive.
640 :    
641 :     =item tableName
642 :    
643 :     Name of the table to contain the attribute values found.
644 :    
645 :     =item fieldName
646 :    
647 :     Name of the field to contain the attribute values in the output table.
648 :    
649 :     =item RETURN
650 :    
651 :     Returns a count of the matches found.
652 :    
653 :     =item
654 :    
655 :     =back
656 :    
657 :     =cut
658 :    
659 :     sub SpecialAttribute {
660 :     # Get the parameters.
661 :     my ($self, $id, $attributes, $idxMatch, $idxValues, $pattern, $tableName, $fieldName) = @_;
662 :     # Declare the return variable.
663 :     my $retVal = 0;
664 :     # Loop through the attribute rows.
665 :     for my $row (@{$attributes}) {
666 :     # Check for a match.
667 :     if ($row->[$idxMatch] =~ m/$pattern/i) {
668 : parrello 1.6 # We have a match, so output a row.
669 :     my $value = HyperLink->new(map { $row->[$_] } @$idxValues);
670 :     $self->PutE($tableName => $id, $fieldName => $value);
671 : parrello 1.1 $retVal++;
672 :     }
673 :     }
674 :     Trace("$retVal special attributes found for $id and table $tableName.") if T(ERDBLoadGroup => 4) && $retVal;
675 :     # Return the number of matches.
676 :     return $retVal;
677 :     }
678 :    
679 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3