[Bio] / FigKernelPackages / AliasAnalysis.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/AliasAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package AliasAnalysis;
4 :    
5 :     use strict;
6 :     use Tracer;
7 : parrello 1.5 use FIGRules;
8 :     use SOAP::Lite;
9 : parrello 1.7 use base qw(Exporter);
10 :     use vars qw(@EXPORT);
11 :     @EXPORT = qw(AliasCheck);
12 : parrello 1.1
13 :     =head1 Alias Analysis Module
14 :    
15 :     =head2 Introduction
16 :    
17 :     This module encapsulates data about aliases. For each alias, it tells us how to generate
18 :     the appropriate link, what the type is for the alias, its export format, and its display
19 :     format. To add new alias types, we simply update this package.
20 :    
21 :     An alias has three forms. The I<internal> form is how the alias is stored in the database.
22 :     The I<export> form is the form into which it should be translated when being exported to
23 :     BRC databases. The I<natural> form is the form it takes in its own environment. For
24 :     example, C<gi|15675083> is the internal form of a GenBank ID. Its export form is
25 :     C<NCBI_gi:15675083>, and its natural form is simply C<15675083>.
26 :    
27 :     =head2 The Alias Table
28 :    
29 :     The alias table is a hash of hashes. Each sub-hash relates to a specific type of alias, and
30 :     the key names the alias type (e.g. C<uniprot>, C<KEGG>). The sub-hashes have three fields.
31 :    
32 :     =over 4
33 :    
34 :     =item pattern
35 :    
36 :     This is a regular expression that will match aliases of the specified type in their internal
37 :     forms.
38 :    
39 :     =item convert
40 :    
41 :     This field is a hash of conversions. The key for each is the conversion type and the
42 :     data is a replacement expression. These replacement expressions rely on the pattern match
43 :     having just taken place and use the C<$1>, C<$2>, ... variables to get text from the
44 :     alias's internal form. An alias's natural form, export form, and URL are all implemented as
45 :     different types of conversions. New conversion types can be created at
46 :     will be updating the table without having to worry about changing any code. Note that for
47 :     the URL conversion, a value of C<undef> means no URL is available.
48 :    
49 :     =item normalize
50 :    
51 :     This is a prefix that can be used to convert an alias from its natural form to its
52 :     internal form.
53 :    
54 : parrello 1.4 =item home
55 :    
56 :     This is the URL of the alias's home web site.
57 :    
58 :     =item curated
59 :    
60 :     This is the external database name used when the alias appears as a corresponding ID.
61 :     If the alias type is not supported by the corresponding ID effort, this value is
62 :     undefined.
63 :    
64 : parrello 1.1 =back
65 :    
66 :     At some point the Alias Table may be converted from an inline hash to an external XML file.
67 :    
68 :     =cut
69 :    
70 :     my %AliasTable = (
71 :     RefSeq => {
72 : parrello 1.6 pattern => '(?:ref\|)?([NXYZA]P_[0-9\.]+)',
73 : parrello 1.4 home => 'http://www.ncbi.nlm.nih.gov',
74 : parrello 1.1 convert => { natural => '$1',
75 :     export => 'RefSeq_Prot:$1',
76 :     url => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1',
77 :     },
78 :     normalize => '',
79 :     },
80 : parrello 1.4 NCBI => {
81 : parrello 1.1 pattern => 'gi\|(\d+)',
82 : parrello 1.4 home => 'http://www.ncbi.nlm.nih.gov',
83 : parrello 1.1 convert => { natural => '$1',
84 :     export => 'NCBI_gi:$1',
85 : parrello 1.2 url => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept',
86 : parrello 1.1 },
87 :     normalize => 'gi|',
88 :     },
89 : parrello 1.4 CMR => {
90 : parrello 1.8 pattern => 'cmr\|([A-Za-z0-9_]+)',
91 : parrello 1.4 home => 'http://cmr.jcvi.org',
92 :     convert => { natural => '$1',
93 :     export => 'cmr|$1',
94 :     url => 'http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1',
95 :     },
96 :     normalize => 'cmr|',
97 :     },
98 : parrello 1.1 SwissProt => {
99 :     pattern => 'sp\|([A-Z0-9]{6})',
100 : parrello 1.6 home => 'http://us.expasy.org',
101 : parrello 1.1 convert => { natural => '$1',
102 :     export => 'Swiss-Prot:$1',
103 :     url => 'http://us.expasy.org/cgi-bin/get-sprot-entry?$1',
104 :     },
105 :     normalize => 'sp|',
106 :     },
107 :     UniProt => {
108 : parrello 1.4 pattern => 'uni\|([A-Z0-9_]+?)',
109 :     home => 'http://www.uniprot.org',
110 : parrello 1.1 convert => { natural => '$1',
111 :     export => 'UniProtKB:',
112 :     url => 'http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1',
113 :     },
114 :     normalize => 'uni|',
115 :     },
116 :     KEGG => {
117 :     pattern => 'kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))',
118 : parrello 1.6 home => 'http://www.genome.ad.jp',
119 : parrello 1.1 convert => { natural => '$1',
120 :     export => 'KEGG:$2+$3',
121 :     url => 'http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3',
122 :     },
123 :     normalize => 'kegg|',
124 :     },
125 :     LocusTag => {
126 : parrello 1.6 pattern => 'LocusTag:([A-Za-z]{3}\d+(?:\.\d+)?)',
127 : parrello 1.1 convert => { natural => '$1',
128 :     export => 'Locus_Tag:$1',
129 :     url => undef,
130 :     },
131 :     normalize => 'LocusTag:',
132 :     },
133 :     GeneID => {
134 :     pattern => 'GeneID:(\d+)',
135 :     convert => { natural => '$1',
136 :     export => 'GeneID:$1',
137 :     url => undef,
138 :     },
139 :     normalize => 'GeneID:',
140 :     },
141 :     Trembl => {
142 :     pattern => 'tr\|([a-zA-Z0-9]+)',
143 : parrello 1.6 home => 'http://ca.expasy.org',
144 : parrello 1.1 convert => { natural => '$1',
145 :     export => 'TrEMBL:$1',
146 :     url => 'http://ca.expasy.org/uniprot/$1',
147 :     },
148 :     normalize => 'tr|',
149 :     },
150 : parrello 1.6 GENE => {
151 :     pattern => 'GENE:([a-zA-Z]{3,4}(?:-\d+)?)',
152 :     convert => { natural => '$1',
153 :     export => 'GENE:$1',
154 :     url => undef,
155 :     },
156 :     normalize => 'GENE:',
157 :     },
158 : parrello 1.1 );
159 :    
160 : parrello 1.7 =head3 BlackList
161 :    
162 :     The Black List contains a list of alias types that should be discarded during
163 :     alias processing. Any normalized alias whose prefix matches one of the names
164 :     in the list will be discarded (see L</AliasCheck>).
165 :    
166 :     =cut
167 :    
168 :     my %BlackList = map { lc($_) => 1 } qw(InterPro);
169 :    
170 : parrello 1.1 =head2 Public Methods
171 :    
172 : parrello 1.7 =head3 AliasCheck
173 :    
174 :     my $okFlag = AliasCheck($alias);
175 :    
176 :     Return TRUE if the specified alias is acceptable, FALSE if it is
177 :     blacklisted.
178 :    
179 :     =over 4
180 :    
181 :     =item alias
182 :    
183 :     Alias to check, in normalized (internal) form.
184 :    
185 :     =item RETURN
186 :    
187 :     Returns TRUE if the alias's type is acceptable, FALSE if it is found in the blacklist.
188 :    
189 :     =back
190 :    
191 :     =cut
192 :    
193 :     sub AliasCheck {
194 :     # Get the parameters.
195 :     my ($alias) = @_;
196 :     # Declare the return variable.
197 :     my $retVal = 1;
198 :     # Check for a prefix.
199 :     if ($alias =~ /^([^|:]+)/) {
200 :     # Check the prefix against the black list.
201 :     my $prefix = lc $1;
202 :     Trace("Prefix for $alias is $prefix.") if T(3);
203 :     if ($BlackList{$prefix}) {
204 :     $retVal = 0;
205 :     }
206 :     }
207 :     # Return the result.
208 :     return $retVal;
209 :     }
210 :    
211 :    
212 : parrello 1.1 =head3 AliasTypes
213 :    
214 : parrello 1.3 my @aliasTypes = AliasAnalysis::AliasTypes();
215 : parrello 1.1
216 :     Return a list of the alias types. The list can be used to create a menu or dropdown
217 :     for selecting a preferred alias.
218 :    
219 :     =cut
220 :    
221 :     sub AliasTypes {
222 :     return sort keys %AliasTable;
223 :     }
224 :    
225 : parrello 1.7
226 : parrello 1.1 =head3 Find
227 :    
228 : parrello 1.3 my $aliasFound = AliasAnalysis::Find($type, \@aliases);
229 : parrello 1.1
230 :     Find the first alias of the specified type in the list.
231 :    
232 :     =over 4
233 :    
234 :     =item type
235 :    
236 :     Type of alias desired. This must be one of the keys in C<%AliasTable>.
237 :    
238 :     =item aliases
239 :    
240 :     Reference of a list containing alias names. The first alias name that matches
241 :     the structure of the specified alias type will be returned. The incoming
242 :     aliases are presumed to be in internal form.
243 :    
244 :     =item RETURN
245 :    
246 :     Returns the natural form of the desired alias, or C<undef> if no alias of
247 :     the specified type could be found.
248 :    
249 :     =back
250 :    
251 :     =cut
252 :    
253 :     sub Find {
254 :     # Get the parameters.
255 :     my ($type, $aliases) = @_;
256 :     # Declare the return variable.
257 :     my $retVal;
258 :     # Insure we have a valid alias type.
259 :     if (! exists $AliasTable{$type}) {
260 :     Confess("Invalid aliase type \"$type\" specified.");
261 :     } else {
262 :     # Get the pattern for the specified alias type.
263 :     my $pattern = $AliasTable{$type}->{pattern};
264 :     Trace("Alias pattern is /$pattern/.") if T(3);
265 :     # Search for matching aliases. We can't use GREP here because we want
266 :     # to stop as soon as we find a match. That way, the $1,$2.. variables
267 :     # will be set properly.
268 :     my $found;
269 :     for my $alias (@$aliases) { last if $found;
270 :     Trace("Matching against \"$alias\".") if T(4);
271 : parrello 1.4 if ($alias =~ /^$pattern$/) {
272 : parrello 1.1 Trace("Match found.") if T(4);
273 :     # Here we have a match. Return the matching alias's natural form.
274 :     $retVal = eval($AliasTable{$type}->{convert}->{natural});
275 :     $found = 1;
276 :     }
277 :     }
278 :     }
279 :     # Return the value found.
280 :     return $retVal;
281 :     }
282 :    
283 : parrello 1.4 =head3 Normalize
284 :    
285 :     my $normalized = AliasAnalysis::Normalize($type => $naturalName);
286 :    
287 :     Convert an alias of the specified typefrom its natural form to its internal
288 :     form.
289 :    
290 :     =over 4
291 :    
292 :     =item type
293 :    
294 :     Type of the relevant alias.
295 :    
296 :     =item naturalName
297 :    
298 :     Natural-form alias to be converted to internal form.
299 :    
300 :     =item RETURN
301 :    
302 : parrello 1.6 Returns the normalized alias, or the original value if the alias type
303 : parrello 1.4 is not recognized.
304 :    
305 :     =back
306 :    
307 :     =cut
308 :    
309 :     sub Normalize {
310 :     # Get the parameters.
311 :     my ($type, $naturalName) = @_;
312 :     # Declare the return variable.
313 :     my $retVal = $naturalName;
314 :     # Only proceed if the specified type is valid.
315 :     if (exists $AliasTable{$type}) {
316 :     # Normalize the name.
317 :     $retVal = $AliasTable{$type}->{normalize} . $naturalName;
318 :     }
319 :     # Return the result.
320 :     return $retVal;
321 :     }
322 :    
323 :    
324 : parrello 1.1 =head3 Type
325 :    
326 : parrello 1.3 my $naturalName = AliasAnalysis::Type($type => $name);
327 : parrello 1.1
328 :     Return the natural name of an alias if it is of the specified type, and C<undef> otherwise.
329 :     Note that the result of this method will be TRUE if the alias is an internal form of the named
330 :     type and FALSE otherwise.
331 :    
332 :     =over 4
333 :    
334 :     =item type
335 :    
336 :     Relevant alias type.
337 :    
338 :     =item name
339 :    
340 :     Internal-form alias to be matched to the specified type.
341 :    
342 :     =item RETURN
343 :    
344 :     Returns the natural form of the alias if it is of the specified type, and C<undef> otherwise.
345 :    
346 :     =back
347 :    
348 :     =cut
349 :    
350 :     sub Type {
351 :     # Get the parameters.
352 :     my ($type, $name) = @_;
353 :     # Declare the return variable. If there is no match, it will stay undefined.
354 :     my $retVal;
355 :     # Check the alias type.
356 :     my $pattern = $AliasTable{$type}->{pattern};
357 : parrello 1.6 if ($name =~ /^$pattern$/) {
358 : parrello 1.1 # We have a match, so we return the natural form of the alias.
359 :     $retVal = eval($AliasTable{$type}->{convert}->{natural});
360 :     }
361 :     # Return the result.
362 :     return $retVal;
363 :     }
364 :    
365 : parrello 1.4 =head3 Format
366 :    
367 :     my $htmlText = AliasAnalysis::Format($type => $alias);
368 :    
369 :     Return the converted form of an alias. The alias will be compared against
370 :     the patterns in the type table to determine which type of alias it is. Then
371 :     the named conversion will be applied. If the alias is not of a recognized
372 :     type, an undefined value will be returned.
373 :    
374 :     =over 4
375 :    
376 :     =item type
377 :    
378 :     Type of conversion desired (C<natural>, C<export>, C<url>, C<internal>)
379 :    
380 :     =item alias
381 :    
382 :     Alias to be converted.
383 :    
384 :     =item RETURN
385 :    
386 :     Returns the converted alias, or C<undef> if the alias is not of a known type
387 :     or is of a type that does not support the specified conversion.
388 :    
389 :     =back
390 :    
391 :     =cut
392 :    
393 :     sub Format {
394 :     # Get the parameters.
395 :     my ($type, $alias) = @_;
396 :     # Declare the return variable.
397 :     my $retVal;
398 :     # This flag will be used to stop the loop.
399 :     my $found;
400 :     # Check this alias against all the known types.
401 :     for my $aliasType (keys %AliasTable) { last if $found;
402 :     # Get the conversion expression for this alias type.
403 :     my $convertExpression = $AliasTable{$aliasType}->{convert}->{$type};
404 :     # Check to see if we found the right type.
405 :     my $pattern = $AliasTable{$aliasType}->{pattern};
406 :     Trace("Matching \"$alias\" to /$pattern/.") if T(4);
407 :     if ($alias =~ /^$pattern$/) {
408 :     # Here we did. Denote we found the type.
409 :     $found = 1;
410 :     # Insure this type supports the conversion.
411 :     if ($convertExpression) {
412 :     # It does, so do the conversion.
413 :     $retVal = eval("\"$convertExpression\"");
414 :     Trace("Convert expression was \"$convertExpression\".") if T(3);
415 :     }
416 :     }
417 :     }
418 :     # Return the result.
419 :     return $retVal;
420 :     }
421 :    
422 :     =head3 TypeOf
423 :    
424 :     my $type = AliasAnalysis::TypeOf($alias);
425 :    
426 :     Return the type of the specified alias, or C<undef> if the alias is not
427 :     of a recognized type.
428 :    
429 :     =over 4
430 :    
431 :     =item alias
432 :    
433 :     Alias (in internal form) whose type is desired.
434 :    
435 :     =item RETURN
436 :    
437 :     Returns the type of the specified alias, or C<undef> if the alias is of an
438 :     unknown type.
439 :    
440 :     =back
441 :    
442 :     =cut
443 :    
444 :     sub TypeOf {
445 :     # Get the parameters.
446 :     my ($alias) = @_;
447 :     # Declare the return variable.
448 :     my $retVal;
449 :     # Check this alias against all the known types.
450 :     for my $aliasType (keys %AliasTable) { last if defined $retVal;
451 :     # Check to see if we found the right type.
452 :     my $pattern = $AliasTable{$aliasType}->{pattern};
453 :     Trace("Matching \"$alias\" to /$pattern/.") if T(4);
454 :     if ($alias =~ /^$pattern$/) {
455 :     # Here we did. Denote we found the type.
456 :     $retVal = $aliasType;
457 :     }
458 :     }
459 :     # Return the result.
460 :     return $retVal;
461 :     }
462 :    
463 : parrello 1.6 =head3 IsNatural
464 :    
465 :     my $normalized = AliasAnalysis::IsNatural($type => $natural);
466 :    
467 :     Return the normalized form of an alias if it is a natural name of the
468 :     specified type, or an undefined value otherwise. This is useful for
469 :     determining if a particular identifier is a natural alias.
470 :    
471 :     =over 4
472 :    
473 :     =item type
474 :    
475 :     Type of alias name to check.
476 :    
477 :     =item natural
478 :    
479 :     Natural-form alias name to check.
480 :    
481 :     =item RETURN
482 :    
483 :     Returns the normalized alias if the incoming value is a natural-form identifier
484 :     of the specified type, or C<undef> otherwise.
485 :    
486 :     =back
487 :    
488 :     =cut
489 :    
490 :     sub IsNatural {
491 :     # Get the parameters.
492 :     my ($type, $natural) = @_;
493 :     # Declare the return variable.
494 :     my $retVal;
495 :     # Attempt to convert the incoming value to its normalized form.
496 :     my $normalized = $AliasTable{$type}->{normalize} . $natural;
497 :     # Get the pattern for this alias type.
498 :     my $pattern = $AliasTable{$type}->{pattern};
499 :     if ($normalized =~ /^$pattern$/) {
500 :     # Here we have a match, so return the normalized form.
501 :     $retVal = $normalized;
502 :     }
503 :     # Return the result.
504 :     return $retVal;
505 :     }
506 :    
507 : parrello 1.4
508 : parrello 1.1 =head3 FormatHtml
509 :    
510 : parrello 1.3 my $htmlText = AliasAnalysis::FormatHtml(@aliases);
511 : parrello 1.1
512 :     Create an html string that contains the specified aliases in a comma-separated list
513 :     with hyperlinks where available. The aliases are expected to be in internal form and
514 :     will stay that way.
515 :    
516 :     =over 4
517 :    
518 :     =item aliases
519 :    
520 :     A list of aliases in internal form that are to be formatted into HTML.
521 :    
522 :     =item RETURN
523 :    
524 :     Returns a string containing the aliases in a comma-separated list, with hyperlinks
525 :     present on those for which hyperlinks are available.
526 :    
527 :     =back
528 :    
529 :     =cut
530 :    
531 :     sub FormatHtml {
532 :     # Get the parameters.
533 :     my (@aliases) = @_;
534 :     # Set up the output list. The hyperlinked aliases will be put in here, and then
535 :     # srung together before returning to the caller.
536 :     my @retVal = ();
537 :     # Loop through the incoming aliases.
538 :     for my $alias (@aliases) {
539 : parrello 1.4 # We'll put our result string in here.
540 :     my $aliasResult;
541 :     # Compute the alias's URL.
542 :     my $url = Format(url => $alias);
543 :     # Check to see if a URL does indeed exist.
544 : parrello 1.1 if ($url) {
545 : parrello 1.4 # Yes, hyperlink the alias.
546 :     $aliasResult = "<a href=\"$url\">$alias</a>";
547 :     } else {
548 :     # No, return the raw alias.
549 :     $aliasResult = $alias;
550 : parrello 1.1 }
551 : parrello 1.4 # Push the result into the return list.
552 :     push @retVal, $aliasResult;
553 : parrello 1.1 }
554 :     # Convert the aliases into a comma-separated string.
555 :     return join(", ", @retVal);
556 :     }
557 :    
558 : parrello 1.5 =head3 AnalyzeClearinghouseArray
559 :    
560 :     my @aliases = AliasAnalysis::AnalyzeClearinghouseArray($orgName, $array);
561 :    
562 :     Analyze a response array from the %FIG{Annotation Clearinghouse}%. The
563 :     response array is a list of tuples containing identifiers of essentially
564 :     identical proteins along with the name of the relevant organism, its
565 :     assignment, and other data. This method looks at the identifier and
566 :     organism name in each tuple, and if the organism name matches, it checks
567 :     to see if the identifier is of a recognized alias type. If it is, then
568 :     the identifier is added to the output list. The net effect is to harvest
569 :     the response array for aliases of the %FIG{protein encoding group}% used
570 :     to obtain the response array from the clearinghouse.
571 :    
572 :     =over 4
573 :    
574 :     =item orgName
575 :    
576 :     Name of the genome of interest.
577 :    
578 :     =item array
579 :    
580 :     Array of Annotation Clearinghouse tuples. The aliases will be harvested
581 :     from this array. In the array, the first element in each tuple is an
582 :     identifier and the fourth is a genome name.
583 :    
584 :     =item RETURN
585 :    
586 :     Returns a list of aliases from the incoming aray of tuples.
587 :    
588 :     =back
589 :    
590 :     =cut
591 :    
592 :     sub AnalyzeClearinghouseArray {
593 :     # Get the parameters.
594 :     my ($orgName, $array) = @_;
595 :     # Declare the return variable.
596 :     my @retVal;
597 :     # Loop through the response array, keeping aliases that look good.
598 :     for my $result (@$array) {
599 :     # Get the useful pieces of this result.
600 :     my ($alias, undef, undef, $org) = @$result;
601 :     # Is this ID for the correct organism?
602 :     if ($org eq $orgName) {
603 :     # Yes. If it's refseq, throw away the prefix.
604 :     if ($alias =~ /^ref\|(.+)/) {
605 :     $alias = $1;
606 :     }
607 :     # Is it a recognized type?
608 :     my $type = TypeOf($alias);
609 :     if ($type) {
610 :     # It's a recognized alias type and its for the correct
611 :     # organism, so keep it.
612 :     push @retVal, $alias;
613 :     }
614 :     }
615 :     }
616 :     # Return the result.
617 :     return @retVal;
618 :     }
619 :    
620 :    
621 : parrello 1.3 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3