[Bio] / FigKernelPackages / AliasAnalysis.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/AliasAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package AliasAnalysis;
4 :    
5 :     use strict;
6 :     use Tracer;
7 : parrello 1.7 use base qw(Exporter);
8 :     use vars qw(@EXPORT);
9 :     @EXPORT = qw(AliasCheck);
10 : parrello 1.1
11 :     =head1 Alias Analysis Module
12 :    
13 :     =head2 Introduction
14 :    
15 :     This module encapsulates data about aliases. For each alias, it tells us how to generate
16 :     the appropriate link, what the type is for the alias, its export format, and its display
17 :     format. To add new alias types, we simply update this package.
18 :    
19 :     An alias has three forms. The I<internal> form is how the alias is stored in the database.
20 :     The I<export> form is the form into which it should be translated when being exported to
21 :     BRC databases. The I<natural> form is the form it takes in its own environment. For
22 :     example, C<gi|15675083> is the internal form of a GenBank ID. Its export form is
23 :     C<NCBI_gi:15675083>, and its natural form is simply C<15675083>.
24 :    
25 :     =head2 The Alias Table
26 :    
27 :     The alias table is a hash of hashes. Each sub-hash relates to a specific type of alias, and
28 :     the key names the alias type (e.g. C<uniprot>, C<KEGG>). The sub-hashes have three fields.
29 :    
30 :     =over 4
31 :    
32 :     =item pattern
33 :    
34 :     This is a regular expression that will match aliases of the specified type in their internal
35 :     forms.
36 :    
37 :     =item convert
38 :    
39 :     This field is a hash of conversions. The key for each is the conversion type and the
40 :     data is a replacement expression. These replacement expressions rely on the pattern match
41 :     having just taken place and use the C<$1>, C<$2>, ... variables to get text from the
42 :     alias's internal form. An alias's natural form, export form, and URL are all implemented as
43 :     different types of conversions. New conversion types can be created at
44 :     will be updating the table without having to worry about changing any code. Note that for
45 :     the URL conversion, a value of C<undef> means no URL is available.
46 :    
47 :     =item normalize
48 :    
49 :     This is a prefix that can be used to convert an alias from its natural form to its
50 :     internal form.
51 :    
52 : parrello 1.4 =item home
53 :    
54 :     This is the URL of the alias's home web site.
55 :    
56 :     =item curated
57 :    
58 :     This is the external database name used when the alias appears as a corresponding ID.
59 :     If the alias type is not supported by the corresponding ID effort, this value is
60 :     undefined.
61 :    
62 : parrello 1.1 =back
63 :    
64 :     At some point the Alias Table may be converted from an inline hash to an external XML file.
65 :    
66 :     =cut
67 :    
68 :     my %AliasTable = (
69 :     RefSeq => {
70 : parrello 1.6 pattern => '(?:ref\|)?([NXYZA]P_[0-9\.]+)',
71 : parrello 1.4 home => 'http://www.ncbi.nlm.nih.gov',
72 : parrello 1.1 convert => { natural => '$1',
73 :     export => 'RefSeq_Prot:$1',
74 :     url => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1',
75 :     },
76 :     normalize => '',
77 :     },
78 : parrello 1.4 NCBI => {
79 : parrello 1.1 pattern => 'gi\|(\d+)',
80 : parrello 1.4 home => 'http://www.ncbi.nlm.nih.gov',
81 : parrello 1.1 convert => { natural => '$1',
82 :     export => 'NCBI_gi:$1',
83 : parrello 1.2 url => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept',
84 : parrello 1.1 },
85 :     normalize => 'gi|',
86 :     },
87 : parrello 1.4 CMR => {
88 : parrello 1.9 pattern => 'cmr\|(.+)',
89 : parrello 1.4 home => 'http://cmr.jcvi.org',
90 :     convert => { natural => '$1',
91 :     export => 'cmr|$1',
92 :     url => 'http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1',
93 :     },
94 :     normalize => 'cmr|',
95 :     },
96 : parrello 1.1 SwissProt => {
97 :     pattern => 'sp\|([A-Z0-9]{6})',
98 : parrello 1.6 home => 'http://us.expasy.org',
99 : parrello 1.1 convert => { natural => '$1',
100 :     export => 'Swiss-Prot:$1',
101 :     url => 'http://us.expasy.org/cgi-bin/get-sprot-entry?$1',
102 :     },
103 :     normalize => 'sp|',
104 :     },
105 :     UniProt => {
106 : parrello 1.4 pattern => 'uni\|([A-Z0-9_]+?)',
107 :     home => 'http://www.uniprot.org',
108 : parrello 1.1 convert => { natural => '$1',
109 :     export => 'UniProtKB:',
110 :     url => 'http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1',
111 :     },
112 :     normalize => 'uni|',
113 :     },
114 :     KEGG => {
115 :     pattern => 'kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))',
116 : parrello 1.6 home => 'http://www.genome.ad.jp',
117 : parrello 1.1 convert => { natural => '$1',
118 :     export => 'KEGG:$2+$3',
119 :     url => 'http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3',
120 :     },
121 :     normalize => 'kegg|',
122 :     },
123 :     LocusTag => {
124 : parrello 1.10 pattern => 'LocusTag:([A-Za-z]{2,3}\d+)',
125 : parrello 1.1 convert => { natural => '$1',
126 :     export => 'Locus_Tag:$1',
127 :     url => undef,
128 :     },
129 :     normalize => 'LocusTag:',
130 :     },
131 :     GeneID => {
132 :     pattern => 'GeneID:(\d+)',
133 :     convert => { natural => '$1',
134 :     export => 'GeneID:$1',
135 :     url => undef,
136 :     },
137 :     normalize => 'GeneID:',
138 :     },
139 :     Trembl => {
140 :     pattern => 'tr\|([a-zA-Z0-9]+)',
141 : parrello 1.6 home => 'http://ca.expasy.org',
142 : parrello 1.1 convert => { natural => '$1',
143 :     export => 'TrEMBL:$1',
144 :     url => 'http://ca.expasy.org/uniprot/$1',
145 :     },
146 :     normalize => 'tr|',
147 :     },
148 : parrello 1.6 GENE => {
149 :     pattern => 'GENE:([a-zA-Z]{3,4}(?:-\d+)?)',
150 :     convert => { natural => '$1',
151 :     export => 'GENE:$1',
152 :     url => undef,
153 :     },
154 :     normalize => 'GENE:',
155 :     },
156 : parrello 1.1 );
157 :    
158 : parrello 1.7 =head3 BlackList
159 :    
160 :     The Black List contains a list of alias types that should be discarded during
161 :     alias processing. Any normalized alias whose prefix matches one of the names
162 :     in the list will be discarded (see L</AliasCheck>).
163 :    
164 :     =cut
165 :    
166 :     my %BlackList = map { lc($_) => 1 } qw(InterPro);
167 :    
168 : parrello 1.1 =head2 Public Methods
169 :    
170 : parrello 1.7 =head3 AliasCheck
171 :    
172 :     my $okFlag = AliasCheck($alias);
173 :    
174 :     Return TRUE if the specified alias is acceptable, FALSE if it is
175 :     blacklisted.
176 :    
177 :     =over 4
178 :    
179 :     =item alias
180 :    
181 :     Alias to check, in normalized (internal) form.
182 :    
183 :     =item RETURN
184 :    
185 :     Returns TRUE if the alias's type is acceptable, FALSE if it is found in the blacklist.
186 :    
187 :     =back
188 :    
189 :     =cut
190 :    
191 :     sub AliasCheck {
192 :     # Get the parameters.
193 :     my ($alias) = @_;
194 :     # Declare the return variable.
195 :     my $retVal = 1;
196 :     # Check for a prefix.
197 :     if ($alias =~ /^([^|:]+)/) {
198 :     # Check the prefix against the black list.
199 :     my $prefix = lc $1;
200 :     Trace("Prefix for $alias is $prefix.") if T(3);
201 :     if ($BlackList{$prefix}) {
202 :     $retVal = 0;
203 :     }
204 :     }
205 :     # Return the result.
206 :     return $retVal;
207 :     }
208 :    
209 :    
210 : parrello 1.1 =head3 AliasTypes
211 :    
212 : parrello 1.3 my @aliasTypes = AliasAnalysis::AliasTypes();
213 : parrello 1.1
214 :     Return a list of the alias types. The list can be used to create a menu or dropdown
215 :     for selecting a preferred alias.
216 :    
217 :     =cut
218 :    
219 :     sub AliasTypes {
220 :     return sort keys %AliasTable;
221 :     }
222 :    
223 : parrello 1.7
224 : parrello 1.1 =head3 Find
225 :    
226 : parrello 1.3 my $aliasFound = AliasAnalysis::Find($type, \@aliases);
227 : parrello 1.1
228 :     Find the first alias of the specified type in the list.
229 :    
230 :     =over 4
231 :    
232 :     =item type
233 :    
234 :     Type of alias desired. This must be one of the keys in C<%AliasTable>.
235 :    
236 :     =item aliases
237 :    
238 :     Reference of a list containing alias names. The first alias name that matches
239 :     the structure of the specified alias type will be returned. The incoming
240 :     aliases are presumed to be in internal form.
241 :    
242 :     =item RETURN
243 :    
244 :     Returns the natural form of the desired alias, or C<undef> if no alias of
245 :     the specified type could be found.
246 :    
247 :     =back
248 :    
249 :     =cut
250 :    
251 :     sub Find {
252 :     # Get the parameters.
253 :     my ($type, $aliases) = @_;
254 :     # Declare the return variable.
255 :     my $retVal;
256 :     # Insure we have a valid alias type.
257 :     if (! exists $AliasTable{$type}) {
258 :     Confess("Invalid aliase type \"$type\" specified.");
259 :     } else {
260 :     # Get the pattern for the specified alias type.
261 :     my $pattern = $AliasTable{$type}->{pattern};
262 :     Trace("Alias pattern is /$pattern/.") if T(3);
263 :     # Search for matching aliases. We can't use GREP here because we want
264 :     # to stop as soon as we find a match. That way, the $1,$2.. variables
265 :     # will be set properly.
266 :     my $found;
267 :     for my $alias (@$aliases) { last if $found;
268 :     Trace("Matching against \"$alias\".") if T(4);
269 : parrello 1.4 if ($alias =~ /^$pattern$/) {
270 : parrello 1.1 Trace("Match found.") if T(4);
271 :     # Here we have a match. Return the matching alias's natural form.
272 :     $retVal = eval($AliasTable{$type}->{convert}->{natural});
273 :     $found = 1;
274 :     }
275 :     }
276 :     }
277 :     # Return the value found.
278 :     return $retVal;
279 :     }
280 :    
281 : parrello 1.4 =head3 Normalize
282 :    
283 :     my $normalized = AliasAnalysis::Normalize($type => $naturalName);
284 :    
285 :     Convert an alias of the specified typefrom its natural form to its internal
286 :     form.
287 :    
288 :     =over 4
289 :    
290 :     =item type
291 :    
292 :     Type of the relevant alias.
293 :    
294 :     =item naturalName
295 :    
296 :     Natural-form alias to be converted to internal form.
297 :    
298 :     =item RETURN
299 :    
300 : parrello 1.6 Returns the normalized alias, or the original value if the alias type
301 : parrello 1.4 is not recognized.
302 :    
303 :     =back
304 :    
305 :     =cut
306 :    
307 :     sub Normalize {
308 :     # Get the parameters.
309 :     my ($type, $naturalName) = @_;
310 :     # Declare the return variable.
311 :     my $retVal = $naturalName;
312 :     # Only proceed if the specified type is valid.
313 :     if (exists $AliasTable{$type}) {
314 :     # Normalize the name.
315 :     $retVal = $AliasTable{$type}->{normalize} . $naturalName;
316 :     }
317 :     # Return the result.
318 :     return $retVal;
319 :     }
320 :    
321 :    
322 : parrello 1.1 =head3 Type
323 :    
324 : parrello 1.3 my $naturalName = AliasAnalysis::Type($type => $name);
325 : parrello 1.1
326 :     Return the natural name of an alias if it is of the specified type, and C<undef> otherwise.
327 :     Note that the result of this method will be TRUE if the alias is an internal form of the named
328 :     type and FALSE otherwise.
329 :    
330 :     =over 4
331 :    
332 :     =item type
333 :    
334 :     Relevant alias type.
335 :    
336 :     =item name
337 :    
338 :     Internal-form alias to be matched to the specified type.
339 :    
340 :     =item RETURN
341 :    
342 :     Returns the natural form of the alias if it is of the specified type, and C<undef> otherwise.
343 :    
344 :     =back
345 :    
346 :     =cut
347 :    
348 :     sub Type {
349 :     # Get the parameters.
350 :     my ($type, $name) = @_;
351 :     # Declare the return variable. If there is no match, it will stay undefined.
352 :     my $retVal;
353 :     # Check the alias type.
354 :     my $pattern = $AliasTable{$type}->{pattern};
355 : parrello 1.6 if ($name =~ /^$pattern$/) {
356 : parrello 1.1 # We have a match, so we return the natural form of the alias.
357 :     $retVal = eval($AliasTable{$type}->{convert}->{natural});
358 :     }
359 :     # Return the result.
360 :     return $retVal;
361 :     }
362 :    
363 : parrello 1.4 =head3 Format
364 :    
365 :     my $htmlText = AliasAnalysis::Format($type => $alias);
366 :    
367 :     Return the converted form of an alias. The alias will be compared against
368 :     the patterns in the type table to determine which type of alias it is. Then
369 :     the named conversion will be applied. If the alias is not of a recognized
370 :     type, an undefined value will be returned.
371 :    
372 :     =over 4
373 :    
374 :     =item type
375 :    
376 :     Type of conversion desired (C<natural>, C<export>, C<url>, C<internal>)
377 :    
378 :     =item alias
379 :    
380 :     Alias to be converted.
381 :    
382 :     =item RETURN
383 :    
384 :     Returns the converted alias, or C<undef> if the alias is not of a known type
385 :     or is of a type that does not support the specified conversion.
386 :    
387 :     =back
388 :    
389 :     =cut
390 :    
391 :     sub Format {
392 :     # Get the parameters.
393 :     my ($type, $alias) = @_;
394 :     # Declare the return variable.
395 :     my $retVal;
396 :     # This flag will be used to stop the loop.
397 :     my $found;
398 :     # Check this alias against all the known types.
399 :     for my $aliasType (keys %AliasTable) { last if $found;
400 :     # Get the conversion expression for this alias type.
401 :     my $convertExpression = $AliasTable{$aliasType}->{convert}->{$type};
402 :     # Check to see if we found the right type.
403 :     my $pattern = $AliasTable{$aliasType}->{pattern};
404 :     Trace("Matching \"$alias\" to /$pattern/.") if T(4);
405 :     if ($alias =~ /^$pattern$/) {
406 :     # Here we did. Denote we found the type.
407 :     $found = 1;
408 :     # Insure this type supports the conversion.
409 :     if ($convertExpression) {
410 :     # It does, so do the conversion.
411 :     $retVal = eval("\"$convertExpression\"");
412 :     Trace("Convert expression was \"$convertExpression\".") if T(3);
413 :     }
414 :     }
415 :     }
416 :     # Return the result.
417 :     return $retVal;
418 :     }
419 :    
420 :     =head3 TypeOf
421 :    
422 :     my $type = AliasAnalysis::TypeOf($alias);
423 :    
424 :     Return the type of the specified alias, or C<undef> if the alias is not
425 :     of a recognized type.
426 :    
427 :     =over 4
428 :    
429 :     =item alias
430 :    
431 :     Alias (in internal form) whose type is desired.
432 :    
433 :     =item RETURN
434 :    
435 :     Returns the type of the specified alias, or C<undef> if the alias is of an
436 :     unknown type.
437 :    
438 :     =back
439 :    
440 :     =cut
441 :    
442 :     sub TypeOf {
443 :     # Get the parameters.
444 :     my ($alias) = @_;
445 :     # Declare the return variable.
446 :     my $retVal;
447 :     # Check this alias against all the known types.
448 :     for my $aliasType (keys %AliasTable) { last if defined $retVal;
449 :     # Check to see if we found the right type.
450 :     my $pattern = $AliasTable{$aliasType}->{pattern};
451 :     Trace("Matching \"$alias\" to /$pattern/.") if T(4);
452 :     if ($alias =~ /^$pattern$/) {
453 :     # Here we did. Denote we found the type.
454 :     $retVal = $aliasType;
455 :     }
456 :     }
457 :     # Return the result.
458 :     return $retVal;
459 :     }
460 :    
461 : parrello 1.6 =head3 IsNatural
462 :    
463 :     my $normalized = AliasAnalysis::IsNatural($type => $natural);
464 :    
465 :     Return the normalized form of an alias if it is a natural name of the
466 :     specified type, or an undefined value otherwise. This is useful for
467 :     determining if a particular identifier is a natural alias.
468 :    
469 :     =over 4
470 :    
471 :     =item type
472 :    
473 :     Type of alias name to check.
474 :    
475 :     =item natural
476 :    
477 :     Natural-form alias name to check.
478 :    
479 :     =item RETURN
480 :    
481 :     Returns the normalized alias if the incoming value is a natural-form identifier
482 :     of the specified type, or C<undef> otherwise.
483 :    
484 :     =back
485 :    
486 :     =cut
487 :    
488 :     sub IsNatural {
489 :     # Get the parameters.
490 :     my ($type, $natural) = @_;
491 :     # Declare the return variable.
492 :     my $retVal;
493 :     # Attempt to convert the incoming value to its normalized form.
494 :     my $normalized = $AliasTable{$type}->{normalize} . $natural;
495 :     # Get the pattern for this alias type.
496 :     my $pattern = $AliasTable{$type}->{pattern};
497 :     if ($normalized =~ /^$pattern$/) {
498 :     # Here we have a match, so return the normalized form.
499 :     $retVal = $normalized;
500 :     }
501 :     # Return the result.
502 :     return $retVal;
503 :     }
504 :    
505 : parrello 1.4
506 : parrello 1.1 =head3 FormatHtml
507 :    
508 : parrello 1.3 my $htmlText = AliasAnalysis::FormatHtml(@aliases);
509 : parrello 1.1
510 :     Create an html string that contains the specified aliases in a comma-separated list
511 :     with hyperlinks where available. The aliases are expected to be in internal form and
512 :     will stay that way.
513 :    
514 :     =over 4
515 :    
516 :     =item aliases
517 :    
518 :     A list of aliases in internal form that are to be formatted into HTML.
519 :    
520 :     =item RETURN
521 :    
522 :     Returns a string containing the aliases in a comma-separated list, with hyperlinks
523 :     present on those for which hyperlinks are available.
524 :    
525 :     =back
526 :    
527 :     =cut
528 :    
529 :     sub FormatHtml {
530 :     # Get the parameters.
531 :     my (@aliases) = @_;
532 :     # Set up the output list. The hyperlinked aliases will be put in here, and then
533 :     # srung together before returning to the caller.
534 :     my @retVal = ();
535 :     # Loop through the incoming aliases.
536 :     for my $alias (@aliases) {
537 : parrello 1.4 # We'll put our result string in here.
538 :     my $aliasResult;
539 :     # Compute the alias's URL.
540 :     my $url = Format(url => $alias);
541 :     # Check to see if a URL does indeed exist.
542 : parrello 1.1 if ($url) {
543 : parrello 1.4 # Yes, hyperlink the alias.
544 :     $aliasResult = "<a href=\"$url\">$alias</a>";
545 :     } else {
546 :     # No, return the raw alias.
547 :     $aliasResult = $alias;
548 : parrello 1.1 }
549 : parrello 1.4 # Push the result into the return list.
550 :     push @retVal, $aliasResult;
551 : parrello 1.1 }
552 :     # Convert the aliases into a comma-separated string.
553 :     return join(", ", @retVal);
554 :     }
555 :    
556 : parrello 1.5 =head3 AnalyzeClearinghouseArray
557 :    
558 :     my @aliases = AliasAnalysis::AnalyzeClearinghouseArray($orgName, $array);
559 :    
560 :     Analyze a response array from the %FIG{Annotation Clearinghouse}%. The
561 :     response array is a list of tuples containing identifiers of essentially
562 :     identical proteins along with the name of the relevant organism, its
563 :     assignment, and other data. This method looks at the identifier and
564 :     organism name in each tuple, and if the organism name matches, it checks
565 :     to see if the identifier is of a recognized alias type. If it is, then
566 :     the identifier is added to the output list. The net effect is to harvest
567 :     the response array for aliases of the %FIG{protein encoding group}% used
568 :     to obtain the response array from the clearinghouse.
569 :    
570 :     =over 4
571 :    
572 :     =item orgName
573 :    
574 :     Name of the genome of interest.
575 :    
576 :     =item array
577 :    
578 :     Array of Annotation Clearinghouse tuples. The aliases will be harvested
579 :     from this array. In the array, the first element in each tuple is an
580 :     identifier and the fourth is a genome name.
581 :    
582 :     =item RETURN
583 :    
584 :     Returns a list of aliases from the incoming aray of tuples.
585 :    
586 :     =back
587 :    
588 :     =cut
589 :    
590 :     sub AnalyzeClearinghouseArray {
591 :     # Get the parameters.
592 :     my ($orgName, $array) = @_;
593 :     # Declare the return variable.
594 :     my @retVal;
595 :     # Loop through the response array, keeping aliases that look good.
596 :     for my $result (@$array) {
597 :     # Get the useful pieces of this result.
598 :     my ($alias, undef, undef, $org) = @$result;
599 :     # Is this ID for the correct organism?
600 :     if ($org eq $orgName) {
601 :     # Yes. If it's refseq, throw away the prefix.
602 :     if ($alias =~ /^ref\|(.+)/) {
603 :     $alias = $1;
604 :     }
605 :     # Is it a recognized type?
606 :     my $type = TypeOf($alias);
607 :     if ($type) {
608 :     # It's a recognized alias type and its for the correct
609 :     # organism, so keep it.
610 :     push @retVal, $alias;
611 :     }
612 :     }
613 :     }
614 :     # Return the result.
615 :     return @retVal;
616 :     }
617 :    
618 :    
619 : parrello 1.3 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3