[Bio] / Sprout / LoaderUtils.pm Repository:
ViewVC logotype

Annotation of /Sprout/LoaderUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package LoaderUtils;
4 :    
5 :     use strict;
6 :     use Tracer;
7 : parrello 1.2 use SeedUtils;
8 : parrello 1.1
9 :     =head1 Common DB Load Utilities
10 :    
11 :     =head2 Introduction
12 :    
13 :     This package contains static methods used by both the Sprout and Sapling loaders.
14 :    
15 :     =head2 Public Methods
16 :    
17 :     =head3 ReadAliasFile
18 :    
19 :     my $aliasHash = LoaderUtils::ReadAliasFile($dir, $genomeID);
20 :    
21 :     This method reads the content of the alias file for the specified genome,
22 :     and returns a hash. For each feature, the hash contains a list of its
23 :     aliases. Each alias is represented by a 3-tuple consisting of the actual
24 :     alias, the alias type (e.g. C<CMR>, C<NCBI>), and the confidence code--
25 :     C<A> for a curated alias, C<B> for a non-curated feature alias, and C<C>
26 :     for a protein alias. If the alias file is not found, an error will occur.
27 :    
28 :     =over 4
29 :    
30 :     =item dir
31 :    
32 :     Name of the directory containing the alias files.
33 :    
34 :     =item genomeID
35 :    
36 :     ID of the genome whose alias file is to be read.
37 :    
38 :     =item RETURN
39 :    
40 :     Returns a reference to a hash of feature IDs to alias lists. For each feature,
41 :     the alias list will be a reference to a list of 3-tuples. Each 3-tuple will
42 :     contain an alias ID, an alias type, and a confidence level from C<A> (highest)
43 :     to C<C> (lowest). If the alias file is not found, it will return an undefined
44 :     value.
45 :    
46 :     =back
47 :    
48 :     =cut
49 :    
50 :     sub ReadAliasFile {
51 :     # Get the parameters.
52 :     my ($dir, $genomeID) = @_;
53 :     # Declare the return variable.
54 :     my $retVal = {};
55 :     # Find the alias file. The alias files are created by "AliasCrunch.pl".
56 :     my $aliasFile = "$dir/alias.$genomeID.tbl";
57 :     if (! -f $aliasFile) {
58 :     undef $retVal;
59 :     } else {
60 :     # The file exists, so open it for input.
61 :     my $aliasH = Open(undef, "<$aliasFile");
62 :     # Loop through the file.
63 :     while (! eof $aliasH) {
64 :     # Get this alias record.
65 :     my ($aliasFid, $aliasID, $aliasType, $aliasConf) = Tracer::GetLine($aliasH);
66 :     # Put it in the return hash.
67 :     push @{$retVal->{$aliasFid}}, [$aliasID, $aliasType, $aliasConf];
68 :     }
69 :     # Close the file: we're done with it.
70 :     close $aliasH;
71 :     # Do a memory trace. Alias files can be pretty big.
72 :     MemTrace("Aliases adjusted.") if T(ERDBLoadGroup => 3);
73 :     }
74 :     # Return the result.
75 :     return $retVal;
76 :     }
77 :    
78 : parrello 1.2 =head3 RolesForLoading
79 :    
80 :     my ($roles, $errors) = RolesForLoading($function);
81 :    
82 :     Split a functional assignment into roles. If the functional assignment
83 :     seems suspicious, it will be flagged as invalid. A count will be returned
84 :     of the number of roles that are rejected because they are too long.
85 :    
86 :     =over 4
87 :    
88 :     =item function
89 :    
90 :     Functional assignment to parse.
91 :    
92 :     =item RETURN
93 :    
94 :     Returns a two-element list. The first is either a reference to a list of
95 :     roles, or an undefined value (indicating a suspicious functional assignment).
96 :     The second is the number of roles that are rejected for being too long.
97 :    
98 :     =back
99 :    
100 :     =cut
101 :    
102 :     sub RolesForLoading {
103 :     # Get the parameters.
104 :     my ($function) = @_;
105 :     # Declare the return variables.
106 :     my ($roles, $errors) = (undef, 0);
107 :     # Only proceed if there are no suspicious elements in the functional assignment.
108 :     if (! ($function =~ /\b(?:similarit|blast\b|fasta|identity)|%|E=/i)) {
109 :     # Initialize the return list.
110 :     $roles = [];
111 :     # Split the function into roles.
112 :     my @roles = roles_of_function($function);
113 :     # Keep only the good roles.
114 :     for my $role (@roles) {
115 :     if (length($role) > 250) {
116 :     $errors++;
117 :     } else {
118 :     push @$roles, $role;
119 :     }
120 :     }
121 :     }
122 :     # Return the results.
123 :     return ($roles, $errors);
124 :     }
125 :    
126 :    
127 :    
128 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3