[Bio] / Sprout / ERDBTypeCountVector.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBTypeCountVector.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package ERDBTypeCountVector;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use CGI;
26 :     use base qw(ERDBType);
27 :    
28 :     =head1 ERDB CountVector Descriptor Type Definition
29 :    
30 :     =head2 Introduction
31 :    
32 :     This object represents the data type for a vector of counts. The vector is
33 :     represented in the database as a long text string. To the user, it is represented
34 :     as a reference to a list of integers. Utility methods are provided for the most
35 :     common vector operations.
36 :    
37 :     The vectors are expected to be sparse and biased toward low numbers. A simple
38 :     base-64 representation is used for the counts, with a null string representing
39 : parrello 1.2 zero. The first character of a count is either a space (indicating 0), a plus (indicating 1),
40 :     or a minus (indicating -1, the only negative number allowed). If it is a space,
41 :     then there may be zero or more additional digits.
42 : parrello 1.1
43 :     =head3 DIGITS
44 :    
45 :     This is a constant containing the base64 digit string.
46 :    
47 :     =cut
48 :    
49 :     use constant DIGITS => '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_.';
50 :    
51 :     =head3 new
52 :    
53 :     my $et = ERDBTypeCountVector->new();
54 :    
55 :     Construct a new ERDBTypeCountVector descriptor.
56 :    
57 :     =cut
58 :    
59 :     sub new {
60 :     # Get the parameters.
61 :     my ($class) = @_;
62 :     # Create the ERDBTypeCountVector object.
63 :     my $retVal = { };
64 :     # Bless and return it.
65 :     bless $retVal, $class;
66 :     return $retVal;
67 :     }
68 :    
69 :     =head2 Virtual Methods
70 :    
71 :     =head3 averageLength
72 :    
73 :     my $value = $et->averageLength();
74 :    
75 :     Return the average length of a data item of this field type when it is stored in the
76 :     database. This value is used to compute the expected size of a database table.
77 :    
78 :     =cut
79 :    
80 :     sub averageLength {
81 :     return 10000;
82 :     }
83 :    
84 :     =head3 prettySortValue
85 :    
86 :     my $value = $et->prettySortValue();
87 :    
88 :     Number indicating where fields of this type should go in relation to other
89 :     fields. The value should be somewhere between C<1> and C<5>. A value outside
90 :     that range will make terrible things happen.
91 :    
92 :     =cut
93 :    
94 :     sub prettySortValue() {
95 :     return 5;
96 :     }
97 :    
98 :     =head3 validate
99 :    
100 :     my $okFlag = $et->validate($value);
101 :    
102 :     Return an error message if the specified value is invalid for this field type.
103 :    
104 :     The parameters are as follows.
105 :    
106 :     =over 4
107 :    
108 :     =item value
109 :    
110 :     Value of this type, for validation.
111 :    
112 :     =item RETURN
113 :    
114 :     Returns an empty string if the specified field is valid, and an error message
115 :     otherwise.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     sub validate {
122 :     # Get the parameters.
123 :     my ($self, $value) = @_;
124 :     # Assume it's valid until we prove otherwise.
125 :     my $retVal = "";
126 :     if (! defined $value || ref $value ne 'ARRAY') {
127 :     $retVal = "Value is not a list reference.";
128 :     } else {
129 :     my $errCount;
130 :     for my $entry (@$value) {
131 : parrello 1.2 unless ($entry =~ /^\-?\d+$/) {
132 : parrello 1.1 $errCount++;
133 :     }
134 :     }
135 :     if ($errCount) {
136 :     $retVal = "$errCount invalid characters found.";
137 :     }
138 :     }
139 :     # Return the determination.
140 :     return $retVal;
141 :     }
142 :    
143 :     =head3 encode
144 :    
145 :     my $string = $et->encode($value, $mode);
146 :    
147 :     Encode a value of this field type for storage in the database (or in a database load
148 :     file.)
149 :    
150 :     The parameters are as follows.
151 :    
152 :     =over 4
153 :    
154 :     =item value
155 :    
156 :     Value of this type, for encoding.
157 :    
158 :     =item mode
159 :    
160 :     TRUE if the value is being encoding for placement in a load file, FALSE if it
161 :     is being encoded for use as an SQL statement parameter. In most cases, the
162 :     encoding is the same for both modes.
163 :    
164 :     =back
165 :    
166 :     =cut
167 :    
168 :     sub encode {
169 :     # Get the parameters.
170 :     my ($self, $value, $mode) = @_;
171 :     # Convert the numbers to strings and join them together.
172 : parrello 1.2 my $retVal = join("", map { NumToString($_) } @$value);
173 : parrello 1.1 # Return the result.
174 :     return $retVal;
175 :     }
176 :    
177 :     =head3 decode
178 :    
179 :     my $value = $et->decode($string);
180 :    
181 :     Decode a string from the database into a value of this field type.
182 :    
183 :     The parameters are as follows.
184 :    
185 :     =over 4
186 :    
187 :     =item string
188 :    
189 :     String from the database to be decoded.
190 :    
191 :     =item RETURN
192 :    
193 :     Returns a value of the desired type.
194 :    
195 :     =back
196 :    
197 :     =cut
198 :    
199 :     sub decode {
200 :     # Get the parameters.
201 :     my ($self, $string) = @_;
202 : parrello 1.2 # We'll put the values found in here.
203 :     my @retVal;
204 :     # Loop through the string.
205 :     while ($string =~ /([+\- ][^+\- ]*)/g) {
206 :     push @retVal, StringToNum($1);
207 :     }
208 : parrello 1.1 # Return the result.
209 :     return \@retVal;
210 :     }
211 :    
212 :     =head3 sqlType
213 :    
214 :     my $typeString = $et->sqlType();
215 :    
216 :     Return the SQL data type for this field type.
217 :    
218 :     =cut
219 :    
220 :     sub sqlType {
221 :     return "MEDIUMTEXT";
222 :     }
223 :    
224 :     =head3 indexMod
225 :    
226 :     my $length = $et->indexMod();
227 :    
228 :     Return the index modifier for this field type. The index modifier is the number of
229 :     characters to be indexed. If it is undefined, the field cannot be indexed. If it
230 :     is an empty string, the entire field is indexed. The default is an empty string.
231 :    
232 :     =cut
233 :    
234 :     sub indexMod {
235 :     return undef;
236 :     }
237 :    
238 :     =head3 sortType
239 :    
240 :     my $letter = $et->sortType();
241 :    
242 :     Return the sorting type for this field type. The sorting type is C<n> for integers,
243 :     C<g> for floating-point numbers, and the empty string for character fields.
244 :     The default is the empty string.
245 :    
246 :     =cut
247 :    
248 :     sub sortType {
249 :     return "";
250 :     }
251 :    
252 :     =head3 documentation
253 :    
254 :     my $docText = $et->documentation();
255 :    
256 :     Return the documentation text for this field type. This should be in TWiki markup
257 :     format, though HTML will also work.
258 :    
259 :     =cut
260 :    
261 :     sub documentation() {
262 :     return 'vector of counts';
263 :     }
264 :    
265 :     =head3 name
266 :    
267 :     my $name = $et->name();
268 :    
269 :     Return the name of this type, as it will appear in the XML database definition.
270 :    
271 :     =cut
272 :    
273 :     sub name() {
274 :     return "countVector";
275 :     }
276 :    
277 :     =head3 default
278 :    
279 :     my $defaultValue = $et->default();
280 :    
281 :     Return the default value to be used for fields of this type if no default value
282 :     is specified in the database definition or in an L<ERDBLoadGroup/Put> call
283 :     during a loader operation. The default is undefined, which means an error will
284 :     be thrown during the load.
285 :    
286 :     =cut
287 :    
288 :     sub default {
289 :     return '';
290 :     }
291 :    
292 :     =head3 align
293 :    
294 :     my $alignment = $et->align();
295 :    
296 :     Return the display alignment for fields of this type: either C<left>, C<right>, or
297 :     C<center>. The default is C<left>.
298 :    
299 :     =cut
300 :    
301 :     sub align {
302 :     return 'left';
303 :     }
304 :    
305 :     =head3 html
306 :    
307 :     my $html = $et->html($value);
308 :    
309 :     Return the HTML for displaying the content of a field of this type in an output
310 :     table. The default is the raw value, html-escaped.
311 :    
312 :     =cut
313 :    
314 :     sub html {
315 :     my ($self, $value) = @_;
316 :     # Display the number of values.
317 :     my $retVal = "&lt;" . scalar(@$value) . "-vector&gt;";
318 :     # Return the result.
319 :     return $retVal;
320 :     }
321 :    
322 :     =head2 Vector Manipulation Utilities
323 :    
324 :     =head2 Internal Utilities
325 :    
326 :     =head3 NumToString
327 :    
328 :     my $string = ERDBTypeCountVector::NumToString($number);
329 :    
330 :     Convert an unsigned integer into a base-64 character string for encoding into a count
331 :     vector.
332 :    
333 :     =over 4
334 :    
335 :     =item number
336 :    
337 :     Number to convert.
338 :    
339 :     =item RETURN
340 :    
341 :     Returns a (possibly null) string consisting of characters from the B<DIGITS> string
342 :     that represents the incoming number.
343 :    
344 :     =back
345 :    
346 :     =cut
347 :    
348 :     sub NumToString {
349 :     # Get the parameter.
350 :     my ($number) = @_;
351 : parrello 1.2 # Declare the return variable.
352 :     my $retVal;
353 : parrello 1.1 # Get a copy of the number.
354 :     my $residual = $number;
355 : parrello 1.2 # Check for the specials.
356 :     if ($residual == 0) {
357 :     $retVal = " ";
358 :     } elsif ($residual == -1) {
359 :     $retVal = "-";
360 :     } elsif ($residual == 1) {
361 :     $retVal = "+";
362 :     } else {
363 :     # We'll store our digits in here.
364 :     my @digits;
365 :     # Loop until it's zero.
366 :     while ($residual > 0) {
367 :     # Get the last digit.
368 :     push @digits, substr(DIGITS, $residual & 63, 1);
369 :     # Shift it off.
370 :     $residual >>= 6;
371 :     }
372 :     # Form the digits into a string.
373 :     $retVal = " " . join("", @digits);
374 : parrello 1.1 }
375 : parrello 1.2 # Return the result.
376 :     return $retVal;
377 : parrello 1.1 }
378 :    
379 :     =head3 StringToNum
380 :    
381 :     my $number = ERDBTypeCountVector::StringToNum($string);
382 :    
383 :     Convert a base-64 character string into an unsigned integer for a count vector.
384 :    
385 :     =over 4
386 :    
387 :     =item string
388 :    
389 :     (Possibly null) string to convert.
390 :    
391 :     =item RETURN
392 :    
393 :     Returns the number represented by the incoming string.
394 :    
395 :     =back
396 :    
397 :     =cut
398 :    
399 :     sub StringToNum {
400 :     # Get the parameter.
401 :     my ($string) = @_;
402 :     # We'll store the result in here.
403 :     my $retVal = 0;
404 : parrello 1.2 # Check for the specials.
405 :     if ($string eq ' ') {
406 :     $retVal = 0;
407 :     } elsif ($string eq '-') {
408 :     $retVal = -1;
409 :     } elsif ($string eq '+') {
410 :     $retVal = 1;
411 :     } else {
412 :     # Loop through the string. Note that we ignore the space at the
413 :     # front.
414 :     for (my $i = length($string) - 1; $i > 0; $i--) {
415 :     # Get the current digit.
416 :     my $digit = substr($string, $i, 1);
417 :     # Add it to the number result.
418 :     $retVal = ($retVal << 6) + index(DIGITS, $digit);
419 :     }
420 : parrello 1.1 }
421 :     # Return the result.
422 :     return $retVal;
423 :     }
424 :    
425 :     =head3 VectorLength
426 :    
427 :     my $length = ERDBTypeCountVector::VectorLength($vector);
428 :    
429 :     Return the length of a vector.
430 :    
431 :     =over 4
432 :    
433 :     =item vector
434 :    
435 :     A count vector, represented as a reference to a list of unsigned integers.
436 :    
437 :     =item RETURN
438 :    
439 :     Returns a real number representing the length of the vector.
440 :    
441 :     =cut
442 :    
443 :     sub VectorLength {
444 :     # Get the parameter.
445 :     my ($vector) = @_;
446 :     # Compute the sum of squares.
447 :     my $retVal = 0;
448 :     for my $value (@$vector) {
449 : parrello 1.2 if ($value == 1 || $value == -1) {
450 :     $retVal++
451 :     } elsif ($value != 0) {
452 :     $retVal += $value * $value;
453 :     }
454 : parrello 1.1 }
455 :     # Return the square root.
456 :     return sqrt($retVal);
457 :     }
458 :    
459 :     =head3 DotProduct
460 :    
461 :     my $product = ERDBTypeCountVector::DotProduct($vector1, $vector2);
462 :    
463 :     Compute the normalized dot product of the two incoming vectors. The result will be C<1>
464 :     if the vectors are parallel and C<0> if they are orthogonal.
465 :    
466 :     =over 4
467 :    
468 :     =item vector1
469 :    
470 :     A count vector, represented as a reference to a list of unsigned integers.
471 :    
472 :     =item vector2
473 :    
474 :     Another count vector, represented as a reference to a list of unsigned integers.
475 :    
476 :     =item RETURN
477 :    
478 :     Returns a value between 0 and 1 that indicates the cosine of the angle between the
479 :     two vectors.
480 :    
481 :     =back
482 :    
483 :     =cut
484 :    
485 :     sub DotProduct {
486 :     # Get the parameters.
487 :     my ($vector1, $vector2) = @_;
488 :     # Compute the length of the shortest vector.
489 :     my $len = scalar @$vector1;
490 :     my $len2 = scalar @$vector2;
491 :     if ($len2 < $len) {
492 :     $len = $len2;
493 :     }
494 :     # Compute the sum of products.
495 :     my $retVal = 0;
496 :     for (my $i = 0; $i < $len; $i++) {
497 :     $retVal += $vector1->[$i] * $vector2->[$i];
498 :     }
499 :     # Normalize the result.
500 :     $retVal /= VectorLength($vector1) * VectorLength($vector2);
501 :     # Return it.
502 :     return $retVal;
503 :     }
504 :    
505 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3