[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.47 - (view) (download) (as text)

1 : olson 1.28 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : redwards 1.1 # -*- perl -*-
19 :    
20 :     =pod
21 :    
22 : parrello 1.13 =head1 RAE Library
23 : redwards 1.1
24 :     Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25 :     your own code or move them into FIG.pm or elsewhere.
26 :    
27 : overbeek 1.31 For questions about this email RobE@theFIG.info
28 :    
29 : redwards 1.1 =cut
30 :    
31 :     package raelib;
32 :     use strict;
33 : redwards 1.17 use Bio::SeqIO;
34 :     use Bio::Seq;
35 : overbeek 1.34 use Bio::Tools::SeqStats;
36 : redwards 1.17 use Bio::SeqFeature::Generic;
37 : overbeek 1.31
38 :     # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
39 :     # so we use an eval and set the boolean if we are cool.
40 :     my $useexcel;
41 : overbeek 1.38 my $excelfile;
42 :     my $excelfilelink="";
43 : overbeek 1.31 BEGIN {
44 :     eval "use Spreadsheet::WriteExcel";
45 :     unless ($@) {$useexcel=1}
46 :     }
47 :    
48 : overbeek 1.38 END {
49 :     my $self=shift;
50 :     if ($useexcel && $excelfile) {&close_excel_file($excelfile)}
51 :     }
52 :    
53 : overbeek 1.32
54 : redwards 1.1 use FIG;
55 :     my $fig=new FIG;
56 :    
57 : parrello 1.43 =head2 Methods
58 :    
59 :     =head3 new
60 : redwards 1.5
61 :     Just instantiate the object and return $self
62 :    
63 :     =cut
64 :    
65 :     sub new {
66 : redwards 1.27 my ($class)=@_;
67 :     my $self={};
68 : overbeek 1.32 $self->{'useexcel'}=1 if ($useexcel);
69 : redwards 1.27 return bless $self, $class;
70 : redwards 1.5 }
71 :    
72 :    
73 :    
74 :    
75 : parrello 1.43 =head3 features_on_contig
76 : redwards 1.4
77 :     Returns a reference to an array containing all the features on a contig in a genome.
78 :    
79 :     use:
80 :    
81 :     my $arrayref=$rae->features_on_contig($genome, $contig);
82 :    
83 :     or
84 :    
85 :     foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) {
86 :     ... blah blah ...
87 :     }
88 :    
89 :     returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs
90 :    
91 :     v. experimental and guaranteed not to work!
92 :    
93 :     =cut
94 :    
95 :     sub features_on_contig {
96 :     my ($self, $genome, $contig)=@_;
97 :     # were this in FIG.pm you'd use this line:
98 :     #my $rdbH = $self->db_handle;
99 :    
100 :     my $rdbH = $fig->db_handle;
101 :     my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')');
102 :     # this is complicated. A reference to an array of references to arrays, and we only want the first element.
103 :     # simplify.
104 :     my @results;
105 :     foreach my $res (@$relational_db_response) {push @results, $res->[0]}
106 :     return \@results;
107 :     }
108 :    
109 :    
110 : redwards 1.45 =head2 pegs_in_order
111 :    
112 :     Given a genome id, returns a list of the pegs in order along the genome.
113 :    
114 :     my @pegs_in_order = $rae->pegs-in_order($genome);
115 :    
116 :     This code is actually taken from adjacent.pl but put here to be useful
117 :    
118 :     =cut
119 :    
120 :     sub pegs_in_order {
121 :     my ($self, $genome) = @_;
122 :     my @pegs = map { $_->[0] }
123 :     sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) }
124 :     map { my $peg = $_;
125 :     if (my $loc = $fig->feature_location($peg) )
126 :     {
127 :     my ($contig,$beg,$end) = $fig->boundaries_of($loc);
128 :     [$peg,$contig,&FIG::min($beg,$end)];
129 :     }
130 :     else
131 :     {
132 :     ();
133 :     }
134 :     }
135 :     $fig->pegs_of($genome);
136 :     return @pegs;
137 :     }
138 :    
139 : parrello 1.43 =head3 mol_wt
140 : redwards 1.4
141 : overbeek 1.34 Calculate the molecular weight of a protein.
142 : redwards 1.4
143 : overbeek 1.34 This just offlaods the calculation to BioPerl, which is probably dumb since we need to load the whole of bioperl in just for this, but I don't have time to rewrite their method now, and I am not going to copy and paste it since I didn't write it :)
144 : redwards 1.4
145 : overbeek 1.34 my ($lower, $upper)=$raelib->mol_wt($peg);
146 :    
147 :     $lower is the lower bound for the possible mw, upper is the upper bound.
148 :    
149 :     =cut
150 :    
151 :     sub mol_wt {
152 :     my ($self, $fid)=@_;
153 :     my $sobj=Bio::Seq->new(-seq => $fig->get_translation($fid), -id => $fid);
154 :     return Bio::Tools::SeqStats->get_mol_wt($sobj);
155 :     }
156 : redwards 1.4
157 :    
158 : parrello 1.43 =head3 pirsfcorrespondence
159 : redwards 1.1
160 : redwards 1.18 Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
161 : redwards 1.1
162 : redwards 1.18 The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
163 : redwards 1.1
164 : redwards 1.18 This method takes three arguments:
165 : redwards 1.9 from : pirsfinfo.dat file
166 :     to : file to write information to
167 :     verbose : report on progress
168 :    
169 : redwards 1.18 Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
170 :    
171 :     Returns the number of lines in the pirsinfo file that were read.
172 : redwards 1.9
173 : redwards 1.1 =cut
174 :    
175 : redwards 1.7 sub pirsfcorrespondence {
176 : redwards 1.9 my ($self, $from, $to, $verbose)=@_;
177 : redwards 1.10 unless (-e $from) {
178 :     print STDERR "File $from does not exist as called in $0\n";
179 :     return 0;
180 :     }
181 : redwards 1.18 if ($from =~ /\.gz$/) {
182 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
183 :     }
184 :     else {
185 :     open (IN, $from) || die "Can't open $from";
186 :     }
187 : redwards 1.8 open (OUT, ">$to") || die "Can't write to $to";
188 : redwards 1.9 my $linecount;
189 : redwards 1.1 while (<IN>) {
190 : redwards 1.9 $linecount++;
191 : redwards 1.14 if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
192 : redwards 1.1 if (/^>/) {print OUT; next}
193 :     chomp;
194 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($_)) {
195 : redwards 1.1 print OUT $_, "\t", $peg, "\n";
196 :     }
197 : redwards 1.14 }
198 :     close IN;
199 :     close OUT;
200 :     return $linecount;
201 :     }
202 :    
203 : parrello 1.43 =head3 uniprotcorrespondence
204 : redwards 1.14
205 :     Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
206 :    
207 :     The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
208 :    
209 :     This method takes three arguments:
210 :     from : uniprotKB file
211 :     to : file to write information to
212 :     verbose : report on progress
213 :    
214 : redwards 1.18 Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
215 :    
216 :     Returns the number of lines in the uniprotkb file that were read.
217 : redwards 1.14
218 :     =cut
219 :    
220 :     sub uniprotcorrespondence {
221 :     my ($self, $from, $to, $verbose)=@_;
222 :     unless (-e $from) {
223 :     print STDERR "File $from does not exist as called in $0\n";
224 :     return 0;
225 :     }
226 : redwards 1.18 if ($from =~ /\.gz$/) {
227 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
228 :     }
229 :     else {
230 :     open (IN, $from) || die "Can't open $from";
231 :     }
232 : redwards 1.14 open (OUT, ">$to") || die "Can't write to $to";
233 :     my $linecount;
234 :     while (<IN>) {
235 :     chomp;
236 :     $linecount++;
237 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
238 :     my @line=split /\s+\|\s+/;
239 : redwards 1.16 my $added;
240 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($line[0])) {
241 :     print OUT "$_ | $peg\n";
242 : redwards 1.16 $added=1;
243 : redwards 1.12 }
244 : redwards 1.16 unless ($added) {print OUT "$_\n"}
245 : redwards 1.1 }
246 :     close IN;
247 :     close OUT;
248 : redwards 1.9 return $linecount;
249 : redwards 1.1 }
250 :    
251 : parrello 1.43 =head3 prositecorrespondence
252 : redwards 1.18
253 :     Generate a correspondence table between prosite and seed using sp id's and seed ids.
254 :    
255 :     The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
256 :    
257 :     The output file will have the following columns:
258 :    
259 :     prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
260 :    
261 :     The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
262 :    
263 :     This method takes three arguments:
264 :     from : prosite file
265 :     to : file to write information to
266 :     verbose : report on progress
267 :    
268 :     Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
269 :    
270 :     Returns the number of lines in the prosite file that were read.
271 :    
272 :     =cut
273 :    
274 :     sub prositecorrespondence {
275 :     my ($self, $from, $to, $verbose)=@_;
276 :     unless (-e $from) {
277 :     print STDERR "File $from does not exist as called in $0\n";
278 :     return 0;
279 :     }
280 :     if ($from =~ /\.gz$/) {
281 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
282 :     }
283 :     else {
284 :     open (IN, $from) || die "Can't open $from";
285 :     }
286 :     open (OUT, ">$to") || die "Can't write to $to";
287 :     my $linecount;
288 :     my ($famac, $famname, $famtype)=('','','');
289 :     while (<IN>) {
290 :     chomp;
291 :     $linecount++;
292 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
293 :     if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
294 :     elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
295 : redwards 1.19 elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
296 : redwards 1.18 next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
297 :     #
298 :     # this is the format of the DR lines:
299 :     # DR P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
300 :     s/^DR\s*//;
301 :     foreach my $piece (split /\s*\;\s*/, $_) {
302 :     my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
303 :     foreach my $fig ($self->swiss_pir_ids($acc)) {
304 : redwards 1.20 print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
305 : redwards 1.18 }
306 :     }
307 :     }
308 :     }
309 : redwards 1.14
310 : parrello 1.43 =head3 swiss_pir_ids()
311 : redwards 1.14
312 : redwards 1.18 SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
313 : redwards 1.14
314 :     =cut
315 :    
316 :     sub swiss_pir_ids {
317 :     my ($self, $id)=@_;
318 :     return () unless ($id);
319 : redwards 1.18 $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
320 :    
321 : redwards 1.15 my @return=($fig->by_alias("uni|$id"));
322 : redwards 1.14 return @return if ($return[0]);
323 :    
324 : redwards 1.15 @return=($fig->by_alias("tr|$id"));
325 : redwards 1.14 return @return if ($return[0]);
326 :    
327 : redwards 1.15 @return=($fig->by_alias("sp|$id"));
328 : redwards 1.14 return @return if ($return[0]);
329 :    
330 :     return ();
331 :     }
332 : redwards 1.1
333 : parrello 1.43 =head3 ss_by_id
334 : redwards 1.1
335 :     Generate a list of subsystems that a peg occurs in. This is a ; separated list.
336 :     This is a wrapper that removes roles and ignores essential things
337 :    
338 :     =cut
339 :    
340 :     sub ss_by_id {
341 :     my ($self, $peg)=@_;
342 :     my $ssout;
343 :     foreach my $ss (sort $fig->subsystems_for_peg($peg))
344 :     {
345 :     next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems
346 :     $ssout.=$$ss[0]."; ";
347 :     }
348 :     $ssout =~ s/; $//;
349 :     return $ssout;
350 :     }
351 :    
352 : parrello 1.43 =head3 ss_by_homol
353 : redwards 1.3
354 :     Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.
355 :     This is also a wrapper around sims and ss, but makes everything unified
356 :    
357 :     =cut
358 :    
359 :     sub ss_by_homol {
360 :     my ($self, $peg)=@_;
361 :     return unless ($peg);
362 :     my ($maxN, $maxP)=(50, 1e-20);
363 :    
364 :     # find the sims
365 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
366 :    
367 :     # we are only going to keep the best hit for each peg
368 :     # in a subsystem
369 :     my $best_ss_score; my $best_ss_id;
370 :     foreach my $sim (@sims)
371 :     {
372 :     my $simpeg=$$sim[1];
373 :     my $simscore=$$sim[10];
374 :     my @subsys=$fig->subsystems_for_peg($simpeg);
375 :     foreach my $ss (@subsys)
376 :     {
377 :     if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg}
378 :     elsif ($best_ss_score->{$$ss[0]} > $simscore)
379 :     {
380 :     $best_ss_score->{$$ss[0]}=$simscore;
381 :     $best_ss_id->{$$ss[0]}=$simpeg;
382 :     }
383 :     }
384 :     }
385 :    
386 :     my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id);
387 :    
388 :     $ssoutput =~ s/, $//;
389 :     return $ssoutput;
390 :     }
391 :    
392 : parrello 1.43 =head3 tagvalue
393 : redwards 1.3
394 :     This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)
395 :    
396 :     e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values;
397 :    
398 :     Returns an empty array if no tag/value appropriate.
399 :    
400 :     Just because I use this a lot I don't want to waste rewriting it.
401 :    
402 :     =cut
403 :    
404 :     sub tagvalue {
405 :     my ($self, $peg, $tag)=@_;
406 :     my @return;
407 :     my @attr=$fig->feature_attributes($peg);
408 :     foreach my $attr (@attr) {
409 : redwards 1.11 my ($gotpeg, $gottag, $val, $link)=@$attr;
410 : redwards 1.3 push @return, $val if ($gottag eq $tag);
411 :     }
412 :     return wantarray ? @return : join "; ", @return;
413 :     }
414 : redwards 1.1
415 : parrello 1.43 =head3 locations_on_contig
416 : redwards 1.5
417 :     Return the locations of a sequence on a contig.
418 :    
419 :     This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations.
420 :    
421 :     my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef);
422 :     foreach my $bp (@$locations) { ... do something ... }
423 :    
424 :     first argument : genome number
425 :     second argument : contig name
426 :     third argument : sequence to look for
427 :     fourth argument : beginning position to start looking from (can be undef)
428 :     fifth argument : end position to stop looking from (can be undef)
429 :     sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc)
430 :    
431 :     Note, the position is calculated before the sequence is rc'd
432 :    
433 :     =cut
434 :    
435 :     sub locations_on_contig {
436 :     my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_;
437 :     my $return=[];
438 :    
439 :     # get the dna sequence of the contig, and make sure it is uppercase
440 :     my $contig_ln=$fig->contig_ln($genome, $contig);
441 :     return $return unless ($contig_ln);
442 :     unless ($from) {$from=1}
443 :     unless ($to) {$to=$contig_ln}
444 :     if ($from > $to) {($from, $to)=($to, $from)}
445 :     my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to);
446 :     $dna_seq=uc($dna_seq);
447 :    
448 :     # if we want to check the rc, we actually rc the query
449 :     $sequence=$fig->reverse_comp($sequence) if ($check_reverse);
450 :     $sequence=uc($sequence);
451 :    
452 :     # now find all the matches
453 :     my $posn=index($dna_seq, $sequence, 0);
454 :     while ($posn > -1) {
455 :     push @$return, $posn;
456 :     $posn=index($dna_seq, $sequence, $posn+1);
457 :     }
458 :     return $return;
459 :     }
460 :    
461 :    
462 : parrello 1.43 =head3 scrolling_org_list
463 : redwards 1.5
464 :     This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
465 :    
466 : overbeek 1.34 use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default, $limit);
467 : redwards 1.5
468 :     multiple selections will only be set if $multiple is true
469 :    
470 : overbeek 1.24 default will set a default to override (maybe) korgs
471 :    
472 : overbeek 1.34 limit is a reference to an array of organism IDs that you want to limit the list to.
473 :    
474 : redwards 1.5 =cut
475 :    
476 :     sub scrolling_org_list {
477 : overbeek 1.34 my ($self, $cgi, $multiple, $default, $limit)=@_;
478 : redwards 1.5 unless ($multiple) {$multiple=0}
479 :    
480 :     my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
481 :    
482 :     #
483 :     # Canonical names must match the keywords used in the DBMS. They are
484 :     # defined in compute_genome_counts.pl
485 :     #
486 :     my %canonical = (
487 :     'All' => undef,
488 :     'Archaea' => 'Archaea',
489 :     'Bacteria' => 'Bacteria',
490 :     'Eucarya' => 'Eukaryota',
491 :     'Viruses' => 'Virus',
492 :     'Environmental samples' => 'Environmental Sample'
493 :     );
494 :    
495 :     my $req_dom = $cgi->param( 'domain' ) || 'All';
496 :     my @domains = $cgi->radio_group( -name => 'domain',
497 :     -default => $req_dom,
498 :     -override => 1,
499 :     -values => [ @display ]
500 :     );
501 :    
502 :     my $n_domain = 0;
503 :     my %dom_num = map { ( $_, $n_domain++ ) } @display;
504 :     my $req_dom_num = $dom_num{ $req_dom } || 0;
505 :    
506 :     #
507 :     # Viruses and Environmental samples must have completeness = All (that is
508 :     # how they are in the database). Otherwise, default is Only "complete".
509 :     #
510 :     my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All'
511 :     : $cgi->param( 'complete' ) || 'Only "complete"';
512 :     my @complete = $cgi->radio_group( -name => 'complete',
513 :     -default => $req_comp,
514 :     -override => 1,
515 :     -values => [ 'All', 'Only "complete"' ]
516 :     );
517 :     #
518 :     # Use $fig->genomes( complete, restricted, domain ) to get org list:
519 :     #
520 :     my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete";
521 :    
522 :     my $orgs; my $label;
523 :     @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } );
524 : overbeek 1.34
525 :     # limit the list of organisms to a selected few if required
526 :     if ($limit)
527 :     {
528 :     my %lim=map {($_=>1)} @$limit;
529 :     my $norg;
530 :     foreach my $o (@$orgs) {push @$norg, $o if ($lim{$o})}
531 :     $orgs=$norg;
532 :     }
533 : redwards 1.5
534 :     foreach (@$orgs) {
535 :     my $gs = $fig->genus_species($_);
536 : overbeek 1.35 if ($fig->genome_domain($_) ne "Environmental Sample")
537 :     {
538 :     my $gc=$fig->number_of_contigs($_);
539 :     $label->{$_} = "$gs ($_) [$gc contigs]";
540 :     }
541 :     else
542 :     {
543 :     $label->{$_} = "$gs ($_) ";
544 :     }
545 : redwards 1.5 }
546 :    
547 :     @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
548 :    
549 :     my $n_genomes = @$orgs;
550 :    
551 :     return ( "<TABLE>\n",
552 :     " <TR>\n",
553 :     " <TD>",
554 : redwards 1.6 $cgi->scrolling_list( -name => 'korgs',
555 :     -values => $orgs,
556 :     -labels => $label,
557 :     -size => 10,
558 :     -multiple => $multiple,
559 : overbeek 1.24 -default => $default,
560 : redwards 1.5 ), $cgi->br,
561 :     "$n_genomes genomes shown ",
562 :     $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
563 :     " </TD>",
564 :     " <TD>",
565 :     join( "<br>", "<b>Domain(s) to show:</b>", @domains), "<br>\n",
566 :     join( "<br>", "<b>Completeness?</b>", @complete), "\n",
567 :     "</TD>",
568 :     " </TR>\n",
569 :     "</TABLE>\n",
570 :     );
571 :     }
572 :    
573 : redwards 1.21
574 : parrello 1.43 =head3 scrolling_subsys_list
575 : redwards 1.21
576 :     Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
577 :    
578 :     use like this
579 :    
580 :     push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
581 :    
582 :     =cut
583 :    
584 :     sub scrolling_subsys_list {
585 :     my ($self, $cgi, $multiple)=@_;
586 :     $multiple=0 unless (defined $multiple);
587 : redwards 1.22 my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
588 : redwards 1.21 my $label;
589 :     # generate labels for the list
590 :     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/ / /g; $k =~ s/\s+$//; $label->{$s}=$k}
591 :     return $cgi->scrolling_list(
592 :     -name => 'subsystems',
593 :     -values => \@ss,
594 :     -labels => $label,
595 :     -size => 10,
596 :     -multiple=> $multiple,
597 :     );
598 :     }
599 :    
600 : parrello 1.43 =head3 subsys_names_for_display
601 : redwards 1.21
602 :     Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
603 :    
604 :     $raelib->subsys_names_for_display(@ss);
605 :     or
606 :     $raelib->subsys_names_for_display($fig->all_subsystems());
607 :    
608 :     =cut
609 :    
610 :     sub subsys_names_for_display {
611 :     my ($self, @ss)=@_;
612 :     foreach (@ss) {s/\_/ /g; 1 while (s/ / /g); s/\s+$//}
613 :     return @ss;
614 :     }
615 :    
616 : parrello 1.43 =head3 GenBank
617 : redwards 1.5
618 : redwards 1.17 This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
619 :     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
620 :     at building the object.
621 : redwards 1.1
622 : redwards 1.17 Note that you need to call this with the genome name and the contig. This will then go through that contig.
623 : redwards 1.1
624 : redwards 1.17 Something like this should work
625 :    
626 :     foreach my $contig ($fig->all_contigs($genome)) {
627 :     my $seqobj=FIGRob->GenBank($genome, $contig);
628 :     # process the contig
629 :     }
630 :    
631 :     =cut
632 :    
633 :     sub GenBank {
634 :     my ($self, $genome, $contig)=@_;
635 :     my $gs=$fig->genus_species($genome);
636 :     return unless ($gs);
637 :     unless ($contig) {
638 :     print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
639 :     return;
640 :     }
641 :     my $len=$fig->contig_ln($genome, $contig);
642 :     unless ($len) {
643 :     print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
644 :     return;
645 :     }
646 :    
647 :    
648 :     # first find all the pegs ...
649 :     my $features; # all the features in the genome
650 :     my $allpegs; # all the pegs
651 :     my $translation; # all the protein sequences
652 :     foreach my $peg ($fig->pegs_of($genome)) {
653 :     my @location=$fig->feature_location($peg);
654 :     my $func=$fig->function_of($peg);
655 :     foreach my $loc (@location) {
656 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
657 :     my ($cg, $start, $stop)=($1, $2, $3);
658 :     next unless ($cg eq $contig);
659 :     # save this information for later
660 :     $features->{'peg'}->{$loc}=$func;
661 :     $allpegs->{'peg'}->{$loc}=$peg;
662 :     $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
663 :     }
664 :     }
665 :     # ... and all the RNAs
666 :     foreach my $peg ($fig->rnas_of($genome)) {
667 :     my @location=$fig->feature_location($peg);
668 :     my $func=$fig->function_of($peg);
669 :     foreach my $loc (@location) {
670 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
671 :     my ($cg, $start, $stop)=($1, $2, $3);
672 :     next unless ($cg eq $contig);
673 :     # save this information for later
674 :     $features->{'rna'}->{$loc}=$func;
675 :     $allpegs->{'rna'}->{$loc}=$peg;
676 :     }
677 :     }
678 :    
679 :    
680 :     # now get all the contigs out
681 :     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
682 :     my $description = "Contig $contig from " . $fig->genus_species($genome);
683 :     my $sobj=Bio::Seq->new(
684 :     -seq => $seq,
685 :     -id => $contig,
686 :     -desc => $description,
687 :     -accession_number => $genome
688 :     );
689 :     foreach my $prot (keys %{$features->{'peg'}}) {
690 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
691 :     my ($cg, $start, $stop)=($1, $2, $3);
692 :     my $strand=1;
693 :     if ($stop < $start) {
694 :     ($stop, $start)=($start, $stop);
695 :     $strand=-1;
696 :     }
697 :    
698 :     my $feat=Bio::SeqFeature::Generic->new(
699 :     -start => $start,
700 :     -end => $stop,
701 :     -strand => $strand,
702 :     -primary => 'CDS',
703 :     -display_name => $allpegs->{'peg'}->{$prot},
704 :     -source_tag => 'the SEED',
705 :     -tag =>
706 :     {
707 :     db_xref => $allpegs->{'peg'}->{$prot},
708 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
709 :     function => $features->{'peg'}->{$prot},
710 :     translation => $translation->{'peg'}->{$prot}
711 :     }
712 :     );
713 :    
714 :     $sobj->add_SeqFeature($feat);
715 :     }
716 :    
717 :     foreach my $prot (keys %{$features->{'rna'}}) {
718 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
719 :     my ($cg, $start, $stop)=($1, $2, $3);
720 :     my $strand=1;
721 :     if ($stop < $start) {
722 :     ($stop, $start)=($start, $stop);
723 :     $strand=-1;
724 :     }
725 :    
726 :     my $feat=Bio::SeqFeature::Generic->new(
727 :     -start => $start,
728 :     -end => $stop,
729 :     -strand => $strand,
730 :     -primary => 'RNA',
731 :     -source_tag => 'the SEED',
732 :     -display_name => $allpegs->{'rna'}->{$prot},
733 :     -tag =>
734 :     {
735 :     db_xref => $allpegs->{'rna'}->{$prot},
736 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
737 :     function => $features->{'rna'}->{$prot},
738 :     }
739 :     );
740 :    
741 :     $sobj->add_SeqFeature($feat);
742 :     }
743 :     return $sobj;
744 :     }
745 :    
746 : parrello 1.43 =head3 best_hit
747 : redwards 1.17
748 :     Returns the FIG id of the single best hit to a peg
749 :    
750 :     eg
751 :    
752 :     my $bh=$fr->best_hit($peg);
753 :     print 'function is ', scalar $fig->function_of($bh);
754 :    
755 :     =cut
756 :    
757 :     sub best_hit {
758 :     my ($self, $peg)=@_;
759 :     return unless ($peg);
760 :    
761 :     my ($maxN, $maxP)=(1, 1e-5);
762 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
763 :     return ${$sims[0]}[1];
764 :     }
765 : redwards 1.1
766 : redwards 1.23
767 : parrello 1.43 =head3 read_fasta
768 : redwards 1.23
769 :     Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
770 :    
771 :     Usage:
772 :     my $fasta=$raelib->read_fasta($file);
773 :     my ($fasta, $comments)=$raelib->read_fasta($file, 1);
774 :    
775 :     =cut
776 :    
777 :     sub read_fasta {
778 :     my ($self, $file, $keepcomments)=@_;
779 : redwards 1.41 if ($file =~ /\.gz$/) {open(IN, "gunzip -c $file|") || die "Can't open a pipe from gunzip -c $file"}
780 : redwards 1.46 elsif ($file =~ /.zip$/) {open(IN, "unzip -p $file|") || die "can't open a pipe from unzip -p $file"}
781 : redwards 1.41 else {open (IN, $file) || die "Can't open $file"}
782 : redwards 1.23 my %f; my $t; my $s; my %c;
783 : overbeek 1.26 while (<IN>) {
784 : redwards 1.23 chomp;
785 :     if (/^>/) {
786 :     if ($s) {
787 :     $f{$t}=$s;
788 :     undef $s;
789 :     }
790 :     s/^>(\S+)\s*//;
791 :     $t=$1;
792 :     $c{$t}=$_ if ($_);
793 :     }
794 :     else {$s .= $_}
795 :     }
796 :     $f{$t}=$s;
797 :     if ($keepcomments) {return (\%f, \%c)}
798 :     else {return \%f}
799 :     }
800 :    
801 : parrello 1.43 =head3 rc
802 : redwards 1.23
803 :     Reverse complement. It's too easy.
804 :    
805 :     =cut
806 :    
807 :     sub rc {
808 :     my ($self, $seq)=@_;
809 : redwards 1.47 $seq =~ tr/acgtrymkbdhvACGTRYMKBDHV/tgcayrkmvhdbTGCAYRKMVHDB/;
810 : redwards 1.23 $seq = reverse $seq;
811 :     return $seq;
812 :     }
813 :    
814 : redwards 1.27
815 : parrello 1.43 =head3 cookies
816 : redwards 1.27
817 :     Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
818 :    
819 :     If you do not pass any arguments the whole cookie will be returned.
820 :    
821 :     Use as follows:
822 :    
823 :     ($cookie, $data) = raelib->cookie($cgi, \%data);
824 :    
825 :     You do not need to pass in any data, in that case you will just get the cookie back
826 :    
827 :     Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
828 :    
829 :     Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
830 :    
831 :     =cut
832 :    
833 :     sub cookie {
834 :     my ($self, $cgi, $input)=@_;
835 :     return unless ($cgi);
836 :     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
837 :    
838 :     # first, create a hash from the existing cookie data
839 :     my $cookie;
840 :     map {
841 :     my ($kname, $kvalue)=split /\=/, $_;
842 :     $cookie->{$kname}=$kvalue;
843 :     } split /\;/, $self->{'cookie'};
844 :    
845 :     if ($input)
846 :     {
847 :     # add the values that were passed in
848 :     map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
849 :     # put everything back together and set the cookie
850 :     my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
851 :     $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
852 :     }
853 :    
854 :     return ($self->{'cookie'}, $cookie);
855 :     }
856 :    
857 :    
858 : parrello 1.43 =head3 is_number
859 : overbeek 1.34
860 :     returns 1 if the argument is a number, and 0 if not. This is taken directly from the perl cookbook.
861 :    
862 :     =cut
863 :    
864 :     sub is_number {
865 :     my ($self, $no)=@_;
866 :     return 1 if ($no =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/); # Perl cookbook, page 44
867 :     return 0;
868 :     }
869 :    
870 :    
871 : redwards 1.27
872 : parrello 1.43 =head3 commify
873 : overbeek 1.30
874 :     Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
875 :    
876 :     =cut
877 :    
878 :     sub commify {
879 :     my($self,$n) = @_;
880 :     my(@n) = ();
881 :     my($i);
882 :    
883 :     for ($i = (length($n) - 3); ($i > 0); $i -= 3)
884 :     {
885 :     unshift(@n,",",substr($n,$i,3));
886 :     }
887 :     unshift(@n,substr($n,0,$i+3));
888 :     return join("",@n);
889 :     }
890 :    
891 :    
892 : parrello 1.43 =head3 tab2excel
893 : overbeek 1.31
894 :     This is experimental as of May, 2006.
895 :    
896 :     There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
897 :    
898 : overbeek 1.32 At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
899 : overbeek 1.31
900 :     The issues are:
901 :     1. creating the excel file
902 :     2. reading through @$tab and presenting the data
903 :     3. Checking @$tab because each element can be a reference to an array with color or formatting information
904 :    
905 :     Formatting
906 :    
907 :     A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
908 :    
909 :    
910 :     Usage:
911 : overbeek 1.38
912 :     The recommended way of using this set of modules is to add the options excelfile=>"filename" to the options passed to &HTML::make_table. That should take care of EVERYTHING for you, so you should do that. You can call thinks separetly if you like, but I don't recommend it.
913 :    
914 :     Note that you can make multiple calls to the same excel file,a nd each one will get added as a new sheet.
915 :    
916 : overbeek 1.31 Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
917 :     and second, the additional option "filename" that is the filename to be written;
918 :    
919 :     $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
920 :    
921 :     The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
922 :    
923 :     Returns:
924 :     A link to the file in the format
925 :     <p><a href="...">filename</a> [Download Excel file]</p>
926 :    
927 : overbeek 1.32 Note that there are four separate methods:
928 :     1. tab2excel is the method for a single call from HTML::make_table
929 :     this will make an excel file, fill it, and return the link;
930 :     2. make_excel_workbook is the method that instantiates a file
931 :     3. make_excel_worksheet is the method that actually populates the file
932 :     this loads all the data into the excel file, but if you know what you are doing you can call this many times,
933 :     each with a different spreadsheet
934 :     4. close_excel_file
935 : overbeek 1.38 this closes the file and writes it. This is called in the END block, so you do not have to explicitly call it here.
936 : overbeek 1.32
937 :     tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
938 :     See subsys.cgi for a more complex involvement of this!
939 :    
940 : overbeek 1.31
941 :     =cut
942 :    
943 :     sub tab2excel {
944 :     my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
945 : overbeek 1.40 return "" unless ($self->{'useexcel'});
946 :     #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
947 : overbeek 1.38 $self->{'excel_file_link'} = $self->make_excel_workbook($filename, $options);
948 :     $excelfilelink=$self->{'excel_file_link'};
949 : overbeek 1.32 $self->make_excel_worksheet($col_hdrs, $tab, $title);
950 : overbeek 1.38 return "" if ($options->{'no_excel_link'});
951 :     return $self->{'excel_file_link'};
952 : overbeek 1.32 }
953 : overbeek 1.31
954 : overbeek 1.38
955 : parrello 1.43 =head3 excel_file_link
956 : overbeek 1.38
957 :     Just returns the link to the file, if one has been created. If not, returns a non-breaking space (&nbsp;)
958 :    
959 :     =cut
960 :    
961 :     sub excel_file_link {
962 :     my $self=shift;
963 :     # I am not sure why, but this is not working. Perhaps because I am calling it from &HTML::make_table (i.e. not OO perl?)
964 :     #print STDERR "SELF: $self LINK: ",$self->{'excel_file_link'}," or \n$excelfilelink\n";
965 :     #return $self->{'excel_file_link'};
966 :     return $excelfilelink;
967 :     }
968 :    
969 :    
970 :    
971 : parrello 1.43 =head3 make_excel_workbook
972 : overbeek 1.32
973 :     This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
974 :    
975 : overbeek 1.38 This will return the link to the workbook
976 :    
977 : overbeek 1.32 =cut
978 :    
979 :     sub make_excel_workbook {
980 :     my($self, $filename, $options)=@_;
981 : overbeek 1.40 return "" unless ($self->{'useexcel'});
982 :     #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
983 : overbeek 1.32
984 : overbeek 1.31 $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
985 :     unless ($filename =~ /\.xls$/) {$filename .=".xls"}
986 : overbeek 1.37
987 : overbeek 1.38 # now generate the link to return
988 :     my $link="<p><a href=\"".$fig->temp_url."/".$filename.'">'.$filename."</a> [Download table in Excel format].</p>\n";
989 : overbeek 1.37 # do we already have this file -- if so, just return that info
990 : overbeek 1.38 return $link if ($self->{'excel_short_filename'} eq $filename); # don't do anything, just return the fact that we have the book made!
991 : overbeek 1.37
992 :    
993 : overbeek 1.32 $self->{'excel_short_filename'}=$filename;
994 :     $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
995 : overbeek 1.31
996 : overbeek 1.37
997 : overbeek 1.31 # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
998 :     # that can be added with "Insert->new worksheet" from the menus.
999 :     # Create a new workbook called simple.xls and add a worksheet
1000 : overbeek 1.32
1001 :     # instantiate the workbook
1002 :     $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
1003 : overbeek 1.38 $excelfile=$self->{'excel_workbook'}; # this is for the close on END
1004 : overbeek 1.32 $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
1005 :    
1006 :     # define the default formats
1007 :     my $border = defined $options->{border} ? $options->{border} : 0;
1008 :     $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
1009 : overbeek 1.38 return $link;
1010 : overbeek 1.32 }
1011 :    
1012 :    
1013 : parrello 1.43 =head3 make_excel_worksheet()
1014 : overbeek 1.32
1015 :     This is the method that makes the separate sheets in the file. You can add as many of these as you want.
1016 :    
1017 :     =cut
1018 :    
1019 :     sub make_excel_worksheet {
1020 :     my($self, $col_hdrs, $tab, $title)=@_;
1021 : overbeek 1.40 #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
1022 :     return "" unless ($self->{'useexcel'});
1023 :    
1024 : overbeek 1.32 unless (defined $self->{'excel_workbook'})
1025 :     {
1026 :     print STDERR "The workbook was not defined. Couldn't fill it in\n";
1027 :     return;
1028 :     }
1029 :    
1030 : overbeek 1.37 if (length($title) > 31) {$title=substr($title, 0, 31)}
1031 : overbeek 1.32 my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
1032 : overbeek 1.31 # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
1033 :     # column are zero indexed
1034 :    
1035 :     # write the column headers
1036 :     # define a new format that is bold
1037 : overbeek 1.32 $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
1038 :     $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
1039 :     $self->{'excel_format'}->{header}->set_bold();
1040 : overbeek 1.31
1041 :     for my $i (0 .. $#$col_hdrs)
1042 :     {
1043 : overbeek 1.39 my $cell;
1044 :     my ($useformat, $rowspan, $colspan);
1045 :     if (ref($col_hdrs->[$i]) eq "ARRAY") {($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($col_hdrs->[$i])}
1046 :     else {$cell=$col_hdrs->[$i]}
1047 : redwards 1.42 $cell=$self->clean_excel_cell($cell);
1048 : overbeek 1.32 $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
1049 : overbeek 1.31 }
1050 :    
1051 :     # now loop through the table and write them out. Remember to break on array refs
1052 : overbeek 1.39 # we are going to have to build the table col by col so we get the breaks in the right place
1053 :     # for merged cells
1054 : overbeek 1.31 my $row_idx=1;
1055 : overbeek 1.39 my $maxrow=$#$tab;
1056 :     my $skip;
1057 :     while ($row_idx <= $maxrow+1)
1058 :     {
1059 :     my @row=@{$tab->[$row_idx-1]};
1060 :     my $col_idx=0;
1061 :     foreach my $cell (@row)
1062 : overbeek 1.31 {
1063 : overbeek 1.39 while ($skip->{$row_idx}->{$col_idx}) {$col_idx++}
1064 : overbeek 1.32 my $useformat=$self->{'excel_format'}->{default};
1065 : overbeek 1.36
1066 :     # there is an approach to setting color using \@bgcolor. Oh well.
1067 :     if ( $cell =~ /^\@([^:]+)\:(.*)$/ )
1068 :     {
1069 :     $cell=[$2, $1];
1070 :     }
1071 :    
1072 : overbeek 1.39 my ($rowspan, $colspan);
1073 : overbeek 1.31 if (ref($cell) eq "ARRAY")
1074 :     {
1075 : overbeek 1.39 ($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($cell);
1076 : overbeek 1.31 }
1077 :    
1078 :     $cell=$self->clean_excel_cell($cell);
1079 : overbeek 1.39
1080 :     if ($rowspan > 1 || $colspan > 1)
1081 :     {
1082 :     # handle merged cells separately
1083 :     my $row_idx_to=$row_idx+$rowspan-1;
1084 :     my $col_idx_to=$col_idx+$colspan-1;
1085 :     # we want to not put anything in the merged cells
1086 :     for (my $x=$row_idx; $x<=$row_idx_to; $x++) {$skip->{$x}->{$col_idx}=1}
1087 :     for (my $y=$col_idx; $y<=$col_idx_to; $y++) {$skip->{$row_idx}->{$y}=1}
1088 :    
1089 :     if (ref($cell) eq "ARRAY") {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, @$cell, $useformat)}
1090 :     else {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, $cell, $useformat)}
1091 :     }
1092 :     else
1093 :     {
1094 :     # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
1095 :     # in this case we have a reference to an array, and we'll use a slight modification on the process
1096 :     if ((ref($cell) eq "ARRAY" && $cell->[0] eq " &nbsp; ") || $cell eq " &nbsp; ") {$worksheet->write_blank($row_idx, $col_idx, $useformat)}
1097 :     else
1098 :     {
1099 :     if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
1100 :     else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
1101 :     }
1102 :     }
1103 :    
1104 : overbeek 1.31 # increment to the next column
1105 :     $col_idx++;
1106 :     }
1107 :     # new line, and start of line
1108 :     $row_idx++;
1109 :     $col_idx=0;
1110 : overbeek 1.32 }
1111 : overbeek 1.39 }
1112 : overbeek 1.32
1113 :    
1114 :    
1115 :    
1116 : parrello 1.43 =head3 close_excel_file()
1117 : overbeek 1.32
1118 :     We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
1119 :    
1120 :     =cut
1121 :    
1122 :     sub close_excel_file{
1123 : overbeek 1.38 my ($workbook)=@_;
1124 :     return unless (defined $workbook);
1125 : overbeek 1.32 # close the workbook. this writes the files
1126 : overbeek 1.38 return $workbook->close();
1127 : overbeek 1.31 }
1128 :    
1129 :    
1130 :    
1131 :    
1132 : overbeek 1.30
1133 :    
1134 : redwards 1.27
1135 : overbeek 1.31
1136 :    
1137 : parrello 1.43 =head3 parse_cell()
1138 : overbeek 1.31
1139 :     A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
1140 :    
1141 :     Requires the cell and the current $format.
1142 :    
1143 :     When applied to <td> the default formats that we'll deal with at the moment are
1144 :     align=
1145 :     background-color=
1146 :     color=
1147 :     bgcolor=
1148 :    
1149 :     Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
1150 :    
1151 : overbeek 1.32 The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
1152 : overbeek 1.31
1153 :     =cut
1154 :    
1155 :     sub parse_cell {
1156 : overbeek 1.32 my ($self, $arr)=@_;
1157 :     return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1158 : overbeek 1.31 my ($cell, $tag)=@$arr;
1159 :     $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1160 :    
1161 :     # we are going to define a series of formats that we can apply, this will have a key that is
1162 : overbeek 1.32 # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1163 : overbeek 1.31
1164 : overbeek 1.39 my ($th, $center, $bgcolor, $fgcolor)=(0,0,0,0);
1165 : overbeek 1.31
1166 : overbeek 1.32 if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1167 : overbeek 1.31 if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1168 : overbeek 1.32
1169 :     # get rid of white tags because I don't care about them
1170 :     $tag =~ s/color\=.\#FFFFFF/ /ig;
1171 :    
1172 : overbeek 1.31 if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1173 :     {
1174 :     my $color=$1;
1175 : overbeek 1.38 if ($color)
1176 : overbeek 1.31 {
1177 : overbeek 1.38 if (!defined $self->{'excel_color'}->{$color})
1178 :     {
1179 :     # find out the last custom color used and increment it
1180 :     my $max=19; # we are not going to use a color less than 20
1181 :     foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1182 :     $max++;
1183 :     $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1184 :     }
1185 :     $bgcolor=$self->{'excel_color'}->{$color};
1186 : overbeek 1.31 }
1187 :     }
1188 : overbeek 1.32 elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1189 : overbeek 1.31 {
1190 :     my $color=$1;
1191 : overbeek 1.32 if (!defined $self->{'excel_color'}->{$color})
1192 : overbeek 1.31 {
1193 :     # find out the last custom color used and increment it
1194 :     my $max=19; # we are not going to use a color less than 20
1195 : overbeek 1.33 foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1196 : overbeek 1.31 $max++;
1197 : overbeek 1.32 $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1198 : overbeek 1.31 }
1199 : overbeek 1.32 $fgcolor=$self->{'excel_color'}->{$color};
1200 : overbeek 1.31 }
1201 :    
1202 : overbeek 1.39 # check and see if we span multiple rows or columns
1203 :     my ($rowspan, $colspan)=(1,1);
1204 :     if ($tag =~ /rowspan\=[\'\"]?(\d+)/) {$rowspan=$1} # these should match rowspan="4", rowspan='4', and rowspan=4
1205 :     if ($tag =~ /colspan\=[\'\"]?(\d+)/) {$colspan=$1}
1206 :    
1207 :     my $formatid=$th.$center.$bgcolor.$fgcolor.$rowspan.$colspan;
1208 :     if (!defined $self->{'excel_format'}->{$formatid})
1209 :     {
1210 :     $self->{'excel_format'}->{$formatid}=$self->{'excel_workbook'}->add_format();
1211 :     if ($rowspan > 1) {$self->{'excel_format'}->{$formatid}->set_align("vcenter")}
1212 :     else
1213 :     {
1214 :     if ($th) {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{header})}
1215 :     else {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{default})}
1216 :     }
1217 :     $center && $self->{'excel_format'}->{$formatid}->set_align($center);
1218 :     $bgcolor && $self->{'excel_format'}->{$formatid}->set_bg_color($bgcolor);
1219 :     $fgcolor && $self->{'excel_format'}->{$formatid}->set_color($fgcolor);
1220 : overbeek 1.31 }
1221 : overbeek 1.39
1222 :     return ($cell, $self->{'excel_format'}->{$formatid}, $rowspan, $colspan);
1223 : overbeek 1.31 }
1224 :    
1225 :    
1226 : parrello 1.43 =head3 clean_excel_cell
1227 : overbeek 1.31
1228 :     Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1229 :    
1230 :     =cut
1231 :    
1232 :     sub clean_excel_cell {
1233 :     my ($self, $cell)=@_;
1234 :     if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1235 : overbeek 1.32
1236 :     # some cells have something like this:
1237 :     # <a onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1238 :     # we don't want those, but we do want the ones that have a real url hidden here.
1239 :     # so remove the mouseover part, and then see what is left
1240 :     if ($cell =~ s/onMouseover\=\".*?\"//)
1241 :     {
1242 :     if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1243 :     }
1244 :    
1245 : overbeek 1.31 if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1246 :     {
1247 :     # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1248 :     my ($url, $link)=($1, $2);
1249 : overbeek 1.32 $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1250 : overbeek 1.31 unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1251 : overbeek 1.32 # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1252 :     $link =~ s/\<.*?\>//g;
1253 : overbeek 1.31 $cell=[$url, $link];
1254 :     }
1255 : overbeek 1.32 elsif ($cell =~ /<input type/)
1256 :     {
1257 :     if ($cell =~ /value='(.*?)'/) {$cell = $1}
1258 :     elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1259 :     }
1260 :     else
1261 :     {
1262 :     # this is all the html that I don't know what to do with, like <input type=...>
1263 :     $cell =~ s/\<.*?\>//g;
1264 :     }
1265 : overbeek 1.31 return $cell;
1266 :     }
1267 :    
1268 : redwards 1.44 =head1 rand
1269 : overbeek 1.31
1270 : redwards 1.44 Randomize an array using the fisher-yates shuffle described in the perl cookbook.
1271 :    
1272 :     =cut
1273 :    
1274 :     sub rand {
1275 :     my ($self, $array) = @_;
1276 :     my $i;
1277 :     for ($i = @$array; --$i; ) {
1278 :     my $j = int rand ($i+1);
1279 :     next if $i == $j;
1280 :     @$array[$i,$j] = @$array[$j,$i];
1281 :     }
1282 :     return $array;
1283 :     }
1284 : overbeek 1.31
1285 :    
1286 : redwards 1.1 1;
1287 : redwards 1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3