[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.32, Sun May 7 20:10:07 2006 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
26    
27     For questions about this email RobE@theFIG.info
28    
29  =cut  =cut
30    
31  package raelib;  package raelib;
32  use strict;  use strict;
33    use Bio::SeqIO;
34    use Bio::Seq;
35    use Bio::SeqFeature::Generic;
36    
37    # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
38    # so we use an eval and set the boolean if we are cool.
39    my $useexcel;
40    BEGIN {
41        use lib '/home/seed/Rob/perl/lib/perl5/site_perl/5.8.5/';
42        eval "use Spreadsheet::WriteExcel";
43        unless ($@) {$useexcel=1}
44    }
45    
46    
47  use FIG;  use FIG;
48  my $fig=new FIG;  my $fig=new FIG;
49    
# Line 21  Line 54 
54  =cut  =cut
55    
56  sub new {  sub new {
57   my $self=shift;   my ($class)=@_;
58   return $self   my $self={};
59     $self->{'useexcel'}=1 if ($useexcel);
60     return bless $self, $class;
61  }  }
62    
63    
# Line 70  Line 105 
105    
106  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
107    
108   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
109    
110   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
111    
112    This method takes three arguments:
113       from    : pirsfinfo.dat file
114       to      : file to write information to
115       verbose : report on progress
116    
117    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
118    
119    Returns the number of lines in the pirsinfo file that were read.
120    
121  =cut  =cut
122    
123  sub pirsfcorrespondence {  sub pirsfcorrespondence {
124   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
125   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
126      print STDERR "File $from does not exist as called in $0\n";
127      return 0;
128     }
129     if ($from =~ /\.gz$/) {
130      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
131     }
132     else {
133   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
134   open (OUT, ">$to") || die "Can't write tot $to";   }
135     open (OUT, ">$to") || die "Can't write to $to";
136     my $linecount;
137   while (<IN>) {   while (<IN>) {
138      $linecount++;
139      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
140    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
141    chomp;    chomp;
142    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
143     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
144    }    }
   unless ($done) {print OUT $_, "\t\n"}  
145   }   }
146   close IN;   close IN;
147   close OUT;   close OUT;
148     return $linecount;
149    }
150    
151    =head2 uniprotcorrespondence
152    
153    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
154    
155    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
156    
157     This method takes three arguments:
158       from    : uniprotKB file
159       to      : file to write information to
160       verbose : report on progress
161    
162    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
163    
164     Returns the number of lines in the uniprotkb file that were read.
165    
166    =cut
167    
168    sub uniprotcorrespondence {
169     my ($self, $from, $to, $verbose)=@_;
170     unless (-e $from) {
171      print STDERR "File $from does not exist as called in $0\n";
172      return 0;
173     }
174     if ($from =~ /\.gz$/) {
175      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
176     }
177     else {
178      open (IN, $from) || die "Can't open $from";
179     }
180     open (OUT, ">$to") || die "Can't write to $to";
181     my $linecount;
182     while (<IN>) {
183      chomp;
184      $linecount++;
185      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
186      my @line=split /\s+\|\s+/;
187      my $added;
188      foreach my $peg ($self->swiss_pir_ids($line[0])) {
189       print OUT "$_ | $peg\n";
190       $added=1;
191      }
192      unless ($added) {print OUT "$_\n"}
193     }
194     close IN;
195     close OUT;
196     return $linecount;
197  }  }
198    
199    =head2 prositecorrespondence
200    
201    Generate a correspondence table between prosite and seed using sp id's and seed ids.
202    
203    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
204    
205    The output file will have the following columns:
206    
207    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
208    
209    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
210    
211     This method takes three arguments:
212       from    : prosite file
213       to      : file to write information to
214       verbose : report on progress
215    
216    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
217    
218     Returns the number of lines in the prosite file that were read.
219    
220    =cut
221    
222    sub prositecorrespondence {
223     my ($self, $from, $to, $verbose)=@_;
224     unless (-e $from) {
225      print STDERR "File $from does not exist as called in $0\n";
226      return 0;
227     }
228     if ($from =~ /\.gz$/) {
229      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
230     }
231     else {
232      open (IN, $from) || die "Can't open $from";
233     }
234     open (OUT, ">$to") || die "Can't write to $to";
235     my $linecount;
236     my ($famac, $famname, $famtype)=('','','');
237     while (<IN>) {
238      chomp;
239      $linecount++;
240      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
241      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
242      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
243      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
244      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
245      #
246      # this is the format of the DR lines:
247      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
248      s/^DR\s*//;
249      foreach my $piece (split /\s*\;\s*/, $_) {
250       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
251       foreach my $fig ($self->swiss_pir_ids($acc)) {
252        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
253       }
254      }
255     }
256    }
257    
258    =head2 swiss_pir_ids()
259    
260    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
261    
262    =cut
263    
264    sub swiss_pir_ids {
265     my ($self, $id)=@_;
266     return () unless ($id);
267     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
268    
269     my @return=($fig->by_alias("uni|$id"));
270     return @return if ($return[0]);
271    
272     @return=($fig->by_alias("tr|$id"));
273     return @return if ($return[0]);
274    
275     @return=($fig->by_alias("sp|$id"));
276     return @return if ($return[0]);
277    
278     return ();
279    }
280    
281  =head2 ss_by_id  =head2 ss_by_id
282    
# Line 173  Line 354 
354   my @return;   my @return;
355   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
356   foreach my $attr (@attr) {   foreach my $attr (@attr) {
357    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
358    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
359   }   }
360   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 411 
411    
412  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
413    
414  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default);
415    
416  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
417    
418    default will set a default to override (maybe) korgs
419    
420  =cut  =cut
421    
422  sub scrolling_org_list {  sub scrolling_org_list {
423   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default)=@_;
424   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
425    
426   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 287  Line 470 
470    
471   foreach (@$orgs) {   foreach (@$orgs) {
472     my $gs = $fig->genus_species($_);     my $gs = $fig->genus_species($_);
473     my $gc=scalar $fig->all_contigs($_);     my $gc=$fig->number_of_contigs($_);
474     $label->{$_} = "$gs ($_) [$gc contigs]";     $label->{$_} = "$gs ($_) [$gc contigs]";
475    }    }
476    
# Line 303  Line 486 
486                                          -labels   => $label,                                          -labels   => $label,
487                                          -size     => 10,                                          -size     => 10,
488                                          -multiple => $multiple,                                          -multiple => $multiple,
489                                            -default  => $default,
490                                        ), $cgi->br,                                        ), $cgi->br,
491                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
492                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 497 
497                    "</TD>",                    "</TD>",
498                    "   </TR>\n",                    "   </TR>\n",
499                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
500          );          );
501  }  }
502    
503    
504    =head2 scrolling_subsys_list
505    
506    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
507    
508    use like this
509    
510    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
511    
512    =cut
513    
514    sub scrolling_subsys_list {
515     my ($self, $cgi, $multiple)=@_;
516     $multiple=0 unless (defined $multiple);
517     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
518     my $label;
519     # generate labels for the list
520     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
521     return $cgi->scrolling_list(
522      -name    => 'subsystems',
523      -values  => \@ss,
524      -labels  => $label,
525      -size    => 10,
526      -multiple=> $multiple,
527     );
528    }
529    
530    =head2 subsys_names_for_display
531    
532    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
533    
534    $raelib->subsys_names_for_display(@ss);
535    or
536    $raelib->subsys_names_for_display($fig->all_subsystems());
537    
538    =cut
539    
540    sub subsys_names_for_display {
541     my ($self, @ss)=@_;
542     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
543     return @ss;
544    }
545    
546    =head2 GenBank
547    
548     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
549     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
550     at building the object.
551    
552     Note that you need to call this with the genome name and the contig. This will then go through that contig.
553    
554     Something like this should work
555    
556     foreach my $contig ($fig->all_contigs($genome)) {
557      my $seqobj=FIGRob->GenBank($genome, $contig);
558      # process the contig
559     }
560    
561    =cut
562    
563    sub GenBank {
564     my ($self, $genome, $contig)=@_;
565     my $gs=$fig->genus_species($genome);
566     return unless ($gs);
567     unless ($contig) {
568      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
569      return;
570     }
571     my $len=$fig->contig_ln($genome, $contig);
572     unless ($len) {
573      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
574      return;
575     }
576    
577    
578     # first find all the pegs ...
579     my $features; # all the features in the genome
580     my $allpegs; # all the pegs
581     my $translation; # all the protein sequences
582     foreach my $peg ($fig->pegs_of($genome)) {
583      my @location=$fig->feature_location($peg);
584      my $func=$fig->function_of($peg);
585      foreach my $loc (@location) {
586       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
587       my ($cg, $start, $stop)=($1, $2, $3);
588       next unless ($cg eq $contig);
589       # save this information for later
590       $features->{'peg'}->{$loc}=$func;
591       $allpegs->{'peg'}->{$loc}=$peg;
592       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
593      }
594     }
595     # ... and all the RNAs
596     foreach my $peg ($fig->rnas_of($genome)) {
597      my @location=$fig->feature_location($peg);
598      my $func=$fig->function_of($peg);
599      foreach my $loc (@location) {
600       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
601       my ($cg, $start, $stop)=($1, $2, $3);
602       next unless ($cg eq $contig);
603       # save this information for later
604       $features->{'rna'}->{$loc}=$func;
605       $allpegs->{'rna'}->{$loc}=$peg;
606      }
607     }
608    
609    
610     # now get all the contigs out
611     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
612     my $description = "Contig $contig from " . $fig->genus_species($genome);
613     my $sobj=Bio::Seq->new(
614              -seq              =>  $seq,
615              -id               =>  $contig,
616              -desc             =>  $description,
617              -accession_number =>  $genome
618              );
619     foreach my $prot (keys %{$features->{'peg'}}) {
620       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
621       my ($cg, $start, $stop)=($1, $2, $3);
622       my $strand=1;
623       if ($stop < $start) {
624        ($stop, $start)=($start, $stop);
625        $strand=-1;
626     }
627    
628     my $feat=Bio::SeqFeature::Generic->new(
629            -start         =>  $start,
630            -end           =>  $stop,
631            -strand        =>  $strand,
632            -primary       =>  'CDS',
633            -display_name  =>  $allpegs->{'peg'}->{$prot},
634            -source_tag    =>  'the SEED',
635            -tag           =>
636                           {
637                           db_xref     =>   $allpegs->{'peg'}->{$prot},
638                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
639                           function    =>  $features->{'peg'}->{$prot},
640                           translation =>  $translation->{'peg'}->{$prot}
641                          }
642           );
643    
644       $sobj->add_SeqFeature($feat);
645     }
646    
647     foreach my $prot (keys %{$features->{'rna'}}) {
648       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
649       my ($cg, $start, $stop)=($1, $2, $3);
650       my $strand=1;
651       if ($stop < $start) {
652        ($stop, $start)=($start, $stop);
653        $strand=-1;
654       }
655    
656       my $feat=Bio::SeqFeature::Generic->new(
657            -start         =>  $start,
658            -end           =>  $stop,
659            -strand        =>  $strand,
660            -primary       =>  'RNA',
661            -source_tag    =>  'the SEED',
662            -display_name  =>  $allpegs->{'rna'}->{$prot},
663            -tag           =>
664                          {
665                           db_xref     =>   $allpegs->{'rna'}->{$prot},
666                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
667                           function    =>  $features->{'rna'}->{$prot},
668                          }
669           );
670    
671      $sobj->add_SeqFeature($feat);
672     }
673     return $sobj;
674    }
675    
676    =head2 best_hit
677    
678     Returns the FIG id of the single best hit to a peg
679    
680     eg
681    
682     my $bh=$fr->best_hit($peg);
683     print 'function is ', scalar $fig->function_of($bh);
684    
685    =cut
686    
687    sub best_hit {
688     my ($self, $peg)=@_;
689     return unless ($peg);
690    
691     my ($maxN, $maxP)=(1, 1e-5);
692     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
693     return ${$sims[0]}[1];
694    }
695    
696    
697    =head1 read_fasta
698    
699    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
700    
701    Usage:
702    my $fasta=$raelib->read_fasta($file);
703    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
704    
705    =cut
706    
707    sub read_fasta {
708     my ($self, $file, $keepcomments)=@_;
709     open (IN, $file) || die "Can't open $file";
710     my %f; my $t; my $s; my %c;
711     while (<IN>) {
712      chomp;
713      if (/^>/) {
714       if ($s) {
715        $f{$t}=$s;
716        undef $s;
717       }
718       s/^>(\S+)\s*//;
719       $t=$1;
720       $c{$t}=$_ if ($_);
721      }
722      else {$s .= $_}
723     }
724     $f{$t}=$s;
725     if ($keepcomments) {return (\%f, \%c)}
726     else {return \%f}
727    }
728    
729    =head1 rc
730    
731    Reverse complement. It's too easy.
732    
733    =cut
734    
735    sub rc {
736     my ($self, $seq)=@_;
737     $seq=~tr/GATCgatc/CTAGctag/;
738     $seq = reverse $seq;
739     return $seq;
740    }
741    
742    
743    =head2 cookies
744    
745    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
746    
747    If you do not pass any arguments the whole cookie will be returned.
748    
749    Use as follows:
750    
751    ($cookie, $data) = raelib->cookie($cgi, \%data);
752    
753    You do not need to pass in any data, in that case you will just get the cookie back
754    
755    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
756    
757    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
758    
759    =cut
760    
761    sub cookie {
762     my ($self, $cgi, $input)=@_;
763     return unless ($cgi);
764     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
765    
766     # first, create a hash from the existing cookie data
767     my $cookie;
768     map {
769      my ($kname, $kvalue)=split /\=/, $_;
770      $cookie->{$kname}=$kvalue;
771     } split /\;/, $self->{'cookie'};
772    
773     if ($input)
774     {
775      # add the values that were passed in
776      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
777      # put everything back together and set the cookie
778      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
779      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
780     }
781    
782     return ($self->{'cookie'}, $cookie);
783    }
784    
785    
786    
787    =head1 commify
788    
789    Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
790    
791    =cut
792    
793    sub commify {
794        my($self,$n) = @_;
795        my(@n) = ();
796        my($i);
797    
798        for ($i = (length($n) - 3); ($i > 0); $i -= 3)
799        {
800            unshift(@n,",",substr($n,$i,3));
801        }
802        unshift(@n,substr($n,0,$i+3));
803        return join("",@n);
804    }
805    
806    
807    =head1 tab2excel
808    
809    This is experimental as of May, 2006.
810    
811    There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
812    
813    At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
814    
815    The issues are:
816        1. creating the excel file
817        2. reading through @$tab and presenting the data
818        3. Checking @$tab because each element can be a reference to an array with color or formatting information
819    
820    Formatting
821    
822    A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
823    
824    
825    Usage:
826        Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
827        and second, the additional option "filename" that is the filename to be written;
828    
829        $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
830    
831        The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
832    
833    Returns:
834        A link to the file in the format
835            <p><a href="...">filename</a> [Download Excel file]</p>
836    
837    Note that there are four separate methods:
838        1. tab2excel is the method for a single call from HTML::make_table
839            this will make an excel file, fill it, and return the link;
840        2. make_excel_workbook is the method that instantiates a file
841        3. make_excel_worksheet is the method that actually populates the file
842            this loads all the data into the excel file, but if you know what you are doing you can call this many times,
843            each with a different spreadsheet
844        4. close_excel_file
845            this closes the file and writes it. It is what returns the link.
846    
847        tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
848        See subsys.cgi for a more complex involvement of this!
849    
850    
851    =cut
852    
853    sub tab2excel {
854        my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
855        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
856        $self->make_excel_workbook($filename, $options);
857        $self->make_excel_worksheet($col_hdrs, $tab, $title);
858        return $self->close_excel_file();
859    }
860    
861    =head1 make_excel_workbook
862    
863    This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
864    
865    =cut
866    
867    sub make_excel_workbook {
868        my($self, $filename, $options)=@_;
869        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
870    
871        $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
872        unless ($filename =~ /\.xls$/) {$filename .=".xls"}
873        $self->{'excel_short_filename'}=$filename;
874        $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
875    
876        # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
877        # that can be added with "Insert->new worksheet" from the menus.
878        # Create a new workbook called simple.xls and add a worksheet
879    
880        # instantiate the workbook
881        $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
882        $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
883    
884        # define the default formats
885        my $border = defined $options->{border} ? $options->{border} : 0;
886        $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
887    }
888    
889    
890    =head1 make_excel_worksheet()
891    
892    This is the method that makes the separate sheets in the file. You can add as many of these as you want.
893    
894    =cut
895    
896    sub make_excel_worksheet {
897        my($self, $col_hdrs, $tab, $title)=@_;
898        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
899        unless (defined $self->{'excel_workbook'})
900        {
901            print STDERR "The workbook was not defined. Couldn't fill it in\n";
902            return;
903        }
904    
905        my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
906        # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
907        # column are zero indexed
908    
909        # write the column headers
910        # define a new format that is bold
911        $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
912        $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
913        $self->{'excel_format'}->{header}->set_bold();
914    
915        for my $i (0 .. $#$col_hdrs)
916        {
917            my $cell=$self->clean_excel_cell($col_hdrs->[$i]);
918            $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
919        }
920    
921        # now loop through the table and write them out. Remember to break on array refs
922        my $row_idx=1;
923        my $col_idx=0;
924        foreach my $row (@$tab)
925        {
926            foreach my $cell (@$row)
927            {
928                my $useformat=$self->{'excel_format'}->{default};
929                if (ref($cell) eq "ARRAY")
930                {
931                    ($cell, $useformat)=$self->parse_cell($cell);
932                }
933    
934                $cell=$self->clean_excel_cell($cell);
935                # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
936                # in this case we have a reference to an array, and we'll use  a slight modification on the process
937                if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
938                else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
939    
940                # increment to the next column
941                $col_idx++;
942            }
943            # new line, and start of line
944            $row_idx++;
945            $col_idx=0;
946        }
947    }
948    
949    
950    
951    
952    =head1 close_excel_file()
953    
954    We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
955    
956    =cut
957    
958    sub close_excel_file{
959        my ($self)=@_;
960    
961        # close the workbook. this writes the files
962        $self->{'excel_workbook'}->close();
963    
964        # now generate the link to return
965        my $size=(stat($self->{'excel_filename'}))[7];
966        $size=int($size/1000);
967        my $link="<p><a href=\"".$fig->temp_url."/".$self->{'excel_short_filename'}.'">'.
968                $self->{'excel_short_filename'}."</a> [Download table in Excel format. $size kb]</p>\n";
969        return $link;
970    
971    }
972    
973    
974    
975    
976    
977    
978    
979    
980    
981    =head2 parse_cell()
982    
983    A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
984    
985    Requires the cell and the current $format.
986    
987    When applied to <td> the default formats that we'll deal with at the moment are
988         align=
989         background-color=
990         color=
991         bgcolor=
992    
993    Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
994    
995    The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
996    
997    =cut
998    
999    sub parse_cell {
1000        my ($self, $arr)=@_;
1001        return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1002        my ($cell, $tag)=@$arr;
1003        $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1004    
1005        # we are going to define a series of formats that we can apply, this will have  a key that is
1006        # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1007    
1008        my ($th, $center, $bgcolor, $fgcolor)=(undef, undef, undef, undef);
1009    
1010        if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1011        if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1012    
1013        # get rid of white tags because I don't care about them
1014        $tag =~ s/color\=.\#FFFFFF/ /ig;
1015    
1016        if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1017        {
1018            my $color=$1;
1019            if (!defined $self->{'excel_color'}->{$color})
1020            {
1021                # find out the last custom color used and increment it
1022                my $max=19; # we are not going to use a color less than 20
1023                foreach my $k (keys %{$self->{'excel_color'}}) {($k > $max) ? ($max=$k) :1}
1024                $max++;
1025                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1026            }
1027            $bgcolor=$self->{'excel_color'}->{$color};
1028        }
1029        elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1030        {
1031            my $color=$1;
1032            if (!defined $self->{'excel_color'}->{$color})
1033            {
1034                # find out the last custom color used and increment it
1035                my $max=19; # we are not going to use a color less than 20
1036                foreach my $k (keys %{$self->{'excel_color'}}) {($k > $max) ? ($max=$k) :1}
1037                $max++;
1038                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1039            }
1040            $fgcolor=$self->{'excel_color'}->{$color};
1041        }
1042    
1043        if (!defined $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor})
1044        {
1045            $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}=$self->{'excel_workbook'}->add_format();
1046            if ($th) {$self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->copy($self->{'excel_format'}->{header})}
1047            else {$self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->copy($self->{'excel_format'}->{default})}
1048            $center && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_align($center);
1049            $bgcolor && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_bg_color($bgcolor);
1050            $fgcolor && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_color($fgcolor);
1051        }
1052    
1053        return ($cell, $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor});
1054    }
1055    
1056    
1057    =head1 clean_excel_cell
1058    
1059    Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1060    
1061    =cut
1062    
1063    sub clean_excel_cell {
1064        my ($self, $cell)=@_;
1065        if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1066    
1067        # some cells have something like this:
1068        # <a  onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1069        # we don't want those, but we do want the ones that have a real url hidden here.
1070        # so remove the mouseover part, and then see what is left
1071        if ($cell =~ s/onMouseover\=\".*?\"//)
1072        {
1073            if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1074        }
1075    
1076    
1077        if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1078        {
1079            # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1080            my ($url, $link)=($1, $2);
1081            $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1082            unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1083            # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1084            $link =~ s/\<.*?\>//g;
1085            $cell=[$url, $link];
1086        }
1087        elsif ($cell =~ /<input type/)
1088        {
1089            if ($cell =~ /value='(.*?)'/) {$cell = $1}
1090            elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1091        }
1092        else
1093        {
1094            # this is all the html that I don't know what to do with, like <input type=...>
1095            $cell =~ s/\<.*?\>//g;
1096        }
1097        return $cell;
1098    }
1099    
1100    
1101    
1102    
1103  1;  1;
1104    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.32

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3