[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.35, Tue Jun 6 02:25:37 2006 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
26    
27     For questions about this email RobE@theFIG.info
28    
29  =cut  =cut
30    
31  package raelib;  package raelib;
32  use strict;  use strict;
33    use Bio::SeqIO;
34    use Bio::Seq;
35    use Bio::Tools::SeqStats;
36    use Bio::SeqFeature::Generic;
37    
38    # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
39    # so we use an eval and set the boolean if we are cool.
40    my $useexcel;
41    BEGIN {
42        use lib '/home/seed/Rob/perl/lib/perl5/site_perl/5.8.5/';
43        eval "use Spreadsheet::WriteExcel";
44        unless ($@) {$useexcel=1}
45    }
46    
47    
48  use FIG;  use FIG;
49  my $fig=new FIG;  my $fig=new FIG;
50    
# Line 21  Line 55 
55  =cut  =cut
56    
57  sub new {  sub new {
58   my $self=shift;   my ($class)=@_;
59   return $self   my $self={};
60     $self->{'useexcel'}=1 if ($useexcel);
61     return bless $self, $class;
62  }  }
63    
64    
# Line 63  Line 99 
99  }  }
100    
101    
102    =head2 mol_wt
103    
104    Calculate the molecular weight of a protein.
105    
106    This just offlaods the calculation to BioPerl, which is probably dumb since we need to load the whole of bioperl in just for this, but I don't have time to rewrite their method now, and I am not going to copy and paste it since I didn't write it :)
107    
108    my ($lower, $upper)=$raelib->mol_wt($peg);
109    
110    $lower is the lower bound for the possible mw, upper is the upper bound.
111    
112    =cut
113    
114    sub mol_wt {
115        my ($self, $fid)=@_;
116        my $sobj=Bio::Seq->new(-seq => $fig->get_translation($fid), -id => $fid);
117        return Bio::Tools::SeqStats->get_mol_wt($sobj);
118    }
119    
120    
121  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
122    
123   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
124    
125   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
126    
127    This method takes three arguments:
128       from    : pirsfinfo.dat file
129       to      : file to write information to
130       verbose : report on progress
131    
132    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
133    
134    Returns the number of lines in the pirsinfo file that were read.
135    
136  =cut  =cut
137    
138  sub pirsfcorrespondence {  sub pirsfcorrespondence {
139   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
140   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
141      print STDERR "File $from does not exist as called in $0\n";
142      return 0;
143     }
144     if ($from =~ /\.gz$/) {
145      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
146     }
147     else {
148   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
149   open (OUT, ">$to") || die "Can't write tot $to";   }
150     open (OUT, ">$to") || die "Can't write to $to";
151     my $linecount;
152   while (<IN>) {   while (<IN>) {
153      $linecount++;
154      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
155    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
156    chomp;    chomp;
157    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
158     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
159    }    }
   unless ($done) {print OUT $_, "\t\n"}  
160   }   }
161   close IN;   close IN;
162   close OUT;   close OUT;
163     return $linecount;
164    }
165    
166    =head2 uniprotcorrespondence
167    
168    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
169    
170    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
171    
172     This method takes three arguments:
173       from    : uniprotKB file
174       to      : file to write information to
175       verbose : report on progress
176    
177    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
178    
179     Returns the number of lines in the uniprotkb file that were read.
180    
181    =cut
182    
183    sub uniprotcorrespondence {
184     my ($self, $from, $to, $verbose)=@_;
185     unless (-e $from) {
186      print STDERR "File $from does not exist as called in $0\n";
187      return 0;
188     }
189     if ($from =~ /\.gz$/) {
190      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
191     }
192     else {
193      open (IN, $from) || die "Can't open $from";
194     }
195     open (OUT, ">$to") || die "Can't write to $to";
196     my $linecount;
197     while (<IN>) {
198      chomp;
199      $linecount++;
200      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
201      my @line=split /\s+\|\s+/;
202      my $added;
203      foreach my $peg ($self->swiss_pir_ids($line[0])) {
204       print OUT "$_ | $peg\n";
205       $added=1;
206      }
207      unless ($added) {print OUT "$_\n"}
208     }
209     close IN;
210     close OUT;
211     return $linecount;
212    }
213    
214    =head2 prositecorrespondence
215    
216    Generate a correspondence table between prosite and seed using sp id's and seed ids.
217    
218    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
219    
220    The output file will have the following columns:
221    
222    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
223    
224    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
225    
226     This method takes three arguments:
227       from    : prosite file
228       to      : file to write information to
229       verbose : report on progress
230    
231    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
232    
233     Returns the number of lines in the prosite file that were read.
234    
235    =cut
236    
237    sub prositecorrespondence {
238     my ($self, $from, $to, $verbose)=@_;
239     unless (-e $from) {
240      print STDERR "File $from does not exist as called in $0\n";
241      return 0;
242     }
243     if ($from =~ /\.gz$/) {
244      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
245     }
246     else {
247      open (IN, $from) || die "Can't open $from";
248     }
249     open (OUT, ">$to") || die "Can't write to $to";
250     my $linecount;
251     my ($famac, $famname, $famtype)=('','','');
252     while (<IN>) {
253      chomp;
254      $linecount++;
255      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
256      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
257      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
258      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
259      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
260      #
261      # this is the format of the DR lines:
262      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
263      s/^DR\s*//;
264      foreach my $piece (split /\s*\;\s*/, $_) {
265       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
266       foreach my $fig ($self->swiss_pir_ids($acc)) {
267        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
268       }
269      }
270  }  }
271    }
272    
273    =head2 swiss_pir_ids()
274    
275    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
276    
277    =cut
278    
279    sub swiss_pir_ids {
280     my ($self, $id)=@_;
281     return () unless ($id);
282     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
283    
284     my @return=($fig->by_alias("uni|$id"));
285     return @return if ($return[0]);
286    
287     @return=($fig->by_alias("tr|$id"));
288     return @return if ($return[0]);
289    
290     @return=($fig->by_alias("sp|$id"));
291     return @return if ($return[0]);
292    
293     return ();
294    }
295    
296  =head2 ss_by_id  =head2 ss_by_id
297    
# Line 173  Line 369 
369   my @return;   my @return;
370   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
371   foreach my $attr (@attr) {   foreach my $attr (@attr) {
372    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
373    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
374   }   }
375   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 426 
426    
427  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
428    
429  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default, $limit);
430    
431  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
432    
433    default will set a default to override (maybe) korgs
434    
435    limit is a reference to an array of organism IDs that you want to limit the list to.
436    
437  =cut  =cut
438    
439  sub scrolling_org_list {  sub scrolling_org_list {
440   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default, $limit)=@_;
441   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
442    
443   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 285  Line 485 
485   my $orgs; my $label;   my $orgs; my $label;
486   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );
487    
488     # limit the list of organisms to a selected few if required
489     if ($limit)
490     {
491        my %lim=map {($_=>1)} @$limit;
492        my $norg;
493        foreach my $o (@$orgs) {push @$norg, $o if ($lim{$o})}
494        $orgs=$norg;
495    }
496    
497   foreach (@$orgs) {   foreach (@$orgs) {
498     my $gs = $fig->genus_species($_);     my $gs = $fig->genus_species($_);
499     my $gc=scalar $fig->all_contigs($_);     if ($fig->genome_domain($_) ne "Environmental Sample")
500       {
501        my $gc=$fig->number_of_contigs($_);
502     $label->{$_} = "$gs ($_) [$gc contigs]";     $label->{$_} = "$gs ($_) [$gc contigs]";
503    }    }
504       else
505       {
506        $label->{$_} = "$gs ($_) ";
507       }
508      }
509    
510   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
511    
# Line 303  Line 519 
519                                          -labels   => $label,                                          -labels   => $label,
520                                          -size     => 10,                                          -size     => 10,
521                                          -multiple => $multiple,                                          -multiple => $multiple,
522                                            -default  => $default,
523                                        ), $cgi->br,                                        ), $cgi->br,
524                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
525                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 530 
530                    "</TD>",                    "</TD>",
531                    "   </TR>\n",                    "   </TR>\n",
532                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
533          );          );
534  }  }
535    
536    
537    =head2 scrolling_subsys_list
538    
539    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
540    
541    use like this
542    
543    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
544    
545    =cut
546    
547    sub scrolling_subsys_list {
548     my ($self, $cgi, $multiple)=@_;
549     $multiple=0 unless (defined $multiple);
550     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
551     my $label;
552     # generate labels for the list
553     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
554     return $cgi->scrolling_list(
555      -name    => 'subsystems',
556      -values  => \@ss,
557      -labels  => $label,
558      -size    => 10,
559      -multiple=> $multiple,
560     );
561    }
562    
563    =head2 subsys_names_for_display
564    
565    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
566    
567    $raelib->subsys_names_for_display(@ss);
568    or
569    $raelib->subsys_names_for_display($fig->all_subsystems());
570    
571    =cut
572    
573    sub subsys_names_for_display {
574     my ($self, @ss)=@_;
575     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
576     return @ss;
577    }
578    
579    =head2 GenBank
580    
581     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
582     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
583     at building the object.
584    
585     Note that you need to call this with the genome name and the contig. This will then go through that contig.
586    
587     Something like this should work
588    
589     foreach my $contig ($fig->all_contigs($genome)) {
590      my $seqobj=FIGRob->GenBank($genome, $contig);
591      # process the contig
592     }
593    
594    =cut
595    
596    sub GenBank {
597     my ($self, $genome, $contig)=@_;
598     my $gs=$fig->genus_species($genome);
599     return unless ($gs);
600     unless ($contig) {
601      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
602      return;
603     }
604     my $len=$fig->contig_ln($genome, $contig);
605     unless ($len) {
606      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
607      return;
608     }
609    
610    
611     # first find all the pegs ...
612     my $features; # all the features in the genome
613     my $allpegs; # all the pegs
614     my $translation; # all the protein sequences
615     foreach my $peg ($fig->pegs_of($genome)) {
616      my @location=$fig->feature_location($peg);
617      my $func=$fig->function_of($peg);
618      foreach my $loc (@location) {
619       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
620       my ($cg, $start, $stop)=($1, $2, $3);
621       next unless ($cg eq $contig);
622       # save this information for later
623       $features->{'peg'}->{$loc}=$func;
624       $allpegs->{'peg'}->{$loc}=$peg;
625       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
626      }
627     }
628     # ... and all the RNAs
629     foreach my $peg ($fig->rnas_of($genome)) {
630      my @location=$fig->feature_location($peg);
631      my $func=$fig->function_of($peg);
632      foreach my $loc (@location) {
633       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
634       my ($cg, $start, $stop)=($1, $2, $3);
635       next unless ($cg eq $contig);
636       # save this information for later
637       $features->{'rna'}->{$loc}=$func;
638       $allpegs->{'rna'}->{$loc}=$peg;
639      }
640     }
641    
642    
643     # now get all the contigs out
644     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
645     my $description = "Contig $contig from " . $fig->genus_species($genome);
646     my $sobj=Bio::Seq->new(
647              -seq              =>  $seq,
648              -id               =>  $contig,
649              -desc             =>  $description,
650              -accession_number =>  $genome
651              );
652     foreach my $prot (keys %{$features->{'peg'}}) {
653       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
654       my ($cg, $start, $stop)=($1, $2, $3);
655       my $strand=1;
656       if ($stop < $start) {
657        ($stop, $start)=($start, $stop);
658        $strand=-1;
659     }
660    
661     my $feat=Bio::SeqFeature::Generic->new(
662            -start         =>  $start,
663            -end           =>  $stop,
664            -strand        =>  $strand,
665            -primary       =>  'CDS',
666            -display_name  =>  $allpegs->{'peg'}->{$prot},
667            -source_tag    =>  'the SEED',
668            -tag           =>
669                           {
670                           db_xref     =>   $allpegs->{'peg'}->{$prot},
671                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
672                           function    =>  $features->{'peg'}->{$prot},
673                           translation =>  $translation->{'peg'}->{$prot}
674                          }
675           );
676    
677       $sobj->add_SeqFeature($feat);
678     }
679    
680     foreach my $prot (keys %{$features->{'rna'}}) {
681       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
682       my ($cg, $start, $stop)=($1, $2, $3);
683       my $strand=1;
684       if ($stop < $start) {
685        ($stop, $start)=($start, $stop);
686        $strand=-1;
687       }
688    
689       my $feat=Bio::SeqFeature::Generic->new(
690            -start         =>  $start,
691            -end           =>  $stop,
692            -strand        =>  $strand,
693            -primary       =>  'RNA',
694            -source_tag    =>  'the SEED',
695            -display_name  =>  $allpegs->{'rna'}->{$prot},
696            -tag           =>
697                          {
698                           db_xref     =>   $allpegs->{'rna'}->{$prot},
699                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
700                           function    =>  $features->{'rna'}->{$prot},
701                          }
702           );
703    
704      $sobj->add_SeqFeature($feat);
705     }
706     return $sobj;
707    }
708    
709    =head2 best_hit
710    
711     Returns the FIG id of the single best hit to a peg
712    
713     eg
714    
715     my $bh=$fr->best_hit($peg);
716     print 'function is ', scalar $fig->function_of($bh);
717    
718    =cut
719    
720    sub best_hit {
721     my ($self, $peg)=@_;
722     return unless ($peg);
723    
724     my ($maxN, $maxP)=(1, 1e-5);
725     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
726     return ${$sims[0]}[1];
727    }
728    
729    
730    =head1 read_fasta
731    
732    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
733    
734    Usage:
735    my $fasta=$raelib->read_fasta($file);
736    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
737    
738    =cut
739    
740    sub read_fasta {
741     my ($self, $file, $keepcomments)=@_;
742     open (IN, $file) || die "Can't open $file";
743     my %f; my $t; my $s; my %c;
744     while (<IN>) {
745      chomp;
746      if (/^>/) {
747       if ($s) {
748        $f{$t}=$s;
749        undef $s;
750       }
751       s/^>(\S+)\s*//;
752       $t=$1;
753       $c{$t}=$_ if ($_);
754      }
755      else {$s .= $_}
756     }
757     $f{$t}=$s;
758     if ($keepcomments) {return (\%f, \%c)}
759     else {return \%f}
760    }
761    
762    =head1 rc
763    
764    Reverse complement. It's too easy.
765    
766    =cut
767    
768    sub rc {
769     my ($self, $seq)=@_;
770     $seq=~tr/GATCgatc/CTAGctag/;
771     $seq = reverse $seq;
772     return $seq;
773    }
774    
775    
776    =head2 cookies
777    
778    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
779    
780    If you do not pass any arguments the whole cookie will be returned.
781    
782    Use as follows:
783    
784    ($cookie, $data) = raelib->cookie($cgi, \%data);
785    
786    You do not need to pass in any data, in that case you will just get the cookie back
787    
788    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
789    
790    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
791    
792    =cut
793    
794    sub cookie {
795     my ($self, $cgi, $input)=@_;
796     return unless ($cgi);
797     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
798    
799     # first, create a hash from the existing cookie data
800     my $cookie;
801     map {
802      my ($kname, $kvalue)=split /\=/, $_;
803      $cookie->{$kname}=$kvalue;
804     } split /\;/, $self->{'cookie'};
805    
806     if ($input)
807     {
808      # add the values that were passed in
809      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
810      # put everything back together and set the cookie
811      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
812      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
813     }
814    
815     return ($self->{'cookie'}, $cookie);
816    }
817    
818    
819    =head1 is_number
820    
821    returns 1 if the argument is a number, and 0 if not. This is taken directly from the perl cookbook.
822    
823    =cut
824    
825    sub is_number {
826        my ($self, $no)=@_;
827        return 1 if ($no =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/); # Perl cookbook, page 44
828        return 0;
829    }
830    
831    
832    
833    =head1 commify
834    
835    Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
836    
837    =cut
838    
839    sub commify {
840        my($self,$n) = @_;
841        my(@n) = ();
842        my($i);
843    
844        for ($i = (length($n) - 3); ($i > 0); $i -= 3)
845        {
846            unshift(@n,",",substr($n,$i,3));
847        }
848        unshift(@n,substr($n,0,$i+3));
849        return join("",@n);
850    }
851    
852    
853    =head1 tab2excel
854    
855    This is experimental as of May, 2006.
856    
857    There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
858    
859    At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
860    
861    The issues are:
862        1. creating the excel file
863        2. reading through @$tab and presenting the data
864        3. Checking @$tab because each element can be a reference to an array with color or formatting information
865    
866    Formatting
867    
868    A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
869    
870    
871    Usage:
872        Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
873        and second, the additional option "filename" that is the filename to be written;
874    
875        $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
876    
877        The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
878    
879    Returns:
880        A link to the file in the format
881            <p><a href="...">filename</a> [Download Excel file]</p>
882    
883    Note that there are four separate methods:
884        1. tab2excel is the method for a single call from HTML::make_table
885            this will make an excel file, fill it, and return the link;
886        2. make_excel_workbook is the method that instantiates a file
887        3. make_excel_worksheet is the method that actually populates the file
888            this loads all the data into the excel file, but if you know what you are doing you can call this many times,
889            each with a different spreadsheet
890        4. close_excel_file
891            this closes the file and writes it. It is what returns the link.
892    
893        tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
894        See subsys.cgi for a more complex involvement of this!
895    
896    
897    =cut
898    
899    sub tab2excel {
900        my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
901        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
902        $self->make_excel_workbook($filename, $options);
903        $self->make_excel_worksheet($col_hdrs, $tab, $title);
904        return $self->close_excel_file();
905    }
906    
907    =head1 make_excel_workbook
908    
909    This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
910    
911    =cut
912    
913    sub make_excel_workbook {
914        my($self, $filename, $options)=@_;
915        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
916    
917        $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
918        unless ($filename =~ /\.xls$/) {$filename .=".xls"}
919        $self->{'excel_short_filename'}=$filename;
920        $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
921    
922        # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
923        # that can be added with "Insert->new worksheet" from the menus.
924        # Create a new workbook called simple.xls and add a worksheet
925    
926        # instantiate the workbook
927        $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
928        $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
929    
930        # define the default formats
931        my $border = defined $options->{border} ? $options->{border} : 0;
932        $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
933    }
934    
935    
936    =head1 make_excel_worksheet()
937    
938    This is the method that makes the separate sheets in the file. You can add as many of these as you want.
939    
940    =cut
941    
942    sub make_excel_worksheet {
943        my($self, $col_hdrs, $tab, $title)=@_;
944        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
945        unless (defined $self->{'excel_workbook'})
946        {
947            print STDERR "The workbook was not defined. Couldn't fill it in\n";
948            return;
949        }
950    
951        my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
952        # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
953        # column are zero indexed
954    
955        # write the column headers
956        # define a new format that is bold
957        $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
958        $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
959        $self->{'excel_format'}->{header}->set_bold();
960    
961        for my $i (0 .. $#$col_hdrs)
962        {
963            my $cell=$self->clean_excel_cell($col_hdrs->[$i]);
964            $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
965        }
966    
967        # now loop through the table and write them out. Remember to break on array refs
968        my $row_idx=1;
969        my $col_idx=0;
970        foreach my $row (@$tab)
971        {
972            foreach my $cell (@$row)
973            {
974                my $useformat=$self->{'excel_format'}->{default};
975                if (ref($cell) eq "ARRAY")
976                {
977                    ($cell, $useformat)=$self->parse_cell($cell);
978                }
979    
980                $cell=$self->clean_excel_cell($cell);
981                # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
982                # in this case we have a reference to an array, and we'll use  a slight modification on the process
983                if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
984                else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
985    
986                # increment to the next column
987                $col_idx++;
988            }
989            # new line, and start of line
990            $row_idx++;
991            $col_idx=0;
992        }
993    }
994    
995    
996    
997    
998    =head1 close_excel_file()
999    
1000    We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
1001    
1002    =cut
1003    
1004    sub close_excel_file{
1005        my ($self)=@_;
1006    
1007        # close the workbook. this writes the files
1008        $self->{'excel_workbook'}->close();
1009    
1010        # now generate the link to return
1011        my $size=(stat($self->{'excel_filename'}))[7];
1012        $size=int($size/1000);
1013        my $link="<p><a href=\"".$fig->temp_url."/".$self->{'excel_short_filename'}.'">'.
1014                $self->{'excel_short_filename'}."</a> [Download table in Excel format. $size kb]</p>\n";
1015        return $link;
1016    
1017    }
1018    
1019    
1020    
1021    
1022    
1023    
1024    
1025    
1026    
1027    =head2 parse_cell()
1028    
1029    A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
1030    
1031    Requires the cell and the current $format.
1032    
1033    When applied to <td> the default formats that we'll deal with at the moment are
1034         align=
1035         background-color=
1036         color=
1037         bgcolor=
1038    
1039    Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
1040    
1041    The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
1042    
1043    =cut
1044    
1045    sub parse_cell {
1046        my ($self, $arr)=@_;
1047        return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1048        my ($cell, $tag)=@$arr;
1049        $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1050    
1051        # we are going to define a series of formats that we can apply, this will have  a key that is
1052        # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1053    
1054        my ($th, $center, $bgcolor, $fgcolor)=(undef, undef, undef, undef);
1055    
1056        if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1057        if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1058    
1059        # get rid of white tags because I don't care about them
1060        $tag =~ s/color\=.\#FFFFFF/ /ig;
1061    
1062        if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1063        {
1064            my $color=$1;
1065            if (!defined $self->{'excel_color'}->{$color})
1066            {
1067                # find out the last custom color used and increment it
1068                my $max=19; # we are not going to use a color less than 20
1069                foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1070                $max++;
1071                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1072            }
1073            $bgcolor=$self->{'excel_color'}->{$color};
1074        }
1075        elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1076        {
1077            my $color=$1;
1078            if (!defined $self->{'excel_color'}->{$color})
1079            {
1080                # find out the last custom color used and increment it
1081                my $max=19; # we are not going to use a color less than 20
1082                foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1083                $max++;
1084                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1085            }
1086            $fgcolor=$self->{'excel_color'}->{$color};
1087        }
1088    
1089        if (!defined $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor})
1090        {
1091            $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}=$self->{'excel_workbook'}->add_format();
1092            if ($th) {$self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->copy($self->{'excel_format'}->{header})}
1093            else {$self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->copy($self->{'excel_format'}->{default})}
1094            $center && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_align($center);
1095            $bgcolor && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_bg_color($bgcolor);
1096            $fgcolor && $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor}->set_color($fgcolor);
1097        }
1098    
1099        return ($cell, $self->{'excel_format'}->{$th.$center.$bgcolor.$fgcolor});
1100    }
1101    
1102    
1103    =head1 clean_excel_cell
1104    
1105    Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1106    
1107    =cut
1108    
1109    sub clean_excel_cell {
1110        my ($self, $cell)=@_;
1111        if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1112    
1113        # some cells have something like this:
1114        # <a  onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1115        # we don't want those, but we do want the ones that have a real url hidden here.
1116        # so remove the mouseover part, and then see what is left
1117        if ($cell =~ s/onMouseover\=\".*?\"//)
1118        {
1119            if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1120        }
1121    
1122    
1123        if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1124        {
1125            # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1126            my ($url, $link)=($1, $2);
1127            $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1128            unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1129            # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1130            $link =~ s/\<.*?\>//g;
1131            $cell=[$url, $link];
1132        }
1133        elsif ($cell =~ /<input type/)
1134        {
1135            if ($cell =~ /value='(.*?)'/) {$cell = $1}
1136            elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1137        }
1138        else
1139        {
1140            # this is all the html that I don't know what to do with, like <input type=...>
1141            $cell =~ s/\<.*?\>//g;
1142        }
1143        return $cell;
1144    }
1145    
1146    
1147    
1148    
1149  1;  1;
1150    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.35

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3