[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.42, Sun Jan 14 18:15:43 2007 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
26    
27     For questions about this email RobE@theFIG.info
28    
29  =cut  =cut
30    
31  package raelib;  package raelib;
32  use strict;  use strict;
33    use Bio::SeqIO;
34    use Bio::Seq;
35    use Bio::Tools::SeqStats;
36    use Bio::SeqFeature::Generic;
37    
38    # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
39    # so we use an eval and set the boolean if we are cool.
40    my $useexcel;
41    my $excelfile;
42    my $excelfilelink="";
43    BEGIN {
44        eval "use Spreadsheet::WriteExcel";
45        unless ($@) {$useexcel=1}
46    }
47    
48    END {
49        my $self=shift;
50        if ($useexcel && $excelfile) {&close_excel_file($excelfile)}
51    }
52    
53    
54  use FIG;  use FIG;
55  my $fig=new FIG;  my $fig=new FIG;
56    
# Line 21  Line 61 
61  =cut  =cut
62    
63  sub new {  sub new {
64   my $self=shift;   my ($class)=@_;
65   return $self   my $self={};
66     $self->{'useexcel'}=1 if ($useexcel);
67     return bless $self, $class;
68  }  }
69    
70    
# Line 63  Line 105 
105  }  }
106    
107    
108    =head2 mol_wt
109    
110    Calculate the molecular weight of a protein.
111    
112    This just offlaods the calculation to BioPerl, which is probably dumb since we need to load the whole of bioperl in just for this, but I don't have time to rewrite their method now, and I am not going to copy and paste it since I didn't write it :)
113    
114    my ($lower, $upper)=$raelib->mol_wt($peg);
115    
116    $lower is the lower bound for the possible mw, upper is the upper bound.
117    
118    =cut
119    
120    sub mol_wt {
121        my ($self, $fid)=@_;
122        my $sobj=Bio::Seq->new(-seq => $fig->get_translation($fid), -id => $fid);
123        return Bio::Tools::SeqStats->get_mol_wt($sobj);
124    }
125    
126    
127  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
128    
129   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
130    
131   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
132    
133    This method takes three arguments:
134       from    : pirsfinfo.dat file
135       to      : file to write information to
136       verbose : report on progress
137    
138    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
139    
140    Returns the number of lines in the pirsinfo file that were read.
141    
142  =cut  =cut
143    
144  sub pirsfcorrespondence {  sub pirsfcorrespondence {
145   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
146   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
147      print STDERR "File $from does not exist as called in $0\n";
148      return 0;
149     }
150     if ($from =~ /\.gz$/) {
151      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
152     }
153     else {
154   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
155   open (OUT, ">$to") || die "Can't write tot $to";   }
156     open (OUT, ">$to") || die "Can't write to $to";
157     my $linecount;
158   while (<IN>) {   while (<IN>) {
159      $linecount++;
160      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
161    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
162    chomp;    chomp;
163    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
164     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
165    }    }
   unless ($done) {print OUT $_, "\t\n"}  
166   }   }
167   close IN;   close IN;
168   close OUT;   close OUT;
169     return $linecount;
170    }
171    
172    =head2 uniprotcorrespondence
173    
174    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
175    
176    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
177    
178     This method takes three arguments:
179       from    : uniprotKB file
180       to      : file to write information to
181       verbose : report on progress
182    
183    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
184    
185     Returns the number of lines in the uniprotkb file that were read.
186    
187    =cut
188    
189    sub uniprotcorrespondence {
190     my ($self, $from, $to, $verbose)=@_;
191     unless (-e $from) {
192      print STDERR "File $from does not exist as called in $0\n";
193      return 0;
194     }
195     if ($from =~ /\.gz$/) {
196      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
197     }
198     else {
199      open (IN, $from) || die "Can't open $from";
200     }
201     open (OUT, ">$to") || die "Can't write to $to";
202     my $linecount;
203     while (<IN>) {
204      chomp;
205      $linecount++;
206      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
207      my @line=split /\s+\|\s+/;
208      my $added;
209      foreach my $peg ($self->swiss_pir_ids($line[0])) {
210       print OUT "$_ | $peg\n";
211       $added=1;
212      }
213      unless ($added) {print OUT "$_\n"}
214     }
215     close IN;
216     close OUT;
217     return $linecount;
218  }  }
219    
220    =head2 prositecorrespondence
221    
222    Generate a correspondence table between prosite and seed using sp id's and seed ids.
223    
224    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
225    
226    The output file will have the following columns:
227    
228    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
229    
230    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
231    
232     This method takes three arguments:
233       from    : prosite file
234       to      : file to write information to
235       verbose : report on progress
236    
237    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
238    
239     Returns the number of lines in the prosite file that were read.
240    
241    =cut
242    
243    sub prositecorrespondence {
244     my ($self, $from, $to, $verbose)=@_;
245     unless (-e $from) {
246      print STDERR "File $from does not exist as called in $0\n";
247      return 0;
248     }
249     if ($from =~ /\.gz$/) {
250      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
251     }
252     else {
253      open (IN, $from) || die "Can't open $from";
254     }
255     open (OUT, ">$to") || die "Can't write to $to";
256     my $linecount;
257     my ($famac, $famname, $famtype)=('','','');
258     while (<IN>) {
259      chomp;
260      $linecount++;
261      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
262      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
263      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
264      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
265      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
266      #
267      # this is the format of the DR lines:
268      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
269      s/^DR\s*//;
270      foreach my $piece (split /\s*\;\s*/, $_) {
271       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
272       foreach my $fig ($self->swiss_pir_ids($acc)) {
273        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
274       }
275      }
276     }
277    }
278    
279    =head2 swiss_pir_ids()
280    
281    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
282    
283    =cut
284    
285    sub swiss_pir_ids {
286     my ($self, $id)=@_;
287     return () unless ($id);
288     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
289    
290     my @return=($fig->by_alias("uni|$id"));
291     return @return if ($return[0]);
292    
293     @return=($fig->by_alias("tr|$id"));
294     return @return if ($return[0]);
295    
296     @return=($fig->by_alias("sp|$id"));
297     return @return if ($return[0]);
298    
299     return ();
300    }
301    
302  =head2 ss_by_id  =head2 ss_by_id
303    
# Line 173  Line 375 
375   my @return;   my @return;
376   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
377   foreach my $attr (@attr) {   foreach my $attr (@attr) {
378    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
379    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
380   }   }
381   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 432 
432    
433  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
434    
435  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default, $limit);
436    
437  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
438    
439    default will set a default to override (maybe) korgs
440    
441    limit is a reference to an array of organism IDs that you want to limit the list to.
442    
443  =cut  =cut
444    
445  sub scrolling_org_list {  sub scrolling_org_list {
446   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default, $limit)=@_;
447   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
448    
449   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 285  Line 491 
491   my $orgs; my $label;   my $orgs; my $label;
492   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );
493    
494     # limit the list of organisms to a selected few if required
495     if ($limit)
496     {
497        my %lim=map {($_=>1)} @$limit;
498        my $norg;
499        foreach my $o (@$orgs) {push @$norg, $o if ($lim{$o})}
500        $orgs=$norg;
501    }
502    
503   foreach (@$orgs) {   foreach (@$orgs) {
504     my $gs = $fig->genus_species($_);     my $gs = $fig->genus_species($_);
505     my $gc=scalar $fig->all_contigs($_);     if ($fig->genome_domain($_) ne "Environmental Sample")
506       {
507        my $gc=$fig->number_of_contigs($_);
508     $label->{$_} = "$gs ($_) [$gc contigs]";     $label->{$_} = "$gs ($_) [$gc contigs]";
509    }    }
510       else
511       {
512        $label->{$_} = "$gs ($_) ";
513       }
514      }
515    
516   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
517    
# Line 303  Line 525 
525                                          -labels   => $label,                                          -labels   => $label,
526                                          -size     => 10,                                          -size     => 10,
527                                          -multiple => $multiple,                                          -multiple => $multiple,
528                                            -default  => $default,
529                                        ), $cgi->br,                                        ), $cgi->br,
530                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
531                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 536 
536                    "</TD>",                    "</TD>",
537                    "   </TR>\n",                    "   </TR>\n",
538                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
539          );          );
540  }  }
541    
542    
543    =head2 scrolling_subsys_list
544    
545    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
546    
547    use like this
548    
549    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
550    
551    =cut
552    
553    sub scrolling_subsys_list {
554     my ($self, $cgi, $multiple)=@_;
555     $multiple=0 unless (defined $multiple);
556     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
557     my $label;
558     # generate labels for the list
559     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
560     return $cgi->scrolling_list(
561      -name    => 'subsystems',
562      -values  => \@ss,
563      -labels  => $label,
564      -size    => 10,
565      -multiple=> $multiple,
566     );
567    }
568    
569    =head2 subsys_names_for_display
570    
571    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
572    
573    $raelib->subsys_names_for_display(@ss);
574    or
575    $raelib->subsys_names_for_display($fig->all_subsystems());
576    
577    =cut
578    
579    sub subsys_names_for_display {
580     my ($self, @ss)=@_;
581     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
582     return @ss;
583    }
584    
585    =head2 GenBank
586    
587     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
588     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
589     at building the object.
590    
591     Note that you need to call this with the genome name and the contig. This will then go through that contig.
592    
593     Something like this should work
594    
595     foreach my $contig ($fig->all_contigs($genome)) {
596      my $seqobj=FIGRob->GenBank($genome, $contig);
597      # process the contig
598     }
599    
600    =cut
601    
602    sub GenBank {
603     my ($self, $genome, $contig)=@_;
604     my $gs=$fig->genus_species($genome);
605     return unless ($gs);
606     unless ($contig) {
607      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
608      return;
609     }
610     my $len=$fig->contig_ln($genome, $contig);
611     unless ($len) {
612      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
613      return;
614     }
615    
616    
617     # first find all the pegs ...
618     my $features; # all the features in the genome
619     my $allpegs; # all the pegs
620     my $translation; # all the protein sequences
621     foreach my $peg ($fig->pegs_of($genome)) {
622      my @location=$fig->feature_location($peg);
623      my $func=$fig->function_of($peg);
624      foreach my $loc (@location) {
625       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
626       my ($cg, $start, $stop)=($1, $2, $3);
627       next unless ($cg eq $contig);
628       # save this information for later
629       $features->{'peg'}->{$loc}=$func;
630       $allpegs->{'peg'}->{$loc}=$peg;
631       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
632      }
633     }
634     # ... and all the RNAs
635     foreach my $peg ($fig->rnas_of($genome)) {
636      my @location=$fig->feature_location($peg);
637      my $func=$fig->function_of($peg);
638      foreach my $loc (@location) {
639       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
640       my ($cg, $start, $stop)=($1, $2, $3);
641       next unless ($cg eq $contig);
642       # save this information for later
643       $features->{'rna'}->{$loc}=$func;
644       $allpegs->{'rna'}->{$loc}=$peg;
645      }
646     }
647    
648    
649     # now get all the contigs out
650     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
651     my $description = "Contig $contig from " . $fig->genus_species($genome);
652     my $sobj=Bio::Seq->new(
653              -seq              =>  $seq,
654              -id               =>  $contig,
655              -desc             =>  $description,
656              -accession_number =>  $genome
657              );
658     foreach my $prot (keys %{$features->{'peg'}}) {
659       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
660       my ($cg, $start, $stop)=($1, $2, $3);
661       my $strand=1;
662       if ($stop < $start) {
663        ($stop, $start)=($start, $stop);
664        $strand=-1;
665     }
666    
667     my $feat=Bio::SeqFeature::Generic->new(
668            -start         =>  $start,
669            -end           =>  $stop,
670            -strand        =>  $strand,
671            -primary       =>  'CDS',
672            -display_name  =>  $allpegs->{'peg'}->{$prot},
673            -source_tag    =>  'the SEED',
674            -tag           =>
675                           {
676                           db_xref     =>   $allpegs->{'peg'}->{$prot},
677                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
678                           function    =>  $features->{'peg'}->{$prot},
679                           translation =>  $translation->{'peg'}->{$prot}
680                          }
681           );
682    
683       $sobj->add_SeqFeature($feat);
684     }
685    
686     foreach my $prot (keys %{$features->{'rna'}}) {
687       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
688       my ($cg, $start, $stop)=($1, $2, $3);
689       my $strand=1;
690       if ($stop < $start) {
691        ($stop, $start)=($start, $stop);
692        $strand=-1;
693       }
694    
695       my $feat=Bio::SeqFeature::Generic->new(
696            -start         =>  $start,
697            -end           =>  $stop,
698            -strand        =>  $strand,
699            -primary       =>  'RNA',
700            -source_tag    =>  'the SEED',
701            -display_name  =>  $allpegs->{'rna'}->{$prot},
702            -tag           =>
703                          {
704                           db_xref     =>   $allpegs->{'rna'}->{$prot},
705                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
706                           function    =>  $features->{'rna'}->{$prot},
707                          }
708           );
709    
710      $sobj->add_SeqFeature($feat);
711     }
712     return $sobj;
713    }
714    
715    =head2 best_hit
716    
717     Returns the FIG id of the single best hit to a peg
718    
719     eg
720    
721     my $bh=$fr->best_hit($peg);
722     print 'function is ', scalar $fig->function_of($bh);
723    
724    =cut
725    
726    sub best_hit {
727     my ($self, $peg)=@_;
728     return unless ($peg);
729    
730     my ($maxN, $maxP)=(1, 1e-5);
731     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
732     return ${$sims[0]}[1];
733    }
734    
735    
736    =head1 read_fasta
737    
738    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
739    
740    Usage:
741    my $fasta=$raelib->read_fasta($file);
742    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
743    
744    =cut
745    
746    sub read_fasta {
747     my ($self, $file, $keepcomments)=@_;
748     if ($file =~ /\.gz$/) {open(IN, "gunzip -c $file|") || die "Can't open a pipe from gunzip -c $file"}
749     else {open (IN, $file) || die "Can't open $file"}
750     my %f; my $t; my $s; my %c;
751     while (<IN>) {
752      chomp;
753      if (/^>/) {
754       if ($s) {
755        $f{$t}=$s;
756        undef $s;
757       }
758       s/^>(\S+)\s*//;
759       $t=$1;
760       $c{$t}=$_ if ($_);
761      }
762      else {$s .= $_}
763     }
764     $f{$t}=$s;
765     if ($keepcomments) {return (\%f, \%c)}
766     else {return \%f}
767    }
768    
769    =head1 rc
770    
771    Reverse complement. It's too easy.
772    
773    =cut
774    
775    sub rc {
776     my ($self, $seq)=@_;
777     $seq=~tr/GATCgatc/CTAGctag/;
778     $seq = reverse $seq;
779     return $seq;
780    }
781    
782    
783    =head2 cookies
784    
785    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
786    
787    If you do not pass any arguments the whole cookie will be returned.
788    
789    Use as follows:
790    
791    ($cookie, $data) = raelib->cookie($cgi, \%data);
792    
793    You do not need to pass in any data, in that case you will just get the cookie back
794    
795    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
796    
797    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
798    
799    =cut
800    
801    sub cookie {
802     my ($self, $cgi, $input)=@_;
803     return unless ($cgi);
804     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
805    
806     # first, create a hash from the existing cookie data
807     my $cookie;
808     map {
809      my ($kname, $kvalue)=split /\=/, $_;
810      $cookie->{$kname}=$kvalue;
811     } split /\;/, $self->{'cookie'};
812    
813     if ($input)
814     {
815      # add the values that were passed in
816      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
817      # put everything back together and set the cookie
818      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
819      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
820     }
821    
822     return ($self->{'cookie'}, $cookie);
823    }
824    
825    
826    =head1 is_number
827    
828    returns 1 if the argument is a number, and 0 if not. This is taken directly from the perl cookbook.
829    
830    =cut
831    
832    sub is_number {
833        my ($self, $no)=@_;
834        return 1 if ($no =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/); # Perl cookbook, page 44
835        return 0;
836    }
837    
838    
839    
840    =head1 commify
841    
842    Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
843    
844    =cut
845    
846    sub commify {
847        my($self,$n) = @_;
848        my(@n) = ();
849        my($i);
850    
851        for ($i = (length($n) - 3); ($i > 0); $i -= 3)
852        {
853            unshift(@n,",",substr($n,$i,3));
854        }
855        unshift(@n,substr($n,0,$i+3));
856        return join("",@n);
857    }
858    
859    
860    =head1 tab2excel
861    
862    This is experimental as of May, 2006.
863    
864    There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
865    
866    At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
867    
868    The issues are:
869        1. creating the excel file
870        2. reading through @$tab and presenting the data
871        3. Checking @$tab because each element can be a reference to an array with color or formatting information
872    
873    Formatting
874    
875    A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
876    
877    
878    Usage:
879    
880        The recommended way of using this set of modules is to add the options excelfile=>"filename" to the options passed to &HTML::make_table. That should take care of EVERYTHING for you, so you should do that. You can call thinks separetly if you like, but I don't recommend it.
881    
882        Note that you can make multiple calls to the same excel file,a nd each one will get added as a new sheet.
883    
884        Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
885        and second, the additional option "filename" that is the filename to be written;
886    
887        $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
888    
889        The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
890    
891    Returns:
892        A link to the file in the format
893            <p><a href="...">filename</a> [Download Excel file]</p>
894    
895    Note that there are four separate methods:
896        1. tab2excel is the method for a single call from HTML::make_table
897            this will make an excel file, fill it, and return the link;
898        2. make_excel_workbook is the method that instantiates a file
899        3. make_excel_worksheet is the method that actually populates the file
900            this loads all the data into the excel file, but if you know what you are doing you can call this many times,
901            each with a different spreadsheet
902        4. close_excel_file
903            this closes the file and writes it. This is called in the END block, so you do not have to explicitly call it here.
904    
905        tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
906        See subsys.cgi for a more complex involvement of this!
907    
908    
909    =cut
910    
911    sub tab2excel {
912        my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
913        return "" unless ($self->{'useexcel'});
914        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
915        $self->{'excel_file_link'} = $self->make_excel_workbook($filename, $options);
916        $excelfilelink=$self->{'excel_file_link'};
917        $self->make_excel_worksheet($col_hdrs, $tab, $title);
918        return "" if ($options->{'no_excel_link'});
919        return $self->{'excel_file_link'};
920    }
921    
922    
923    =head1 excel_file_link
924    
925    Just returns the link to the file, if one has been created. If not, returns a non-breaking space (&nbsp;)
926    
927    =cut
928    
929    sub excel_file_link {
930        my $self=shift;
931        # I am not sure why, but this is not working. Perhaps because I am calling it from &HTML::make_table (i.e. not OO perl?)
932        #print STDERR "SELF: $self LINK: ",$self->{'excel_file_link'}," or \n$excelfilelink\n";
933        #return $self->{'excel_file_link'};
934        return $excelfilelink;
935    }
936    
937    
938    
939    =head1 make_excel_workbook
940    
941    This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
942    
943    This will return the link to the workbook
944    
945    =cut
946    
947    sub make_excel_workbook {
948        my($self, $filename, $options)=@_;
949        return "" unless ($self->{'useexcel'});
950        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
951    
952        $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
953        unless ($filename =~ /\.xls$/) {$filename .=".xls"}
954    
955        # now generate the link to return
956        my $link="<p><a href=\"".$fig->temp_url."/".$filename.'">'.$filename."</a> [Download table in Excel format].</p>\n";
957        # do we already have this file -- if so, just return that info
958        return  $link if ($self->{'excel_short_filename'} eq $filename); # don't do anything, just return the fact that we have the book made!
959    
960    
961        $self->{'excel_short_filename'}=$filename;
962        $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
963    
964    
965        # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
966        # that can be added with "Insert->new worksheet" from the menus.
967        # Create a new workbook called simple.xls and add a worksheet
968    
969        # instantiate the workbook
970        $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
971        $excelfile=$self->{'excel_workbook'}; # this is for the close on END
972        $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
973    
974        # define the default formats
975        my $border = defined $options->{border} ? $options->{border} : 0;
976        $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
977        return $link;
978    }
979    
980    
981    =head1 make_excel_worksheet()
982    
983    This is the method that makes the separate sheets in the file. You can add as many of these as you want.
984    
985    =cut
986    
987    sub make_excel_worksheet {
988        my($self, $col_hdrs, $tab, $title)=@_;
989        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
990        return "" unless ($self->{'useexcel'});
991    
992        unless (defined $self->{'excel_workbook'})
993        {
994            print STDERR "The workbook was not defined. Couldn't fill it in\n";
995            return;
996        }
997    
998        if (length($title) > 31) {$title=substr($title, 0, 31)}
999        my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
1000        # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
1001        # column are zero indexed
1002    
1003        # write the column headers
1004        # define a new format that is bold
1005        $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
1006        $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
1007        $self->{'excel_format'}->{header}->set_bold();
1008    
1009        for my $i (0 .. $#$col_hdrs)
1010        {
1011            my $cell;
1012            my ($useformat, $rowspan, $colspan);
1013            if (ref($col_hdrs->[$i]) eq "ARRAY") {($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($col_hdrs->[$i])}
1014            else  {$cell=$col_hdrs->[$i]}
1015            $cell=$self->clean_excel_cell($cell);
1016            $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
1017        }
1018    
1019        # now loop through the table and write them out. Remember to break on array refs
1020        # we are going to have to build the table col by col so we get the breaks in the right place
1021        # for merged cells
1022        my $row_idx=1;
1023        my $maxrow=$#$tab;
1024        my $skip;
1025        while ($row_idx <= $maxrow+1)
1026        {
1027            my @row=@{$tab->[$row_idx-1]};
1028            my $col_idx=0;
1029            foreach my $cell (@row)
1030            {
1031                while ($skip->{$row_idx}->{$col_idx}) {$col_idx++}
1032                my $useformat=$self->{'excel_format'}->{default};
1033    
1034                # there is an approach to setting color using \@bgcolor. Oh well.
1035                if ( $cell =~ /^\@([^:]+)\:(.*)$/ )
1036                {
1037                    $cell=[$2, $1];
1038                }
1039    
1040                my ($rowspan, $colspan);
1041                if (ref($cell) eq "ARRAY")
1042                {
1043                    ($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($cell);
1044                }
1045    
1046                $cell=$self->clean_excel_cell($cell);
1047    
1048                if ($rowspan > 1 || $colspan > 1)
1049                {
1050                    # handle merged cells separately
1051                    my $row_idx_to=$row_idx+$rowspan-1;
1052                    my $col_idx_to=$col_idx+$colspan-1;
1053                    # we want to not put anything in the merged cells
1054                    for (my $x=$row_idx; $x<=$row_idx_to; $x++) {$skip->{$x}->{$col_idx}=1}
1055                    for (my $y=$col_idx; $y<=$col_idx_to; $y++) {$skip->{$row_idx}->{$y}=1}
1056    
1057                    if (ref($cell) eq "ARRAY") {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, @$cell, $useformat)}
1058                    else {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, $cell, $useformat)}
1059                }
1060                else
1061                {
1062                    # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
1063                    # in this case we have a reference to an array, and we'll use  a slight modification on the process
1064                    if ((ref($cell) eq "ARRAY" && $cell->[0] eq " &nbsp; ") || $cell eq " &nbsp; ") {$worksheet->write_blank($row_idx, $col_idx, $useformat)}
1065                    else
1066                    {
1067                        if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
1068                        else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
1069                    }
1070                }
1071    
1072                # increment to the next column
1073                $col_idx++;
1074            }
1075            # new line, and start of line
1076            $row_idx++;
1077            $col_idx=0;
1078        }
1079    }
1080    
1081    
1082    
1083    
1084    =head1 close_excel_file()
1085    
1086    We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
1087    
1088    =cut
1089    
1090    sub close_excel_file{
1091        my ($workbook)=@_;
1092        return unless (defined $workbook);
1093        # close the workbook. this writes the files
1094        return $workbook->close();
1095    }
1096    
1097    
1098    
1099    
1100    
1101    
1102    
1103    
1104    
1105    =head2 parse_cell()
1106    
1107    A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
1108    
1109    Requires the cell and the current $format.
1110    
1111    When applied to <td> the default formats that we'll deal with at the moment are
1112         align=
1113         background-color=
1114         color=
1115         bgcolor=
1116    
1117    Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
1118    
1119    The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
1120    
1121    =cut
1122    
1123    sub parse_cell {
1124        my ($self, $arr)=@_;
1125        return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1126        my ($cell, $tag)=@$arr;
1127        $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1128    
1129        # we are going to define a series of formats that we can apply, this will have  a key that is
1130        # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1131    
1132        my ($th, $center, $bgcolor, $fgcolor)=(0,0,0,0);
1133    
1134        if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1135        if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1136    
1137        # get rid of white tags because I don't care about them
1138        $tag =~ s/color\=.\#FFFFFF/ /ig;
1139    
1140        if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1141        {
1142            my $color=$1;
1143            if ($color)
1144            {
1145                if (!defined $self->{'excel_color'}->{$color})
1146                {
1147    # find out the last custom color used and increment it
1148                    my $max=19; # we are not going to use a color less than 20
1149                        foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1150                    $max++;
1151                    $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1152                }
1153                $bgcolor=$self->{'excel_color'}->{$color};
1154            }
1155        }
1156        elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1157        {
1158            my $color=$1;
1159            if (!defined $self->{'excel_color'}->{$color})
1160            {
1161                # find out the last custom color used and increment it
1162                my $max=19; # we are not going to use a color less than 20
1163                foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1164                $max++;
1165                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1166            }
1167            $fgcolor=$self->{'excel_color'}->{$color};
1168        }
1169    
1170        # check and see if we span multiple rows or columns
1171        my ($rowspan, $colspan)=(1,1);
1172        if ($tag =~ /rowspan\=[\'\"]?(\d+)/) {$rowspan=$1} # these should match rowspan="4", rowspan='4', and rowspan=4
1173        if ($tag =~ /colspan\=[\'\"]?(\d+)/) {$colspan=$1}
1174    
1175        my $formatid=$th.$center.$bgcolor.$fgcolor.$rowspan.$colspan;
1176        if (!defined $self->{'excel_format'}->{$formatid})
1177        {
1178            $self->{'excel_format'}->{$formatid}=$self->{'excel_workbook'}->add_format();
1179            if ($rowspan > 1) {$self->{'excel_format'}->{$formatid}->set_align("vcenter")}
1180            else
1181            {
1182                if ($th) {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{header})}
1183                else {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{default})}
1184            }
1185            $center && $self->{'excel_format'}->{$formatid}->set_align($center);
1186            $bgcolor && $self->{'excel_format'}->{$formatid}->set_bg_color($bgcolor);
1187            $fgcolor && $self->{'excel_format'}->{$formatid}->set_color($fgcolor);
1188        }
1189    
1190        return ($cell, $self->{'excel_format'}->{$formatid}, $rowspan, $colspan);
1191    }
1192    
1193    
1194    =head1 clean_excel_cell
1195    
1196    Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1197    
1198    =cut
1199    
1200    sub clean_excel_cell {
1201        my ($self, $cell)=@_;
1202        if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1203    
1204        # some cells have something like this:
1205        # <a  onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1206        # we don't want those, but we do want the ones that have a real url hidden here.
1207        # so remove the mouseover part, and then see what is left
1208        if ($cell =~ s/onMouseover\=\".*?\"//)
1209        {
1210            if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1211        }
1212    
1213        if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1214        {
1215            # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1216            my ($url, $link)=($1, $2);
1217            $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1218            unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1219            # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1220            $link =~ s/\<.*?\>//g;
1221            $cell=[$url, $link];
1222        }
1223        elsif ($cell =~ /<input type/)
1224        {
1225            if ($cell =~ /value='(.*?)'/) {$cell = $1}
1226            elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1227        }
1228        else
1229        {
1230            # this is all the html that I don't know what to do with, like <input type=...>
1231            $cell =~ s/\<.*?\>//g;
1232        }
1233        return $cell;
1234    }
1235    
1236    
1237    
1238    
1239  1;  1;
1240    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.42

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3