[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.39, Tue Jun 20 03:34:43 2006 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
26    
27     For questions about this email RobE@theFIG.info
28    
29  =cut  =cut
30    
31  package raelib;  package raelib;
32  use strict;  use strict;
33    use Bio::SeqIO;
34    use Bio::Seq;
35    use Bio::Tools::SeqStats;
36    use Bio::SeqFeature::Generic;
37    
38    # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
39    # so we use an eval and set the boolean if we are cool.
40    my $useexcel;
41    my $excelfile;
42    my $excelfilelink="";
43    BEGIN {
44        use lib '/home/seed/Rob/perl/lib/perl5/site_perl/5.8.5/';
45        eval "use Spreadsheet::WriteExcel";
46        unless ($@) {$useexcel=1}
47    }
48    
49    END {
50        my $self=shift;
51        if ($useexcel && $excelfile) {&close_excel_file($excelfile)}
52    }
53    
54    
55  use FIG;  use FIG;
56  my $fig=new FIG;  my $fig=new FIG;
57    
# Line 21  Line 62 
62  =cut  =cut
63    
64  sub new {  sub new {
65   my $self=shift;   my ($class)=@_;
66   return $self   my $self={};
67     $self->{'useexcel'}=1 if ($useexcel);
68     return bless $self, $class;
69  }  }
70    
71    
# Line 63  Line 106 
106  }  }
107    
108    
109    =head2 mol_wt
110    
111    Calculate the molecular weight of a protein.
112    
113    This just offlaods the calculation to BioPerl, which is probably dumb since we need to load the whole of bioperl in just for this, but I don't have time to rewrite their method now, and I am not going to copy and paste it since I didn't write it :)
114    
115    my ($lower, $upper)=$raelib->mol_wt($peg);
116    
117    $lower is the lower bound for the possible mw, upper is the upper bound.
118    
119    =cut
120    
121    sub mol_wt {
122        my ($self, $fid)=@_;
123        my $sobj=Bio::Seq->new(-seq => $fig->get_translation($fid), -id => $fid);
124        return Bio::Tools::SeqStats->get_mol_wt($sobj);
125    }
126    
127    
128  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
129    
130   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
131    
132   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
133    
134    This method takes three arguments:
135       from    : pirsfinfo.dat file
136       to      : file to write information to
137       verbose : report on progress
138    
139    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
140    
141    Returns the number of lines in the pirsinfo file that were read.
142    
143  =cut  =cut
144    
145  sub pirsfcorrespondence {  sub pirsfcorrespondence {
146   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
147   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
148      print STDERR "File $from does not exist as called in $0\n";
149      return 0;
150     }
151     if ($from =~ /\.gz$/) {
152      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
153     }
154     else {
155   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
156   open (OUT, ">$to") || die "Can't write tot $to";   }
157     open (OUT, ">$to") || die "Can't write to $to";
158     my $linecount;
159   while (<IN>) {   while (<IN>) {
160      $linecount++;
161      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
162    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
163    chomp;    chomp;
164    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
165     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
166    }    }
   unless ($done) {print OUT $_, "\t\n"}  
167   }   }
168   close IN;   close IN;
169   close OUT;   close OUT;
170     return $linecount;
171    }
172    
173    =head2 uniprotcorrespondence
174    
175    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
176    
177    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
178    
179     This method takes three arguments:
180       from    : uniprotKB file
181       to      : file to write information to
182       verbose : report on progress
183    
184    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185    
186     Returns the number of lines in the uniprotkb file that were read.
187    
188    =cut
189    
190    sub uniprotcorrespondence {
191     my ($self, $from, $to, $verbose)=@_;
192     unless (-e $from) {
193      print STDERR "File $from does not exist as called in $0\n";
194      return 0;
195     }
196     if ($from =~ /\.gz$/) {
197      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198     }
199     else {
200      open (IN, $from) || die "Can't open $from";
201     }
202     open (OUT, ">$to") || die "Can't write to $to";
203     my $linecount;
204     while (<IN>) {
205      chomp;
206      $linecount++;
207      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
208      my @line=split /\s+\|\s+/;
209      my $added;
210      foreach my $peg ($self->swiss_pir_ids($line[0])) {
211       print OUT "$_ | $peg\n";
212       $added=1;
213      }
214      unless ($added) {print OUT "$_\n"}
215     }
216     close IN;
217     close OUT;
218     return $linecount;
219  }  }
220    
221    =head2 prositecorrespondence
222    
223    Generate a correspondence table between prosite and seed using sp id's and seed ids.
224    
225    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
226    
227    The output file will have the following columns:
228    
229    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
230    
231    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
232    
233     This method takes three arguments:
234       from    : prosite file
235       to      : file to write information to
236       verbose : report on progress
237    
238    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
239    
240     Returns the number of lines in the prosite file that were read.
241    
242    =cut
243    
244    sub prositecorrespondence {
245     my ($self, $from, $to, $verbose)=@_;
246     unless (-e $from) {
247      print STDERR "File $from does not exist as called in $0\n";
248      return 0;
249     }
250     if ($from =~ /\.gz$/) {
251      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
252     }
253     else {
254      open (IN, $from) || die "Can't open $from";
255     }
256     open (OUT, ">$to") || die "Can't write to $to";
257     my $linecount;
258     my ($famac, $famname, $famtype)=('','','');
259     while (<IN>) {
260      chomp;
261      $linecount++;
262      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
263      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
264      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
265      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
266      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
267      #
268      # this is the format of the DR lines:
269      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
270      s/^DR\s*//;
271      foreach my $piece (split /\s*\;\s*/, $_) {
272       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
273       foreach my $fig ($self->swiss_pir_ids($acc)) {
274        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
275       }
276      }
277     }
278    }
279    
280    =head2 swiss_pir_ids()
281    
282    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
283    
284    =cut
285    
286    sub swiss_pir_ids {
287     my ($self, $id)=@_;
288     return () unless ($id);
289     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
290    
291     my @return=($fig->by_alias("uni|$id"));
292     return @return if ($return[0]);
293    
294     @return=($fig->by_alias("tr|$id"));
295     return @return if ($return[0]);
296    
297     @return=($fig->by_alias("sp|$id"));
298     return @return if ($return[0]);
299    
300     return ();
301    }
302    
303  =head2 ss_by_id  =head2 ss_by_id
304    
# Line 173  Line 376 
376   my @return;   my @return;
377   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
378   foreach my $attr (@attr) {   foreach my $attr (@attr) {
379    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
380    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
381   }   }
382   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 433 
433    
434  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
435    
436  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default, $limit);
437    
438  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
439    
440    default will set a default to override (maybe) korgs
441    
442    limit is a reference to an array of organism IDs that you want to limit the list to.
443    
444  =cut  =cut
445    
446  sub scrolling_org_list {  sub scrolling_org_list {
447   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default, $limit)=@_;
448   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
449    
450   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 285  Line 492 
492   my $orgs; my $label;   my $orgs; my $label;
493   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );
494    
495     # limit the list of organisms to a selected few if required
496     if ($limit)
497     {
498        my %lim=map {($_=>1)} @$limit;
499        my $norg;
500        foreach my $o (@$orgs) {push @$norg, $o if ($lim{$o})}
501        $orgs=$norg;
502    }
503    
504   foreach (@$orgs) {   foreach (@$orgs) {
505     my $gs = $fig->genus_species($_);     my $gs = $fig->genus_species($_);
506     my $gc=scalar $fig->all_contigs($_);     if ($fig->genome_domain($_) ne "Environmental Sample")
507       {
508        my $gc=$fig->number_of_contigs($_);
509     $label->{$_} = "$gs ($_) [$gc contigs]";     $label->{$_} = "$gs ($_) [$gc contigs]";
510    }    }
511       else
512       {
513        $label->{$_} = "$gs ($_) ";
514       }
515      }
516    
517   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
518    
# Line 303  Line 526 
526                                          -labels   => $label,                                          -labels   => $label,
527                                          -size     => 10,                                          -size     => 10,
528                                          -multiple => $multiple,                                          -multiple => $multiple,
529                                            -default  => $default,
530                                        ), $cgi->br,                                        ), $cgi->br,
531                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
532                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 537 
537                    "</TD>",                    "</TD>",
538                    "   </TR>\n",                    "   </TR>\n",
539                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
540          );          );
541  }  }
542    
543    
544    =head2 scrolling_subsys_list
545    
546    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
547    
548    use like this
549    
550    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
551    
552    =cut
553    
554    sub scrolling_subsys_list {
555     my ($self, $cgi, $multiple)=@_;
556     $multiple=0 unless (defined $multiple);
557     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
558     my $label;
559     # generate labels for the list
560     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
561     return $cgi->scrolling_list(
562      -name    => 'subsystems',
563      -values  => \@ss,
564      -labels  => $label,
565      -size    => 10,
566      -multiple=> $multiple,
567     );
568    }
569    
570    =head2 subsys_names_for_display
571    
572    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
573    
574    $raelib->subsys_names_for_display(@ss);
575    or
576    $raelib->subsys_names_for_display($fig->all_subsystems());
577    
578    =cut
579    
580    sub subsys_names_for_display {
581     my ($self, @ss)=@_;
582     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
583     return @ss;
584    }
585    
586    =head2 GenBank
587    
588     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
589     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
590     at building the object.
591    
592     Note that you need to call this with the genome name and the contig. This will then go through that contig.
593    
594     Something like this should work
595    
596     foreach my $contig ($fig->all_contigs($genome)) {
597      my $seqobj=FIGRob->GenBank($genome, $contig);
598      # process the contig
599     }
600    
601    =cut
602    
603    sub GenBank {
604     my ($self, $genome, $contig)=@_;
605     my $gs=$fig->genus_species($genome);
606     return unless ($gs);
607     unless ($contig) {
608      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
609      return;
610     }
611     my $len=$fig->contig_ln($genome, $contig);
612     unless ($len) {
613      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
614      return;
615     }
616    
617    
618     # first find all the pegs ...
619     my $features; # all the features in the genome
620     my $allpegs; # all the pegs
621     my $translation; # all the protein sequences
622     foreach my $peg ($fig->pegs_of($genome)) {
623      my @location=$fig->feature_location($peg);
624      my $func=$fig->function_of($peg);
625      foreach my $loc (@location) {
626       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
627       my ($cg, $start, $stop)=($1, $2, $3);
628       next unless ($cg eq $contig);
629       # save this information for later
630       $features->{'peg'}->{$loc}=$func;
631       $allpegs->{'peg'}->{$loc}=$peg;
632       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
633      }
634     }
635     # ... and all the RNAs
636     foreach my $peg ($fig->rnas_of($genome)) {
637      my @location=$fig->feature_location($peg);
638      my $func=$fig->function_of($peg);
639      foreach my $loc (@location) {
640       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
641       my ($cg, $start, $stop)=($1, $2, $3);
642       next unless ($cg eq $contig);
643       # save this information for later
644       $features->{'rna'}->{$loc}=$func;
645       $allpegs->{'rna'}->{$loc}=$peg;
646      }
647     }
648    
649    
650     # now get all the contigs out
651     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
652     my $description = "Contig $contig from " . $fig->genus_species($genome);
653     my $sobj=Bio::Seq->new(
654              -seq              =>  $seq,
655              -id               =>  $contig,
656              -desc             =>  $description,
657              -accession_number =>  $genome
658              );
659     foreach my $prot (keys %{$features->{'peg'}}) {
660       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
661       my ($cg, $start, $stop)=($1, $2, $3);
662       my $strand=1;
663       if ($stop < $start) {
664        ($stop, $start)=($start, $stop);
665        $strand=-1;
666     }
667    
668     my $feat=Bio::SeqFeature::Generic->new(
669            -start         =>  $start,
670            -end           =>  $stop,
671            -strand        =>  $strand,
672            -primary       =>  'CDS',
673            -display_name  =>  $allpegs->{'peg'}->{$prot},
674            -source_tag    =>  'the SEED',
675            -tag           =>
676                           {
677                           db_xref     =>   $allpegs->{'peg'}->{$prot},
678                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
679                           function    =>  $features->{'peg'}->{$prot},
680                           translation =>  $translation->{'peg'}->{$prot}
681                          }
682           );
683    
684       $sobj->add_SeqFeature($feat);
685     }
686    
687     foreach my $prot (keys %{$features->{'rna'}}) {
688       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
689       my ($cg, $start, $stop)=($1, $2, $3);
690       my $strand=1;
691       if ($stop < $start) {
692        ($stop, $start)=($start, $stop);
693        $strand=-1;
694       }
695    
696       my $feat=Bio::SeqFeature::Generic->new(
697            -start         =>  $start,
698            -end           =>  $stop,
699            -strand        =>  $strand,
700            -primary       =>  'RNA',
701            -source_tag    =>  'the SEED',
702            -display_name  =>  $allpegs->{'rna'}->{$prot},
703            -tag           =>
704                          {
705                           db_xref     =>   $allpegs->{'rna'}->{$prot},
706                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
707                           function    =>  $features->{'rna'}->{$prot},
708                          }
709           );
710    
711      $sobj->add_SeqFeature($feat);
712     }
713     return $sobj;
714    }
715    
716    =head2 best_hit
717    
718     Returns the FIG id of the single best hit to a peg
719    
720     eg
721    
722     my $bh=$fr->best_hit($peg);
723     print 'function is ', scalar $fig->function_of($bh);
724    
725    =cut
726    
727    sub best_hit {
728     my ($self, $peg)=@_;
729     return unless ($peg);
730    
731     my ($maxN, $maxP)=(1, 1e-5);
732     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
733     return ${$sims[0]}[1];
734    }
735    
736    
737    =head1 read_fasta
738    
739    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
740    
741    Usage:
742    my $fasta=$raelib->read_fasta($file);
743    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
744    
745    =cut
746    
747    sub read_fasta {
748     my ($self, $file, $keepcomments)=@_;
749     open (IN, $file) || die "Can't open $file";
750     my %f; my $t; my $s; my %c;
751     while (<IN>) {
752      chomp;
753      if (/^>/) {
754       if ($s) {
755        $f{$t}=$s;
756        undef $s;
757       }
758       s/^>(\S+)\s*//;
759       $t=$1;
760       $c{$t}=$_ if ($_);
761      }
762      else {$s .= $_}
763     }
764     $f{$t}=$s;
765     if ($keepcomments) {return (\%f, \%c)}
766     else {return \%f}
767    }
768    
769    =head1 rc
770    
771    Reverse complement. It's too easy.
772    
773    =cut
774    
775    sub rc {
776     my ($self, $seq)=@_;
777     $seq=~tr/GATCgatc/CTAGctag/;
778     $seq = reverse $seq;
779     return $seq;
780    }
781    
782    
783    =head2 cookies
784    
785    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
786    
787    If you do not pass any arguments the whole cookie will be returned.
788    
789    Use as follows:
790    
791    ($cookie, $data) = raelib->cookie($cgi, \%data);
792    
793    You do not need to pass in any data, in that case you will just get the cookie back
794    
795    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
796    
797    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
798    
799    =cut
800    
801    sub cookie {
802     my ($self, $cgi, $input)=@_;
803     return unless ($cgi);
804     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
805    
806     # first, create a hash from the existing cookie data
807     my $cookie;
808     map {
809      my ($kname, $kvalue)=split /\=/, $_;
810      $cookie->{$kname}=$kvalue;
811     } split /\;/, $self->{'cookie'};
812    
813     if ($input)
814     {
815      # add the values that were passed in
816      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
817      # put everything back together and set the cookie
818      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
819      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
820     }
821    
822     return ($self->{'cookie'}, $cookie);
823    }
824    
825    
826    =head1 is_number
827    
828    returns 1 if the argument is a number, and 0 if not. This is taken directly from the perl cookbook.
829    
830    =cut
831    
832    sub is_number {
833        my ($self, $no)=@_;
834        return 1 if ($no =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/); # Perl cookbook, page 44
835        return 0;
836    }
837    
838    
839    
840    =head1 commify
841    
842    Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
843    
844    =cut
845    
846    sub commify {
847        my($self,$n) = @_;
848        my(@n) = ();
849        my($i);
850    
851        for ($i = (length($n) - 3); ($i > 0); $i -= 3)
852        {
853            unshift(@n,",",substr($n,$i,3));
854        }
855        unshift(@n,substr($n,0,$i+3));
856        return join("",@n);
857    }
858    
859    
860    =head1 tab2excel
861    
862    This is experimental as of May, 2006.
863    
864    There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
865    
866    At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
867    
868    The issues are:
869        1. creating the excel file
870        2. reading through @$tab and presenting the data
871        3. Checking @$tab because each element can be a reference to an array with color or formatting information
872    
873    Formatting
874    
875    A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
876    
877    
878    Usage:
879    
880        The recommended way of using this set of modules is to add the options excelfile=>"filename" to the options passed to &HTML::make_table. That should take care of EVERYTHING for you, so you should do that. You can call thinks separetly if you like, but I don't recommend it.
881    
882        Note that you can make multiple calls to the same excel file,a nd each one will get added as a new sheet.
883    
884        Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
885        and second, the additional option "filename" that is the filename to be written;
886    
887        $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
888    
889        The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
890    
891    Returns:
892        A link to the file in the format
893            <p><a href="...">filename</a> [Download Excel file]</p>
894    
895    Note that there are four separate methods:
896        1. tab2excel is the method for a single call from HTML::make_table
897            this will make an excel file, fill it, and return the link;
898        2. make_excel_workbook is the method that instantiates a file
899        3. make_excel_worksheet is the method that actually populates the file
900            this loads all the data into the excel file, but if you know what you are doing you can call this many times,
901            each with a different spreadsheet
902        4. close_excel_file
903            this closes the file and writes it. This is called in the END block, so you do not have to explicitly call it here.
904    
905        tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
906        See subsys.cgi for a more complex involvement of this!
907    
908    
909    =cut
910    
911    sub tab2excel {
912        my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
913        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
914        $self->{'excel_file_link'} = $self->make_excel_workbook($filename, $options);
915        $excelfilelink=$self->{'excel_file_link'};
916        $self->make_excel_worksheet($col_hdrs, $tab, $title);
917        return "" if ($options->{'no_excel_link'});
918        return $self->{'excel_file_link'};
919    }
920    
921    
922    =head1 excel_file_link
923    
924    Just returns the link to the file, if one has been created. If not, returns a non-breaking space (&nbsp;)
925    
926    =cut
927    
928    sub excel_file_link {
929        my $self=shift;
930        # I am not sure why, but this is not working. Perhaps because I am calling it from &HTML::make_table (i.e. not OO perl?)
931        #print STDERR "SELF: $self LINK: ",$self->{'excel_file_link'}," or \n$excelfilelink\n";
932        #return $self->{'excel_file_link'};
933        return $excelfilelink;
934    }
935    
936    
937    
938    =head1 make_excel_workbook
939    
940    This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
941    
942    This will return the link to the workbook
943    
944    =cut
945    
946    sub make_excel_workbook {
947        my($self, $filename, $options)=@_;
948        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
949    
950        $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
951        unless ($filename =~ /\.xls$/) {$filename .=".xls"}
952    
953        # now generate the link to return
954        my $link="<p><a href=\"".$fig->temp_url."/".$filename.'">'.$filename."</a> [Download table in Excel format].</p>\n";
955        # do we already have this file -- if so, just return that info
956        return  $link if ($self->{'excel_short_filename'} eq $filename); # don't do anything, just return the fact that we have the book made!
957    
958    
959        $self->{'excel_short_filename'}=$filename;
960        $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
961    
962    
963        # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
964        # that can be added with "Insert->new worksheet" from the menus.
965        # Create a new workbook called simple.xls and add a worksheet
966    
967        # instantiate the workbook
968        $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
969        $excelfile=$self->{'excel_workbook'}; # this is for the close on END
970        $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
971    
972        # define the default formats
973        my $border = defined $options->{border} ? $options->{border} : 0;
974        $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
975        return $link;
976    }
977    
978    
979    =head1 make_excel_worksheet()
980    
981    This is the method that makes the separate sheets in the file. You can add as many of these as you want.
982    
983    =cut
984    
985    sub make_excel_worksheet {
986        my($self, $col_hdrs, $tab, $title)=@_;
987        return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
988        unless (defined $self->{'excel_workbook'})
989        {
990            print STDERR "The workbook was not defined. Couldn't fill it in\n";
991            return;
992        }
993    
994        if (length($title) > 31) {$title=substr($title, 0, 31)}
995        my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
996        # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
997        # column are zero indexed
998    
999        # write the column headers
1000        # define a new format that is bold
1001        $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
1002        $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
1003        $self->{'excel_format'}->{header}->set_bold();
1004    
1005        for my $i (0 .. $#$col_hdrs)
1006        {
1007            my $cell;
1008            my ($useformat, $rowspan, $colspan);
1009            if (ref($col_hdrs->[$i]) eq "ARRAY") {($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($col_hdrs->[$i])}
1010            else  {$cell=$col_hdrs->[$i]}
1011            my $cell=$self->clean_excel_cell($cell);
1012            $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
1013        }
1014    
1015        # now loop through the table and write them out. Remember to break on array refs
1016        # we are going to have to build the table col by col so we get the breaks in the right place
1017        # for merged cells
1018        my $row_idx=1;
1019        my $maxrow=$#$tab;
1020        my $skip;
1021        while ($row_idx <= $maxrow+1)
1022        {
1023            my @row=@{$tab->[$row_idx-1]};
1024            my $col_idx=0;
1025            foreach my $cell (@row)
1026            {
1027                while ($skip->{$row_idx}->{$col_idx}) {$col_idx++}
1028                my $useformat=$self->{'excel_format'}->{default};
1029    
1030                # there is an approach to setting color using \@bgcolor. Oh well.
1031                if ( $cell =~ /^\@([^:]+)\:(.*)$/ )
1032                {
1033                    $cell=[$2, $1];
1034                }
1035    
1036                my ($rowspan, $colspan);
1037                if (ref($cell) eq "ARRAY")
1038                {
1039                    ($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($cell);
1040                }
1041    
1042                $cell=$self->clean_excel_cell($cell);
1043    
1044                if ($rowspan > 1 || $colspan > 1)
1045                {
1046                    # handle merged cells separately
1047                    my $row_idx_to=$row_idx+$rowspan-1;
1048                    my $col_idx_to=$col_idx+$colspan-1;
1049                    # we want to not put anything in the merged cells
1050                    for (my $x=$row_idx; $x<=$row_idx_to; $x++) {$skip->{$x}->{$col_idx}=1}
1051                    for (my $y=$col_idx; $y<=$col_idx_to; $y++) {$skip->{$row_idx}->{$y}=1}
1052    
1053                    if (ref($cell) eq "ARRAY") {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, @$cell, $useformat)}
1054                    else {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, $cell, $useformat)}
1055                }
1056                else
1057                {
1058                    # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
1059                    # in this case we have a reference to an array, and we'll use  a slight modification on the process
1060                    if ((ref($cell) eq "ARRAY" && $cell->[0] eq " &nbsp; ") || $cell eq " &nbsp; ") {$worksheet->write_blank($row_idx, $col_idx, $useformat)}
1061                    else
1062                    {
1063                        if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
1064                        else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
1065                    }
1066                }
1067    
1068                # increment to the next column
1069                $col_idx++;
1070            }
1071            # new line, and start of line
1072            $row_idx++;
1073            $col_idx=0;
1074        }
1075    }
1076    
1077    
1078    
1079    
1080    =head1 close_excel_file()
1081    
1082    We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
1083    
1084    =cut
1085    
1086    sub close_excel_file{
1087        my ($workbook)=@_;
1088        return unless (defined $workbook);
1089        # close the workbook. this writes the files
1090        return $workbook->close();
1091    }
1092    
1093    
1094    
1095    
1096    
1097    
1098    
1099    
1100    
1101    =head2 parse_cell()
1102    
1103    A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
1104    
1105    Requires the cell and the current $format.
1106    
1107    When applied to <td> the default formats that we'll deal with at the moment are
1108         align=
1109         background-color=
1110         color=
1111         bgcolor=
1112    
1113    Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
1114    
1115    The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
1116    
1117    =cut
1118    
1119    sub parse_cell {
1120        my ($self, $arr)=@_;
1121        return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1122        my ($cell, $tag)=@$arr;
1123        $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1124    
1125        # we are going to define a series of formats that we can apply, this will have  a key that is
1126        # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1127    
1128        my ($th, $center, $bgcolor, $fgcolor)=(0,0,0,0);
1129    
1130        if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1131        if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1132    
1133        # get rid of white tags because I don't care about them
1134        $tag =~ s/color\=.\#FFFFFF/ /ig;
1135    
1136        if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1137        {
1138            my $color=$1;
1139            if ($color)
1140            {
1141                if (!defined $self->{'excel_color'}->{$color})
1142                {
1143    # find out the last custom color used and increment it
1144                    my $max=19; # we are not going to use a color less than 20
1145                        foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1146                    $max++;
1147                    $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1148                }
1149                $bgcolor=$self->{'excel_color'}->{$color};
1150            }
1151        }
1152        elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1153        {
1154            my $color=$1;
1155            if (!defined $self->{'excel_color'}->{$color})
1156            {
1157                # find out the last custom color used and increment it
1158                my $max=19; # we are not going to use a color less than 20
1159                foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1160                $max++;
1161                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1162            }
1163            $fgcolor=$self->{'excel_color'}->{$color};
1164        }
1165    
1166        # check and see if we span multiple rows or columns
1167        my ($rowspan, $colspan)=(1,1);
1168        if ($tag =~ /rowspan\=[\'\"]?(\d+)/) {$rowspan=$1} # these should match rowspan="4", rowspan='4', and rowspan=4
1169        if ($tag =~ /colspan\=[\'\"]?(\d+)/) {$colspan=$1}
1170    
1171        my $formatid=$th.$center.$bgcolor.$fgcolor.$rowspan.$colspan;
1172        if (!defined $self->{'excel_format'}->{$formatid})
1173        {
1174            $self->{'excel_format'}->{$formatid}=$self->{'excel_workbook'}->add_format();
1175            if ($rowspan > 1) {$self->{'excel_format'}->{$formatid}->set_align("vcenter")}
1176            else
1177            {
1178                if ($th) {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{header})}
1179                else {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{default})}
1180            }
1181            $center && $self->{'excel_format'}->{$formatid}->set_align($center);
1182            $bgcolor && $self->{'excel_format'}->{$formatid}->set_bg_color($bgcolor);
1183            $fgcolor && $self->{'excel_format'}->{$formatid}->set_color($fgcolor);
1184        }
1185    
1186        return ($cell, $self->{'excel_format'}->{$formatid}, $rowspan, $colspan);
1187    }
1188    
1189    
1190    =head1 clean_excel_cell
1191    
1192    Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1193    
1194    =cut
1195    
1196    sub clean_excel_cell {
1197        my ($self, $cell)=@_;
1198        if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1199    
1200        # some cells have something like this:
1201        # <a  onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1202        # we don't want those, but we do want the ones that have a real url hidden here.
1203        # so remove the mouseover part, and then see what is left
1204        if ($cell =~ s/onMouseover\=\".*?\"//)
1205        {
1206            if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1207        }
1208    
1209        if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1210        {
1211            # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1212            my ($url, $link)=($1, $2);
1213            $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1214            unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1215            # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1216            $link =~ s/\<.*?\>//g;
1217            $cell=[$url, $link];
1218        }
1219        elsif ($cell =~ /<input type/)
1220        {
1221            if ($cell =~ /value='(.*?)'/) {$cell = $1}
1222            elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1223        }
1224        else
1225        {
1226            # this is all the html that I don't know what to do with, like <input type=...>
1227            $cell =~ s/\<.*?\>//g;
1228        }
1229        return $cell;
1230    }
1231    
1232    
1233    
1234    
1235  1;  1;
1236    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.39

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3