[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Fri Apr 8 20:22:23 2005 UTC revision 1.44, Sun Mar 16 18:24:57 2008 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
26    
27     For questions about this email RobE@theFIG.info
28    
29  =cut  =cut
30    
31  package raelib;  package raelib;
32  use strict;  use strict;
33    use Bio::SeqIO;
34    use Bio::Seq;
35    use Bio::Tools::SeqStats;
36    use Bio::SeqFeature::Generic;
37    
38    # we don't know whether the Spreadsheet::WriteExcel methods are available on all systems, and even on the CI systems they are currently in my shared directory
39    # so we use an eval and set the boolean if we are cool.
40    my $useexcel;
41    my $excelfile;
42    my $excelfilelink="";
43    BEGIN {
44        eval "use Spreadsheet::WriteExcel";
45        unless ($@) {$useexcel=1}
46    }
47    
48    END {
49        my $self=shift;
50        if ($useexcel && $excelfile) {&close_excel_file($excelfile)}
51    }
52    
53    
54  use FIG;  use FIG;
55  my $fig=new FIG;  my $fig=new FIG;
56    
57  =head2 new  =head2 Methods
58    
59    =head3 new
60    
61  Just instantiate the object and return $self  Just instantiate the object and return $self
62    
63  =cut  =cut
64    
65  sub new {  sub new {
66   my $self=shift;   my ($class)=@_;
67   return $self   my $self={};
68     $self->{'useexcel'}=1 if ($useexcel);
69     return bless $self, $class;
70  }  }
71    
72    
73    
74    
75  =head2 features_on_contig  =head3 features_on_contig
76    
77   Returns a reference to an array containing all the features on a contig in a genome.   Returns a reference to an array containing all the features on a contig in a genome.
78    
# Line 63  Line 107 
107  }  }
108    
109    
110    =head3 mol_wt
111    
112    Calculate the molecular weight of a protein.
113    
114    This just offlaods the calculation to BioPerl, which is probably dumb since we need to load the whole of bioperl in just for this, but I don't have time to rewrite their method now, and I am not going to copy and paste it since I didn't write it :)
115    
116    my ($lower, $upper)=$raelib->mol_wt($peg);
117    
118    $lower is the lower bound for the possible mw, upper is the upper bound.
119    
120    =cut
121    
122    sub mol_wt {
123        my ($self, $fid)=@_;
124        my $sobj=Bio::Seq->new(-seq => $fig->get_translation($fid), -id => $fid);
125        return Bio::Tools::SeqStats->get_mol_wt($sobj);
126    }
127    
128    
129  =head2 pirsfcorrespondence  =head3 pirsfcorrespondence
130    
131   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
132    
133   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
134    
# Line 80  Line 137 
137     to      : file to write information to     to      : file to write information to
138     verbose : report on progress     verbose : report on progress
139    
140    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
141    
142   Returns the number of lines in the pirsinfo file that were read.   Returns the number of lines in the pirsinfo file that were read.
143    
144  =cut  =cut
# Line 90  Line 149 
149    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
150    return 0;    return 0;
151   }   }
152     if ($from =~ /\.gz$/) {
153      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
154     }
155     else {
156   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
157     }
158   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
159   my $linecount;   my $linecount;
160   while (<IN>) {   while (<IN>) {
161    $linecount++;    $linecount++;
162    unless ($linecount % 10000) {print STDERR "Correspondence of $linecount lines calculated\n"}    if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
163    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
164    chomp;    chomp;
165    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
166     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
167    }    }
   unless ($done) {print OUT $_, "\t\n"}  
168   }   }
169   close IN;   close IN;
170   close OUT;   close OUT;
171   return $linecount;   return $linecount;
172  }  }
173    
174    =head3 uniprotcorrespondence
175    
176    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
177    
178  =head2 ss_by_id  The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
179    
180     This method takes three arguments:
181       from    : uniprotKB file
182       to      : file to write information to
183       verbose : report on progress
184    
185    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
186    
187     Returns the number of lines in the uniprotkb file that were read.
188    
189    =cut
190    
191    sub uniprotcorrespondence {
192     my ($self, $from, $to, $verbose)=@_;
193     unless (-e $from) {
194      print STDERR "File $from does not exist as called in $0\n";
195      return 0;
196     }
197     if ($from =~ /\.gz$/) {
198      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
199     }
200     else {
201      open (IN, $from) || die "Can't open $from";
202     }
203     open (OUT, ">$to") || die "Can't write to $to";
204     my $linecount;
205     while (<IN>) {
206      chomp;
207      $linecount++;
208      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
209      my @line=split /\s+\|\s+/;
210      my $added;
211      foreach my $peg ($self->swiss_pir_ids($line[0])) {
212       print OUT "$_ | $peg\n";
213       $added=1;
214      }
215      unless ($added) {print OUT "$_\n"}
216     }
217     close IN;
218     close OUT;
219     return $linecount;
220    }
221    
222    =head3 prositecorrespondence
223    
224    Generate a correspondence table between prosite and seed using sp id's and seed ids.
225    
226    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
227    
228    The output file will have the following columns:
229    
230    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
231    
232    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
233    
234     This method takes three arguments:
235       from    : prosite file
236       to      : file to write information to
237       verbose : report on progress
238    
239    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
240    
241     Returns the number of lines in the prosite file that were read.
242    
243    =cut
244    
245    sub prositecorrespondence {
246     my ($self, $from, $to, $verbose)=@_;
247     unless (-e $from) {
248      print STDERR "File $from does not exist as called in $0\n";
249      return 0;
250     }
251     if ($from =~ /\.gz$/) {
252      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
253     }
254     else {
255      open (IN, $from) || die "Can't open $from";
256     }
257     open (OUT, ">$to") || die "Can't write to $to";
258     my $linecount;
259     my ($famac, $famname, $famtype)=('','','');
260     while (<IN>) {
261      chomp;
262      $linecount++;
263      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
264      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
265      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
266      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
267      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
268      #
269      # this is the format of the DR lines:
270      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
271      s/^DR\s*//;
272      foreach my $piece (split /\s*\;\s*/, $_) {
273       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
274       foreach my $fig ($self->swiss_pir_ids($acc)) {
275        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
276       }
277      }
278     }
279    }
280    
281    =head3 swiss_pir_ids()
282    
283    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
284    
285    =cut
286    
287    sub swiss_pir_ids {
288     my ($self, $id)=@_;
289     return () unless ($id);
290     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
291    
292     my @return=($fig->by_alias("uni|$id"));
293     return @return if ($return[0]);
294    
295     @return=($fig->by_alias("tr|$id"));
296     return @return if ($return[0]);
297    
298     @return=($fig->by_alias("sp|$id"));
299     return @return if ($return[0]);
300    
301     return ();
302    }
303    
304    =head3 ss_by_id
305    
306   Generate a list of subsystems that a peg occurs in. This is a ; separated list.   Generate a list of subsystems that a peg occurs in. This is a ; separated list.
307   This is a wrapper that removes roles and ignores essential things   This is a wrapper that removes roles and ignores essential things
# Line 130  Line 320 
320   return $ssout;   return $ssout;
321  }  }
322    
323  =head2 ss_by_homol  =head3 ss_by_homol
324    
325   Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.   Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.
326   This is also a wrapper around sims and ss, but makes everything unified   This is also a wrapper around sims and ss, but makes everything unified
# Line 170  Line 360 
360   return $ssoutput;   return $ssoutput;
361  }  }
362    
363  =head2 tagvalue  =head3 tagvalue
364    
365   This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)   This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)
366    
# Line 193  Line 383 
383   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
384  }  }
385    
386  =head2 locations_on_contig  =head3 locations_on_contig
387    
388  Return the locations of a sequence on a contig.  Return the locations of a sequence on a contig.
389    
# Line 240  Line 430 
430  }  }
431    
432    
433  =head2 scrolling_org_list  =head3 scrolling_org_list
434    
435  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
436    
437  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default, $limit);
438    
439  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
440    
441    default will set a default to override (maybe) korgs
442    
443    limit is a reference to an array of organism IDs that you want to limit the list to.
444    
445  =cut  =cut
446    
447  sub scrolling_org_list {  sub scrolling_org_list {
448   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default, $limit)=@_;
449   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
450    
451   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 299  Line 493 
493   my $orgs; my $label;   my $orgs; my $label;
494   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );   @$orgs =  $fig->genomes( $complete, undef, $canonical{ $req_dom } );
495    
496     # limit the list of organisms to a selected few if required
497     if ($limit)
498     {
499        my %lim=map {($_=>1)} @$limit;
500        my $norg;
501        foreach my $o (@$orgs) {push @$norg, $o if ($lim{$o})}
502        $orgs=$norg;
503    }
504    
505   foreach (@$orgs) {   foreach (@$orgs) {
506     my $gs = $fig->genus_species($_);     my $gs = $fig->genus_species($_);
507     my $gc=scalar $fig->all_contigs($_);     if ($fig->genome_domain($_) ne "Environmental Sample")
508       {
509        my $gc=$fig->number_of_contigs($_);
510     $label->{$_} = "$gs ($_) [$gc contigs]";     $label->{$_} = "$gs ($_) [$gc contigs]";
511    }    }
512       else
513       {
514        $label->{$_} = "$gs ($_) ";
515       }
516      }
517    
518   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;   @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
519    
# Line 317  Line 527 
527                                          -labels   => $label,                                          -labels   => $label,
528                                          -size     => 10,                                          -size     => 10,
529                                          -multiple => $multiple,                                          -multiple => $multiple,
530                                            -default  => $default,
531                                        ), $cgi->br,                                        ), $cgi->br,
532                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
533                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 327  Line 538 
538                    "</TD>",                    "</TD>",
539                    "   </TR>\n",                    "   </TR>\n",
540                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
541          );          );
542  }  }
543    
544    
545    =head3 scrolling_subsys_list
546    
547    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
548    
549    use like this
550    
551    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
552    
553    =cut
554    
555    sub scrolling_subsys_list {
556     my ($self, $cgi, $multiple)=@_;
557     $multiple=0 unless (defined $multiple);
558     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
559     my $label;
560     # generate labels for the list
561     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
562     return $cgi->scrolling_list(
563      -name    => 'subsystems',
564      -values  => \@ss,
565      -labels  => $label,
566      -size    => 10,
567      -multiple=> $multiple,
568     );
569    }
570    
571    =head3 subsys_names_for_display
572    
573    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
574    
575    $raelib->subsys_names_for_display(@ss);
576    or
577    $raelib->subsys_names_for_display($fig->all_subsystems());
578    
579    =cut
580    
581    sub subsys_names_for_display {
582     my ($self, @ss)=@_;
583     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
584     return @ss;
585    }
586    
587    =head3 GenBank
588    
589     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
590     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
591     at building the object.
592    
593     Note that you need to call this with the genome name and the contig. This will then go through that contig.
594    
595     Something like this should work
596    
597     foreach my $contig ($fig->all_contigs($genome)) {
598      my $seqobj=FIGRob->GenBank($genome, $contig);
599      # process the contig
600     }
601    
602    =cut
603    
604    sub GenBank {
605     my ($self, $genome, $contig)=@_;
606     my $gs=$fig->genus_species($genome);
607     return unless ($gs);
608     unless ($contig) {
609      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
610      return;
611     }
612     my $len=$fig->contig_ln($genome, $contig);
613     unless ($len) {
614      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
615      return;
616     }
617    
618    
619     # first find all the pegs ...
620     my $features; # all the features in the genome
621     my $allpegs; # all the pegs
622     my $translation; # all the protein sequences
623     foreach my $peg ($fig->pegs_of($genome)) {
624      my @location=$fig->feature_location($peg);
625      my $func=$fig->function_of($peg);
626      foreach my $loc (@location) {
627       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
628       my ($cg, $start, $stop)=($1, $2, $3);
629       next unless ($cg eq $contig);
630       # save this information for later
631       $features->{'peg'}->{$loc}=$func;
632       $allpegs->{'peg'}->{$loc}=$peg;
633       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
634      }
635     }
636     # ... and all the RNAs
637     foreach my $peg ($fig->rnas_of($genome)) {
638      my @location=$fig->feature_location($peg);
639      my $func=$fig->function_of($peg);
640      foreach my $loc (@location) {
641       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
642       my ($cg, $start, $stop)=($1, $2, $3);
643       next unless ($cg eq $contig);
644       # save this information for later
645       $features->{'rna'}->{$loc}=$func;
646       $allpegs->{'rna'}->{$loc}=$peg;
647      }
648     }
649    
650    
651     # now get all the contigs out
652     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
653     my $description = "Contig $contig from " . $fig->genus_species($genome);
654     my $sobj=Bio::Seq->new(
655              -seq              =>  $seq,
656              -id               =>  $contig,
657              -desc             =>  $description,
658              -accession_number =>  $genome
659              );
660     foreach my $prot (keys %{$features->{'peg'}}) {
661       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
662       my ($cg, $start, $stop)=($1, $2, $3);
663       my $strand=1;
664       if ($stop < $start) {
665        ($stop, $start)=($start, $stop);
666        $strand=-1;
667     }
668    
669     my $feat=Bio::SeqFeature::Generic->new(
670            -start         =>  $start,
671            -end           =>  $stop,
672            -strand        =>  $strand,
673            -primary       =>  'CDS',
674            -display_name  =>  $allpegs->{'peg'}->{$prot},
675            -source_tag    =>  'the SEED',
676            -tag           =>
677                           {
678                           db_xref     =>   $allpegs->{'peg'}->{$prot},
679                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
680                           function    =>  $features->{'peg'}->{$prot},
681                           translation =>  $translation->{'peg'}->{$prot}
682                          }
683           );
684    
685       $sobj->add_SeqFeature($feat);
686     }
687    
688     foreach my $prot (keys %{$features->{'rna'}}) {
689       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
690       my ($cg, $start, $stop)=($1, $2, $3);
691       my $strand=1;
692       if ($stop < $start) {
693        ($stop, $start)=($start, $stop);
694        $strand=-1;
695       }
696    
697       my $feat=Bio::SeqFeature::Generic->new(
698            -start         =>  $start,
699            -end           =>  $stop,
700            -strand        =>  $strand,
701            -primary       =>  'RNA',
702            -source_tag    =>  'the SEED',
703            -display_name  =>  $allpegs->{'rna'}->{$prot},
704            -tag           =>
705                          {
706                           db_xref     =>   $allpegs->{'rna'}->{$prot},
707                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
708                           function    =>  $features->{'rna'}->{$prot},
709                          }
710           );
711    
712      $sobj->add_SeqFeature($feat);
713     }
714     return $sobj;
715    }
716    
717    =head3 best_hit
718    
719     Returns the FIG id of the single best hit to a peg
720    
721     eg
722    
723     my $bh=$fr->best_hit($peg);
724     print 'function is ', scalar $fig->function_of($bh);
725    
726    =cut
727    
728    sub best_hit {
729     my ($self, $peg)=@_;
730     return unless ($peg);
731    
732     my ($maxN, $maxP)=(1, 1e-5);
733     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
734     return ${$sims[0]}[1];
735    }
736    
737    
738    =head3 read_fasta
739    
740    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
741    
742    Usage:
743    my $fasta=$raelib->read_fasta($file);
744    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
745    
746    =cut
747    
748    sub read_fasta {
749     my ($self, $file, $keepcomments)=@_;
750     if ($file =~ /\.gz$/) {open(IN, "gunzip -c $file|") || die "Can't open a pipe from gunzip -c $file"}
751     else {open (IN, $file) || die "Can't open $file"}
752     my %f; my $t; my $s; my %c;
753     while (<IN>) {
754      chomp;
755      if (/^>/) {
756       if ($s) {
757        $f{$t}=$s;
758        undef $s;
759       }
760       s/^>(\S+)\s*//;
761       $t=$1;
762       $c{$t}=$_ if ($_);
763      }
764      else {$s .= $_}
765     }
766     $f{$t}=$s;
767     if ($keepcomments) {return (\%f, \%c)}
768     else {return \%f}
769    }
770    
771    =head3 rc
772    
773    Reverse complement. It's too easy.
774    
775    =cut
776    
777    sub rc {
778     my ($self, $seq)=@_;
779     $seq=~tr/GATCgatc/CTAGctag/;
780     $seq = reverse $seq;
781     return $seq;
782    }
783    
784    
785    =head3 cookies
786    
787    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
788    
789    If you do not pass any arguments the whole cookie will be returned.
790    
791    Use as follows:
792    
793    ($cookie, $data) = raelib->cookie($cgi, \%data);
794    
795    You do not need to pass in any data, in that case you will just get the cookie back
796    
797    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
798    
799    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
800    
801    =cut
802    
803    sub cookie {
804     my ($self, $cgi, $input)=@_;
805     return unless ($cgi);
806     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
807    
808     # first, create a hash from the existing cookie data
809     my $cookie;
810     map {
811      my ($kname, $kvalue)=split /\=/, $_;
812      $cookie->{$kname}=$kvalue;
813     } split /\;/, $self->{'cookie'};
814    
815     if ($input)
816     {
817      # add the values that were passed in
818      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
819      # put everything back together and set the cookie
820      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
821      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
822     }
823    
824     return ($self->{'cookie'}, $cookie);
825    }
826    
827    
828    =head3 is_number
829    
830    returns 1 if the argument is a number, and 0 if not. This is taken directly from the perl cookbook.
831    
832    =cut
833    
834    sub is_number {
835        my ($self, $no)=@_;
836        return 1 if ($no =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/); # Perl cookbook, page 44
837        return 0;
838    }
839    
840    
841    
842    =head3 commify
843    
844    Put commas in numbers. I think this comes straight from the perl cookbook and is very useful for nice displays
845    
846    =cut
847    
848    sub commify {
849        my($self,$n) = @_;
850        my(@n) = ();
851        my($i);
852    
853        for ($i = (length($n) - 3); ($i > 0); $i -= 3)
854        {
855            unshift(@n,",",substr($n,$i,3));
856        }
857        unshift(@n,substr($n,0,$i+3));
858        return join("",@n);
859    }
860    
861    
862    =head3 tab2excel
863    
864    This is experimental as of May, 2006.
865    
866    There are a couple of perl modules that allow you to write to excel files, and so I am trying out the idea of taking our standard $tab table respresentation that is used in HTML.pm and making an excel file that people could download. It seems like that would be a great tool for them to have.
867    
868    At the moment the excel modules are in my shared space on the CI machines, and so won't work in every seed installation. Therefore the $self->{'useexcel'} boolean is set at compile time if we successfully load the module.
869    
870    The issues are:
871        1. creating the excel file
872        2. reading through @$tab and presenting the data
873        3. Checking @$tab because each element can be a reference to an array with color or formatting information
874    
875    Formatting
876    
877    A separate set of formats must be created for each color and font combination since the formats are applied at the end of the processing of the file.
878    
879    
880    Usage:
881    
882        The recommended way of using this set of modules is to add the options excelfile=>"filename" to the options passed to &HTML::make_table. That should take care of EVERYTHING for you, so you should do that. You can call thinks separetly if you like, but I don't recommend it.
883    
884        Note that you can make multiple calls to the same excel file,a nd each one will get added as a new sheet.
885    
886        Note the usage is ALMOST the same as make_table, but not quite. First, options is a reference to a hash rather than the hash itself
887        and second, the additional option "filename" that is the filename to be written;
888    
889        $url = $raelib->tab2excel($col_hdrs, $tab, $title, $options, "filename");
890    
891        The filename will be created in $FIG_Config::temp. The extension .xls will be added to the filename if it is not present.
892    
893    Returns:
894        A link to the file in the format
895            <p><a href="...">filename</a> [Download Excel file]</p>
896    
897    Note that there are four separate methods:
898        1. tab2excel is the method for a single call from HTML::make_table
899            this will make an excel file, fill it, and return the link;
900        2. make_excel_workbook is the method that instantiates a file
901        3. make_excel_worksheet is the method that actually populates the file
902            this loads all the data into the excel file, but if you know what you are doing you can call this many times,
903            each with a different spreadsheet
904        4. close_excel_file
905            this closes the file and writes it. This is called in the END block, so you do not have to explicitly call it here.
906    
907        tab2excel is a wrapper for all three so that the method in HTML::make_table is really easy.
908        See subsys.cgi for a more complex involvement of this!
909    
910    
911    =cut
912    
913    sub tab2excel {
914        my($self, $col_hdrs, $tab, $title, $options, $filename)=@_;
915        return "" unless ($self->{'useexcel'});
916        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
917        $self->{'excel_file_link'} = $self->make_excel_workbook($filename, $options);
918        $excelfilelink=$self->{'excel_file_link'};
919        $self->make_excel_worksheet($col_hdrs, $tab, $title);
920        return "" if ($options->{'no_excel_link'});
921        return $self->{'excel_file_link'};
922    }
923    
924    
925    =head3 excel_file_link
926    
927    Just returns the link to the file, if one has been created. If not, returns a non-breaking space (&nbsp;)
928    
929    =cut
930    
931    sub excel_file_link {
932        my $self=shift;
933        # I am not sure why, but this is not working. Perhaps because I am calling it from &HTML::make_table (i.e. not OO perl?)
934        #print STDERR "SELF: $self LINK: ",$self->{'excel_file_link'}," or \n$excelfilelink\n";
935        #return $self->{'excel_file_link'};
936        return $excelfilelink;
937    }
938    
939    
940    
941    =head3 make_excel_workbook
942    
943    This is the method that actually makes individual workbook. You should call this once, with the name of the file that you want it to be known by. The options are to set borders and whatnot.
944    
945    This will return the link to the workbook
946    
947    =cut
948    
949    sub make_excel_workbook {
950        my($self, $filename, $options)=@_;
951        return "" unless ($self->{'useexcel'});
952        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
953    
954        $filename =~ s/^.*\///; # remove any path information. We are going to only write to FIG_Config::temp
955        unless ($filename =~ /\.xls$/) {$filename .=".xls"}
956    
957        # now generate the link to return
958        my $link="<p><a href=\"".$fig->temp_url."/".$filename.'">'.$filename."</a> [Download table in Excel format].</p>\n";
959        # do we already have this file -- if so, just return that info
960        return  $link if ($self->{'excel_short_filename'} eq $filename); # don't do anything, just return the fact that we have the book made!
961    
962    
963        $self->{'excel_short_filename'}=$filename;
964        $self->{'excel_filename'}=$FIG_Config::temp."/$filename";
965    
966    
967        # Each excel file consists of the file, and then of worksheets from within the file. These are the tabs at the bottom of the screen
968        # that can be added with "Insert->new worksheet" from the menus.
969        # Create a new workbook called simple.xls and add a worksheet
970    
971        # instantiate the workbook
972        $self->{'excel_workbook'}=Spreadsheet::WriteExcel->new($self->{'excel_filename'});
973        $excelfile=$self->{'excel_workbook'}; # this is for the close on END
974        $self->{'excel_workbook'}->set_tempdir($FIG_Config::temp); # you don't have to do this, but it may speed things up and reduce memory load.
975    
976        # define the default formats
977        my $border = defined $options->{border} ? $options->{border} : 0;
978        $self->{'excel_format'}->{default}=$self->{'excel_workbook'}->add_format(border=>$border, size=>10);
979        return $link;
980    }
981    
982    
983    =head3 make_excel_worksheet()
984    
985    This is the method that makes the separate sheets in the file. You can add as many of these as you want.
986    
987    =cut
988    
989    sub make_excel_worksheet {
990        my($self, $col_hdrs, $tab, $title)=@_;
991        #return "<p>Couldn't load Spreadsheet::WriteExcel</p>\n" unless ($self->{'useexcel'});
992        return "" unless ($self->{'useexcel'});
993    
994        unless (defined $self->{'excel_workbook'})
995        {
996            print STDERR "The workbook was not defined. Couldn't fill it in\n";
997            return;
998        }
999    
1000        if (length($title) > 31) {$title=substr($title, 0, 31)}
1001        my $worksheet = $self->{'excel_workbook'}->add_worksheet($title);
1002        # The general syntax for output to an excel file is write($row, $column, $value, $format). Note that row and
1003        # column are zero indexed
1004    
1005        # write the column headers
1006        # define a new format that is bold
1007        $self->{'excel_format'}->{header} = $self->{'excel_workbook'}->add_format();
1008        $self->{'excel_format'}->{header}->copy($self->{'excel_format'}->{default});
1009        $self->{'excel_format'}->{header}->set_bold();
1010    
1011        for my $i (0 .. $#$col_hdrs)
1012        {
1013            my $cell;
1014            my ($useformat, $rowspan, $colspan);
1015            if (ref($col_hdrs->[$i]) eq "ARRAY") {($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($col_hdrs->[$i])}
1016            else  {$cell=$col_hdrs->[$i]}
1017            $cell=$self->clean_excel_cell($cell);
1018            $worksheet->write(0, $i, $cell, $self->{'excel_format'}->{header});
1019        }
1020    
1021        # now loop through the table and write them out. Remember to break on array refs
1022        # we are going to have to build the table col by col so we get the breaks in the right place
1023        # for merged cells
1024        my $row_idx=1;
1025        my $maxrow=$#$tab;
1026        my $skip;
1027        while ($row_idx <= $maxrow+1)
1028        {
1029            my @row=@{$tab->[$row_idx-1]};
1030            my $col_idx=0;
1031            foreach my $cell (@row)
1032            {
1033                while ($skip->{$row_idx}->{$col_idx}) {$col_idx++}
1034                my $useformat=$self->{'excel_format'}->{default};
1035    
1036                # there is an approach to setting color using \@bgcolor. Oh well.
1037                if ( $cell =~ /^\@([^:]+)\:(.*)$/ )
1038                {
1039                    $cell=[$2, $1];
1040                }
1041    
1042                my ($rowspan, $colspan);
1043                if (ref($cell) eq "ARRAY")
1044                {
1045                    ($cell, $useformat, $rowspan, $colspan)=$self->parse_cell($cell);
1046                }
1047    
1048                $cell=$self->clean_excel_cell($cell);
1049    
1050                if ($rowspan > 1 || $colspan > 1)
1051                {
1052                    # handle merged cells separately
1053                    my $row_idx_to=$row_idx+$rowspan-1;
1054                    my $col_idx_to=$col_idx+$colspan-1;
1055                    # we want to not put anything in the merged cells
1056                    for (my $x=$row_idx; $x<=$row_idx_to; $x++) {$skip->{$x}->{$col_idx}=1}
1057                    for (my $y=$col_idx; $y<=$col_idx_to; $y++) {$skip->{$row_idx}->{$y}=1}
1058    
1059                    if (ref($cell) eq "ARRAY") {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, @$cell, $useformat)}
1060                    else {$worksheet->merge_range($row_idx, $col_idx, $row_idx_to, $col_idx_to, $cell, $useformat)}
1061                }
1062                else
1063                {
1064                    # this is a botch, but in some circumstances we need to split the cell out. e.g. if it is a URL
1065                    # in this case we have a reference to an array, and we'll use  a slight modification on the process
1066                    if ((ref($cell) eq "ARRAY" && $cell->[0] eq " &nbsp; ") || $cell eq " &nbsp; ") {$worksheet->write_blank($row_idx, $col_idx, $useformat)}
1067                    else
1068                    {
1069                        if (ref($cell) eq "ARRAY") {$worksheet->write($row_idx, $col_idx, @$cell, $useformat)}
1070                        else {$worksheet->write($row_idx, $col_idx, $cell, $useformat)}
1071                    }
1072                }
1073    
1074                # increment to the next column
1075                $col_idx++;
1076            }
1077            # new line, and start of line
1078            $row_idx++;
1079            $col_idx=0;
1080        }
1081    }
1082    
1083    
1084    
1085    
1086    =head3 close_excel_file()
1087    
1088    We must explicitly close the file before creating the link so that the file is written. This is also what returns the link
1089    
1090    =cut
1091    
1092    sub close_excel_file{
1093        my ($workbook)=@_;
1094        return unless (defined $workbook);
1095        # close the workbook. this writes the files
1096        return $workbook->close();
1097    }
1098    
1099    
1100    
1101    
1102    
1103    
1104    
1105    
1106    
1107    =head3 parse_cell()
1108    
1109    A method to take the cell from the table where there is some formatting information and figure out what we know. Return the data and the format.
1110    
1111    Requires the cell and the current $format.
1112    
1113    When applied to <td> the default formats that we'll deal with at the moment are
1114         align=
1115         background-color=
1116         color=
1117         bgcolor=
1118    
1119    Colors are funky in excel because it only has a limited palette. We rename colors as needed, and then save those so that we can use them again. We're only allowed 55 colors in excel (numbered 8..63). Because its a little stupid to mess with black and white and so on, I ignore those, and also start renumbering at color number 20, giving us 43 different colors.
1120    
1121    The reference to the hash excel_color has the custom excel colors stored in it for a few colors, and others are added to it.
1122    
1123    =cut
1124    
1125    sub parse_cell {
1126        my ($self, $arr)=@_;
1127        return ($arr, $self->{'excel_format'}->{default}) unless (ref($arr) eq "ARRAY");
1128        my ($cell, $tag)=@$arr;
1129        $tag =~ s/\'/"/g; # this just makes it easier to parse the things like align='center' and align="center" that are both valid
1130    
1131        # we are going to define a series of formats that we can apply, this will have  a key that is
1132        # th.center.bgcolor.fgcolor. Then if we already have that, we can use it, if not, we'll define it
1133    
1134        my ($th, $center, $bgcolor, $fgcolor)=(0,0,0,0);
1135    
1136        if ($tag =~ /^th/) {$th=1} # it is a header cell so we should make it bold
1137        if ($tag =~ /align\=\"(.*?)\"/i) {$center=$1}
1138    
1139        # get rid of white tags because I don't care about them
1140        $tag =~ s/color\=.\#FFFFFF/ /ig;
1141    
1142        if ($tag =~ /background-color\=\"(.*?)\"/i || $tag =~ /bgcolor\=\"(.*?)\"/i)
1143        {
1144            my $color=$1;
1145            if ($color)
1146            {
1147                if (!defined $self->{'excel_color'}->{$color})
1148                {
1149    # find out the last custom color used and increment it
1150                    my $max=19; # we are not going to use a color less than 20
1151                        foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1152                    $max++;
1153                    $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1154                }
1155                $bgcolor=$self->{'excel_color'}->{$color};
1156            }
1157        }
1158        elsif ($tag =~ /color\=\"(.*?)\"/i || $tag =~ /color\=\'(.*?)\'/i)
1159        {
1160            my $color=$1;
1161            if (!defined $self->{'excel_color'}->{$color})
1162            {
1163                # find out the last custom color used and increment it
1164                my $max=19; # we are not going to use a color less than 20
1165                foreach my $k (keys %{$self->{'excel_color'}}) {($self->{'excel_color'}->{$k} > $max) ? ($max=$self->{'excel_color'}->{$k}) :1}
1166                $max++;
1167                $self->{'excel_color'}->{$color}=$self->{'excel_workbook'}->set_custom_color($max, $color);
1168            }
1169            $fgcolor=$self->{'excel_color'}->{$color};
1170        }
1171    
1172        # check and see if we span multiple rows or columns
1173        my ($rowspan, $colspan)=(1,1);
1174        if ($tag =~ /rowspan\=[\'\"]?(\d+)/) {$rowspan=$1} # these should match rowspan="4", rowspan='4', and rowspan=4
1175        if ($tag =~ /colspan\=[\'\"]?(\d+)/) {$colspan=$1}
1176    
1177        my $formatid=$th.$center.$bgcolor.$fgcolor.$rowspan.$colspan;
1178        if (!defined $self->{'excel_format'}->{$formatid})
1179        {
1180            $self->{'excel_format'}->{$formatid}=$self->{'excel_workbook'}->add_format();
1181            if ($rowspan > 1) {$self->{'excel_format'}->{$formatid}->set_align("vcenter")}
1182            else
1183            {
1184                if ($th) {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{header})}
1185                else {$self->{'excel_format'}->{$formatid}->copy($self->{'excel_format'}->{default})}
1186            }
1187            $center && $self->{'excel_format'}->{$formatid}->set_align($center);
1188            $bgcolor && $self->{'excel_format'}->{$formatid}->set_bg_color($bgcolor);
1189            $fgcolor && $self->{'excel_format'}->{$formatid}->set_color($fgcolor);
1190        }
1191    
1192        return ($cell, $self->{'excel_format'}->{$formatid}, $rowspan, $colspan);
1193    }
1194    
1195    
1196    =head3 clean_excel_cell
1197    
1198    Process the cells to remove &nbsp; and also convert relative URLs to full URLs
1199    
1200    =cut
1201    
1202    sub clean_excel_cell {
1203        my ($self, $cell)=@_;
1204        if ($cell =~ /^\s*\&nbsp\;\s*$/) {$cell=undef} # ignore white space
1205    
1206        # some cells have something like this:
1207        # <a  onMouseover="javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this,'Role of BCAT','Branched-chain amino acid aminotransferase (EC 2.6.1.42)','','','','');this.tooltip.addHandler(); return false;" >BCAT</a>
1208        # we don't want those, but we do want the ones that have a real url hidden here.
1209        # so remove the mouseover part, and then see what is left
1210        if ($cell =~ s/onMouseover\=\".*?\"//)
1211        {
1212            if ($cell =~ s/\<a\s+>//i) {$cell =~ s/\<\/a>//i}
1213        }
1214    
1215        if ($cell =~ /\<a href=.(.*?).>(.*)<\/a>/)
1216        {
1217            # this is tricky because if the cell is a url then we need two separate things, the url and the link name
1218            my ($url, $link)=($1, $2);
1219            $url =~ s/^\.{1,2}\///; # remove notation of ./ and ../
1220            unless ($url =~ /^http/) {$url=$FIG_Config::cgi_url."/$url"}
1221            # this sucks as excel can only handle one link per cell, so we remove the other links. At the moment users will have to deal with that.
1222            $link =~ s/\<.*?\>//g;
1223            $cell=[$url, $link];
1224        }
1225        elsif ($cell =~ /<input type/)
1226        {
1227            if ($cell =~ /value='(.*?)'/) {$cell = $1}
1228            elsif ($cell =~ /value="(.*?)"/) {$cell = $1}
1229        }
1230        else
1231        {
1232            # this is all the html that I don't know what to do with, like <input type=...>
1233            $cell =~ s/\<.*?\>//g;
1234        }
1235        return $cell;
1236    }
1237    
1238    =head1 rand
1239    
1240    Randomize an array using the fisher-yates shuffle described in the perl cookbook.
1241    
1242    =cut
1243    
1244    sub rand {
1245      my ($self, $array) = @_;
1246      my $i;
1247      for ($i = @$array; --$i; ) {
1248       my $j = int rand ($i+1);
1249       next if $i == $j;
1250       @$array[$i,$j] = @$array[$j,$i];
1251      }
1252      return $array;
1253    }
1254    
1255    
1256  1;  1;
1257    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.44

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3