[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.28, Mon Dec 5 19:06:30 2005 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  # -*- perl -*-  # -*- perl -*-
19    
20  =pod  =pod
21    
22  =head1  =head1 RAE Library
23    
24   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
25   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
# Line 11  Line 28 
28    
29  package raelib;  package raelib;
30  use strict;  use strict;
31    use Bio::SeqIO;
32    use Bio::Seq;
33    use Bio::SeqFeature::Generic;
34  use FIG;  use FIG;
35  my $fig=new FIG;  my $fig=new FIG;
36    
# Line 21  Line 41 
41  =cut  =cut
42    
43  sub new {  sub new {
44   my $self=shift;   my ($class)=@_;
45   return $self   my $self={};
46     return bless $self, $class;
47  }  }
48    
49    
# Line 70  Line 91 
91    
92  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
93    
94   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
95    
96   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
97    
98    This method takes three arguments:
99       from    : pirsfinfo.dat file
100       to      : file to write information to
101       verbose : report on progress
102    
103    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
104    
105    Returns the number of lines in the pirsinfo file that were read.
106    
107  =cut  =cut
108    
109  sub pirsfcorrespondence {  sub pirsfcorrespondence {
110   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
111   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
112      print STDERR "File $from does not exist as called in $0\n";
113      return 0;
114     }
115     if ($from =~ /\.gz$/) {
116      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
117     }
118     else {
119   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
120   open (OUT, ">$to") || die "Can't write tot $to";   }
121     open (OUT, ">$to") || die "Can't write to $to";
122     my $linecount;
123   while (<IN>) {   while (<IN>) {
124      $linecount++;
125      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
126    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
127    chomp;    chomp;
128    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
129     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
130    }    }
   unless ($done) {print OUT $_, "\t\n"}  
131   }   }
132   close IN;   close IN;
133   close OUT;   close OUT;
134     return $linecount;
135    }
136    
137    =head2 uniprotcorrespondence
138    
139    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
140    
141    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
142    
143     This method takes three arguments:
144       from    : uniprotKB file
145       to      : file to write information to
146       verbose : report on progress
147    
148    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
149    
150     Returns the number of lines in the uniprotkb file that were read.
151    
152    =cut
153    
154    sub uniprotcorrespondence {
155     my ($self, $from, $to, $verbose)=@_;
156     unless (-e $from) {
157      print STDERR "File $from does not exist as called in $0\n";
158      return 0;
159     }
160     if ($from =~ /\.gz$/) {
161      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
162     }
163     else {
164      open (IN, $from) || die "Can't open $from";
165     }
166     open (OUT, ">$to") || die "Can't write to $to";
167     my $linecount;
168     while (<IN>) {
169      chomp;
170      $linecount++;
171      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
172      my @line=split /\s+\|\s+/;
173      my $added;
174      foreach my $peg ($self->swiss_pir_ids($line[0])) {
175       print OUT "$_ | $peg\n";
176       $added=1;
177      }
178      unless ($added) {print OUT "$_\n"}
179     }
180     close IN;
181     close OUT;
182     return $linecount;
183  }  }
184    
185    =head2 prositecorrespondence
186    
187    Generate a correspondence table between prosite and seed using sp id's and seed ids.
188    
189    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
190    
191    The output file will have the following columns:
192    
193    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
194    
195    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
196    
197     This method takes three arguments:
198       from    : prosite file
199       to      : file to write information to
200       verbose : report on progress
201    
202    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
203    
204     Returns the number of lines in the prosite file that were read.
205    
206    =cut
207    
208    sub prositecorrespondence {
209     my ($self, $from, $to, $verbose)=@_;
210     unless (-e $from) {
211      print STDERR "File $from does not exist as called in $0\n";
212      return 0;
213     }
214     if ($from =~ /\.gz$/) {
215      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
216     }
217     else {
218      open (IN, $from) || die "Can't open $from";
219     }
220     open (OUT, ">$to") || die "Can't write to $to";
221     my $linecount;
222     my ($famac, $famname, $famtype)=('','','');
223     while (<IN>) {
224      chomp;
225      $linecount++;
226      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
227      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
228      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
229      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
230      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
231      #
232      # this is the format of the DR lines:
233      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
234      s/^DR\s*//;
235      foreach my $piece (split /\s*\;\s*/, $_) {
236       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
237       foreach my $fig ($self->swiss_pir_ids($acc)) {
238        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
239       }
240      }
241     }
242    }
243    
244    =head2 swiss_pir_ids()
245    
246    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
247    
248    =cut
249    
250    sub swiss_pir_ids {
251     my ($self, $id)=@_;
252     return () unless ($id);
253     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
254    
255     my @return=($fig->by_alias("uni|$id"));
256     return @return if ($return[0]);
257    
258     @return=($fig->by_alias("tr|$id"));
259     return @return if ($return[0]);
260    
261     @return=($fig->by_alias("sp|$id"));
262     return @return if ($return[0]);
263    
264     return ();
265    }
266    
267  =head2 ss_by_id  =head2 ss_by_id
268    
# Line 173  Line 340 
340   my @return;   my @return;
341   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
342   foreach my $attr (@attr) {   foreach my $attr (@attr) {
343    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
344    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
345   }   }
346   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 397 
397    
398  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
399    
400  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default);
401    
402  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
403    
404    default will set a default to override (maybe) korgs
405    
406  =cut  =cut
407    
408  sub scrolling_org_list {  sub scrolling_org_list {
409   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default)=@_;
410   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
411    
412   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 303  Line 472 
472                                          -labels   => $label,                                          -labels   => $label,
473                                          -size     => 10,                                          -size     => 10,
474                                          -multiple => $multiple,                                          -multiple => $multiple,
475                                            -default  => $default,
476                                        ), $cgi->br,                                        ), $cgi->br,
477                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
478                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 483 
483                    "</TD>",                    "</TD>",
484                    "   </TR>\n",                    "   </TR>\n",
485                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
486          );          );
487  }  }
488    
489    
490    =head2 scrolling_subsys_list
491    
492    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
493    
494    use like this
495    
496    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
497    
498    =cut
499    
500    sub scrolling_subsys_list {
501     my ($self, $cgi, $multiple)=@_;
502     $multiple=0 unless (defined $multiple);
503     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
504     my $label;
505     # generate labels for the list
506     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
507     return $cgi->scrolling_list(
508      -name    => 'subsystems',
509      -values  => \@ss,
510      -labels  => $label,
511      -size    => 10,
512      -multiple=> $multiple,
513     );
514    }
515    
516    =head2 subsys_names_for_display
517    
518    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
519    
520    $raelib->subsys_names_for_display(@ss);
521    or
522    $raelib->subsys_names_for_display($fig->all_subsystems());
523    
524    =cut
525    
526    sub subsys_names_for_display {
527     my ($self, @ss)=@_;
528     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
529     return @ss;
530    }
531    
532    =head2 GenBank
533    
534     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
535     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
536     at building the object.
537    
538     Note that you need to call this with the genome name and the contig. This will then go through that contig.
539    
540     Something like this should work
541    
542     foreach my $contig ($fig->all_contigs($genome)) {
543      my $seqobj=FIGRob->GenBank($genome, $contig);
544      # process the contig
545     }
546    
547    =cut
548    
549    sub GenBank {
550     my ($self, $genome, $contig)=@_;
551     my $gs=$fig->genus_species($genome);
552     return unless ($gs);
553     unless ($contig) {
554      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
555      return;
556     }
557     my $len=$fig->contig_ln($genome, $contig);
558     unless ($len) {
559      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
560      return;
561     }
562    
563    
564     # first find all the pegs ...
565     my $features; # all the features in the genome
566     my $allpegs; # all the pegs
567     my $translation; # all the protein sequences
568     foreach my $peg ($fig->pegs_of($genome)) {
569      my @location=$fig->feature_location($peg);
570      my $func=$fig->function_of($peg);
571      foreach my $loc (@location) {
572       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
573       my ($cg, $start, $stop)=($1, $2, $3);
574       next unless ($cg eq $contig);
575       # save this information for later
576       $features->{'peg'}->{$loc}=$func;
577       $allpegs->{'peg'}->{$loc}=$peg;
578       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
579      }
580     }
581     # ... and all the RNAs
582     foreach my $peg ($fig->rnas_of($genome)) {
583      my @location=$fig->feature_location($peg);
584      my $func=$fig->function_of($peg);
585      foreach my $loc (@location) {
586       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
587       my ($cg, $start, $stop)=($1, $2, $3);
588       next unless ($cg eq $contig);
589       # save this information for later
590       $features->{'rna'}->{$loc}=$func;
591       $allpegs->{'rna'}->{$loc}=$peg;
592      }
593     }
594    
595    
596     # now get all the contigs out
597     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
598     my $description = "Contig $contig from " . $fig->genus_species($genome);
599     my $sobj=Bio::Seq->new(
600              -seq              =>  $seq,
601              -id               =>  $contig,
602              -desc             =>  $description,
603              -accession_number =>  $genome
604              );
605     foreach my $prot (keys %{$features->{'peg'}}) {
606       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
607       my ($cg, $start, $stop)=($1, $2, $3);
608       my $strand=1;
609       if ($stop < $start) {
610        ($stop, $start)=($start, $stop);
611        $strand=-1;
612     }
613    
614     my $feat=Bio::SeqFeature::Generic->new(
615            -start         =>  $start,
616            -end           =>  $stop,
617            -strand        =>  $strand,
618            -primary       =>  'CDS',
619            -display_name  =>  $allpegs->{'peg'}->{$prot},
620            -source_tag    =>  'the SEED',
621            -tag           =>
622                           {
623                           db_xref     =>   $allpegs->{'peg'}->{$prot},
624                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
625                           function    =>  $features->{'peg'}->{$prot},
626                           translation =>  $translation->{'peg'}->{$prot}
627                          }
628           );
629    
630       $sobj->add_SeqFeature($feat);
631     }
632    
633     foreach my $prot (keys %{$features->{'rna'}}) {
634       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
635       my ($cg, $start, $stop)=($1, $2, $3);
636       my $strand=1;
637       if ($stop < $start) {
638        ($stop, $start)=($start, $stop);
639        $strand=-1;
640       }
641    
642       my $feat=Bio::SeqFeature::Generic->new(
643            -start         =>  $start,
644            -end           =>  $stop,
645            -strand        =>  $strand,
646            -primary       =>  'RNA',
647            -source_tag    =>  'the SEED',
648            -display_name  =>  $allpegs->{'rna'}->{$prot},
649            -tag           =>
650                          {
651                           db_xref     =>   $allpegs->{'rna'}->{$prot},
652                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
653                           function    =>  $features->{'rna'}->{$prot},
654                          }
655           );
656    
657      $sobj->add_SeqFeature($feat);
658     }
659     return $sobj;
660    }
661    
662    =head2 best_hit
663    
664     Returns the FIG id of the single best hit to a peg
665    
666     eg
667    
668     my $bh=$fr->best_hit($peg);
669     print 'function is ', scalar $fig->function_of($bh);
670    
671    =cut
672    
673    sub best_hit {
674     my ($self, $peg)=@_;
675     return unless ($peg);
676    
677     my ($maxN, $maxP)=(1, 1e-5);
678     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
679     return ${$sims[0]}[1];
680    }
681    
682    
683    =head1 read_fasta
684    
685    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
686    
687    Usage:
688    my $fasta=$raelib->read_fasta($file);
689    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
690    
691    =cut
692    
693    sub read_fasta {
694     my ($self, $file, $keepcomments)=@_;
695     open (IN, $file) || die "Can't open $file";
696     my %f; my $t; my $s; my %c;
697     while (<IN>) {
698      chomp;
699      if (/^>/) {
700       if ($s) {
701        $f{$t}=$s;
702        undef $s;
703       }
704       s/^>(\S+)\s*//;
705       $t=$1;
706       $c{$t}=$_ if ($_);
707      }
708      else {$s .= $_}
709     }
710     $f{$t}=$s;
711     if ($keepcomments) {return (\%f, \%c)}
712     else {return \%f}
713    }
714    
715    =head1 rc
716    
717    Reverse complement. It's too easy.
718    
719    =cut
720    
721    sub rc {
722     my ($self, $seq)=@_;
723     $seq=~tr/GATCgatc/CTAGctag/;
724     $seq = reverse $seq;
725     return $seq;
726    }
727    
728    
729    =head2 cookies
730    
731    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
732    
733    If you do not pass any arguments the whole cookie will be returned.
734    
735    Use as follows:
736    
737    ($cookie, $data) = raelib->cookie($cgi, \%data);
738    
739    You do not need to pass in any data, in that case you will just get the cookie back
740    
741    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
742    
743    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
744    
745    =cut
746    
747    sub cookie {
748     my ($self, $cgi, $input)=@_;
749     return unless ($cgi);
750     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
751    
752     # first, create a hash from the existing cookie data
753     my $cookie;
754     map {
755      my ($kname, $kvalue)=split /\=/, $_;
756      $cookie->{$kname}=$kvalue;
757     } split /\;/, $self->{'cookie'};
758    
759     if ($input)
760     {
761      # add the values that were passed in
762      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
763      # put everything back together and set the cookie
764      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
765      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
766     }
767    
768     return ($self->{'cookie'}, $cookie);
769    }
770    
771    
772    
773    
774  1;  1;
775    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.28

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3