[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Fri Apr 8 20:22:23 2005 UTC revision 1.27, Mon Oct 3 20:05:34 2005 UTC
# Line 2  Line 2 
2    
3  =pod  =pod
4    
5  =head1  =head1 RAE Library
6    
7   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
# Line 11  Line 11 
11    
12  package raelib;  package raelib;
13  use strict;  use strict;
14    use Bio::SeqIO;
15    use Bio::Seq;
16    use Bio::SeqFeature::Generic;
17  use FIG;  use FIG;
18  my $fig=new FIG;  my $fig=new FIG;
19    
# Line 21  Line 24 
24  =cut  =cut
25    
26  sub new {  sub new {
27   my $self=shift;   my ($class)=@_;
28   return $self   my $self={};
29     return bless $self, $class;
30  }  }
31    
32    
# Line 70  Line 74 
74    
75  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
76    
77   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
78    
79   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
80    
# Line 80  Line 83 
83     to      : file to write information to     to      : file to write information to
84     verbose : report on progress     verbose : report on progress
85    
86    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
87    
88   Returns the number of lines in the pirsinfo file that were read.   Returns the number of lines in the pirsinfo file that were read.
89    
90  =cut  =cut
# Line 90  Line 95 
95    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
96    return 0;    return 0;
97   }   }
98     if ($from =~ /\.gz$/) {
99      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
100     }
101     else {
102   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
103     }
104   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
105   my $linecount;   my $linecount;
106   while (<IN>) {   while (<IN>) {
107    $linecount++;    $linecount++;
108    unless ($linecount % 10000) {print STDERR "Correspondence of $linecount lines calculated\n"}    if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
109    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
110    chomp;    chomp;
111    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
112     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
113    }    }
   unless ($done) {print OUT $_, "\t\n"}  
114   }   }
115   close IN;   close IN;
116   close OUT;   close OUT;
117   return $linecount;   return $linecount;
118  }  }
119    
120    =head2 uniprotcorrespondence
121    
122    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
123    
124    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
125    
126     This method takes three arguments:
127       from    : uniprotKB file
128       to      : file to write information to
129       verbose : report on progress
130    
131    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
132    
133     Returns the number of lines in the uniprotkb file that were read.
134    
135    =cut
136    
137    sub uniprotcorrespondence {
138     my ($self, $from, $to, $verbose)=@_;
139     unless (-e $from) {
140      print STDERR "File $from does not exist as called in $0\n";
141      return 0;
142     }
143     if ($from =~ /\.gz$/) {
144      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
145     }
146     else {
147      open (IN, $from) || die "Can't open $from";
148     }
149     open (OUT, ">$to") || die "Can't write to $to";
150     my $linecount;
151     while (<IN>) {
152      chomp;
153      $linecount++;
154      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
155      my @line=split /\s+\|\s+/;
156      my $added;
157      foreach my $peg ($self->swiss_pir_ids($line[0])) {
158       print OUT "$_ | $peg\n";
159       $added=1;
160      }
161      unless ($added) {print OUT "$_\n"}
162     }
163     close IN;
164     close OUT;
165     return $linecount;
166    }
167    
168    =head2 prositecorrespondence
169    
170    Generate a correspondence table between prosite and seed using sp id's and seed ids.
171    
172    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
173    
174    The output file will have the following columns:
175    
176    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
177    
178    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
179    
180     This method takes three arguments:
181       from    : prosite file
182       to      : file to write information to
183       verbose : report on progress
184    
185    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
186    
187     Returns the number of lines in the prosite file that were read.
188    
189    =cut
190    
191    sub prositecorrespondence {
192     my ($self, $from, $to, $verbose)=@_;
193     unless (-e $from) {
194      print STDERR "File $from does not exist as called in $0\n";
195      return 0;
196     }
197     if ($from =~ /\.gz$/) {
198      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
199     }
200     else {
201      open (IN, $from) || die "Can't open $from";
202     }
203     open (OUT, ">$to") || die "Can't write to $to";
204     my $linecount;
205     my ($famac, $famname, $famtype)=('','','');
206     while (<IN>) {
207      chomp;
208      $linecount++;
209      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
210      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
211      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
212      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
213      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
214      #
215      # this is the format of the DR lines:
216      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
217      s/^DR\s*//;
218      foreach my $piece (split /\s*\;\s*/, $_) {
219       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
220       foreach my $fig ($self->swiss_pir_ids($acc)) {
221        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
222       }
223      }
224     }
225    }
226    
227    =head2 swiss_pir_ids()
228    
229    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
230    
231    =cut
232    
233    sub swiss_pir_ids {
234     my ($self, $id)=@_;
235     return () unless ($id);
236     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
237    
238     my @return=($fig->by_alias("uni|$id"));
239     return @return if ($return[0]);
240    
241     @return=($fig->by_alias("tr|$id"));
242     return @return if ($return[0]);
243    
244     @return=($fig->by_alias("sp|$id"));
245     return @return if ($return[0]);
246    
247     return ();
248    }
249    
250  =head2 ss_by_id  =head2 ss_by_id
251    
# Line 244  Line 380 
380    
381  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
382    
383  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default);
384    
385  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
386    
387    default will set a default to override (maybe) korgs
388    
389  =cut  =cut
390    
391  sub scrolling_org_list {  sub scrolling_org_list {
392   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default)=@_;
393   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
394    
395   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 317  Line 455 
455                                          -labels   => $label,                                          -labels   => $label,
456                                          -size     => 10,                                          -size     => 10,
457                                          -multiple => $multiple,                                          -multiple => $multiple,
458                                            -default  => $default,
459                                        ), $cgi->br,                                        ), $cgi->br,
460                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
461                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 327  Line 466 
466                    "</TD>",                    "</TD>",
467                    "   </TR>\n",                    "   </TR>\n",
468                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
469          );          );
470  }  }
471    
472    
473    =head2 scrolling_subsys_list
474    
475    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
476    
477    use like this
478    
479    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
480    
481    =cut
482    
483    sub scrolling_subsys_list {
484     my ($self, $cgi, $multiple)=@_;
485     $multiple=0 unless (defined $multiple);
486     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
487     my $label;
488     # generate labels for the list
489     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
490     return $cgi->scrolling_list(
491      -name    => 'subsystems',
492      -values  => \@ss,
493      -labels  => $label,
494      -size    => 10,
495      -multiple=> $multiple,
496     );
497    }
498    
499    =head2 subsys_names_for_display
500    
501    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
502    
503    $raelib->subsys_names_for_display(@ss);
504    or
505    $raelib->subsys_names_for_display($fig->all_subsystems());
506    
507    =cut
508    
509    sub subsys_names_for_display {
510     my ($self, @ss)=@_;
511     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
512     return @ss;
513    }
514    
515    =head2 GenBank
516    
517     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
518     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
519     at building the object.
520    
521     Note that you need to call this with the genome name and the contig. This will then go through that contig.
522    
523     Something like this should work
524    
525     foreach my $contig ($fig->all_contigs($genome)) {
526      my $seqobj=FIGRob->GenBank($genome, $contig);
527      # process the contig
528     }
529    
530    =cut
531    
532    sub GenBank {
533     my ($self, $genome, $contig)=@_;
534     my $gs=$fig->genus_species($genome);
535     return unless ($gs);
536     unless ($contig) {
537      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
538      return;
539     }
540     my $len=$fig->contig_ln($genome, $contig);
541     unless ($len) {
542      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
543      return;
544     }
545    
546    
547     # first find all the pegs ...
548     my $features; # all the features in the genome
549     my $allpegs; # all the pegs
550     my $translation; # all the protein sequences
551     foreach my $peg ($fig->pegs_of($genome)) {
552      my @location=$fig->feature_location($peg);
553      my $func=$fig->function_of($peg);
554      foreach my $loc (@location) {
555       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
556       my ($cg, $start, $stop)=($1, $2, $3);
557       next unless ($cg eq $contig);
558       # save this information for later
559       $features->{'peg'}->{$loc}=$func;
560       $allpegs->{'peg'}->{$loc}=$peg;
561       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
562      }
563     }
564     # ... and all the RNAs
565     foreach my $peg ($fig->rnas_of($genome)) {
566      my @location=$fig->feature_location($peg);
567      my $func=$fig->function_of($peg);
568      foreach my $loc (@location) {
569       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
570       my ($cg, $start, $stop)=($1, $2, $3);
571       next unless ($cg eq $contig);
572       # save this information for later
573       $features->{'rna'}->{$loc}=$func;
574       $allpegs->{'rna'}->{$loc}=$peg;
575      }
576     }
577    
578    
579     # now get all the contigs out
580     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
581     my $description = "Contig $contig from " . $fig->genus_species($genome);
582     my $sobj=Bio::Seq->new(
583              -seq              =>  $seq,
584              -id               =>  $contig,
585              -desc             =>  $description,
586              -accession_number =>  $genome
587              );
588     foreach my $prot (keys %{$features->{'peg'}}) {
589       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
590       my ($cg, $start, $stop)=($1, $2, $3);
591       my $strand=1;
592       if ($stop < $start) {
593        ($stop, $start)=($start, $stop);
594        $strand=-1;
595     }
596    
597     my $feat=Bio::SeqFeature::Generic->new(
598            -start         =>  $start,
599            -end           =>  $stop,
600            -strand        =>  $strand,
601            -primary       =>  'CDS',
602            -display_name  =>  $allpegs->{'peg'}->{$prot},
603            -source_tag    =>  'the SEED',
604            -tag           =>
605                           {
606                           db_xref     =>   $allpegs->{'peg'}->{$prot},
607                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
608                           function    =>  $features->{'peg'}->{$prot},
609                           translation =>  $translation->{'peg'}->{$prot}
610                          }
611           );
612    
613       $sobj->add_SeqFeature($feat);
614     }
615    
616     foreach my $prot (keys %{$features->{'rna'}}) {
617       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
618       my ($cg, $start, $stop)=($1, $2, $3);
619       my $strand=1;
620       if ($stop < $start) {
621        ($stop, $start)=($start, $stop);
622        $strand=-1;
623       }
624    
625       my $feat=Bio::SeqFeature::Generic->new(
626            -start         =>  $start,
627            -end           =>  $stop,
628            -strand        =>  $strand,
629            -primary       =>  'RNA',
630            -source_tag    =>  'the SEED',
631            -display_name  =>  $allpegs->{'rna'}->{$prot},
632            -tag           =>
633                          {
634                           db_xref     =>   $allpegs->{'rna'}->{$prot},
635                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
636                           function    =>  $features->{'rna'}->{$prot},
637                          }
638           );
639    
640      $sobj->add_SeqFeature($feat);
641     }
642     return $sobj;
643    }
644    
645    =head2 best_hit
646    
647     Returns the FIG id of the single best hit to a peg
648    
649     eg
650    
651     my $bh=$fr->best_hit($peg);
652     print 'function is ', scalar $fig->function_of($bh);
653    
654    =cut
655    
656    sub best_hit {
657     my ($self, $peg)=@_;
658     return unless ($peg);
659    
660     my ($maxN, $maxP)=(1, 1e-5);
661     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
662     return ${$sims[0]}[1];
663    }
664    
665    
666    =head1 read_fasta
667    
668    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
669    
670    Usage:
671    my $fasta=$raelib->read_fasta($file);
672    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
673    
674    =cut
675    
676    sub read_fasta {
677     my ($self, $file, $keepcomments)=@_;
678     open (IN, $file) || die "Can't open $file";
679     my %f; my $t; my $s; my %c;
680     while (<IN>) {
681      chomp;
682      if (/^>/) {
683       if ($s) {
684        $f{$t}=$s;
685        undef $s;
686       }
687       s/^>(\S+)\s*//;
688       $t=$1;
689       $c{$t}=$_ if ($_);
690      }
691      else {$s .= $_}
692     }
693     $f{$t}=$s;
694     if ($keepcomments) {return (\%f, \%c)}
695     else {return \%f}
696    }
697    
698    =head1 rc
699    
700    Reverse complement. It's too easy.
701    
702    =cut
703    
704    sub rc {
705     my ($self, $seq)=@_;
706     $seq=~tr/GATCgatc/CTAGctag/;
707     $seq = reverse $seq;
708     return $seq;
709    }
710    
711    
712    =head2 cookies
713    
714    Handle cookies. This method will get and set the value of the FIG cookie. Cookies are name/value pairs that are stored on the users computer. We then retrieve them using this method. The cookies are passed in as a reference to a hash, and the method returns a tuple of the cookie that can be passed to the browser and a reference to a hash with the data.
715    
716    If you do not pass any arguments the whole cookie will be returned.
717    
718    Use as follows:
719    
720    ($cookie, $data) = raelib->cookie($cgi, \%data);
721    
722    You do not need to pass in any data, in that case you will just get the cookie back
723    
724    Underneath, I create a single cookie called FIG which stores all the information. The names and value pairs are stored using = to join name to value and ; to concatenate. This way we can create a single cookie with all the data. I am using the FIG::clean_attribute_key method to remove unwanted characters from the name/value pairs, so don't use them.
725    
726    Note that for the moment I have put this routine here since it needs to maintain the state of the cookie (i.e. it needs to know what $self is). It should really be in HTML.pm but that is not, as far as I can tell, maintaining states?
727    
728    =cut
729    
730    sub cookie {
731     my ($self, $cgi, $input)=@_;
732     return unless ($cgi);
733     $self->{'cookie'}=$cgi->cookie(-name=>"FIG") unless ($self->{'cookie'});
734    
735     # first, create a hash from the existing cookie data
736     my $cookie;
737     map {
738      my ($kname, $kvalue)=split /\=/, $_;
739      $cookie->{$kname}=$kvalue;
740     } split /\;/, $self->{'cookie'};
741    
742     if ($input)
743     {
744      # add the values that were passed in
745      map {$cookie->{FIG->clean_attribute_key($_)}=$input->{$_}} keys %$input;
746      # put everything back together and set the cookie
747      my $newcookie=join ";", map {$_ . "=" . $cookie->{$_}} keys %$cookie;
748      $self->{'cookie'}=$cgi->cookie(-name=>"FIG", -value=>$newcookie, -expires=>'+1y');
749     }
750    
751     return ($self->{'cookie'}, $cookie);
752    }
753    
754    
755    
756    
757  1;  1;
758    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.27

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3