[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Sun Apr 3 02:15:15 2005 UTC revision 1.26, Fri Aug 5 16:22:53 2005 UTC
# Line 2  Line 2 
2    
3  =pod  =pod
4    
5  =head1  =head1 RAE Library
6    
7   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
# Line 11  Line 11 
11    
12  package raelib;  package raelib;
13  use strict;  use strict;
14    use Bio::SeqIO;
15    use Bio::Seq;
16    use Bio::SeqFeature::Generic;
17  use FIG;  use FIG;
18  my $fig=new FIG;  my $fig=new FIG;
19    
# Line 70  Line 73 
73    
74  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
75    
76   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
77    
78   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
79    
80    This method takes three arguments:
81       from    : pirsfinfo.dat file
82       to      : file to write information to
83       verbose : report on progress
84    
85    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
86    
87    Returns the number of lines in the pirsinfo file that were read.
88    
89  =cut  =cut
90    
91  sub pirsfcorrespondence {  sub pirsfcorrespondence {
92   my ($self, $from, $to)=@_;   my ($self, $from, $to, $verbose)=@_;
93   die "File $from does not exist as called in $0" unless (-e $from);   unless (-e $from) {
94      print STDERR "File $from does not exist as called in $0\n";
95      return 0;
96     }
97     if ($from =~ /\.gz$/) {
98      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
99     }
100     else {
101   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
102   open (OUT, ">$to") || die "Can't write tot $to";   }
103     open (OUT, ">$to") || die "Can't write to $to";
104     my $linecount;
105   while (<IN>) {   while (<IN>) {
106      $linecount++;
107      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
108    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
109    chomp;    chomp;
110    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
111     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
112    }    }
   unless ($done) {print OUT $_, "\t\n"}  
113   }   }
114   close IN;   close IN;
115   close OUT;   close OUT;
116     return $linecount;
117    }
118    
119    =head2 uniprotcorrespondence
120    
121    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
122    
123    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
124    
125     This method takes three arguments:
126       from    : uniprotKB file
127       to      : file to write information to
128       verbose : report on progress
129    
130    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
131    
132     Returns the number of lines in the uniprotkb file that were read.
133    
134    =cut
135    
136    sub uniprotcorrespondence {
137     my ($self, $from, $to, $verbose)=@_;
138     unless (-e $from) {
139      print STDERR "File $from does not exist as called in $0\n";
140      return 0;
141     }
142     if ($from =~ /\.gz$/) {
143      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
144     }
145     else {
146      open (IN, $from) || die "Can't open $from";
147     }
148     open (OUT, ">$to") || die "Can't write to $to";
149     my $linecount;
150     while (<IN>) {
151      chomp;
152      $linecount++;
153      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
154      my @line=split /\s+\|\s+/;
155      my $added;
156      foreach my $peg ($self->swiss_pir_ids($line[0])) {
157       print OUT "$_ | $peg\n";
158       $added=1;
159      }
160      unless ($added) {print OUT "$_\n"}
161     }
162     close IN;
163     close OUT;
164     return $linecount;
165    }
166    
167    =head2 prositecorrespondence
168    
169    Generate a correspondence table between prosite and seed using sp id's and seed ids.
170    
171    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
172    
173    The output file will have the following columns:
174    
175    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
176    
177    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
178    
179     This method takes three arguments:
180       from    : prosite file
181       to      : file to write information to
182       verbose : report on progress
183    
184    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185    
186     Returns the number of lines in the prosite file that were read.
187    
188    =cut
189    
190    sub prositecorrespondence {
191     my ($self, $from, $to, $verbose)=@_;
192     unless (-e $from) {
193      print STDERR "File $from does not exist as called in $0\n";
194      return 0;
195     }
196     if ($from =~ /\.gz$/) {
197      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198     }
199     else {
200      open (IN, $from) || die "Can't open $from";
201     }
202     open (OUT, ">$to") || die "Can't write to $to";
203     my $linecount;
204     my ($famac, $famname, $famtype)=('','','');
205     while (<IN>) {
206      chomp;
207      $linecount++;
208      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
209      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
210      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
211      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
212      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
213      #
214      # this is the format of the DR lines:
215      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
216      s/^DR\s*//;
217      foreach my $piece (split /\s*\;\s*/, $_) {
218       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
219       foreach my $fig ($self->swiss_pir_ids($acc)) {
220        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
221       }
222      }
223     }
224  }  }
225    
226    =head2 swiss_pir_ids()
227    
228    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
229    
230    =cut
231    
232    sub swiss_pir_ids {
233     my ($self, $id)=@_;
234     return () unless ($id);
235     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
236    
237     my @return=($fig->by_alias("uni|$id"));
238     return @return if ($return[0]);
239    
240     @return=($fig->by_alias("tr|$id"));
241     return @return if ($return[0]);
242    
243     @return=($fig->by_alias("sp|$id"));
244     return @return if ($return[0]);
245    
246     return ();
247    }
248    
249  =head2 ss_by_id  =head2 ss_by_id
250    
# Line 173  Line 322 
322   my @return;   my @return;
323   my @attr=$fig->feature_attributes($peg);   my @attr=$fig->feature_attributes($peg);
324   foreach my $attr (@attr) {   foreach my $attr (@attr) {
325    my ($gottag, $val, $link)=@$attr;    my ($gotpeg, $gottag, $val, $link)=@$attr;
326    push @return, $val if ($gottag eq $tag);    push @return, $val if ($gottag eq $tag);
327   }   }
328   return wantarray ? @return : join "; ", @return;   return wantarray ? @return : join "; ", @return;
# Line 230  Line 379 
379    
380  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.  This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
381    
382  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);  use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default);
383    
384  multiple selections will only be set if $multiple is true  multiple selections will only be set if $multiple is true
385    
386    default will set a default to override (maybe) korgs
387    
388  =cut  =cut
389    
390  sub scrolling_org_list {  sub scrolling_org_list {
391   my ($self, $cgi, $multiple)=@_;   my ($self, $cgi, $multiple, $default)=@_;
392   unless ($multiple) {$multiple=0}   unless ($multiple) {$multiple=0}
393    
394   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );   my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
# Line 303  Line 454 
454                                          -labels   => $label,                                          -labels   => $label,
455                                          -size     => 10,                                          -size     => 10,
456                                          -multiple => $multiple,                                          -multiple => $multiple,
457                                            -default  => $default,
458                                        ), $cgi->br,                                        ), $cgi->br,
459                    "$n_genomes genomes shown ",                    "$n_genomes genomes shown ",
460                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,                    $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
# Line 313  Line 465 
465                    "</TD>",                    "</TD>",
466                    "   </TR>\n",                    "   </TR>\n",
467                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
468          );          );
469  }  }
470    
471    
472    =head2 scrolling_subsys_list
473    
474    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
475    
476    use like this
477    
478    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
479    
480    =cut
481    
482    sub scrolling_subsys_list {
483     my ($self, $cgi, $multiple)=@_;
484     $multiple=0 unless (defined $multiple);
485     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
486     my $label;
487     # generate labels for the list
488     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
489     return $cgi->scrolling_list(
490      -name    => 'subsystems',
491      -values  => \@ss,
492      -labels  => $label,
493      -size    => 10,
494      -multiple=> $multiple,
495     );
496    }
497    
498    =head2 subsys_names_for_display
499    
500    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
501    
502    $raelib->subsys_names_for_display(@ss);
503    or
504    $raelib->subsys_names_for_display($fig->all_subsystems());
505    
506    =cut
507    
508    sub subsys_names_for_display {
509     my ($self, @ss)=@_;
510     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
511     return @ss;
512    }
513    
514    =head2 GenBank
515    
516     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
517     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
518     at building the object.
519    
520     Note that you need to call this with the genome name and the contig. This will then go through that contig.
521    
522     Something like this should work
523    
524     foreach my $contig ($fig->all_contigs($genome)) {
525      my $seqobj=FIGRob->GenBank($genome, $contig);
526      # process the contig
527     }
528    
529    =cut
530    
531    sub GenBank {
532     my ($self, $genome, $contig)=@_;
533     my $gs=$fig->genus_species($genome);
534     return unless ($gs);
535     unless ($contig) {
536      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
537      return;
538     }
539     my $len=$fig->contig_ln($genome, $contig);
540     unless ($len) {
541      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
542      return;
543     }
544    
545    
546     # first find all the pegs ...
547     my $features; # all the features in the genome
548     my $allpegs; # all the pegs
549     my $translation; # all the protein sequences
550     foreach my $peg ($fig->pegs_of($genome)) {
551      my @location=$fig->feature_location($peg);
552      my $func=$fig->function_of($peg);
553      foreach my $loc (@location) {
554       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
555       my ($cg, $start, $stop)=($1, $2, $3);
556       next unless ($cg eq $contig);
557       # save this information for later
558       $features->{'peg'}->{$loc}=$func;
559       $allpegs->{'peg'}->{$loc}=$peg;
560       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
561      }
562     }
563     # ... and all the RNAs
564     foreach my $peg ($fig->rnas_of($genome)) {
565      my @location=$fig->feature_location($peg);
566      my $func=$fig->function_of($peg);
567      foreach my $loc (@location) {
568       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
569       my ($cg, $start, $stop)=($1, $2, $3);
570       next unless ($cg eq $contig);
571       # save this information for later
572       $features->{'rna'}->{$loc}=$func;
573       $allpegs->{'rna'}->{$loc}=$peg;
574      }
575     }
576    
577    
578     # now get all the contigs out
579     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
580     my $description = "Contig $contig from " . $fig->genus_species($genome);
581     my $sobj=Bio::Seq->new(
582              -seq              =>  $seq,
583              -id               =>  $contig,
584              -desc             =>  $description,
585              -accession_number =>  $genome
586              );
587     foreach my $prot (keys %{$features->{'peg'}}) {
588       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
589       my ($cg, $start, $stop)=($1, $2, $3);
590       my $strand=1;
591       if ($stop < $start) {
592        ($stop, $start)=($start, $stop);
593        $strand=-1;
594     }
595    
596     my $feat=Bio::SeqFeature::Generic->new(
597            -start         =>  $start,
598            -end           =>  $stop,
599            -strand        =>  $strand,
600            -primary       =>  'CDS',
601            -display_name  =>  $allpegs->{'peg'}->{$prot},
602            -source_tag    =>  'the SEED',
603            -tag           =>
604                           {
605                           db_xref     =>   $allpegs->{'peg'}->{$prot},
606                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
607                           function    =>  $features->{'peg'}->{$prot},
608                           translation =>  $translation->{'peg'}->{$prot}
609                          }
610           );
611    
612       $sobj->add_SeqFeature($feat);
613     }
614    
615     foreach my $prot (keys %{$features->{'rna'}}) {
616       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
617       my ($cg, $start, $stop)=($1, $2, $3);
618       my $strand=1;
619       if ($stop < $start) {
620        ($stop, $start)=($start, $stop);
621        $strand=-1;
622       }
623    
624       my $feat=Bio::SeqFeature::Generic->new(
625            -start         =>  $start,
626            -end           =>  $stop,
627            -strand        =>  $strand,
628            -primary       =>  'RNA',
629            -source_tag    =>  'the SEED',
630            -display_name  =>  $allpegs->{'rna'}->{$prot},
631            -tag           =>
632                          {
633                           db_xref     =>   $allpegs->{'rna'}->{$prot},
634                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
635                           function    =>  $features->{'rna'}->{$prot},
636                          }
637           );
638    
639      $sobj->add_SeqFeature($feat);
640     }
641     return $sobj;
642    }
643    
644    =head2 best_hit
645    
646     Returns the FIG id of the single best hit to a peg
647    
648     eg
649    
650     my $bh=$fr->best_hit($peg);
651     print 'function is ', scalar $fig->function_of($bh);
652    
653    =cut
654    
655    sub best_hit {
656     my ($self, $peg)=@_;
657     return unless ($peg);
658    
659     my ($maxN, $maxP)=(1, 1e-5);
660     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
661     return ${$sims[0]}[1];
662    }
663    
664    
665    =head1 read_fasta
666    
667    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
668    
669    Usage:
670    my $fasta=$raelib->read_fasta($file);
671    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
672    
673    =cut
674    
675    sub read_fasta {
676     my ($self, $file, $keepcomments)=@_;
677     open (IN, $file) || die "Can't open $file";
678     my %f; my $t; my $s; my %c;
679     while (<IN>) {
680      chomp;
681      if (/^>/) {
682       if ($s) {
683        $f{$t}=$s;
684        undef $s;
685       }
686       s/^>(\S+)\s*//;
687       $t=$1;
688       $c{$t}=$_ if ($_);
689      }
690      else {$s .= $_}
691     }
692     $f{$t}=$s;
693     if ($keepcomments) {return (\%f, \%c)}
694     else {return \%f}
695    }
696    
697    =head1 rc
698    
699    Reverse complement. It's too easy.
700    
701    =cut
702    
703    sub rc {
704     my ($self, $seq)=@_;
705     $seq=~tr/GATCgatc/CTAGctag/;
706     $seq = reverse $seq;
707     return $seq;
708    }
709    
710  1;  1;
711    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.26

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3