[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.13, Wed May 4 03:07:23 2005 UTC revision 1.23, Mon Jul 11 21:25:03 2005 UTC
# Line 11  Line 11 
11    
12  package raelib;  package raelib;
13  use strict;  use strict;
14    use Bio::SeqIO;
15    use Bio::Seq;
16    use Bio::SeqFeature::Generic;
17  use FIG;  use FIG;
18  my $fig=new FIG;  my $fig=new FIG;
19    
# Line 70  Line 73 
73    
74  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
75    
76   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
77    
78   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
79    
# Line 80  Line 82 
82     to      : file to write information to     to      : file to write information to
83     verbose : report on progress     verbose : report on progress
84    
85    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
86    
87   Returns the number of lines in the pirsinfo file that were read.   Returns the number of lines in the pirsinfo file that were read.
88    
89  =cut  =cut
# Line 90  Line 94 
94    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
95    return 0;    return 0;
96   }   }
97     if ($from =~ /\.gz$/) {
98      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
99     }
100     else {
101   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
102     }
103   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
104   my $linecount;   my $linecount;
105   while (<IN>) {   while (<IN>) {
106    $linecount++;    $linecount++;
107    unless ($linecount % 10000) {print STDERR "Correspondence of $linecount lines calculated\n"}    if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
108    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
109    chomp;    chomp;
110    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
111     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
112    }    }
   unless ($done) {  
    foreach my $peg ($fig->by_alias("tr|$_")) {  
     print OUT $_, "\t", $peg, "\n";  
     $done=1;  
113     }     }
114     close IN;
115     close OUT;
116     return $linecount;
117    }    }
118    unless ($done) {  
119     foreach my $peg ($fig->by_alias("sp|$_")) {  =head2 uniprotcorrespondence
120      print OUT $_, "\t", $peg, "\n";  
121      $done=1;  Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
122    
123    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
124    
125     This method takes three arguments:
126       from    : uniprotKB file
127       to      : file to write information to
128       verbose : report on progress
129    
130    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
131    
132     Returns the number of lines in the uniprotkb file that were read.
133    
134    =cut
135    
136    sub uniprotcorrespondence {
137     my ($self, $from, $to, $verbose)=@_;
138     unless (-e $from) {
139      print STDERR "File $from does not exist as called in $0\n";
140      return 0;
141     }     }
142     if ($from =~ /\.gz$/) {
143      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
144    }    }
145    unless ($done) {print OUT $_, "\t\n"}   else {
146      open (IN, $from) || die "Can't open $from";
147     }
148     open (OUT, ">$to") || die "Can't write to $to";
149     my $linecount;
150     while (<IN>) {
151      chomp;
152      $linecount++;
153      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
154      my @line=split /\s+\|\s+/;
155      my $added;
156      foreach my $peg ($self->swiss_pir_ids($line[0])) {
157       print OUT "$_ | $peg\n";
158       $added=1;
159      }
160      unless ($added) {print OUT "$_\n"}
161   }   }
162   close IN;   close IN;
163   close OUT;   close OUT;
164   return $linecount;   return $linecount;
165  }  }
166    
167    =head2 prositecorrespondence
168    
169    Generate a correspondence table between prosite and seed using sp id's and seed ids.
170    
171    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
172    
173    The output file will have the following columns:
174    
175    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
176    
177    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
178    
179     This method takes three arguments:
180       from    : prosite file
181       to      : file to write information to
182       verbose : report on progress
183    
184    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185    
186     Returns the number of lines in the prosite file that were read.
187    
188    =cut
189    
190    sub prositecorrespondence {
191     my ($self, $from, $to, $verbose)=@_;
192     unless (-e $from) {
193      print STDERR "File $from does not exist as called in $0\n";
194      return 0;
195     }
196     if ($from =~ /\.gz$/) {
197      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198     }
199     else {
200      open (IN, $from) || die "Can't open $from";
201     }
202     open (OUT, ">$to") || die "Can't write to $to";
203     my $linecount;
204     my ($famac, $famname, $famtype)=('','','');
205     while (<IN>) {
206      chomp;
207      $linecount++;
208      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
209      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
210      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
211      elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
212      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
213      #
214      # this is the format of the DR lines:
215      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
216      s/^DR\s*//;
217      foreach my $piece (split /\s*\;\s*/, $_) {
218       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
219       foreach my $fig ($self->swiss_pir_ids($acc)) {
220        print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
221       }
222      }
223     }
224    }
225    
226    =head2 swiss_pir_ids()
227    
228    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
229    
230    =cut
231    
232    sub swiss_pir_ids {
233     my ($self, $id)=@_;
234     return () unless ($id);
235     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
236    
237     my @return=($fig->by_alias("uni|$id"));
238     return @return if ($return[0]);
239    
240     @return=($fig->by_alias("tr|$id"));
241     return @return if ($return[0]);
242    
243     @return=($fig->by_alias("sp|$id"));
244     return @return if ($return[0]);
245    
246     return ();
247    }
248    
249  =head2 ss_by_id  =head2 ss_by_id
250    
# Line 339  Line 462 
462                    "</TD>",                    "</TD>",
463                    "   </TR>\n",                    "   </TR>\n",
464                    "</TABLE>\n",                    "</TABLE>\n",
                   $cgi->hr  
465          );          );
466  }  }
467    
468    
469    =head2 scrolling_subsys_list
470    
471    Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
472    
473    use like this
474    
475    push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
476    
477    =cut
478    
479    sub scrolling_subsys_list {
480     my ($self, $cgi, $multiple)=@_;
481     $multiple=0 unless (defined $multiple);
482     my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
483     my $label;
484     # generate labels for the list
485     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/  / /g; $k =~ s/\s+$//; $label->{$s}=$k}
486     return $cgi->scrolling_list(
487      -name    => 'subsystems',
488      -values  => \@ss,
489      -labels  => $label,
490      -size    => 10,
491      -multiple=> $multiple,
492     );
493    }
494    
495    =head2 subsys_names_for_display
496    
497    Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
498    
499    $raelib->subsys_names_for_display(@ss);
500    or
501    $raelib->subsys_names_for_display($fig->all_subsystems());
502    
503    =cut
504    
505    sub subsys_names_for_display {
506     my ($self, @ss)=@_;
507     foreach (@ss) {s/\_/ /g; 1 while (s/  / /g); s/\s+$//}
508     return @ss;
509    }
510    
511    =head2 GenBank
512    
513     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
514     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
515     at building the object.
516    
517     Note that you need to call this with the genome name and the contig. This will then go through that contig.
518    
519     Something like this should work
520    
521     foreach my $contig ($fig->all_contigs($genome)) {
522      my $seqobj=FIGRob->GenBank($genome, $contig);
523      # process the contig
524     }
525    
526    =cut
527    
528    sub GenBank {
529     my ($self, $genome, $contig)=@_;
530     my $gs=$fig->genus_species($genome);
531     return unless ($gs);
532     unless ($contig) {
533      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
534      return;
535     }
536     my $len=$fig->contig_ln($genome, $contig);
537     unless ($len) {
538      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
539      return;
540     }
541    
542    
543     # first find all the pegs ...
544     my $features; # all the features in the genome
545     my $allpegs; # all the pegs
546     my $translation; # all the protein sequences
547     foreach my $peg ($fig->pegs_of($genome)) {
548      my @location=$fig->feature_location($peg);
549      my $func=$fig->function_of($peg);
550      foreach my $loc (@location) {
551       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
552       my ($cg, $start, $stop)=($1, $2, $3);
553       next unless ($cg eq $contig);
554       # save this information for later
555       $features->{'peg'}->{$loc}=$func;
556       $allpegs->{'peg'}->{$loc}=$peg;
557       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
558      }
559     }
560     # ... and all the RNAs
561     foreach my $peg ($fig->rnas_of($genome)) {
562      my @location=$fig->feature_location($peg);
563      my $func=$fig->function_of($peg);
564      foreach my $loc (@location) {
565       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
566       my ($cg, $start, $stop)=($1, $2, $3);
567       next unless ($cg eq $contig);
568       # save this information for later
569       $features->{'rna'}->{$loc}=$func;
570       $allpegs->{'rna'}->{$loc}=$peg;
571      }
572     }
573    
574    
575     # now get all the contigs out
576     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
577     my $description = "Contig $contig from " . $fig->genus_species($genome);
578     my $sobj=Bio::Seq->new(
579              -seq              =>  $seq,
580              -id               =>  $contig,
581              -desc             =>  $description,
582              -accession_number =>  $genome
583              );
584     foreach my $prot (keys %{$features->{'peg'}}) {
585       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
586       my ($cg, $start, $stop)=($1, $2, $3);
587       my $strand=1;
588       if ($stop < $start) {
589        ($stop, $start)=($start, $stop);
590        $strand=-1;
591     }
592    
593     my $feat=Bio::SeqFeature::Generic->new(
594            -start         =>  $start,
595            -end           =>  $stop,
596            -strand        =>  $strand,
597            -primary       =>  'CDS',
598            -display_name  =>  $allpegs->{'peg'}->{$prot},
599            -source_tag    =>  'the SEED',
600            -tag           =>
601                           {
602                           db_xref     =>   $allpegs->{'peg'}->{$prot},
603                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
604                           function    =>  $features->{'peg'}->{$prot},
605                           translation =>  $translation->{'peg'}->{$prot}
606                          }
607           );
608    
609       $sobj->add_SeqFeature($feat);
610     }
611    
612     foreach my $prot (keys %{$features->{'rna'}}) {
613       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
614       my ($cg, $start, $stop)=($1, $2, $3);
615       my $strand=1;
616       if ($stop < $start) {
617        ($stop, $start)=($start, $stop);
618        $strand=-1;
619       }
620    
621       my $feat=Bio::SeqFeature::Generic->new(
622            -start         =>  $start,
623            -end           =>  $stop,
624            -strand        =>  $strand,
625            -primary       =>  'RNA',
626            -source_tag    =>  'the SEED',
627            -display_name  =>  $allpegs->{'rna'}->{$prot},
628            -tag           =>
629                          {
630                           db_xref     =>   $allpegs->{'rna'}->{$prot},
631                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
632                           function    =>  $features->{'rna'}->{$prot},
633                          }
634           );
635    
636      $sobj->add_SeqFeature($feat);
637     }
638     return $sobj;
639    }
640    
641    =head2 best_hit
642    
643     Returns the FIG id of the single best hit to a peg
644    
645     eg
646    
647     my $bh=$fr->best_hit($peg);
648     print 'function is ', scalar $fig->function_of($bh);
649    
650    =cut
651    
652    sub best_hit {
653     my ($self, $peg)=@_;
654     return unless ($peg);
655    
656     my ($maxN, $maxP)=(1, 1e-5);
657     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
658     return ${$sims[0]}[1];
659    }
660    
661    
662    =head1 read_fasta
663    
664    Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
665    
666    Usage:
667    my $fasta=$raelib->read_fasta($file);
668    my ($fasta, $comments)=$raelib->read_fasta($file, 1);
669    
670    =cut
671    
672    sub read_fasta {
673     my ($self, $file, $keepcomments)=@_;
674     open (IN, $file) || die "Can't open $file";
675     my %f; my $t; my $s; my %c;
676     while (<IN>) {
677      chomp;
678      if (/^>/) {
679       if ($s) {
680        $f{$t}=$s;
681        undef $s;
682       }
683       s/^>(\S+)\s*//;
684       $t=$1;
685       $c{$t}=$_ if ($_);
686      }
687      else {$s .= $_}
688     }
689     $f{$t}=$s;
690     if ($keepcomments) {return (\%f, \%c)}
691     else {return \%f}
692    }
693    
694    =head1 rc
695    
696    Reverse complement. It's too easy.
697    
698    =cut
699    
700    sub rc {
701     my ($self, $seq)=@_;
702     $seq=~tr/GATCgatc/CTAGctag/;
703     $seq = reverse $seq;
704     return $seq;
705    }
706    
707  1;  1;
708    

Legend:
Removed from v.1.13  
changed lines
  Added in v.1.23

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3