[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Fri Apr 8 20:22:23 2005 UTC revision 1.18, Fri Jun 10 15:36:37 2005 UTC
# Line 2  Line 2 
2    
3  =pod  =pod
4    
5  =head1  =head1 RAE Library
6    
7   Some routines and things that Rob uses. Please feel free to use at will and incorporate into   Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8   your own code or move them into FIG.pm or elsewhere.   your own code or move them into FIG.pm or elsewhere.
# Line 11  Line 11 
11    
12  package raelib;  package raelib;
13  use strict;  use strict;
14    use Bio::SeqIO;
15    use Bio::Seq;
16    use Bio::SeqFeature::Generic;
17  use FIG;  use FIG;
18  my $fig=new FIG;  my $fig=new FIG;
19    
# Line 70  Line 73 
73    
74  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
75    
76   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
77    
78   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
79    
# Line 80  Line 82 
82     to      : file to write information to     to      : file to write information to
83     verbose : report on progress     verbose : report on progress
84    
85    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
86    
87   Returns the number of lines in the pirsinfo file that were read.   Returns the number of lines in the pirsinfo file that were read.
88    
89  =cut  =cut
# Line 90  Line 94 
94    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
95    return 0;    return 0;
96   }   }
97     if ($from =~ /\.gz$/) {
98      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
99     }
100     else {
101   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
102     }
103   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
104   my $linecount;   my $linecount;
105   while (<IN>) {   while (<IN>) {
106    $linecount++;    $linecount++;
107    unless ($linecount % 10000) {print STDERR "Correspondence of $linecount lines calculated\n"}    if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
108    if (/^>/) {print OUT; next}    if (/^>/) {print OUT; next}
109    chomp;    chomp;
110    my $done;    foreach my $peg ($self->swiss_pir_ids($_)) {
   foreach my $peg ($fig->by_alias("uni|$_")) {  
111     print OUT $_, "\t", $peg, "\n";     print OUT $_, "\t", $peg, "\n";
    $done=1;  
112    }    }
   unless ($done) {print OUT $_, "\t\n"}  
113   }   }
114   close IN;   close IN;
115   close OUT;   close OUT;
116   return $linecount;   return $linecount;
117  }  }
118    
119    =head2 uniprotcorrespondence
120    
121    Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
122    
123    The uniprot KB file is in the form:  UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
124    
125     This method takes three arguments:
126       from    : uniprotKB file
127       to      : file to write information to
128       verbose : report on progress
129    
130    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
131    
132     Returns the number of lines in the uniprotkb file that were read.
133    
134    =cut
135    
136    sub uniprotcorrespondence {
137     my ($self, $from, $to, $verbose)=@_;
138     unless (-e $from) {
139      print STDERR "File $from does not exist as called in $0\n";
140      return 0;
141     }
142     if ($from =~ /\.gz$/) {
143      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
144     }
145     else {
146      open (IN, $from) || die "Can't open $from";
147     }
148     open (OUT, ">$to") || die "Can't write to $to";
149     my $linecount;
150     while (<IN>) {
151      chomp;
152      $linecount++;
153      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
154      my @line=split /\s+\|\s+/;
155      my $added;
156      foreach my $peg ($self->swiss_pir_ids($line[0])) {
157       print OUT "$_ | $peg\n";
158       $added=1;
159      }
160      unless ($added) {print OUT "$_\n"}
161     }
162     close IN;
163     close OUT;
164     return $linecount;
165    }
166    
167    =head2 prositecorrespondence
168    
169    Generate a correspondence table between prosite and seed using sp id's and seed ids.
170    
171    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
172    
173    The output file will have the following columns:
174    
175    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
176    
177    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
178    
179     This method takes three arguments:
180       from    : prosite file
181       to      : file to write information to
182       verbose : report on progress
183    
184    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185    
186     Returns the number of lines in the prosite file that were read.
187    
188    =cut
189    
190    sub prositecorrespondence {
191     my ($self, $from, $to, $verbose)=@_;
192     unless (-e $from) {
193      print STDERR "File $from does not exist as called in $0\n";
194      return 0;
195     }
196     if ($from =~ /\.gz$/) {
197      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198     }
199     else {
200      open (IN, $from) || die "Can't open $from";
201     }
202     open (OUT, ">$to") || die "Can't write to $to";
203     my $linecount;
204     my ($famac, $famname, $famtype)=('','','');
205     while (<IN>) {
206      chomp;
207      $linecount++;
208      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
209      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
210      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
211      elsif (m/^AC\s*(\S+)/) {$famac=$1; next}
212      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
213      #
214      # this is the format of the DR lines:
215      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
216      s/^DR\s*//;
217      foreach my $piece (split /\s*\;\s*/, $_) {
218       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
219       foreach my $fig ($self->swiss_pir_ids($acc)) {
220        print join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
221       }
222      }
223     }
224    }
225    
226    =head2 swiss_pir_ids()
227    
228    SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
229    
230    =cut
231    
232    sub swiss_pir_ids {
233     my ($self, $id)=@_;
234     return () unless ($id);
235     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
236    
237     my @return=($fig->by_alias("uni|$id"));
238     return @return if ($return[0]);
239    
240     @return=($fig->by_alias("tr|$id"));
241     return @return if ($return[0]);
242    
243     @return=($fig->by_alias("sp|$id"));
244     return @return if ($return[0]);
245    
246     return ();
247    }
248    
249  =head2 ss_by_id  =head2 ss_by_id
250    
# Line 331  Line 466 
466          );          );
467  }  }
468    
469    =head2 GenBank
470    
471     This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
472     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
473     at building the object.
474    
475     Note that you need to call this with the genome name and the contig. This will then go through that contig.
476    
477     Something like this should work
478    
479     foreach my $contig ($fig->all_contigs($genome)) {
480      my $seqobj=FIGRob->GenBank($genome, $contig);
481      # process the contig
482     }
483    
484    =cut
485    
486    sub GenBank {
487     my ($self, $genome, $contig)=@_;
488     my $gs=$fig->genus_species($genome);
489     return unless ($gs);
490     unless ($contig) {
491      print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
492      return;
493     }
494     my $len=$fig->contig_ln($genome, $contig);
495     unless ($len) {
496      print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
497      return;
498     }
499    
500    
501     # first find all the pegs ...
502     my $features; # all the features in the genome
503     my $allpegs; # all the pegs
504     my $translation; # all the protein sequences
505     foreach my $peg ($fig->pegs_of($genome)) {
506      my @location=$fig->feature_location($peg);
507      my $func=$fig->function_of($peg);
508      foreach my $loc (@location) {
509       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
510       my ($cg, $start, $stop)=($1, $2, $3);
511       next unless ($cg eq $contig);
512       # save this information for later
513       $features->{'peg'}->{$loc}=$func;
514       $allpegs->{'peg'}->{$loc}=$peg;
515       $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
516      }
517     }
518     # ... and all the RNAs
519     foreach my $peg ($fig->rnas_of($genome)) {
520      my @location=$fig->feature_location($peg);
521      my $func=$fig->function_of($peg);
522      foreach my $loc (@location) {
523       $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
524       my ($cg, $start, $stop)=($1, $2, $3);
525       next unless ($cg eq $contig);
526       # save this information for later
527       $features->{'rna'}->{$loc}=$func;
528       $allpegs->{'rna'}->{$loc}=$peg;
529      }
530     }
531    
532    
533     # now get all the contigs out
534     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
535     my $description = "Contig $contig from " . $fig->genus_species($genome);
536     my $sobj=Bio::Seq->new(
537              -seq              =>  $seq,
538              -id               =>  $contig,
539              -desc             =>  $description,
540              -accession_number =>  $genome
541              );
542     foreach my $prot (keys %{$features->{'peg'}}) {
543       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
544       my ($cg, $start, $stop)=($1, $2, $3);
545       my $strand=1;
546       if ($stop < $start) {
547        ($stop, $start)=($start, $stop);
548        $strand=-1;
549     }
550    
551     my $feat=Bio::SeqFeature::Generic->new(
552            -start         =>  $start,
553            -end           =>  $stop,
554            -strand        =>  $strand,
555            -primary       =>  'CDS',
556            -display_name  =>  $allpegs->{'peg'}->{$prot},
557            -source_tag    =>  'the SEED',
558            -tag           =>
559                           {
560                           db_xref     =>   $allpegs->{'peg'}->{$prot},
561                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
562                           function    =>  $features->{'peg'}->{$prot},
563                           translation =>  $translation->{'peg'}->{$prot}
564                          }
565           );
566    
567       $sobj->add_SeqFeature($feat);
568     }
569    
570     foreach my $prot (keys %{$features->{'rna'}}) {
571       $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
572       my ($cg, $start, $stop)=($1, $2, $3);
573       my $strand=1;
574       if ($stop < $start) {
575        ($stop, $start)=($start, $stop);
576        $strand=-1;
577       }
578    
579       my $feat=Bio::SeqFeature::Generic->new(
580            -start         =>  $start,
581            -end           =>  $stop,
582            -strand        =>  $strand,
583            -primary       =>  'RNA',
584            -source_tag    =>  'the SEED',
585            -display_name  =>  $allpegs->{'rna'}->{$prot},
586            -tag           =>
587                          {
588                           db_xref     =>   $allpegs->{'rna'}->{$prot},
589                           note        =>  'Generated by the Fellowship for the Interpretation of Genomes',
590                           function    =>  $features->{'rna'}->{$prot},
591                          }
592           );
593    
594      $sobj->add_SeqFeature($feat);
595     }
596     return $sobj;
597    }
598    
599    =head2 best_hit
600    
601     Returns the FIG id of the single best hit to a peg
602    
603     eg
604    
605     my $bh=$fr->best_hit($peg);
606     print 'function is ', scalar $fig->function_of($bh);
607    
608    =cut
609    
610    sub best_hit {
611     my ($self, $peg)=@_;
612     return unless ($peg);
613    
614     my ($maxN, $maxP)=(1, 1e-5);
615     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
616     return ${$sims[0]}[1];
617    }
618    
619  1;  1;
620    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3