[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (view) (download) (as text)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 : parrello 1.13 =head1 RAE Library
6 : redwards 1.1
7 :     Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8 :     your own code or move them into FIG.pm or elsewhere.
9 :    
10 :     =cut
11 :    
12 :     package raelib;
13 :     use strict;
14 : redwards 1.17 use Bio::SeqIO;
15 :     use Bio::Seq;
16 :     use Bio::SeqFeature::Generic;
17 : redwards 1.1 use FIG;
18 :     my $fig=new FIG;
19 :    
20 : redwards 1.5 =head2 new
21 :    
22 :     Just instantiate the object and return $self
23 :    
24 :     =cut
25 :    
26 :     sub new {
27 :     my $self=shift;
28 :     return $self
29 :     }
30 :    
31 :    
32 :    
33 :    
34 : redwards 1.4 =head2 features_on_contig
35 :    
36 :     Returns a reference to an array containing all the features on a contig in a genome.
37 :    
38 :     use:
39 :    
40 :     my $arrayref=$rae->features_on_contig($genome, $contig);
41 :    
42 :     or
43 :    
44 :     foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) {
45 :     ... blah blah ...
46 :     }
47 :    
48 :     returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs
49 :    
50 :     v. experimental and guaranteed not to work!
51 :    
52 :     =cut
53 :    
54 :     sub features_on_contig {
55 :     my ($self, $genome, $contig)=@_;
56 :     # were this in FIG.pm you'd use this line:
57 :     #my $rdbH = $self->db_handle;
58 :    
59 :     my $rdbH = $fig->db_handle;
60 :     my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')');
61 :     # this is complicated. A reference to an array of references to arrays, and we only want the first element.
62 :     # simplify.
63 :     my @results;
64 :     foreach my $res (@$relational_db_response) {push @results, $res->[0]}
65 :     return \@results;
66 :     }
67 :    
68 :    
69 :    
70 :    
71 :    
72 :    
73 :    
74 : redwards 1.7 =head2 pirsfcorrespondence
75 : redwards 1.1
76 : redwards 1.7 Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.
77 : redwards 1.1 This is so that we can easily go back and forth.
78 :    
79 : redwards 1.7 The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
80 : redwards 1.1
81 : redwards 1.9 This method takes three arguments:
82 :     from : pirsfinfo.dat file
83 :     to : file to write information to
84 :     verbose : report on progress
85 :    
86 :     Returns the number of lines in the pirsinfo file that were read.
87 :    
88 : redwards 1.1 =cut
89 :    
90 : redwards 1.7 sub pirsfcorrespondence {
91 : redwards 1.9 my ($self, $from, $to, $verbose)=@_;
92 : redwards 1.10 unless (-e $from) {
93 :     print STDERR "File $from does not exist as called in $0\n";
94 :     return 0;
95 :     }
96 : redwards 1.1 open (IN, $from) || die "Can't open $from";
97 : redwards 1.8 open (OUT, ">$to") || die "Can't write to $to";
98 : redwards 1.9 my $linecount;
99 : redwards 1.1 while (<IN>) {
100 : redwards 1.9 $linecount++;
101 : redwards 1.14 if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
102 : redwards 1.1 if (/^>/) {print OUT; next}
103 :     chomp;
104 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($_)) {
105 : redwards 1.1 print OUT $_, "\t", $peg, "\n";
106 :     }
107 : redwards 1.14 }
108 :     close IN;
109 :     close OUT;
110 :     return $linecount;
111 :     }
112 :    
113 :     =head2 uniprotcorrespondence
114 :    
115 :     Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
116 :    
117 :     The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
118 :    
119 :     This method takes three arguments:
120 :     from : uniprotKB file
121 :     to : file to write information to
122 :     verbose : report on progress
123 :    
124 :     Returns the number of lines in the pirsinfo file that were read.
125 :    
126 :     =cut
127 :    
128 :     sub uniprotcorrespondence {
129 :     my ($self, $from, $to, $verbose)=@_;
130 :     unless (-e $from) {
131 :     print STDERR "File $from does not exist as called in $0\n";
132 :     return 0;
133 :     }
134 :     open (IN, $from) || die "Can't open $from";
135 :     open (OUT, ">$to") || die "Can't write to $to";
136 :     my $linecount;
137 :     while (<IN>) {
138 :     chomp;
139 :     $linecount++;
140 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
141 :     my @line=split /\s+\|\s+/;
142 : redwards 1.16 my $added;
143 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($line[0])) {
144 :     print OUT "$_ | $peg\n";
145 : redwards 1.16 $added=1;
146 : redwards 1.12 }
147 : redwards 1.16 unless ($added) {print OUT "$_\n"}
148 : redwards 1.1 }
149 :     close IN;
150 :     close OUT;
151 : redwards 1.9 return $linecount;
152 : redwards 1.1 }
153 :    
154 : redwards 1.14
155 :    
156 :     =head2 swiss_pir_ids()
157 :    
158 :     SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's.
159 :    
160 :     =cut
161 :    
162 :     sub swiss_pir_ids {
163 :     my ($self, $id)=@_;
164 :     return () unless ($id);
165 :    
166 : redwards 1.15 my @return=($fig->by_alias("uni|$id"));
167 : redwards 1.14 return @return if ($return[0]);
168 :    
169 : redwards 1.15 @return=($fig->by_alias("tr|$id"));
170 : redwards 1.14 return @return if ($return[0]);
171 :    
172 : redwards 1.15 @return=($fig->by_alias("sp|$id"));
173 : redwards 1.14 return @return if ($return[0]);
174 :    
175 :     return ();
176 :     }
177 : redwards 1.1
178 :     =head2 ss_by_id
179 :    
180 :     Generate a list of subsystems that a peg occurs in. This is a ; separated list.
181 :     This is a wrapper that removes roles and ignores essential things
182 :    
183 :     =cut
184 :    
185 :     sub ss_by_id {
186 :     my ($self, $peg)=@_;
187 :     my $ssout;
188 :     foreach my $ss (sort $fig->subsystems_for_peg($peg))
189 :     {
190 :     next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems
191 :     $ssout.=$$ss[0]."; ";
192 :     }
193 :     $ssout =~ s/; $//;
194 :     return $ssout;
195 :     }
196 :    
197 : redwards 1.3 =head2 ss_by_homol
198 :    
199 :     Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.
200 :     This is also a wrapper around sims and ss, but makes everything unified
201 :    
202 :     =cut
203 :    
204 :     sub ss_by_homol {
205 :     my ($self, $peg)=@_;
206 :     return unless ($peg);
207 :     my ($maxN, $maxP)=(50, 1e-20);
208 :    
209 :     # find the sims
210 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
211 :    
212 :     # we are only going to keep the best hit for each peg
213 :     # in a subsystem
214 :     my $best_ss_score; my $best_ss_id;
215 :     foreach my $sim (@sims)
216 :     {
217 :     my $simpeg=$$sim[1];
218 :     my $simscore=$$sim[10];
219 :     my @subsys=$fig->subsystems_for_peg($simpeg);
220 :     foreach my $ss (@subsys)
221 :     {
222 :     if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg}
223 :     elsif ($best_ss_score->{$$ss[0]} > $simscore)
224 :     {
225 :     $best_ss_score->{$$ss[0]}=$simscore;
226 :     $best_ss_id->{$$ss[0]}=$simpeg;
227 :     }
228 :     }
229 :     }
230 :    
231 :     my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id);
232 :    
233 :     $ssoutput =~ s/, $//;
234 :     return $ssoutput;
235 :     }
236 :    
237 :     =head2 tagvalue
238 :    
239 :     This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)
240 :    
241 :     e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values;
242 :    
243 :     Returns an empty array if no tag/value appropriate.
244 :    
245 :     Just because I use this a lot I don't want to waste rewriting it.
246 :    
247 :     =cut
248 :    
249 :     sub tagvalue {
250 :     my ($self, $peg, $tag)=@_;
251 :     my @return;
252 :     my @attr=$fig->feature_attributes($peg);
253 :     foreach my $attr (@attr) {
254 : redwards 1.11 my ($gotpeg, $gottag, $val, $link)=@$attr;
255 : redwards 1.3 push @return, $val if ($gottag eq $tag);
256 :     }
257 :     return wantarray ? @return : join "; ", @return;
258 :     }
259 : redwards 1.1
260 : redwards 1.5 =head2 locations_on_contig
261 :    
262 :     Return the locations of a sequence on a contig.
263 :    
264 :     This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations.
265 :    
266 :     my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef);
267 :     foreach my $bp (@$locations) { ... do something ... }
268 :    
269 :     first argument : genome number
270 :     second argument : contig name
271 :     third argument : sequence to look for
272 :     fourth argument : beginning position to start looking from (can be undef)
273 :     fifth argument : end position to stop looking from (can be undef)
274 :     sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc)
275 :    
276 :     Note, the position is calculated before the sequence is rc'd
277 :    
278 :     =cut
279 :    
280 :     sub locations_on_contig {
281 :     my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_;
282 :     my $return=[];
283 :    
284 :     # get the dna sequence of the contig, and make sure it is uppercase
285 :     my $contig_ln=$fig->contig_ln($genome, $contig);
286 :     return $return unless ($contig_ln);
287 :     unless ($from) {$from=1}
288 :     unless ($to) {$to=$contig_ln}
289 :     if ($from > $to) {($from, $to)=($to, $from)}
290 :     my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to);
291 :     $dna_seq=uc($dna_seq);
292 :    
293 :     # if we want to check the rc, we actually rc the query
294 :     $sequence=$fig->reverse_comp($sequence) if ($check_reverse);
295 :     $sequence=uc($sequence);
296 :    
297 :     # now find all the matches
298 :     my $posn=index($dna_seq, $sequence, 0);
299 :     while ($posn > -1) {
300 :     push @$return, $posn;
301 :     $posn=index($dna_seq, $sequence, $posn+1);
302 :     }
303 :     return $return;
304 :     }
305 :    
306 :    
307 :     =head2 scrolling_org_list
308 :    
309 :     This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
310 :    
311 :     use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);
312 :    
313 :     multiple selections will only be set if $multiple is true
314 :    
315 :     =cut
316 :    
317 :     sub scrolling_org_list {
318 :     my ($self, $cgi, $multiple)=@_;
319 :     unless ($multiple) {$multiple=0}
320 :    
321 :     my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
322 :    
323 :     #
324 :     # Canonical names must match the keywords used in the DBMS. They are
325 :     # defined in compute_genome_counts.pl
326 :     #
327 :     my %canonical = (
328 :     'All' => undef,
329 :     'Archaea' => 'Archaea',
330 :     'Bacteria' => 'Bacteria',
331 :     'Eucarya' => 'Eukaryota',
332 :     'Viruses' => 'Virus',
333 :     'Environmental samples' => 'Environmental Sample'
334 :     );
335 :    
336 :     my $req_dom = $cgi->param( 'domain' ) || 'All';
337 :     my @domains = $cgi->radio_group( -name => 'domain',
338 :     -default => $req_dom,
339 :     -override => 1,
340 :     -values => [ @display ]
341 :     );
342 :    
343 :     my $n_domain = 0;
344 :     my %dom_num = map { ( $_, $n_domain++ ) } @display;
345 :     my $req_dom_num = $dom_num{ $req_dom } || 0;
346 :    
347 :     #
348 :     # Viruses and Environmental samples must have completeness = All (that is
349 :     # how they are in the database). Otherwise, default is Only "complete".
350 :     #
351 :     my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All'
352 :     : $cgi->param( 'complete' ) || 'Only "complete"';
353 :     my @complete = $cgi->radio_group( -name => 'complete',
354 :     -default => $req_comp,
355 :     -override => 1,
356 :     -values => [ 'All', 'Only "complete"' ]
357 :     );
358 :     #
359 :     # Use $fig->genomes( complete, restricted, domain ) to get org list:
360 :     #
361 :     my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete";
362 :    
363 :     my $orgs; my $label;
364 :     @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } );
365 :    
366 :     foreach (@$orgs) {
367 :     my $gs = $fig->genus_species($_);
368 :     my $gc=scalar $fig->all_contigs($_);
369 :     $label->{$_} = "$gs ($_) [$gc contigs]";
370 :     }
371 :    
372 :     @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
373 :    
374 :     my $n_genomes = @$orgs;
375 :    
376 :     return ( "<TABLE>\n",
377 :     " <TR>\n",
378 :     " <TD>",
379 : redwards 1.6 $cgi->scrolling_list( -name => 'korgs',
380 :     -values => $orgs,
381 :     -labels => $label,
382 :     -size => 10,
383 :     -multiple => $multiple,
384 : redwards 1.5 ), $cgi->br,
385 :     "$n_genomes genomes shown ",
386 :     $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
387 :     " </TD>",
388 :     " <TD>",
389 :     join( "<br>", "<b>Domain(s) to show:</b>", @domains), "<br>\n",
390 :     join( "<br>", "<b>Completeness?</b>", @complete), "\n",
391 :     "</TD>",
392 :     " </TR>\n",
393 :     "</TABLE>\n",
394 :     $cgi->hr
395 :     );
396 :     }
397 :    
398 : redwards 1.17 =head2 GenBank
399 : redwards 1.5
400 : redwards 1.17 This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
401 :     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
402 :     at building the object.
403 : redwards 1.1
404 : redwards 1.17 Note that you need to call this with the genome name and the contig. This will then go through that contig.
405 : redwards 1.1
406 : redwards 1.17 Something like this should work
407 :    
408 :     foreach my $contig ($fig->all_contigs($genome)) {
409 :     my $seqobj=FIGRob->GenBank($genome, $contig);
410 :     # process the contig
411 :     }
412 :    
413 :     =cut
414 :    
415 :     sub GenBank {
416 :     my ($self, $genome, $contig)=@_;
417 :     my $gs=$fig->genus_species($genome);
418 :     return unless ($gs);
419 :     unless ($contig) {
420 :     print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
421 :     return;
422 :     }
423 :     my $len=$fig->contig_ln($genome, $contig);
424 :     unless ($len) {
425 :     print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
426 :     return;
427 :     }
428 :    
429 :    
430 :     # first find all the pegs ...
431 :     my $features; # all the features in the genome
432 :     my $allpegs; # all the pegs
433 :     my $translation; # all the protein sequences
434 :     foreach my $peg ($fig->pegs_of($genome)) {
435 :     my @location=$fig->feature_location($peg);
436 :     my $func=$fig->function_of($peg);
437 :     foreach my $loc (@location) {
438 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
439 :     my ($cg, $start, $stop)=($1, $2, $3);
440 :     next unless ($cg eq $contig);
441 :     # save this information for later
442 :     $features->{'peg'}->{$loc}=$func;
443 :     $allpegs->{'peg'}->{$loc}=$peg;
444 :     $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
445 :     }
446 :     }
447 :     # ... and all the RNAs
448 :     foreach my $peg ($fig->rnas_of($genome)) {
449 :     my @location=$fig->feature_location($peg);
450 :     my $func=$fig->function_of($peg);
451 :     foreach my $loc (@location) {
452 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
453 :     my ($cg, $start, $stop)=($1, $2, $3);
454 :     next unless ($cg eq $contig);
455 :     # save this information for later
456 :     $features->{'rna'}->{$loc}=$func;
457 :     $allpegs->{'rna'}->{$loc}=$peg;
458 :     }
459 :     }
460 :    
461 :    
462 :     # now get all the contigs out
463 :     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
464 :     my $description = "Contig $contig from " . $fig->genus_species($genome);
465 :     my $sobj=Bio::Seq->new(
466 :     -seq => $seq,
467 :     -id => $contig,
468 :     -desc => $description,
469 :     -accession_number => $genome
470 :     );
471 :     foreach my $prot (keys %{$features->{'peg'}}) {
472 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
473 :     my ($cg, $start, $stop)=($1, $2, $3);
474 :     my $strand=1;
475 :     if ($stop < $start) {
476 :     ($stop, $start)=($start, $stop);
477 :     $strand=-1;
478 :     }
479 :    
480 :     my $feat=Bio::SeqFeature::Generic->new(
481 :     -start => $start,
482 :     -end => $stop,
483 :     -strand => $strand,
484 :     -primary => 'CDS',
485 :     -display_name => $allpegs->{'peg'}->{$prot},
486 :     -source_tag => 'the SEED',
487 :     -tag =>
488 :     {
489 :     db_xref => $allpegs->{'peg'}->{$prot},
490 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
491 :     function => $features->{'peg'}->{$prot},
492 :     translation => $translation->{'peg'}->{$prot}
493 :     }
494 :     );
495 :    
496 :     $sobj->add_SeqFeature($feat);
497 :     }
498 :    
499 :     foreach my $prot (keys %{$features->{'rna'}}) {
500 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
501 :     my ($cg, $start, $stop)=($1, $2, $3);
502 :     my $strand=1;
503 :     if ($stop < $start) {
504 :     ($stop, $start)=($start, $stop);
505 :     $strand=-1;
506 :     }
507 :    
508 :     my $feat=Bio::SeqFeature::Generic->new(
509 :     -start => $start,
510 :     -end => $stop,
511 :     -strand => $strand,
512 :     -primary => 'RNA',
513 :     -source_tag => 'the SEED',
514 :     -display_name => $allpegs->{'rna'}->{$prot},
515 :     -tag =>
516 :     {
517 :     db_xref => $allpegs->{'rna'}->{$prot},
518 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
519 :     function => $features->{'rna'}->{$prot},
520 :     }
521 :     );
522 :    
523 :     $sobj->add_SeqFeature($feat);
524 :     }
525 :     return $sobj;
526 :     }
527 :    
528 :     =head2 best_hit
529 :    
530 :     Returns the FIG id of the single best hit to a peg
531 :    
532 :     eg
533 :    
534 :     my $bh=$fr->best_hit($peg);
535 :     print 'function is ', scalar $fig->function_of($bh);
536 :    
537 :     =cut
538 :    
539 :     sub best_hit {
540 :     my ($self, $peg)=@_;
541 :     return unless ($peg);
542 :    
543 :     my ($maxN, $maxP)=(1, 1e-5);
544 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
545 :     return ${$sims[0]}[1];
546 :     }
547 : redwards 1.1
548 :     1;
549 : redwards 1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3