[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (view) (download) (as text)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 : parrello 1.13 =head1 RAE Library
6 : redwards 1.1
7 :     Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8 :     your own code or move them into FIG.pm or elsewhere.
9 :    
10 :     =cut
11 :    
12 :     package raelib;
13 :     use strict;
14 : redwards 1.17 use Bio::SeqIO;
15 :     use Bio::Seq;
16 :     use Bio::SeqFeature::Generic;
17 : redwards 1.1 use FIG;
18 :     my $fig=new FIG;
19 :    
20 : redwards 1.5 =head2 new
21 :    
22 :     Just instantiate the object and return $self
23 :    
24 :     =cut
25 :    
26 :     sub new {
27 :     my $self=shift;
28 :     return $self
29 :     }
30 :    
31 :    
32 :    
33 :    
34 : redwards 1.4 =head2 features_on_contig
35 :    
36 :     Returns a reference to an array containing all the features on a contig in a genome.
37 :    
38 :     use:
39 :    
40 :     my $arrayref=$rae->features_on_contig($genome, $contig);
41 :    
42 :     or
43 :    
44 :     foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) {
45 :     ... blah blah ...
46 :     }
47 :    
48 :     returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs
49 :    
50 :     v. experimental and guaranteed not to work!
51 :    
52 :     =cut
53 :    
54 :     sub features_on_contig {
55 :     my ($self, $genome, $contig)=@_;
56 :     # were this in FIG.pm you'd use this line:
57 :     #my $rdbH = $self->db_handle;
58 :    
59 :     my $rdbH = $fig->db_handle;
60 :     my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')');
61 :     # this is complicated. A reference to an array of references to arrays, and we only want the first element.
62 :     # simplify.
63 :     my @results;
64 :     foreach my $res (@$relational_db_response) {push @results, $res->[0]}
65 :     return \@results;
66 :     }
67 :    
68 :    
69 :    
70 :    
71 :    
72 :    
73 :    
74 : redwards 1.7 =head2 pirsfcorrespondence
75 : redwards 1.1
76 : redwards 1.18 Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
77 : redwards 1.1
78 : redwards 1.18 The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
79 : redwards 1.1
80 : redwards 1.18 This method takes three arguments:
81 : redwards 1.9 from : pirsfinfo.dat file
82 :     to : file to write information to
83 :     verbose : report on progress
84 :    
85 : redwards 1.18 Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
86 :    
87 :     Returns the number of lines in the pirsinfo file that were read.
88 : redwards 1.9
89 : redwards 1.1 =cut
90 :    
91 : redwards 1.7 sub pirsfcorrespondence {
92 : redwards 1.9 my ($self, $from, $to, $verbose)=@_;
93 : redwards 1.10 unless (-e $from) {
94 :     print STDERR "File $from does not exist as called in $0\n";
95 :     return 0;
96 :     }
97 : redwards 1.18 if ($from =~ /\.gz$/) {
98 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
99 :     }
100 :     else {
101 :     open (IN, $from) || die "Can't open $from";
102 :     }
103 : redwards 1.8 open (OUT, ">$to") || die "Can't write to $to";
104 : redwards 1.9 my $linecount;
105 : redwards 1.1 while (<IN>) {
106 : redwards 1.9 $linecount++;
107 : redwards 1.14 if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
108 : redwards 1.1 if (/^>/) {print OUT; next}
109 :     chomp;
110 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($_)) {
111 : redwards 1.1 print OUT $_, "\t", $peg, "\n";
112 :     }
113 : redwards 1.14 }
114 :     close IN;
115 :     close OUT;
116 :     return $linecount;
117 :     }
118 :    
119 :     =head2 uniprotcorrespondence
120 :    
121 :     Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
122 :    
123 :     The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
124 :    
125 :     This method takes three arguments:
126 :     from : uniprotKB file
127 :     to : file to write information to
128 :     verbose : report on progress
129 :    
130 : redwards 1.18 Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
131 :    
132 :     Returns the number of lines in the uniprotkb file that were read.
133 : redwards 1.14
134 :     =cut
135 :    
136 :     sub uniprotcorrespondence {
137 :     my ($self, $from, $to, $verbose)=@_;
138 :     unless (-e $from) {
139 :     print STDERR "File $from does not exist as called in $0\n";
140 :     return 0;
141 :     }
142 : redwards 1.18 if ($from =~ /\.gz$/) {
143 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
144 :     }
145 :     else {
146 :     open (IN, $from) || die "Can't open $from";
147 :     }
148 : redwards 1.14 open (OUT, ">$to") || die "Can't write to $to";
149 :     my $linecount;
150 :     while (<IN>) {
151 :     chomp;
152 :     $linecount++;
153 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
154 :     my @line=split /\s+\|\s+/;
155 : redwards 1.16 my $added;
156 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($line[0])) {
157 :     print OUT "$_ | $peg\n";
158 : redwards 1.16 $added=1;
159 : redwards 1.12 }
160 : redwards 1.16 unless ($added) {print OUT "$_\n"}
161 : redwards 1.1 }
162 :     close IN;
163 :     close OUT;
164 : redwards 1.9 return $linecount;
165 : redwards 1.1 }
166 :    
167 : redwards 1.18 =head2 prositecorrespondence
168 :    
169 :     Generate a correspondence table between prosite and seed using sp id's and seed ids.
170 :    
171 :     The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
172 :    
173 :     The output file will have the following columns:
174 :    
175 :     prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
176 :    
177 :     The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
178 :    
179 :     This method takes three arguments:
180 :     from : prosite file
181 :     to : file to write information to
182 :     verbose : report on progress
183 :    
184 :     Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185 :    
186 :     Returns the number of lines in the prosite file that were read.
187 :    
188 :     =cut
189 :    
190 :     sub prositecorrespondence {
191 :     my ($self, $from, $to, $verbose)=@_;
192 :     unless (-e $from) {
193 :     print STDERR "File $from does not exist as called in $0\n";
194 :     return 0;
195 :     }
196 :     if ($from =~ /\.gz$/) {
197 :     open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198 :     }
199 :     else {
200 :     open (IN, $from) || die "Can't open $from";
201 :     }
202 :     open (OUT, ">$to") || die "Can't write to $to";
203 :     my $linecount;
204 :     my ($famac, $famname, $famtype)=('','','');
205 :     while (<IN>) {
206 :     chomp;
207 :     $linecount++;
208 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
209 :     if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
210 :     elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
211 : redwards 1.19 elsif (m/^AC\s*(\S+)/) {$famac=$1; $famac =~ s/\;\s*$//; next}
212 : redwards 1.18 next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
213 :     #
214 :     # this is the format of the DR lines:
215 :     # DR P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
216 :     s/^DR\s*//;
217 :     foreach my $piece (split /\s*\;\s*/, $_) {
218 :     my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
219 :     foreach my $fig ($self->swiss_pir_ids($acc)) {
220 : redwards 1.20 print OUT join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
221 : redwards 1.18 }
222 :     }
223 :     }
224 :     }
225 : redwards 1.14
226 :     =head2 swiss_pir_ids()
227 :    
228 : redwards 1.18 SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
229 : redwards 1.14
230 :     =cut
231 :    
232 :     sub swiss_pir_ids {
233 :     my ($self, $id)=@_;
234 :     return () unless ($id);
235 : redwards 1.18 $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
236 :    
237 : redwards 1.15 my @return=($fig->by_alias("uni|$id"));
238 : redwards 1.14 return @return if ($return[0]);
239 :    
240 : redwards 1.15 @return=($fig->by_alias("tr|$id"));
241 : redwards 1.14 return @return if ($return[0]);
242 :    
243 : redwards 1.15 @return=($fig->by_alias("sp|$id"));
244 : redwards 1.14 return @return if ($return[0]);
245 :    
246 :     return ();
247 :     }
248 : redwards 1.1
249 :     =head2 ss_by_id
250 :    
251 :     Generate a list of subsystems that a peg occurs in. This is a ; separated list.
252 :     This is a wrapper that removes roles and ignores essential things
253 :    
254 :     =cut
255 :    
256 :     sub ss_by_id {
257 :     my ($self, $peg)=@_;
258 :     my $ssout;
259 :     foreach my $ss (sort $fig->subsystems_for_peg($peg))
260 :     {
261 :     next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems
262 :     $ssout.=$$ss[0]."; ";
263 :     }
264 :     $ssout =~ s/; $//;
265 :     return $ssout;
266 :     }
267 :    
268 : redwards 1.3 =head2 ss_by_homol
269 :    
270 :     Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.
271 :     This is also a wrapper around sims and ss, but makes everything unified
272 :    
273 :     =cut
274 :    
275 :     sub ss_by_homol {
276 :     my ($self, $peg)=@_;
277 :     return unless ($peg);
278 :     my ($maxN, $maxP)=(50, 1e-20);
279 :    
280 :     # find the sims
281 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
282 :    
283 :     # we are only going to keep the best hit for each peg
284 :     # in a subsystem
285 :     my $best_ss_score; my $best_ss_id;
286 :     foreach my $sim (@sims)
287 :     {
288 :     my $simpeg=$$sim[1];
289 :     my $simscore=$$sim[10];
290 :     my @subsys=$fig->subsystems_for_peg($simpeg);
291 :     foreach my $ss (@subsys)
292 :     {
293 :     if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg}
294 :     elsif ($best_ss_score->{$$ss[0]} > $simscore)
295 :     {
296 :     $best_ss_score->{$$ss[0]}=$simscore;
297 :     $best_ss_id->{$$ss[0]}=$simpeg;
298 :     }
299 :     }
300 :     }
301 :    
302 :     my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id);
303 :    
304 :     $ssoutput =~ s/, $//;
305 :     return $ssoutput;
306 :     }
307 :    
308 :     =head2 tagvalue
309 :    
310 :     This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)
311 :    
312 :     e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values;
313 :    
314 :     Returns an empty array if no tag/value appropriate.
315 :    
316 :     Just because I use this a lot I don't want to waste rewriting it.
317 :    
318 :     =cut
319 :    
320 :     sub tagvalue {
321 :     my ($self, $peg, $tag)=@_;
322 :     my @return;
323 :     my @attr=$fig->feature_attributes($peg);
324 :     foreach my $attr (@attr) {
325 : redwards 1.11 my ($gotpeg, $gottag, $val, $link)=@$attr;
326 : redwards 1.3 push @return, $val if ($gottag eq $tag);
327 :     }
328 :     return wantarray ? @return : join "; ", @return;
329 :     }
330 : redwards 1.1
331 : redwards 1.5 =head2 locations_on_contig
332 :    
333 :     Return the locations of a sequence on a contig.
334 :    
335 :     This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations.
336 :    
337 :     my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef);
338 :     foreach my $bp (@$locations) { ... do something ... }
339 :    
340 :     first argument : genome number
341 :     second argument : contig name
342 :     third argument : sequence to look for
343 :     fourth argument : beginning position to start looking from (can be undef)
344 :     fifth argument : end position to stop looking from (can be undef)
345 :     sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc)
346 :    
347 :     Note, the position is calculated before the sequence is rc'd
348 :    
349 :     =cut
350 :    
351 :     sub locations_on_contig {
352 :     my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_;
353 :     my $return=[];
354 :    
355 :     # get the dna sequence of the contig, and make sure it is uppercase
356 :     my $contig_ln=$fig->contig_ln($genome, $contig);
357 :     return $return unless ($contig_ln);
358 :     unless ($from) {$from=1}
359 :     unless ($to) {$to=$contig_ln}
360 :     if ($from > $to) {($from, $to)=($to, $from)}
361 :     my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to);
362 :     $dna_seq=uc($dna_seq);
363 :    
364 :     # if we want to check the rc, we actually rc the query
365 :     $sequence=$fig->reverse_comp($sequence) if ($check_reverse);
366 :     $sequence=uc($sequence);
367 :    
368 :     # now find all the matches
369 :     my $posn=index($dna_seq, $sequence, 0);
370 :     while ($posn > -1) {
371 :     push @$return, $posn;
372 :     $posn=index($dna_seq, $sequence, $posn+1);
373 :     }
374 :     return $return;
375 :     }
376 :    
377 :    
378 :     =head2 scrolling_org_list
379 :    
380 :     This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
381 :    
382 : overbeek 1.24 use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple, $default);
383 : redwards 1.5
384 :     multiple selections will only be set if $multiple is true
385 :    
386 : overbeek 1.24 default will set a default to override (maybe) korgs
387 :    
388 : redwards 1.5 =cut
389 :    
390 :     sub scrolling_org_list {
391 : overbeek 1.24 my ($self, $cgi, $multiple, $default)=@_;
392 : redwards 1.5 unless ($multiple) {$multiple=0}
393 :    
394 :     my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
395 :    
396 :     #
397 :     # Canonical names must match the keywords used in the DBMS. They are
398 :     # defined in compute_genome_counts.pl
399 :     #
400 :     my %canonical = (
401 :     'All' => undef,
402 :     'Archaea' => 'Archaea',
403 :     'Bacteria' => 'Bacteria',
404 :     'Eucarya' => 'Eukaryota',
405 :     'Viruses' => 'Virus',
406 :     'Environmental samples' => 'Environmental Sample'
407 :     );
408 :    
409 :     my $req_dom = $cgi->param( 'domain' ) || 'All';
410 :     my @domains = $cgi->radio_group( -name => 'domain',
411 :     -default => $req_dom,
412 :     -override => 1,
413 :     -values => [ @display ]
414 :     );
415 :    
416 :     my $n_domain = 0;
417 :     my %dom_num = map { ( $_, $n_domain++ ) } @display;
418 :     my $req_dom_num = $dom_num{ $req_dom } || 0;
419 :    
420 :     #
421 :     # Viruses and Environmental samples must have completeness = All (that is
422 :     # how they are in the database). Otherwise, default is Only "complete".
423 :     #
424 :     my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All'
425 :     : $cgi->param( 'complete' ) || 'Only "complete"';
426 :     my @complete = $cgi->radio_group( -name => 'complete',
427 :     -default => $req_comp,
428 :     -override => 1,
429 :     -values => [ 'All', 'Only "complete"' ]
430 :     );
431 :     #
432 :     # Use $fig->genomes( complete, restricted, domain ) to get org list:
433 :     #
434 :     my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete";
435 :    
436 :     my $orgs; my $label;
437 :     @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } );
438 :    
439 :     foreach (@$orgs) {
440 :     my $gs = $fig->genus_species($_);
441 :     my $gc=scalar $fig->all_contigs($_);
442 :     $label->{$_} = "$gs ($_) [$gc contigs]";
443 :     }
444 :    
445 :     @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
446 :    
447 :     my $n_genomes = @$orgs;
448 :    
449 :     return ( "<TABLE>\n",
450 :     " <TR>\n",
451 :     " <TD>",
452 : redwards 1.6 $cgi->scrolling_list( -name => 'korgs',
453 :     -values => $orgs,
454 :     -labels => $label,
455 :     -size => 10,
456 :     -multiple => $multiple,
457 : overbeek 1.24 -default => $default,
458 : redwards 1.5 ), $cgi->br,
459 :     "$n_genomes genomes shown ",
460 :     $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
461 :     " </TD>",
462 :     " <TD>",
463 :     join( "<br>", "<b>Domain(s) to show:</b>", @domains), "<br>\n",
464 :     join( "<br>", "<b>Completeness?</b>", @complete), "\n",
465 :     "</TD>",
466 :     " </TR>\n",
467 :     "</TABLE>\n",
468 :     );
469 :     }
470 :    
471 : redwards 1.21
472 :     =head2 scrolling_subsys_list
473 :    
474 :     Create a scrolling list of all subsystems. Just like scrolling_org_list, this will make the list and allow you to select multiples.
475 :    
476 :     use like this
477 :    
478 :     push @$html, $raelib->scrolling_subsys_list($cgi, $multiple);
479 :    
480 :     =cut
481 :    
482 :     sub scrolling_subsys_list {
483 :     my ($self, $cgi, $multiple)=@_;
484 :     $multiple=0 unless (defined $multiple);
485 : redwards 1.22 my @ss=sort {uc($a) cmp uc($b)} $fig->all_subsystems();
486 : redwards 1.21 my $label;
487 :     # generate labels for the list
488 :     foreach my $s (@ss) {my $k=$s; $k =~ s/\_/ /g; $k =~ s/ / /g; $k =~ s/\s+$//; $label->{$s}=$k}
489 :     return $cgi->scrolling_list(
490 :     -name => 'subsystems',
491 :     -values => \@ss,
492 :     -labels => $label,
493 :     -size => 10,
494 :     -multiple=> $multiple,
495 :     );
496 :     }
497 :    
498 :     =head2 subsys_names_for_display
499 :    
500 :     Return a list of subsystem names for display. This will take a list as an argument and return a nice clean list for display.
501 :    
502 :     $raelib->subsys_names_for_display(@ss);
503 :     or
504 :     $raelib->subsys_names_for_display($fig->all_subsystems());
505 :    
506 :     =cut
507 :    
508 :     sub subsys_names_for_display {
509 :     my ($self, @ss)=@_;
510 :     foreach (@ss) {s/\_/ /g; 1 while (s/ / /g); s/\s+$//}
511 :     return @ss;
512 :     }
513 :    
514 : redwards 1.17 =head2 GenBank
515 : redwards 1.5
516 : redwards 1.17 This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome
517 :     in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow
518 :     at building the object.
519 : redwards 1.1
520 : redwards 1.17 Note that you need to call this with the genome name and the contig. This will then go through that contig.
521 : redwards 1.1
522 : redwards 1.17 Something like this should work
523 :    
524 :     foreach my $contig ($fig->all_contigs($genome)) {
525 :     my $seqobj=FIGRob->GenBank($genome, $contig);
526 :     # process the contig
527 :     }
528 :    
529 :     =cut
530 :    
531 :     sub GenBank {
532 :     my ($self, $genome, $contig)=@_;
533 :     my $gs=$fig->genus_species($genome);
534 :     return unless ($gs);
535 :     unless ($contig) {
536 :     print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n";
537 :     return;
538 :     }
539 :     my $len=$fig->contig_ln($genome, $contig);
540 :     unless ($len) {
541 :     print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n";
542 :     return;
543 :     }
544 :    
545 :    
546 :     # first find all the pegs ...
547 :     my $features; # all the features in the genome
548 :     my $allpegs; # all the pegs
549 :     my $translation; # all the protein sequences
550 :     foreach my $peg ($fig->pegs_of($genome)) {
551 :     my @location=$fig->feature_location($peg);
552 :     my $func=$fig->function_of($peg);
553 :     foreach my $loc (@location) {
554 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
555 :     my ($cg, $start, $stop)=($1, $2, $3);
556 :     next unless ($cg eq $contig);
557 :     # save this information for later
558 :     $features->{'peg'}->{$loc}=$func;
559 :     $allpegs->{'peg'}->{$loc}=$peg;
560 :     $translation->{'peg'}->{$loc}=$fig->get_translation($peg);
561 :     }
562 :     }
563 :     # ... and all the RNAs
564 :     foreach my $peg ($fig->rnas_of($genome)) {
565 :     my @location=$fig->feature_location($peg);
566 :     my $func=$fig->function_of($peg);
567 :     foreach my $loc (@location) {
568 :     $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
569 :     my ($cg, $start, $stop)=($1, $2, $3);
570 :     next unless ($cg eq $contig);
571 :     # save this information for later
572 :     $features->{'rna'}->{$loc}=$func;
573 :     $allpegs->{'rna'}->{$loc}=$peg;
574 :     }
575 :     }
576 :    
577 :    
578 :     # now get all the contigs out
579 :     my $seq=$fig->dna_seq($genome, $contig."_1_".$len);
580 :     my $description = "Contig $contig from " . $fig->genus_species($genome);
581 :     my $sobj=Bio::Seq->new(
582 :     -seq => $seq,
583 :     -id => $contig,
584 :     -desc => $description,
585 :     -accession_number => $genome
586 :     );
587 :     foreach my $prot (keys %{$features->{'peg'}}) {
588 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
589 :     my ($cg, $start, $stop)=($1, $2, $3);
590 :     my $strand=1;
591 :     if ($stop < $start) {
592 :     ($stop, $start)=($start, $stop);
593 :     $strand=-1;
594 :     }
595 :    
596 :     my $feat=Bio::SeqFeature::Generic->new(
597 :     -start => $start,
598 :     -end => $stop,
599 :     -strand => $strand,
600 :     -primary => 'CDS',
601 :     -display_name => $allpegs->{'peg'}->{$prot},
602 :     -source_tag => 'the SEED',
603 :     -tag =>
604 :     {
605 :     db_xref => $allpegs->{'peg'}->{$prot},
606 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
607 :     function => $features->{'peg'}->{$prot},
608 :     translation => $translation->{'peg'}->{$prot}
609 :     }
610 :     );
611 :    
612 :     $sobj->add_SeqFeature($feat);
613 :     }
614 :    
615 :     foreach my $prot (keys %{$features->{'rna'}}) {
616 :     $prot =~ /^(.*)\_(\d+)\_(\d+)$/;
617 :     my ($cg, $start, $stop)=($1, $2, $3);
618 :     my $strand=1;
619 :     if ($stop < $start) {
620 :     ($stop, $start)=($start, $stop);
621 :     $strand=-1;
622 :     }
623 :    
624 :     my $feat=Bio::SeqFeature::Generic->new(
625 :     -start => $start,
626 :     -end => $stop,
627 :     -strand => $strand,
628 :     -primary => 'RNA',
629 :     -source_tag => 'the SEED',
630 :     -display_name => $allpegs->{'rna'}->{$prot},
631 :     -tag =>
632 :     {
633 :     db_xref => $allpegs->{'rna'}->{$prot},
634 :     note => 'Generated by the Fellowship for the Interpretation of Genomes',
635 :     function => $features->{'rna'}->{$prot},
636 :     }
637 :     );
638 :    
639 :     $sobj->add_SeqFeature($feat);
640 :     }
641 :     return $sobj;
642 :     }
643 :    
644 :     =head2 best_hit
645 :    
646 :     Returns the FIG id of the single best hit to a peg
647 :    
648 :     eg
649 :    
650 :     my $bh=$fr->best_hit($peg);
651 :     print 'function is ', scalar $fig->function_of($bh);
652 :    
653 :     =cut
654 :    
655 :     sub best_hit {
656 :     my ($self, $peg)=@_;
657 :     return unless ($peg);
658 :    
659 :     my ($maxN, $maxP)=(1, 1e-5);
660 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
661 :     return ${$sims[0]}[1];
662 :     }
663 : redwards 1.1
664 : redwards 1.23
665 :     =head1 read_fasta
666 :    
667 :     Read a fasta format file and return a reference to a hash with the data. The key is the ID and the value is the sequence. If you supply the optional keep comments then the comments (anything after the first white space are returned as a sepaarte hash).
668 :    
669 :     Usage:
670 :     my $fasta=$raelib->read_fasta($file);
671 :     my ($fasta, $comments)=$raelib->read_fasta($file, 1);
672 :    
673 :     =cut
674 :    
675 :     sub read_fasta {
676 :     my ($self, $file, $keepcomments)=@_;
677 :     open (IN, $file) || die "Can't open $file";
678 :     my %f; my $t; my $s; my %c;
679 : overbeek 1.25 while (unless <IN>) {
680 : redwards 1.23 chomp;
681 :     if (/^>/) {
682 :     if ($s) {
683 :     $f{$t}=$s;
684 :     undef $s;
685 :     }
686 :     s/^>(\S+)\s*//;
687 :     $t=$1;
688 :     $c{$t}=$_ if ($_);
689 :     }
690 :     else {$s .= $_}
691 :     }
692 :     $f{$t}=$s;
693 :     if ($keepcomments) {return (\%f, \%c)}
694 :     else {return \%f}
695 :     }
696 :    
697 :     =head1 rc
698 :    
699 :     Reverse complement. It's too easy.
700 :    
701 :     =cut
702 :    
703 :     sub rc {
704 :     my ($self, $seq)=@_;
705 :     $seq=~tr/GATCgatc/CTAGctag/;
706 :     $seq = reverse $seq;
707 :     return $seq;
708 :     }
709 :    
710 : redwards 1.1 1;
711 : redwards 1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3