Parent Directory
|
Revision Log
Revision 1.18 - (view) (download) (as text)
1 : | redwards | 1.1 | # -*- perl -*- |
2 : | |||
3 : | =pod | ||
4 : | |||
5 : | parrello | 1.13 | =head1 RAE Library |
6 : | redwards | 1.1 | |
7 : | Some routines and things that Rob uses. Please feel free to use at will and incorporate into | ||
8 : | your own code or move them into FIG.pm or elsewhere. | ||
9 : | |||
10 : | =cut | ||
11 : | |||
12 : | package raelib; | ||
13 : | use strict; | ||
14 : | redwards | 1.17 | use Bio::SeqIO; |
15 : | use Bio::Seq; | ||
16 : | use Bio::SeqFeature::Generic; | ||
17 : | redwards | 1.1 | use FIG; |
18 : | my $fig=new FIG; | ||
19 : | |||
20 : | redwards | 1.5 | =head2 new |
21 : | |||
22 : | Just instantiate the object and return $self | ||
23 : | |||
24 : | =cut | ||
25 : | |||
26 : | sub new { | ||
27 : | my $self=shift; | ||
28 : | return $self | ||
29 : | } | ||
30 : | |||
31 : | |||
32 : | |||
33 : | |||
34 : | redwards | 1.4 | =head2 features_on_contig |
35 : | |||
36 : | Returns a reference to an array containing all the features on a contig in a genome. | ||
37 : | |||
38 : | use: | ||
39 : | |||
40 : | my $arrayref=$rae->features_on_contig($genome, $contig); | ||
41 : | |||
42 : | or | ||
43 : | |||
44 : | foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) { | ||
45 : | ... blah blah ... | ||
46 : | } | ||
47 : | |||
48 : | returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs | ||
49 : | |||
50 : | v. experimental and guaranteed not to work! | ||
51 : | |||
52 : | =cut | ||
53 : | |||
54 : | sub features_on_contig { | ||
55 : | my ($self, $genome, $contig)=@_; | ||
56 : | # were this in FIG.pm you'd use this line: | ||
57 : | #my $rdbH = $self->db_handle; | ||
58 : | |||
59 : | my $rdbH = $fig->db_handle; | ||
60 : | my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')'); | ||
61 : | # this is complicated. A reference to an array of references to arrays, and we only want the first element. | ||
62 : | # simplify. | ||
63 : | my @results; | ||
64 : | foreach my $res (@$relational_db_response) {push @results, $res->[0]} | ||
65 : | return \@results; | ||
66 : | } | ||
67 : | |||
68 : | |||
69 : | |||
70 : | |||
71 : | |||
72 : | |||
73 : | |||
74 : | redwards | 1.7 | =head2 pirsfcorrespondence |
75 : | redwards | 1.1 | |
76 : | redwards | 1.18 | Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth. |
77 : | redwards | 1.1 | |
78 : | redwards | 1.18 | The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat |
79 : | redwards | 1.1 | |
80 : | redwards | 1.18 | This method takes three arguments: |
81 : | redwards | 1.9 | from : pirsfinfo.dat file |
82 : | to : file to write information to | ||
83 : | verbose : report on progress | ||
84 : | |||
85 : | redwards | 1.18 | Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly. |
86 : | |||
87 : | Returns the number of lines in the pirsinfo file that were read. | ||
88 : | redwards | 1.9 | |
89 : | redwards | 1.1 | =cut |
90 : | |||
91 : | redwards | 1.7 | sub pirsfcorrespondence { |
92 : | redwards | 1.9 | my ($self, $from, $to, $verbose)=@_; |
93 : | redwards | 1.10 | unless (-e $from) { |
94 : | print STDERR "File $from does not exist as called in $0\n"; | ||
95 : | return 0; | ||
96 : | } | ||
97 : | redwards | 1.18 | if ($from =~ /\.gz$/) { |
98 : | open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe"; | ||
99 : | } | ||
100 : | else { | ||
101 : | open (IN, $from) || die "Can't open $from"; | ||
102 : | } | ||
103 : | redwards | 1.8 | open (OUT, ">$to") || die "Can't write to $to"; |
104 : | redwards | 1.9 | my $linecount; |
105 : | redwards | 1.1 | while (<IN>) { |
106 : | redwards | 1.9 | $linecount++; |
107 : | redwards | 1.14 | if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"} |
108 : | redwards | 1.1 | if (/^>/) {print OUT; next} |
109 : | chomp; | ||
110 : | redwards | 1.14 | foreach my $peg ($self->swiss_pir_ids($_)) { |
111 : | redwards | 1.1 | print OUT $_, "\t", $peg, "\n"; |
112 : | } | ||
113 : | redwards | 1.14 | } |
114 : | close IN; | ||
115 : | close OUT; | ||
116 : | return $linecount; | ||
117 : | } | ||
118 : | |||
119 : | =head2 uniprotcorrespondence | ||
120 : | |||
121 : | Generate a correspondence table between uniprot knowledge base IDs and FIG ID's. | ||
122 : | |||
123 : | The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name | ||
124 : | |||
125 : | This method takes three arguments: | ||
126 : | from : uniprotKB file | ||
127 : | to : file to write information to | ||
128 : | verbose : report on progress | ||
129 : | |||
130 : | redwards | 1.18 | Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly. |
131 : | |||
132 : | Returns the number of lines in the uniprotkb file that were read. | ||
133 : | redwards | 1.14 | |
134 : | =cut | ||
135 : | |||
136 : | sub uniprotcorrespondence { | ||
137 : | my ($self, $from, $to, $verbose)=@_; | ||
138 : | unless (-e $from) { | ||
139 : | print STDERR "File $from does not exist as called in $0\n"; | ||
140 : | return 0; | ||
141 : | } | ||
142 : | redwards | 1.18 | if ($from =~ /\.gz$/) { |
143 : | open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe"; | ||
144 : | } | ||
145 : | else { | ||
146 : | open (IN, $from) || die "Can't open $from"; | ||
147 : | } | ||
148 : | redwards | 1.14 | open (OUT, ">$to") || die "Can't write to $to"; |
149 : | my $linecount; | ||
150 : | while (<IN>) { | ||
151 : | chomp; | ||
152 : | $linecount++; | ||
153 : | if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"} | ||
154 : | my @line=split /\s+\|\s+/; | ||
155 : | redwards | 1.16 | my $added; |
156 : | redwards | 1.14 | foreach my $peg ($self->swiss_pir_ids($line[0])) { |
157 : | print OUT "$_ | $peg\n"; | ||
158 : | redwards | 1.16 | $added=1; |
159 : | redwards | 1.12 | } |
160 : | redwards | 1.16 | unless ($added) {print OUT "$_\n"} |
161 : | redwards | 1.1 | } |
162 : | close IN; | ||
163 : | close OUT; | ||
164 : | redwards | 1.9 | return $linecount; |
165 : | redwards | 1.1 | } |
166 : | |||
167 : | redwards | 1.18 | =head2 prositecorrespondence |
168 : | |||
169 : | Generate a correspondence table between prosite and seed using sp id's and seed ids. | ||
170 : | |||
171 : | The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file | ||
172 : | |||
173 : | The output file will have the following columns: | ||
174 : | |||
175 : | prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id. | ||
176 : | |||
177 : | The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices. | ||
178 : | |||
179 : | This method takes three arguments: | ||
180 : | from : prosite file | ||
181 : | to : file to write information to | ||
182 : | verbose : report on progress | ||
183 : | |||
184 : | Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly. | ||
185 : | |||
186 : | Returns the number of lines in the prosite file that were read. | ||
187 : | |||
188 : | =cut | ||
189 : | |||
190 : | sub prositecorrespondence { | ||
191 : | my ($self, $from, $to, $verbose)=@_; | ||
192 : | unless (-e $from) { | ||
193 : | print STDERR "File $from does not exist as called in $0\n"; | ||
194 : | return 0; | ||
195 : | } | ||
196 : | if ($from =~ /\.gz$/) { | ||
197 : | open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe"; | ||
198 : | } | ||
199 : | else { | ||
200 : | open (IN, $from) || die "Can't open $from"; | ||
201 : | } | ||
202 : | open (OUT, ">$to") || die "Can't write to $to"; | ||
203 : | my $linecount; | ||
204 : | my ($famac, $famname, $famtype)=('','',''); | ||
205 : | while (<IN>) { | ||
206 : | chomp; | ||
207 : | $linecount++; | ||
208 : | if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"} | ||
209 : | if (m#//#) {($famac, $famname, $famtype)=('','',''); next} | ||
210 : | elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next} | ||
211 : | elsif (m/^AC\s*(\S+)/) {$famac=$1; next} | ||
212 : | next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time. | ||
213 : | # | ||
214 : | # this is the format of the DR lines: | ||
215 : | # DR P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T; | ||
216 : | s/^DR\s*//; | ||
217 : | foreach my $piece (split /\s*\;\s*/, $_) { | ||
218 : | my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece; | ||
219 : | foreach my $fig ($self->swiss_pir_ids($acc)) { | ||
220 : | print join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n"; | ||
221 : | } | ||
222 : | } | ||
223 : | } | ||
224 : | } | ||
225 : | redwards | 1.14 | |
226 : | =head2 swiss_pir_ids() | ||
227 : | |||
228 : | redwards | 1.18 | SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID. |
229 : | redwards | 1.14 | |
230 : | =cut | ||
231 : | |||
232 : | sub swiss_pir_ids { | ||
233 : | my ($self, $id)=@_; | ||
234 : | return () unless ($id); | ||
235 : | redwards | 1.18 | $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace |
236 : | |||
237 : | redwards | 1.15 | my @return=($fig->by_alias("uni|$id")); |
238 : | redwards | 1.14 | return @return if ($return[0]); |
239 : | |||
240 : | redwards | 1.15 | @return=($fig->by_alias("tr|$id")); |
241 : | redwards | 1.14 | return @return if ($return[0]); |
242 : | |||
243 : | redwards | 1.15 | @return=($fig->by_alias("sp|$id")); |
244 : | redwards | 1.14 | return @return if ($return[0]); |
245 : | |||
246 : | return (); | ||
247 : | } | ||
248 : | redwards | 1.1 | |
249 : | =head2 ss_by_id | ||
250 : | |||
251 : | Generate a list of subsystems that a peg occurs in. This is a ; separated list. | ||
252 : | This is a wrapper that removes roles and ignores essential things | ||
253 : | |||
254 : | =cut | ||
255 : | |||
256 : | sub ss_by_id { | ||
257 : | my ($self, $peg)=@_; | ||
258 : | my $ssout; | ||
259 : | foreach my $ss (sort $fig->subsystems_for_peg($peg)) | ||
260 : | { | ||
261 : | next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems | ||
262 : | $ssout.=$$ss[0]."; "; | ||
263 : | } | ||
264 : | $ssout =~ s/; $//; | ||
265 : | return $ssout; | ||
266 : | } | ||
267 : | |||
268 : | redwards | 1.3 | =head2 ss_by_homol |
269 : | |||
270 : | Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list. | ||
271 : | This is also a wrapper around sims and ss, but makes everything unified | ||
272 : | |||
273 : | =cut | ||
274 : | |||
275 : | sub ss_by_homol { | ||
276 : | my ($self, $peg)=@_; | ||
277 : | return unless ($peg); | ||
278 : | my ($maxN, $maxP)=(50, 1e-20); | ||
279 : | |||
280 : | # find the sims | ||
281 : | my @sims=$fig->sims($peg, $maxN, $maxP, 'fig'); | ||
282 : | |||
283 : | # we are only going to keep the best hit for each peg | ||
284 : | # in a subsystem | ||
285 : | my $best_ss_score; my $best_ss_id; | ||
286 : | foreach my $sim (@sims) | ||
287 : | { | ||
288 : | my $simpeg=$$sim[1]; | ||
289 : | my $simscore=$$sim[10]; | ||
290 : | my @subsys=$fig->subsystems_for_peg($simpeg); | ||
291 : | foreach my $ss (@subsys) | ||
292 : | { | ||
293 : | if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg} | ||
294 : | elsif ($best_ss_score->{$$ss[0]} > $simscore) | ||
295 : | { | ||
296 : | $best_ss_score->{$$ss[0]}=$simscore; | ||
297 : | $best_ss_id->{$$ss[0]}=$simpeg; | ||
298 : | } | ||
299 : | } | ||
300 : | } | ||
301 : | |||
302 : | my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id); | ||
303 : | |||
304 : | $ssoutput =~ s/, $//; | ||
305 : | return $ssoutput; | ||
306 : | } | ||
307 : | |||
308 : | =head2 tagvalue | ||
309 : | |||
310 : | This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar) | ||
311 : | |||
312 : | e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values; | ||
313 : | |||
314 : | Returns an empty array if no tag/value appropriate. | ||
315 : | |||
316 : | Just because I use this a lot I don't want to waste rewriting it. | ||
317 : | |||
318 : | =cut | ||
319 : | |||
320 : | sub tagvalue { | ||
321 : | my ($self, $peg, $tag)=@_; | ||
322 : | my @return; | ||
323 : | my @attr=$fig->feature_attributes($peg); | ||
324 : | foreach my $attr (@attr) { | ||
325 : | redwards | 1.11 | my ($gotpeg, $gottag, $val, $link)=@$attr; |
326 : | redwards | 1.3 | push @return, $val if ($gottag eq $tag); |
327 : | } | ||
328 : | return wantarray ? @return : join "; ", @return; | ||
329 : | } | ||
330 : | redwards | 1.1 | |
331 : | redwards | 1.5 | =head2 locations_on_contig |
332 : | |||
333 : | Return the locations of a sequence on a contig. | ||
334 : | |||
335 : | This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations. | ||
336 : | |||
337 : | my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef); | ||
338 : | foreach my $bp (@$locations) { ... do something ... } | ||
339 : | |||
340 : | first argument : genome number | ||
341 : | second argument : contig name | ||
342 : | third argument : sequence to look for | ||
343 : | fourth argument : beginning position to start looking from (can be undef) | ||
344 : | fifth argument : end position to stop looking from (can be undef) | ||
345 : | sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc) | ||
346 : | |||
347 : | Note, the position is calculated before the sequence is rc'd | ||
348 : | |||
349 : | =cut | ||
350 : | |||
351 : | sub locations_on_contig { | ||
352 : | my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_; | ||
353 : | my $return=[]; | ||
354 : | |||
355 : | # get the dna sequence of the contig, and make sure it is uppercase | ||
356 : | my $contig_ln=$fig->contig_ln($genome, $contig); | ||
357 : | return $return unless ($contig_ln); | ||
358 : | unless ($from) {$from=1} | ||
359 : | unless ($to) {$to=$contig_ln} | ||
360 : | if ($from > $to) {($from, $to)=($to, $from)} | ||
361 : | my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to); | ||
362 : | $dna_seq=uc($dna_seq); | ||
363 : | |||
364 : | # if we want to check the rc, we actually rc the query | ||
365 : | $sequence=$fig->reverse_comp($sequence) if ($check_reverse); | ||
366 : | $sequence=uc($sequence); | ||
367 : | |||
368 : | # now find all the matches | ||
369 : | my $posn=index($dna_seq, $sequence, 0); | ||
370 : | while ($posn > -1) { | ||
371 : | push @$return, $posn; | ||
372 : | $posn=index($dna_seq, $sequence, $posn+1); | ||
373 : | } | ||
374 : | return $return; | ||
375 : | } | ||
376 : | |||
377 : | |||
378 : | =head2 scrolling_org_list | ||
379 : | |||
380 : | This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once. | ||
381 : | |||
382 : | use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple); | ||
383 : | |||
384 : | multiple selections will only be set if $multiple is true | ||
385 : | |||
386 : | =cut | ||
387 : | |||
388 : | sub scrolling_org_list { | ||
389 : | my ($self, $cgi, $multiple)=@_; | ||
390 : | unless ($multiple) {$multiple=0} | ||
391 : | |||
392 : | my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' ); | ||
393 : | |||
394 : | # | ||
395 : | # Canonical names must match the keywords used in the DBMS. They are | ||
396 : | # defined in compute_genome_counts.pl | ||
397 : | # | ||
398 : | my %canonical = ( | ||
399 : | 'All' => undef, | ||
400 : | 'Archaea' => 'Archaea', | ||
401 : | 'Bacteria' => 'Bacteria', | ||
402 : | 'Eucarya' => 'Eukaryota', | ||
403 : | 'Viruses' => 'Virus', | ||
404 : | 'Environmental samples' => 'Environmental Sample' | ||
405 : | ); | ||
406 : | |||
407 : | my $req_dom = $cgi->param( 'domain' ) || 'All'; | ||
408 : | my @domains = $cgi->radio_group( -name => 'domain', | ||
409 : | -default => $req_dom, | ||
410 : | -override => 1, | ||
411 : | -values => [ @display ] | ||
412 : | ); | ||
413 : | |||
414 : | my $n_domain = 0; | ||
415 : | my %dom_num = map { ( $_, $n_domain++ ) } @display; | ||
416 : | my $req_dom_num = $dom_num{ $req_dom } || 0; | ||
417 : | |||
418 : | # | ||
419 : | # Viruses and Environmental samples must have completeness = All (that is | ||
420 : | # how they are in the database). Otherwise, default is Only "complete". | ||
421 : | # | ||
422 : | my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All' | ||
423 : | : $cgi->param( 'complete' ) || 'Only "complete"'; | ||
424 : | my @complete = $cgi->radio_group( -name => 'complete', | ||
425 : | -default => $req_comp, | ||
426 : | -override => 1, | ||
427 : | -values => [ 'All', 'Only "complete"' ] | ||
428 : | ); | ||
429 : | # | ||
430 : | # Use $fig->genomes( complete, restricted, domain ) to get org list: | ||
431 : | # | ||
432 : | my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete"; | ||
433 : | |||
434 : | my $orgs; my $label; | ||
435 : | @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } ); | ||
436 : | |||
437 : | foreach (@$orgs) { | ||
438 : | my $gs = $fig->genus_species($_); | ||
439 : | my $gc=scalar $fig->all_contigs($_); | ||
440 : | $label->{$_} = "$gs ($_) [$gc contigs]"; | ||
441 : | } | ||
442 : | |||
443 : | @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs; | ||
444 : | |||
445 : | my $n_genomes = @$orgs; | ||
446 : | |||
447 : | return ( "<TABLE>\n", | ||
448 : | " <TR>\n", | ||
449 : | " <TD>", | ||
450 : | redwards | 1.6 | $cgi->scrolling_list( -name => 'korgs', |
451 : | -values => $orgs, | ||
452 : | -labels => $label, | ||
453 : | -size => 10, | ||
454 : | -multiple => $multiple, | ||
455 : | redwards | 1.5 | ), $cgi->br, |
456 : | "$n_genomes genomes shown ", | ||
457 : | $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br, | ||
458 : | " </TD>", | ||
459 : | " <TD>", | ||
460 : | join( "<br>", "<b>Domain(s) to show:</b>", @domains), "<br>\n", | ||
461 : | join( "<br>", "<b>Completeness?</b>", @complete), "\n", | ||
462 : | "</TD>", | ||
463 : | " </TR>\n", | ||
464 : | "</TABLE>\n", | ||
465 : | $cgi->hr | ||
466 : | ); | ||
467 : | } | ||
468 : | |||
469 : | redwards | 1.17 | =head2 GenBank |
470 : | redwards | 1.5 | |
471 : | redwards | 1.17 | This object will take a genome number and return a Bio::Seq::RichSeq object that has the whole genome |
472 : | in GenBank format. This should be a nice way of getting some data out, but will probably be quite slow | ||
473 : | at building the object. | ||
474 : | redwards | 1.1 | |
475 : | redwards | 1.17 | Note that you need to call this with the genome name and the contig. This will then go through that contig. |
476 : | redwards | 1.1 | |
477 : | redwards | 1.17 | Something like this should work |
478 : | |||
479 : | foreach my $contig ($fig->all_contigs($genome)) { | ||
480 : | my $seqobj=FIGRob->GenBank($genome, $contig); | ||
481 : | # process the contig | ||
482 : | } | ||
483 : | |||
484 : | =cut | ||
485 : | |||
486 : | sub GenBank { | ||
487 : | my ($self, $genome, $contig)=@_; | ||
488 : | my $gs=$fig->genus_species($genome); | ||
489 : | return unless ($gs); | ||
490 : | unless ($contig) { | ||
491 : | print STDERR "You didn't provide a contig for $gs. I think that was a mistake. Sorry\n"; | ||
492 : | return; | ||
493 : | } | ||
494 : | my $len=$fig->contig_ln($genome, $contig); | ||
495 : | unless ($len) { | ||
496 : | print STDERR "$contig from $gs doesn't appear to have a length. Is it right?\n"; | ||
497 : | return; | ||
498 : | } | ||
499 : | |||
500 : | |||
501 : | # first find all the pegs ... | ||
502 : | my $features; # all the features in the genome | ||
503 : | my $allpegs; # all the pegs | ||
504 : | my $translation; # all the protein sequences | ||
505 : | foreach my $peg ($fig->pegs_of($genome)) { | ||
506 : | my @location=$fig->feature_location($peg); | ||
507 : | my $func=$fig->function_of($peg); | ||
508 : | foreach my $loc (@location) { | ||
509 : | $loc =~ /^(.*)\_(\d+)\_(\d+)$/; | ||
510 : | my ($cg, $start, $stop)=($1, $2, $3); | ||
511 : | next unless ($cg eq $contig); | ||
512 : | # save this information for later | ||
513 : | $features->{'peg'}->{$loc}=$func; | ||
514 : | $allpegs->{'peg'}->{$loc}=$peg; | ||
515 : | $translation->{'peg'}->{$loc}=$fig->get_translation($peg); | ||
516 : | } | ||
517 : | } | ||
518 : | # ... and all the RNAs | ||
519 : | foreach my $peg ($fig->rnas_of($genome)) { | ||
520 : | my @location=$fig->feature_location($peg); | ||
521 : | my $func=$fig->function_of($peg); | ||
522 : | foreach my $loc (@location) { | ||
523 : | $loc =~ /^(.*)\_(\d+)\_(\d+)$/; | ||
524 : | my ($cg, $start, $stop)=($1, $2, $3); | ||
525 : | next unless ($cg eq $contig); | ||
526 : | # save this information for later | ||
527 : | $features->{'rna'}->{$loc}=$func; | ||
528 : | $allpegs->{'rna'}->{$loc}=$peg; | ||
529 : | } | ||
530 : | } | ||
531 : | |||
532 : | |||
533 : | # now get all the contigs out | ||
534 : | my $seq=$fig->dna_seq($genome, $contig."_1_".$len); | ||
535 : | my $description = "Contig $contig from " . $fig->genus_species($genome); | ||
536 : | my $sobj=Bio::Seq->new( | ||
537 : | -seq => $seq, | ||
538 : | -id => $contig, | ||
539 : | -desc => $description, | ||
540 : | -accession_number => $genome | ||
541 : | ); | ||
542 : | foreach my $prot (keys %{$features->{'peg'}}) { | ||
543 : | $prot =~ /^(.*)\_(\d+)\_(\d+)$/; | ||
544 : | my ($cg, $start, $stop)=($1, $2, $3); | ||
545 : | my $strand=1; | ||
546 : | if ($stop < $start) { | ||
547 : | ($stop, $start)=($start, $stop); | ||
548 : | $strand=-1; | ||
549 : | } | ||
550 : | |||
551 : | my $feat=Bio::SeqFeature::Generic->new( | ||
552 : | -start => $start, | ||
553 : | -end => $stop, | ||
554 : | -strand => $strand, | ||
555 : | -primary => 'CDS', | ||
556 : | -display_name => $allpegs->{'peg'}->{$prot}, | ||
557 : | -source_tag => 'the SEED', | ||
558 : | -tag => | ||
559 : | { | ||
560 : | db_xref => $allpegs->{'peg'}->{$prot}, | ||
561 : | note => 'Generated by the Fellowship for the Interpretation of Genomes', | ||
562 : | function => $features->{'peg'}->{$prot}, | ||
563 : | translation => $translation->{'peg'}->{$prot} | ||
564 : | } | ||
565 : | ); | ||
566 : | |||
567 : | $sobj->add_SeqFeature($feat); | ||
568 : | } | ||
569 : | |||
570 : | foreach my $prot (keys %{$features->{'rna'}}) { | ||
571 : | $prot =~ /^(.*)\_(\d+)\_(\d+)$/; | ||
572 : | my ($cg, $start, $stop)=($1, $2, $3); | ||
573 : | my $strand=1; | ||
574 : | if ($stop < $start) { | ||
575 : | ($stop, $start)=($start, $stop); | ||
576 : | $strand=-1; | ||
577 : | } | ||
578 : | |||
579 : | my $feat=Bio::SeqFeature::Generic->new( | ||
580 : | -start => $start, | ||
581 : | -end => $stop, | ||
582 : | -strand => $strand, | ||
583 : | -primary => 'RNA', | ||
584 : | -source_tag => 'the SEED', | ||
585 : | -display_name => $allpegs->{'rna'}->{$prot}, | ||
586 : | -tag => | ||
587 : | { | ||
588 : | db_xref => $allpegs->{'rna'}->{$prot}, | ||
589 : | note => 'Generated by the Fellowship for the Interpretation of Genomes', | ||
590 : | function => $features->{'rna'}->{$prot}, | ||
591 : | } | ||
592 : | ); | ||
593 : | |||
594 : | $sobj->add_SeqFeature($feat); | ||
595 : | } | ||
596 : | return $sobj; | ||
597 : | } | ||
598 : | |||
599 : | =head2 best_hit | ||
600 : | |||
601 : | Returns the FIG id of the single best hit to a peg | ||
602 : | |||
603 : | eg | ||
604 : | |||
605 : | my $bh=$fr->best_hit($peg); | ||
606 : | print 'function is ', scalar $fig->function_of($bh); | ||
607 : | |||
608 : | =cut | ||
609 : | |||
610 : | sub best_hit { | ||
611 : | my ($self, $peg)=@_; | ||
612 : | return unless ($peg); | ||
613 : | |||
614 : | my ($maxN, $maxP)=(1, 1e-5); | ||
615 : | my @sims=$fig->sims($peg, $maxN, $maxP, 'fig'); | ||
616 : | return ${$sims[0]}[1]; | ||
617 : | } | ||
618 : | redwards | 1.1 | |
619 : | 1; | ||
620 : | redwards | 1.17 |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |