Parent Directory
|
Revision Log
Revision 1.3 - (view) (download) (as text)
1 : | paarmann | 1.1 | package SeedViewer::MetagenomeAnalysis; |
2 : | |||
3 : | paarmann | 1.3 | # $Id: MetagenomeAnalysis.pm,v 1.2 2008/04/10 14:46:51 paarmann Exp $ |
4 : | paarmann | 1.1 | |
5 : | use strict; | ||
6 : | use warnings; | ||
7 : | |||
8 : | use FIG_Config; | ||
9 : | use DBI; | ||
10 : | |||
11 : | 1; | ||
12 : | |||
13 : | |||
14 : | |||
15 : | paarmann | 1.3 | use constant QUERY_DEFAULTS => |
16 : | { 1 => { evalue => '1e-05', align_len => 50 }, # RDP | ||
17 : | 2 => { evalue => '0.01' }, # SEED | ||
18 : | 3 => { evalue => '1e-05', align_len => 50 }, # Greengenes | ||
19 : | 4 => { evalue => '1e-05', align_len => 50 }, # LSU | ||
20 : | 5 => { evalue => '1e-05', align_len => 50 }, # SSU | ||
21 : | 6 => { evalue => '0.01' }, # Subsystem | ||
22 : | }; | ||
23 : | |||
24 : | |||
25 : | |||
26 : | paarmann | 1.1 | |
27 : | sub new { | ||
28 : | my ($class, $job) = @_; | ||
29 : | |||
30 : | # check job | ||
31 : | unless (ref $job and $job->isa('RAST::Job') and $job->metagenome) { | ||
32 : | return undef; | ||
33 : | } | ||
34 : | |||
35 : | # connect to database | ||
36 : | my $dbh; | ||
37 : | eval { | ||
38 : | |||
39 : | my $host = $FIG_Config::mgrast_dbhost; | ||
40 : | my $database = $FIG_Config::mgrast_db; | ||
41 : | my $user = $FIG_Config::mgrast_dbuser; | ||
42 : | my $password = ''; | ||
43 : | |||
44 : | $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password, | ||
45 : | { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) || | ||
46 : | die "database connect error."; | ||
47 : | }; | ||
48 : | if ($@) { | ||
49 : | warn "Unable to connect to metagenomics database: $@\n"; | ||
50 : | return undef; | ||
51 : | } | ||
52 : | |||
53 : | # create object | ||
54 : | my $self = { job => $job, | ||
55 : | dbh => $dbh, | ||
56 : | key2taxa => undef, | ||
57 : | paarmann | 1.3 | query => {}, |
58 : | paarmann | 1.1 | }; |
59 : | bless $self, $class; | ||
60 : | |||
61 : | # load key2taxa mapping | ||
62 : | $self->get_key2taxa_mapping(); | ||
63 : | |||
64 : | return $self; | ||
65 : | |||
66 : | } | ||
67 : | |||
68 : | |||
69 : | |||
70 : | |||
71 : | sub job { | ||
72 : | return $_[0]->{job}; | ||
73 : | } | ||
74 : | |||
75 : | |||
76 : | sub dbh { | ||
77 : | return $_[0]->{dbh}; | ||
78 : | } | ||
79 : | |||
80 : | |||
81 : | sub dbtable { | ||
82 : | unless (defined $_[0]->{dbtable}) { | ||
83 : | $_[0]->{dbtable} = 'tax_sim_'.$_[0]->job->id; | ||
84 : | } | ||
85 : | return $_[0]->{dbtable}; | ||
86 : | } | ||
87 : | |||
88 : | |||
89 : | sub get_key2taxa_mapping { | ||
90 : | |||
91 : | unless (defined($_[0]->{key2taxa})) { | ||
92 : | my $sth = $_[0]->dbh->prepare("select dbkey, str from tax_item"); | ||
93 : | $sth->execute; | ||
94 : | $_[0]->{key2taxa} = $sth->fetchall_hashref('dbkey'); | ||
95 : | } | ||
96 : | |||
97 : | return $_[0]->{key2taxa}; | ||
98 : | |||
99 : | } | ||
100 : | |||
101 : | |||
102 : | sub key2taxa { | ||
103 : | if(defined $_[1]) { | ||
104 : | my $t = $_[0]->{key2taxa}->{$_[1]}->{str}; | ||
105 : | $t =~ s/\t+$//; | ||
106 : | return $t; | ||
107 : | } | ||
108 : | return ''; | ||
109 : | } | ||
110 : | |||
111 : | |||
112 : | sub split_taxstr { | ||
113 : | my @r = split(':', $_[1]); | ||
114 : | return \@r; | ||
115 : | } | ||
116 : | |||
117 : | sub join_taxstr { | ||
118 : | # do I really want an ending colon? | ||
119 : | return join(':', @{$_[1]}).':'; | ||
120 : | } | ||
121 : | |||
122 : | |||
123 : | sub evalue2log { | ||
124 : | return 10 * (log($_[1]) / log(10)); | ||
125 : | } | ||
126 : | |||
127 : | sub log2evalue { | ||
128 : | return 10**($_[1]/10); | ||
129 : | } | ||
130 : | |||
131 : | |||
132 : | sub get_dataset_id { | ||
133 : | paarmann | 1.3 | # mapping contained in table db_type |
134 : | # unstable, changes every time | ||
135 : | paarmann | 1.1 | my $db_ids = { 'SEED' => 2, |
136 : | 'Greengenes' => 3, | ||
137 : | 'RDP' => 1, | ||
138 : | 'LSU' => 4, | ||
139 : | 'SSU' => 5, | ||
140 : | 'Subsystem' => 6, | ||
141 : | }; | ||
142 : | return $db_ids->{ $_[1] } || undef; | ||
143 : | } | ||
144 : | |||
145 : | |||
146 : | paarmann | 1.3 | #****************************************************************************** |
147 : | #* MANAGING QUERY CRITERIA | ||
148 : | #****************************************************************************** | ||
149 : | |||
150 : | =pod | ||
151 : | |||
152 : | =item * B<query_evalue> (I<evalue>) | ||
153 : | |||
154 : | Set/get the expectation value which is currently used to query the database. | ||
155 : | Parameter I<evalue> has to be a float or in '1e-5'-like format or undef. | ||
156 : | |||
157 : | =cut | ||
158 : | |||
159 : | sub query_evalue { | ||
160 : | if(scalar(@_)>1) { | ||
161 : | $_[0]->{query}->{evalue} = $_[1]; | ||
162 : | } | ||
163 : | return $_[0]->{query}->{evalue}; | ||
164 : | } | ||
165 : | |||
166 : | |||
167 : | =pod | ||
168 : | |||
169 : | =item * B<query_bitscore> (I<score>) | ||
170 : | |||
171 : | Set/get the bitscore which is currently used to query the database. | ||
172 : | Parameter I<score> has to be a float or undef. | ||
173 : | |||
174 : | =cut | ||
175 : | |||
176 : | sub query_bitscore { | ||
177 : | if(scalar(@_)>1) { | ||
178 : | $_[0]->{query}->{bitscore} = $_[1]; | ||
179 : | } | ||
180 : | return $_[0]->{query}->{bitscore}; | ||
181 : | } | ||
182 : | |||
183 : | |||
184 : | =pod | ||
185 : | |||
186 : | =item * B<query_align_len> (I<length>) | ||
187 : | |||
188 : | Set/get the minimum alignment which is currently used to query the database. | ||
189 : | Parameter I<length> has to be a positive integer or undef. | ||
190 : | |||
191 : | =cut | ||
192 : | |||
193 : | sub query_align_len { | ||
194 : | if(scalar(@_)>1) { | ||
195 : | if($_[1] and $_[1]<0) { | ||
196 : | die "Alignment length has to be positive: ".$_[1]; | ||
197 : | } | ||
198 : | $_[0]->{query}->{align_len} = $_[1]; | ||
199 : | } | ||
200 : | return $_[0]->{query}->{align_len}; | ||
201 : | } | ||
202 : | |||
203 : | |||
204 : | =pod | ||
205 : | |||
206 : | =item * B<query_identity> (I<percent>) | ||
207 : | |||
208 : | Set/get the minimum percent identity which is currently used to query the database. | ||
209 : | Parameter I<percent> has to be a number in 0..100 or undef. | ||
210 : | |||
211 : | =cut | ||
212 : | |||
213 : | sub query_identity { | ||
214 : | if(scalar(@_)>1) { | ||
215 : | if($_[1] and ($_[1]<0 or $_[1]>100)) { | ||
216 : | die "Identity has to be between 0 and 100: ".$_[1]; | ||
217 : | } | ||
218 : | $_[0]->{query}->{identity} = $_[1]; | ||
219 : | } | ||
220 : | return $_[0]->{query}->{identity}; | ||
221 : | } | ||
222 : | |||
223 : | |||
224 : | =pod | ||
225 : | |||
226 : | =item * B<query_load_from_cgi> (I<cgi>, [I<dataset>]) | ||
227 : | |||
228 : | Sets all query parameter to the values provided in the CGI query object I<cgi>. | ||
229 : | This method recognises 'evalue', 'pvalue' (bitscore), 'alignment_length' and | ||
230 : | 'percent_identity' as query criteria. Any missing param will be set to undef. | ||
231 : | If the optional parameter I<dataset> is set to one of the accepted datasets | ||
232 : | (db types), the method will additionally load the defaults for this type into | ||
233 : | the CGI object. | ||
234 : | |||
235 : | |||
236 : | =cut | ||
237 : | |||
238 : | sub query_load_from_cgi { | ||
239 : | my ($self, $cgi, $dataset) = @_; | ||
240 : | |||
241 : | unless(ref $cgi and $cgi->isa("CGI")) { | ||
242 : | die "Query load from cgi requires a valid CGI object."; | ||
243 : | } | ||
244 : | |||
245 : | # load the defaults if necessary | ||
246 : | if($dataset and $self->get_dataset_id($dataset)) { | ||
247 : | |||
248 : | my $d = $self->get_dataset_id($dataset); | ||
249 : | |||
250 : | my @v = qw( evalue bitscore align_len identity ); | ||
251 : | foreach my $v (@v) { | ||
252 : | if(!defined($cgi->param($v)) and QUERY_DEFAULTS->{$d}->{$v}) { | ||
253 : | $cgi->param($v, QUERY_DEFAULTS->{$d}->{$v}); | ||
254 : | } | ||
255 : | } | ||
256 : | } | ||
257 : | |||
258 : | # set the query params | ||
259 : | my $evalue = $cgi->param('evalue') || ''; | ||
260 : | $self->query_evalue($evalue); | ||
261 : | |||
262 : | my $bitscore = $cgi->param('bitscore') || ''; | ||
263 : | $self->query_bitscore($bitscore); | ||
264 : | |||
265 : | my $align_len = $cgi->param('align_len') || ''; | ||
266 : | $self->query_align_len($align_len); | ||
267 : | |||
268 : | my $identity = $cgi->param('identity') || ''; | ||
269 : | $self->query_identity($identity); | ||
270 : | |||
271 : | return $self; | ||
272 : | |||
273 : | } | ||
274 : | |||
275 : | |||
276 : | =pod | ||
277 : | |||
278 : | =item * B<get_where_clause> () | ||
279 : | |||
280 : | Returns for the current query parameters the where clause as applicable to the | ||
281 : | tax_sim_XYZ table SQL queries. The method will take care of all conversions to | ||
282 : | eg the logscore evalues. | ||
283 : | |||
284 : | =cut | ||
285 : | |||
286 : | sub get_where_clause { | ||
287 : | my ($self) = @_; | ||
288 : | |||
289 : | my @params; | ||
290 : | |||
291 : | if($self->{query}->{evalue}) { | ||
292 : | push @params, "logpsc<=".$self->evalue2log($self->{query}->{evalue}); | ||
293 : | } | ||
294 : | |||
295 : | if($self->{query}->{bitscore}) { | ||
296 : | push @params, "bsc>=".$self->{query}->{bitscore}; | ||
297 : | } | ||
298 : | |||
299 : | if($self->{query}->{align_len}) { | ||
300 : | push @params, "ali_ln>=".$self->{query}->{align_len}; | ||
301 : | } | ||
302 : | |||
303 : | if($self->{query}->{identity}) { | ||
304 : | push @params, "iden>=".$self->{query}->{identity}; | ||
305 : | } | ||
306 : | |||
307 : | return join(' and ', @params); | ||
308 : | |||
309 : | } | ||
310 : | |||
311 : | |||
312 : | |||
313 : | #****************************************************************************** | ||
314 : | #* OTHER | ||
315 : | #****************************************************************************** | ||
316 : | |||
317 : | |||
318 : | |||
319 : | |||
320 : | |||
321 : | =pod | ||
322 : | |||
323 : | paarmann | 1.1 | =item * B<get_sequence> (I<sequence_id>) |
324 : | |||
325 : | Retrieve the sequence I<sequence_id> from the metagenome job directory. | ||
326 : | |||
327 : | =cut | ||
328 : | |||
329 : | sub get_sequence { | ||
330 : | my ($self, $id) = @_; | ||
331 : | |||
332 : | my $sequence_file = $self->job->org_dir.'/contigs'; | ||
333 : | |||
334 : | my $sequence = ''; | ||
335 : | open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!"; | ||
336 : | while(<FASTA>) { | ||
337 : | next unless /^\>$id/; | ||
338 : | |||
339 : | while(<FASTA>) { | ||
340 : | last if /^>/; | ||
341 : | chomp; | ||
342 : | $sequence .= $_; | ||
343 : | } | ||
344 : | } | ||
345 : | |||
346 : | return $sequence; | ||
347 : | |||
348 : | } | ||
349 : | |||
350 : | |||
351 : | paarmann | 1.3 | =pod |
352 : | paarmann | 1.1 | |
353 : | paarmann | 1.3 | =item * B<get_hits_count> (I<dataset_name>) |
354 : | |||
355 : | Given a dataset name (db_id), this method returns | ||
356 : | paarmann | 1.1 | the total number of sequences that contain a hit. |
357 : | |||
358 : | =cut | ||
359 : | |||
360 : | sub get_hits_count { | ||
361 : | paarmann | 1.3 | my ($self, $dataset) = @_; |
362 : | paarmann | 1.1 | |
363 : | my $table = $self->dbtable; | ||
364 : | my $dbid = $self->get_dataset_id($dataset); | ||
365 : | paarmann | 1.3 | my $where = $self->get_where_clause(); |
366 : | $where = ($where) ? "and $where" : ''; | ||
367 : | paarmann | 1.1 | |
368 : | paarmann | 1.3 | my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b"); |
369 : | paarmann | 1.1 | $sth->execute; |
370 : | my ($result) = $sth->fetchrow_array; | ||
371 : | |||
372 : | return $result; | ||
373 : | |||
374 : | } | ||
375 : | |||
376 : | |||
377 : | paarmann | 1.3 | =pod |
378 : | |||
379 : | =item * B<get_group_counts> (I<dataset_name>, [I<group>, I<filter1>, I<filter2>]) | ||
380 : | |||
381 : | Given a dataset name (db_id), this method returns the total counts for all | ||
382 : | taxonomy groups of a certain depth which are hit. If no group name I<group> | ||
383 : | was given, the method returns counts for tax_group_1. | ||
384 : | Optionally, I<group> may be 'tax_group_2' or 'tax_group_3' and in that case | ||
385 : | any optional provided filters I<filter1> and I<filter2> will be applied to | ||
386 : | the column 'tax_group_1' and 'tax_group_2' respectively. | ||
387 : | |||
388 : | =cut | ||
389 : | |||
390 : | sub get_group_counts { | ||
391 : | my ($self, $dataset, $group, $filter1, $filter2) = @_; | ||
392 : | |||
393 : | my $table = $self->dbtable; | ||
394 : | my $dbid = $self->get_dataset_id($dataset); | ||
395 : | my $where = $self->get_where_clause(); | ||
396 : | $where = ($where) ? "and $where" : ''; | ||
397 : | $group = 'tax_group_1' unless($group); | ||
398 : | |||
399 : | my @filters; | ||
400 : | push @filters, "tax_group_1='$filter1'" if($filter1); | ||
401 : | push @filters, "tax_group_2='$filter2'" if($filter2); | ||
402 : | my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : ''; | ||
403 : | |||
404 : | my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group"); | ||
405 : | $sth->execute; | ||
406 : | my $result = $sth->fetchall_arrayref(); | ||
407 : | |||
408 : | return $result; | ||
409 : | |||
410 : | } | ||
411 : | |||
412 : | |||
413 : | =pod | ||
414 : | |||
415 : | =item * B<get_taxa_counts> (I<dataset_name>) | ||
416 : | |||
417 : | Given a dataset name (db_id), this method returns the total counts for all | ||
418 : | taxonomy strings which are hit. | ||
419 : | |||
420 : | =cut | ||
421 : | |||
422 : | sub get_taxa_counts { | ||
423 : | my ($self, $dataset) = @_; | ||
424 : | |||
425 : | my $table = $self->dbtable; | ||
426 : | my $dbid = $self->get_dataset_id($dataset); | ||
427 : | my $where = $self->get_where_clause(); | ||
428 : | $where = ($where) ? "and $where" : ''; | ||
429 : | |||
430 : | my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str"); | ||
431 : | |||
432 : | $sth->execute; | ||
433 : | my $result = $sth->fetchall_arrayref(); | ||
434 : | |||
435 : | return $result; | ||
436 : | |||
437 : | } | ||
438 : | |||
439 : | |||
440 : | =pod | ||
441 : | |||
442 : | =item * B<get_subsystem_counts> (I<dataset_name>) | ||
443 : | |||
444 : | Given a dataset name (db_id), this method returns the total counts for all | ||
445 : | subsystems which are hit. | ||
446 : | |||
447 : | =cut | ||
448 : | |||
449 : | sub get_subsystem_counts { | ||
450 : | my ($self, $dataset) = @_; | ||
451 : | |||
452 : | my $table = $self->dbtable; | ||
453 : | my $dbid = $self->get_dataset_id($dataset); | ||
454 : | my $where = $self->get_where_clause(); | ||
455 : | $where = ($where) ? "and $where" : ''; | ||
456 : | |||
457 : | my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3"); | ||
458 : | |||
459 : | $sth->execute; | ||
460 : | my $result = $sth->fetchall_arrayref(); | ||
461 : | |||
462 : | return $result; | ||
463 : | |||
464 : | } | ||
465 : | |||
466 : | |||
467 : | =pod | ||
468 : | |||
469 : | =item * B<get_sequence_subset> (I<dataset_name>, I<filter>) | ||
470 : | |||
471 : | Given a dataset name (db_id), this method returns all sequence ids, | ||
472 : | the alignment length, the match id and the taxonomy string for all | ||
473 : | sequences which match the criteria and have their tax_str start with | ||
474 : | the filter string I<filter>. | ||
475 : | |||
476 : | =cut | ||
477 : | |||
478 : | sub get_sequence_subset { | ||
479 : | my ($self, $dataset, $filter) = @_; | ||
480 : | |||
481 : | my $table = $self->dbtable; | ||
482 : | my $dbid = $self->get_dataset_id($dataset); | ||
483 : | my $where = $self->get_where_clause(); | ||
484 : | $where = ($where) ? "and $where" : ''; | ||
485 : | |||
486 : | my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'"); | ||
487 : | |||
488 : | $sth->execute; | ||
489 : | my $result = $sth->fetchall_arrayref(); | ||
490 : | |||
491 : | return $result; | ||
492 : | |||
493 : | } | ||
494 : | |||
495 : | |||
496 : | =pod | ||
497 : | |||
498 : | paarmann | 1.1 | =item * B<get_align_len_range> (I<dataset_name>) |
499 : | |||
500 : | Given a dataset name (db_id), this method returns | ||
501 : | the minimum and maximum alignment length. | ||
502 : | |||
503 : | =cut | ||
504 : | |||
505 : | sub get_align_len_range { | ||
506 : | my ($self, $dataset) = @_; | ||
507 : | |||
508 : | my $table = $self->dbtable; | ||
509 : | my $dbid = $self->get_dataset_id($dataset); | ||
510 : | |||
511 : | paarmann | 1.3 | my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table where dbid=$dbid"); |
512 : | paarmann | 1.1 | $sth->execute; |
513 : | my ($min, $max) = $sth->fetchrow_array; | ||
514 : | |||
515 : | return ($min, $max); | ||
516 : | |||
517 : | } | ||
518 : | |||
519 : |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |