[Bio] / SeedViewer / MetagenomeAnalysis.pm Repository:
ViewVC logotype

Diff of /SeedViewer/MetagenomeAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Thu Apr 10 14:46:51 2008 UTC revision 1.8, Wed May 7 20:31:19 2008 UTC
# Line 12  Line 12 
12    
13    
14    
15    use constant QUERY_DEFAULTS =>
16      { 1 => { evalue => '1e-05', align_len => 50 }, # RDP
17        2 => { evalue => '0.01' }, # SEED
18        3 => { evalue => '1e-05', align_len => 50 }, # Greengenes
19        4 => { evalue => '1e-05', align_len => 50 }, # LSU
20        5 => { evalue => '1e-05', align_len => 50 }, # SSU
21        6 => { evalue => '0.01' }, # Subsystem
22      };
23    
24    
25    
26    
27  sub new {  sub new {
28    my ($class, $job) = @_;    my ($class, $job) = @_;
# Line 25  Line 36 
36    my $dbh;    my $dbh;
37    eval {    eval {
38    
39        my $dbms     = $FIG_Config::mgrast_dbms;
40      my $host     = $FIG_Config::mgrast_dbhost;      my $host     = $FIG_Config::mgrast_dbhost;
41      my $database = $FIG_Config::mgrast_db;      my $database = $FIG_Config::mgrast_db;
42      my $user     = $FIG_Config::mgrast_dbuser;      my $user     = $FIG_Config::mgrast_dbuser;
43      my $password = '';      my $password = '';
44    
45        if ($dbms eq 'Pg')
46        {
47            $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host", $user, $password,
48                            { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
49                                die "database connect error.";
50        }
51        elsif ($dbms eq 'mysql' or $dbms eq '') # Default to mysql
52        {
53      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,
54                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
55                                            die "database connect error.";                                            die "database connect error.";
56        }
57        else
58        {
59            die "MetagenomeAnalysis: unknown dbms '$dbms'";
60        }
61    
62    };    };
63    if ($@) {    if ($@) {
64      warn "Unable to connect to metagenomics database: $@\n";      warn "Unable to connect to metagenomics database: $@\n";
# Line 43  Line 69 
69    my $self = { job => $job,    my $self = { job => $job,
70                 dbh => $dbh,                 dbh => $dbh,
71                 key2taxa => undef,                 key2taxa => undef,
72                   query => {},
73               };               };
74    bless $self, $class;    bless $self, $class;
75    
# Line 131  Line 158 
158  }  }
159    
160    
161    #******************************************************************************
162    #* MANAGING QUERY CRITERIA
163    #******************************************************************************
164    
165    =pod
166    
167    =over 4
168    
169    =item * B<query_evalue> (I<evalue>)
170    
171    Set/get the expectation value which is currently used to query the database.
172    Parameter I<evalue> has to be a float or in '1e-5'-like format or undef.
173    
174    =cut
175    
176    sub query_evalue {
177      if(scalar(@_)>1) {
178        $_[0]->{query}->{evalue} = $_[1];
179      }
180      return $_[0]->{query}->{evalue};
181    }
182    
183    
184    =pod
185    
186    =item * B<query_bitscore> (I<score>)
187    
188    Set/get the bitscore which is currently used to query the database.
189    Parameter I<score> has to be a float or undef.
190    
191    =cut
192    
193    sub query_bitscore {
194      if(scalar(@_)>1) {
195        $_[0]->{query}->{bitscore} = $_[1];
196      }
197      return $_[0]->{query}->{bitscore};
198    }
199    
200    
201    =pod
202    
203    =item * B<query_align_len> (I<length>)
204    
205    Set/get the minimum alignment which is currently used to query the database.
206    Parameter I<length> has to be a positive integer or undef.
207    
208    =cut
209    
210    sub query_align_len {
211      if(scalar(@_)>1) {
212        if($_[1] and $_[1]<0) {
213          die "Alignment length has to be positive: ".$_[1];
214        }
215        $_[0]->{query}->{align_len} = $_[1];
216      }
217      return $_[0]->{query}->{align_len};
218    }
219    
220    
221    =pod
222    
223    =item * B<query_identity> (I<percent>)
224    
225    Set/get the minimum percent identity which is currently used to query the database.
226    Parameter I<percent> has to be a number in 0..100 or undef.
227    
228    =cut
229    
230    sub query_identity {
231      if(scalar(@_)>1) {
232        if($_[1] and ($_[1]<0 or $_[1]>100)) {
233          die "Identity has to be between 0 and 100: ".$_[1];
234        }
235        $_[0]->{query}->{identity} = $_[1];
236      }
237      return $_[0]->{query}->{identity};
238    }
239    
240    
241    =pod
242    
243    =item * B<query_load_from_cgi> (I<cgi>, [I<dataset>])
244    
245    Sets all query parameter to the values provided in the CGI query object I<cgi>.
246    This method recognises 'evalue', 'pvalue' (bitscore), 'alignment_length' and
247    'percent_identity' as query criteria. Any missing param will be set to undef.
248    If the optional parameter I<dataset> is set to one of the accepted datasets
249    (db types), the method will additionally load the defaults for this type into
250    the CGI object.
251    
252    
253    =cut
254    
255    sub query_load_from_cgi {
256      my ($self, $cgi, $dataset) = @_;
257    
258      unless(ref $cgi and $cgi->isa("CGI")) {
259        die "Query load from cgi requires a valid CGI object.";
260      }
261    
262      # load the defaults if necessary
263      if($dataset and $self->get_dataset_id($dataset)) {
264    
265        my $d = $self->get_dataset_id($dataset);
266    
267        my @v = qw( evalue bitscore align_len identity );
268        foreach my $v (@v) {
269          if(!defined($cgi->param($v)) and QUERY_DEFAULTS->{$d}->{$v}) {
270            $cgi->param($v, QUERY_DEFAULTS->{$d}->{$v});
271          }
272        }
273      }
274    
275      # set the query params
276      my $evalue = $cgi->param('evalue') || '';
277      $self->query_evalue($evalue);
278    
279      my $bitscore = $cgi->param('bitscore') || '';
280      $self->query_bitscore($bitscore);
281    
282      my $align_len = $cgi->param('align_len') || '';
283      $self->query_align_len($align_len);
284    
285      my $identity = $cgi->param('identity') || '';
286      $self->query_identity($identity);
287    
288      return $self;
289    
290    }
291    
292    
293    =pod
294    
295    =item * B<get_where_clause> ()
296    
297    Returns for the current query parameters the where clause as applicable to the
298    tax_sim_XYZ table SQL queries. The method will take care of all conversions to
299    eg the logscore evalues.
300    
301    =cut
302    
303    sub get_where_clause {
304      my ($self) = @_;
305    
306      my @params;
307    
308      if($self->{query}->{evalue}) {
309        push @params, "logpsc<=".$self->evalue2log($self->{query}->{evalue});
310      }
311    
312      if($self->{query}->{bitscore}) {
313        push @params, "bsc>=".$self->{query}->{bitscore};
314      }
315    
316      if($self->{query}->{align_len}) {
317        push @params, "ali_ln>=".$self->{query}->{align_len};
318      }
319    
320      if($self->{query}->{identity}) {
321        push @params, "iden>=".$self->{query}->{identity};
322      }
323    
324      return join(' and ', @params);
325    
326    }
327    
328    
329    
330    #******************************************************************************
331    #* OTHER
332    #******************************************************************************
333    
334    
335    
336    
337    
338    =pod
339    
340  =item * B<get_sequence> (I<sequence_id>)  =item * B<get_sequence> (I<sequence_id>)
341    
342  Retrieve the sequence I<sequence_id> from the metagenome job directory.  Retrieve the sequence I<sequence_id> from the metagenome job directory.
# Line 159  Line 365 
365  }  }
366    
367    
368  =item * B<get_hits_count> (I<dataset_name>, I<where_clause>)  =pod
369    
370    =item * B<get_hits_count> (I<dataset_name>)
371    
372  Given a dataset name (db_id) and optional a where clause, this method returns  Given a dataset name (db_id), this method returns
373  the total number of sequences that contain a hit.  the total number of sequences that contain a hit.
374    
375  =cut  =cut
376    
377  sub get_hits_count {  sub get_hits_count {
378    my ($self, $dataset, $where) = @_;    my ($self, $dataset) = @_;
379    
380    my $table = $self->dbtable;    my $table = $self->dbtable;
381    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
382    $where = '1' unless ($where);    my $where = $self->get_where_clause();
383      $where = ($where) ? "and $where" : '';
384    
385    my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid and $where group by id1) as b");    my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b");
386    $sth->execute;    $sth->execute;
387    my ($result) = $sth->fetchrow_array;    my ($result) = $sth->fetchrow_array;
388    
# Line 182  Line 391 
391  }  }
392    
393    
394    =pod
395    
396    =item * B<get_group_counts> (I<dataset_name>, [I<group>, I<filter1>, I<filter2>])
397    
398    Given a dataset name (db_id), this method returns the total counts for all
399    taxonomy groups of a certain depth which are hit. If no group name I<group>
400    was given, the method returns counts for tax_group_1.
401    Optionally, I<group> may be 'tax_group_2' or 'tax_group_3' and in that case
402    any optional provided filters I<filter1> and I<filter2> will be applied to
403    the column 'tax_group_1' and 'tax_group_2' respectively.
404    
405    =cut
406    
407    sub get_group_counts {
408      my ($self, $dataset, $group, $filter1, $filter2) = @_;
409    
410      my $table = $self->dbtable;
411      my $dbid  = $self->get_dataset_id($dataset);
412      my $where = $self->get_where_clause();
413      $where = ($where) ? "and $where" : '';
414      $group = 'tax_group_1' unless($group);
415    
416      my @filters;
417      push @filters, "tax_group_1='$filter1'" if($filter1);
418      push @filters, "tax_group_2='$filter2'" if($filter2);
419      my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';
420    
421      my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group");
422      $sth->execute;
423      my $result = $sth->fetchall_arrayref();
424    
425      return $result;
426    
427    }
428    
429    
430    =pod
431    
432    =item * B<get_taxa_counts> (I<dataset_name>)
433    
434    Given a dataset name (db_id), this method returns the total counts for all
435    taxonomy strings which are hit.
436    
437    =cut
438    
439    sub get_taxa_counts {
440      my ($self, $dataset) = @_;
441    
442      my $table = $self->dbtable;
443      my $dbid  = $self->get_dataset_id($dataset);
444      my $where = $self->get_where_clause();
445      $where = ($where) ? "and $where" : '';
446    
447      my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str");
448    
449      $sth->execute;
450      my $result = $sth->fetchall_arrayref();
451    
452      return $result;
453    
454    }
455    
456    
457    =pod
458    
459    =item * B<get_subsystem_counts> (I<dataset_name>)
460    
461    Given a dataset name (db_id), this method returns the total counts for all
462    subsystems which are hit.
463    
464    =cut
465    
466    sub get_subsystem_counts {
467      my ($self, $dataset) = @_;
468    
469      my $table = $self->dbtable;
470      my $dbid  = $self->get_dataset_id($dataset);
471      my $where = $self->get_where_clause();
472      $where = ($where) ? "and $where" : '';
473    
474      #print STDERR "select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3";
475     #die;
476      my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3");
477    
478      $sth->execute;
479      my $result = $sth->fetchall_arrayref();
480    
481      return $result;
482    
483    }
484    
485    
486    =pod
487    
488    =item * B<get_sequence_subset> (I<dataset_name>, I<filter>)
489    
490    Given a dataset name (db_id), this method returns all sequence ids,
491    the alignment length, the match id and the taxonomy string for all
492    sequences which match the criteria and have their tax_str start with
493    the filter string I<filter>.
494    
495    =cut
496    
497    sub get_sequence_subset {
498      my ($self, $dataset, $filter) = @_;
499    
500      my $table = $self->dbtable;
501      my $dbid  = $self->get_dataset_id($dataset);
502      my $where = $self->get_where_clause();
503      $where = ($where) ? "and $where" : '';
504    
505      my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'");
506    
507      $sth->execute;
508      my $result = $sth->fetchall_arrayref();
509    
510      return $result;
511    
512    }
513    
514    
515    =pod
516    
517    =item * B<get_recruitment_plot_data> (I<genome>)
518    
519    Given a genome id (83333.1), this method returns all sequence ids,
520    the alignment length, the match id and the taxonomy string for all
521    sequences which match the criteria and have their tax_str start equal
522    the genome tax string I<filter>.
523    
524    =cut
525    
526    sub get_recruitment_plot_data {
527      my ($self, $genome) = @_;
528    
529      my $table = $self->dbtable;
530      my $dbid  = $self->get_dataset_id("SEED");
531      my $where = $self->get_where_clause();
532      $where = ($where) ? "and $where" : '';
533    
534      my ($tax_id) = $self->dbh->selectrow_array("select tax_str from rdp_to_tax where seq_num='". $genome . "'");
535    
536      my $sth = $self->dbh->prepare("select s.id1, s.id2, s.b2, s.e2, s.logpsc from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str='$tax_id'");
537    
538      $sth->execute;
539      my $result = $sth->fetchall_arrayref();
540    
541      return $result;
542    
543    }
544    
545    
546    
547    
548    =pod
549    
550    =item * B<get_hits_for_sequence> (I<seq_id>, I<dataset>, I<limit>)
551    
552    Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns
553    the first I<limit> rows of hit data for this sequence. If no I<limit> is provided, it
554    will default to 10.
555    It returns (match id, taxonomy string, log evalue, bitscore, alignment length,
556    percent identity, start1, end1) per hit.
557    
558    =cut
559    
560    sub get_hits_for_sequence {
561      my ($self, $id, $dataset, $limit) = @_;
562    
563      my $table = $self->dbtable;
564      my $dbid  = $self->get_dataset_id($dataset);
565      $limit = 10 unless ($limit);
566    
567      my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;");
568      $sth->execute;
569      my $result = $sth->fetchall_arrayref();
570    
571      return $result;
572    
573    }
574    
575    
576    =pod
577    
578  =item * B<get_align_len_range> (I<dataset_name>)  =item * B<get_align_len_range> (I<dataset_name>)
579    
580  Given a dataset name (db_id), this method returns  Given a dataset name (db_id), this method returns
# Line 195  Line 588 
588    my $table = $self->dbtable;    my $table = $self->dbtable;
589    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
590    
591    my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table");    my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table where dbid=$dbid");
592    $sth->execute;    $sth->execute;
593    my ($min, $max) = $sth->fetchrow_array;    my ($min, $max) = $sth->fetchrow_array;
594    
# Line 203  Line 596 
596    
597  }  }
598    
599    =pod
600    
601    =back
602    
603    =cut

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3