Parent Directory
|
Revision Log
All queries updated, still need to be fully tested
package SeedViewer::MetagenomeAnalysis; # $Id: MetagenomeAnalysis.pm,v 1.10 2008/06/11 15:17:20 jared Exp $ use strict; use warnings; use FIG_Config; use DBI; 1; use constant QUERY_DEFAULTS => { 1 => { evalue => '1e-05', align_len => 50 }, # RDP 2 => { evalue => '0.01' }, # SEED 3 => { evalue => '1e-05', align_len => 50 }, # Greengenes 4 => { evalue => '1e-05', align_len => 50 }, # LSU 5 => { evalue => '1e-05', align_len => 50 }, # SSU 6 => { evalue => '0.01' }, # Subsystem }; sub new { my ($class, $job) = @_; # check job unless (ref $job and $job->isa('RAST::Job') and $job->metagenome) { return undef; } # connect to database my $dbh; eval { my $dbms = $FIG_Config::mgrast_dbms; my $host = $FIG_Config::mgrast_dbhost; my $database = $FIG_Config::mgrast_db; my $user = $FIG_Config::mgrast_dbuser; my $password = ''; if ($dbms eq 'Pg') { $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host", $user, $password, { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) || die "database connect error."; } elsif ($dbms eq 'mysql' or $dbms eq '') # Default to mysql { $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password, { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) || die "database connect error."; } else { die "MetagenomeAnalysis: unknown dbms '$dbms'"; } }; if ($@) { warn "Unable to connect to metagenomics database: $@\n"; return undef; } # create object my $self = { job => $job, dbh => $dbh, key2taxa => undef, query => {}, }; bless $self, $class; # load key2taxa mapping $self->get_key2taxa_mapping(); return $self; } sub job { return $_[0]->{job}; } sub dbh { return $_[0]->{dbh}; } sub dbtable { unless (defined $_[0]->{dbtable}) { $_[0]->{dbtable} = 'tax_sim_'.$_[0]->job->id; } return $_[0]->{dbtable}; } sub dbtable_best_psc { unless (defined $_[0]->{dbtable}) { $_[0]->{dbtable} = 'tax_sim_best_by_psc_'.$_[0]->job->id; } return $_[0]->{dbtable}; } sub dbtable_best_iden { unless (defined $_[0]->{dbtable}) { $_[0]->{dbtable} = 'tax_sim_best_by_iden_'.$_[0]->job->id; } return $_[0]->{dbtable}; } sub get_key2taxa_mapping { unless (defined($_[0]->{key2taxa})) { my $sth = $_[0]->dbh->prepare("select dbkey, str from tax_item"); $sth->execute; $_[0]->{key2taxa} = $sth->fetchall_hashref('dbkey'); } return $_[0]->{key2taxa}; } sub key2taxa { if(defined $_[1]) { my $t = $_[0]->{key2taxa}->{$_[1]}->{str}; $t =~ s/\t+$//; return $t; } return ''; } sub split_taxstr { my @r = split(':', $_[1]); return \@r; } sub join_taxstr { # do I really want an ending colon? return join(':', @{$_[1]}).':'; } sub evalue2log { return 10 * (log($_[1]) / log(10)); } sub log2evalue { return 10**($_[1]/10); } sub get_dataset_id { # mapping contained in table db_type # unstable, changes every time my $db_ids = { 'SEED' => 2, 'Greengenes' => 3, 'RDP' => 1, 'LSU' => 4, 'SSU' => 5, 'Subsystem' => 6, }; return $db_ids->{ $_[1] } || undef; } #****************************************************************************** #* MANAGING QUERY CRITERIA #****************************************************************************** =pod =over 4 =item * B<query_evalue> (I<evalue>) Set/get the expectation value which is currently used to query the database. Parameter I<evalue> has to be a float or in '1e-5'-like format or undef. =cut sub query_evalue { if(scalar(@_)>1) { $_[0]->{query}->{evalue} = $_[1]; } return $_[0]->{query}->{evalue}; } =pod =item * B<query_bitscore> (I<score>) Set/get the bitscore which is currently used to query the database. Parameter I<score> has to be a float or undef. =cut sub query_bitscore { if(scalar(@_)>1) { $_[0]->{query}->{bitscore} = $_[1]; } return $_[0]->{query}->{bitscore}; } =pod =item * B<query_align_len> (I<length>) Set/get the minimum alignment which is currently used to query the database. Parameter I<length> has to be a positive integer or undef. =cut sub query_align_len { if(scalar(@_)>1) { if($_[1] and $_[1]<0) { die "Alignment length has to be positive: ".$_[1]; } $_[0]->{query}->{align_len} = $_[1]; } return $_[0]->{query}->{align_len}; } =pod =item * B<query_identity> (I<percent>) Set/get the minimum percent identity which is currently used to query the database. Parameter I<percent> has to be a number in 0..100 or undef. =cut sub query_identity { if(scalar(@_)>1) { if($_[1] and ($_[1]<0 or $_[1]>100)) { die "Identity has to be between 0 and 100: ".$_[1]; } $_[0]->{query}->{identity} = $_[1]; } return $_[0]->{query}->{identity}; } =pod =item * B<query_load_from_cgi> (I<cgi>, [I<dataset>]) Sets all query parameter to the values provided in the CGI query object I<cgi>. This method recognises 'evalue', 'pvalue' (bitscore), 'alignment_length' and 'percent_identity' as query criteria. Any missing param will be set to undef. If the optional parameter I<dataset> is set to one of the accepted datasets (db types), the method will additionally load the defaults for this type into the CGI object. =cut sub query_load_from_cgi { my ($self, $cgi, $dataset) = @_; unless(ref $cgi and $cgi->isa("CGI")) { die "Query load from cgi requires a valid CGI object."; } # load the defaults if necessary if($dataset and $self->get_dataset_id($dataset)) { my $d = $self->get_dataset_id($dataset); my @v = qw( evalue bitscore align_len identity ); foreach my $v (@v) { if(!defined($cgi->param($v)) and QUERY_DEFAULTS->{$d}->{$v}) { $cgi->param($v, QUERY_DEFAULTS->{$d}->{$v}); } } } # set the query params my $evalue = $cgi->param('evalue') || ''; $self->query_evalue($evalue); my $bitscore = $cgi->param('bitscore') || ''; $self->query_bitscore($bitscore); my $align_len = $cgi->param('align_len') || ''; $self->query_align_len($align_len); my $identity = $cgi->param('identity') || ''; $self->query_identity($identity); return $self; } =pod =item * B<get_where_clause> () Returns for the current query parameters the where clause as applicable to the tax_sim_XYZ table SQL queries. The method will take care of all conversions to eg the logscore evalues. =cut sub get_where_clause { my ($self) = @_; my @params; if($self->{query}->{evalue}) { push @params, "logpsc<=".$self->evalue2log($self->{query}->{evalue}); } if($self->{query}->{bitscore}) { push @params, "bsc>=".$self->{query}->{bitscore}; } if($self->{query}->{align_len}) { push @params, "ali_ln>=".$self->{query}->{align_len}; } if($self->{query}->{identity}) { push @params, "iden>=".$self->{query}->{identity}; } return join(' and ', @params); } #****************************************************************************** #* OTHER #****************************************************************************** =pod =item * B<get_sequence> (I<sequence_id>) Retrieve the sequence I<sequence_id> from the metagenome job directory. =cut sub get_sequence { my ($self, $id) = @_; my $sequence_file = $self->job->org_dir.'/contigs'; my $sequence = ''; open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!"; while(<FASTA>) { next unless /^\>$id/; while(<FASTA>) { last if /^>/; chomp; $sequence .= $_; } } return $sequence; } =pod =item * B<get_hits_count> (I<dataset_name>) Given a dataset name (db_id), this method returns the total number of sequences that contain a hit. =cut sub get_hits_count { my ($self, $dataset) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select count(distinct id1) from $table where dbid=$dbid $where"); $sth->execute; my ($result) = $sth->fetchrow_array; return $result; } =pod =item * B<get_group_counts> (I<dataset_name>, [I<group>, I<filter1>, I<filter2>]) Given a dataset name (db_id), this method returns the total counts for all taxonomy groups of a certain depth which are hit. If no group name I<group> was given, the method returns counts for tax_group_1. Optionally, I<group> may be 'tax_group_2' or 'tax_group_3' and in that case any optional provided filters I<filter1> and I<filter2> will be applied to the column 'tax_group_1' and 'tax_group_2' respectively. =cut sub get_group_counts { my ($self, $dataset, $group, $filter1, $filter2) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; $group = 'tax_group_1' unless($group); my @filters; push @filters, "tax_group_1='$filter1'" if($filter1); push @filters, "tax_group_2='$filter2'" if($filter2); my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : ''; my $sth = $self->dbh->prepare("select $group as tax, count(*) as num from $table where dbid=$dbid $where $filter group by tax"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_taxa_counts> (I<dataset_name>) Given a dataset name (db_id), this method returns the total counts for all taxonomy strings which are hit. =cut sub get_taxa_counts { my ($self, $dataset) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select tax_str as tax, count(*) from $table where dbid=$dbid group by tax"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_subsystem_counts> (I<dataset_name>) Given a dataset name (db_id), this method returns the total counts for all subsystems which are hit. =cut sub get_subsystem_counts { my ($self, $dataset) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select tax_group_1, tax_group_2, tax_group_3, tax_str, count(*) as num from $table where dbid=$dbid group by tax_group_1, tax_group_2, tax_group_3, tax_str"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_sequence_subset> (I<dataset_name>, I<filter>) Given a dataset name (db_id), this method returns all sequence ids, the alignment length, the match id and the taxonomy string for all sequences which match the criteria and have their tax_str start with the filter string I<filter>. =cut sub get_sequence_subset { my ($self, $dataset, $filter) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select id1, ali_ln, id2, tax_str from $table where dbid=$dbid $where and tax_str like '$filter%'"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_recruitment_plot_data> (I<genome>) Given a genome id (83333.1), this method returns all sequence ids, the alignment length, the match id and the taxonomy string for all sequences which match the criteria and have their tax_str start equal the genome tax string I<filter>. =cut sub get_recruitment_plot_data { my ($self, $genome) = @_; my $table = $self->dbtable_best_psc; my $dbid = $self->get_dataset_id("SEED"); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my ($tax_id) = $self->dbh->selectrow_array("select tax_str from rdp_to_tax where seq_num='". $genome . "'"); if($tax_id =~ /(\S+)\s/){ $tax_id = $1; } my $sth = $self->dbh->prepare("select id1, id2, b2, e2, logpsc from $table where dbid=$dbid $where and tax_str='$tax_id'"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_hits_for_sequence> (I<seq_id>, I<dataset>, I<limit>) Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns the first I<limit> rows of hit data for this sequence. If no I<limit> is provided, it will default to 10. It returns (match id, taxonomy string, log evalue, bitscore, alignment length, percent identity, start1, end1) per hit. =cut sub get_hits_for_sequence { my ($self, $id, $dataset, $limit) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); $limit = 10 unless ($limit); my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B<get_align_len_range> (I<dataset_name>) Given a dataset name (db_id), this method returns the minimum and maximum alignment length. =cut sub get_align_len_range { my ($self, $dataset) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table where dbid=$dbid"); $sth->execute; my ($min, $max) = $sth->fetchrow_array; return ($min, $max); } =pod =back =cut
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |