package SeedViewer::MetagenomeAnalysis; # $Id: MetagenomeAnalysis.pm,v 1.7 2008/04/30 21:00:42 olson Exp $ use strict; use warnings; use FIG_Config; use DBI; 1; use constant QUERY_DEFAULTS => { 1 => { evalue => '1e-05', align_len => 50 }, # RDP 2 => { evalue => '0.01' }, # SEED 3 => { evalue => '1e-05', align_len => 50 }, # Greengenes 4 => { evalue => '1e-05', align_len => 50 }, # LSU 5 => { evalue => '1e-05', align_len => 50 }, # SSU 6 => { evalue => '0.01' }, # Subsystem }; sub new { my ($class, $job) = @_; # check job unless (ref $job and $job->isa('RAST::Job') and $job->metagenome) { return undef; } # connect to database my $dbh; eval { my $dbms = $FIG_Config::mgrast_dbms; my $host = $FIG_Config::mgrast_dbhost; my $database = $FIG_Config::mgrast_db; my $user = $FIG_Config::mgrast_dbuser; my $password = ''; if ($dbms eq 'Pg') { $dbh = DBI->connect("DBI:Pg:dbname=$database", $user, $password, { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) || die "database connect error."; } elsif ($dbms eq 'mysql' or $dbms eq '') # Default to mysql { $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password, { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) || die "database connect error."; } else { die "MetagenomeAnalysis: unknown dbms '$dbms'"; } }; if ($@) { warn "Unable to connect to metagenomics database: $@\n"; return undef; } # create object my $self = { job => $job, dbh => $dbh, key2taxa => undef, query => {}, }; bless $self, $class; # load key2taxa mapping $self->get_key2taxa_mapping(); return $self; } sub job { return $_[0]->{job}; } sub dbh { return $_[0]->{dbh}; } sub dbtable { unless (defined $_[0]->{dbtable}) { $_[0]->{dbtable} = 'tax_sim_'.$_[0]->job->id; } return $_[0]->{dbtable}; } sub get_key2taxa_mapping { unless (defined($_[0]->{key2taxa})) { my $sth = $_[0]->dbh->prepare("select dbkey, str from tax_item"); $sth->execute; $_[0]->{key2taxa} = $sth->fetchall_hashref('dbkey'); } return $_[0]->{key2taxa}; } sub key2taxa { if(defined $_[1]) { my $t = $_[0]->{key2taxa}->{$_[1]}->{str}; $t =~ s/\t+$//; return $t; } return ''; } sub split_taxstr { my @r = split(':', $_[1]); return \@r; } sub join_taxstr { # do I really want an ending colon? return join(':', @{$_[1]}).':'; } sub evalue2log { return 10 * (log($_[1]) / log(10)); } sub log2evalue { return 10**($_[1]/10); } sub get_dataset_id { # mapping contained in table db_type # unstable, changes every time my $db_ids = { 'SEED' => 2, 'Greengenes' => 3, 'RDP' => 1, 'LSU' => 4, 'SSU' => 5, 'Subsystem' => 6, }; return $db_ids->{ $_[1] } || undef; } #****************************************************************************** #* MANAGING QUERY CRITERIA #****************************************************************************** =pod =over 4 =item * B (I) Set/get the expectation value which is currently used to query the database. Parameter I has to be a float or in '1e-5'-like format or undef. =cut sub query_evalue { if(scalar(@_)>1) { $_[0]->{query}->{evalue} = $_[1]; } return $_[0]->{query}->{evalue}; } =pod =item * B (I) Set/get the bitscore which is currently used to query the database. Parameter I has to be a float or undef. =cut sub query_bitscore { if(scalar(@_)>1) { $_[0]->{query}->{bitscore} = $_[1]; } return $_[0]->{query}->{bitscore}; } =pod =item * B (I) Set/get the minimum alignment which is currently used to query the database. Parameter I has to be a positive integer or undef. =cut sub query_align_len { if(scalar(@_)>1) { if($_[1] and $_[1]<0) { die "Alignment length has to be positive: ".$_[1]; } $_[0]->{query}->{align_len} = $_[1]; } return $_[0]->{query}->{align_len}; } =pod =item * B (I) Set/get the minimum percent identity which is currently used to query the database. Parameter I has to be a number in 0..100 or undef. =cut sub query_identity { if(scalar(@_)>1) { if($_[1] and ($_[1]<0 or $_[1]>100)) { die "Identity has to be between 0 and 100: ".$_[1]; } $_[0]->{query}->{identity} = $_[1]; } return $_[0]->{query}->{identity}; } =pod =item * B (I, [I]) Sets all query parameter to the values provided in the CGI query object I. This method recognises 'evalue', 'pvalue' (bitscore), 'alignment_length' and 'percent_identity' as query criteria. Any missing param will be set to undef. If the optional parameter I is set to one of the accepted datasets (db types), the method will additionally load the defaults for this type into the CGI object. =cut sub query_load_from_cgi { my ($self, $cgi, $dataset) = @_; unless(ref $cgi and $cgi->isa("CGI")) { die "Query load from cgi requires a valid CGI object."; } # load the defaults if necessary if($dataset and $self->get_dataset_id($dataset)) { my $d = $self->get_dataset_id($dataset); my @v = qw( evalue bitscore align_len identity ); foreach my $v (@v) { if(!defined($cgi->param($v)) and QUERY_DEFAULTS->{$d}->{$v}) { $cgi->param($v, QUERY_DEFAULTS->{$d}->{$v}); } } } # set the query params my $evalue = $cgi->param('evalue') || ''; $self->query_evalue($evalue); my $bitscore = $cgi->param('bitscore') || ''; $self->query_bitscore($bitscore); my $align_len = $cgi->param('align_len') || ''; $self->query_align_len($align_len); my $identity = $cgi->param('identity') || ''; $self->query_identity($identity); return $self; } =pod =item * B () Returns for the current query parameters the where clause as applicable to the tax_sim_XYZ table SQL queries. The method will take care of all conversions to eg the logscore evalues. =cut sub get_where_clause { my ($self) = @_; my @params; if($self->{query}->{evalue}) { push @params, "logpsc<=".$self->evalue2log($self->{query}->{evalue}); } if($self->{query}->{bitscore}) { push @params, "bsc>=".$self->{query}->{bitscore}; } if($self->{query}->{align_len}) { push @params, "ali_ln>=".$self->{query}->{align_len}; } if($self->{query}->{identity}) { push @params, "iden>=".$self->{query}->{identity}; } return join(' and ', @params); } #****************************************************************************** #* OTHER #****************************************************************************** =pod =item * B (I) Retrieve the sequence I from the metagenome job directory. =cut sub get_sequence { my ($self, $id) = @_; my $sequence_file = $self->job->org_dir.'/contigs'; my $sequence = ''; open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!"; while() { next unless /^\>$id/; while() { last if /^>/; chomp; $sequence .= $_; } } return $sequence; } =pod =item * B (I) Given a dataset name (db_id), this method returns the total number of sequences that contain a hit. =cut sub get_hits_count { my ($self, $dataset) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b"); $sth->execute; my ($result) = $sth->fetchrow_array; return $result; } =pod =item * B (I, [I, I, I]) Given a dataset name (db_id), this method returns the total counts for all taxonomy groups of a certain depth which are hit. If no group name I was given, the method returns counts for tax_group_1. Optionally, I may be 'tax_group_2' or 'tax_group_3' and in that case any optional provided filters I and I will be applied to the column 'tax_group_1' and 'tax_group_2' respectively. =cut sub get_group_counts { my ($self, $dataset, $group, $filter1, $filter2) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; $group = 'tax_group_1' unless($group); my @filters; push @filters, "tax_group_1='$filter1'" if($filter1); push @filters, "tax_group_2='$filter2'" if($filter2); my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : ''; my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I) Given a dataset name (db_id), this method returns the total counts for all taxonomy strings which are hit. =cut sub get_taxa_counts { my ($self, $dataset) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I) Given a dataset name (db_id), this method returns the total counts for all subsystems which are hit. =cut sub get_subsystem_counts { my ($self, $dataset) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; #print STDERR "select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3"; #die; my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I, I) Given a dataset name (db_id), this method returns all sequence ids, the alignment length, the match id and the taxonomy string for all sequences which match the criteria and have their tax_str start with the filter string I. =cut sub get_sequence_subset { my ($self, $dataset, $filter) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I) Given a genome id (83333.1), this method returns all sequence ids, the alignment length, the match id and the taxonomy string for all sequences which match the criteria and have their tax_str start equal the genome tax string I. =cut sub get_recruitment_plot_data { my ($self, $genome) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id("SEED"); my $where = $self->get_where_clause(); $where = ($where) ? "and $where" : ''; my ($tax_id) = $self->dbh->selectrow_array("select tax_str from rdp_to_tax where seq_num='". $genome . "'"); my $sth = $self->dbh->prepare("select s.id1, s.id2, s.b2, s.e2, s.logpsc from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str='$tax_id'"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I, I, I) Given a sequence id I (id1) and a dataset name (db_id), this method returns the first I rows of hit data for this sequence. If no I is provided, it will default to 10. It returns (match id, taxonomy string, log evalue, bitscore, alignment length, percent identity, start1, end1) per hit. =cut sub get_hits_for_sequence { my ($self, $id, $dataset, $limit) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); $limit = 10 unless ($limit); my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;"); $sth->execute; my $result = $sth->fetchall_arrayref(); return $result; } =pod =item * B (I) Given a dataset name (db_id), this method returns the minimum and maximum alignment length. =cut sub get_align_len_range { my ($self, $dataset) = @_; my $table = $self->dbtable; my $dbid = $self->get_dataset_id($dataset); my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table where dbid=$dbid"); $sth->execute; my ($min, $max) = $sth->fetchrow_array; return ($min, $max); } =pod =back =cut