[Bio] / SeedViewer / MetagenomeAnalysis.pm Repository:
ViewVC logotype

View of /SeedViewer/MetagenomeAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Tue Apr 22 19:56:18 2008 UTC (11 years, 7 months ago) by paarmann
Branch: MAIN
CVS Tags: rast_rel_2008_04_23
Changes since 1.3: +32 -2 lines
added method to get hits for sequence

package SeedViewer::MetagenomeAnalysis;

# $Id: MetagenomeAnalysis.pm,v 1.4 2008/04/22 19:56:18 paarmann Exp $

use strict;
use warnings;

use FIG_Config;
use DBI;

1;



use constant QUERY_DEFAULTS => 
  { 1 => { evalue => '1e-05', align_len => 50 }, # RDP
    2 => { evalue => '0.01' }, # SEED
    3 => { evalue => '1e-05', align_len => 50 }, # Greengenes
    4 => { evalue => '1e-05', align_len => 50 }, # LSU
    5 => { evalue => '1e-05', align_len => 50 }, # SSU
    6 => { evalue => '0.01' }, # Subsystem
  };
    
				 


sub new {
  my ($class, $job) = @_;
  
  # check job
  unless (ref $job and $job->isa('RAST::Job') and $job->metagenome) {
    return undef;
  }

  # connect to database
  my $dbh;
  eval {

    my $host     = $FIG_Config::mgrast_dbhost;
    my $database = $FIG_Config::mgrast_db;
    my $user     = $FIG_Config::mgrast_dbuser;
    my $password = '';
    
    $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password, 
			{ RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
					  die "database connect error.";
  };
  if ($@) {
    warn "Unable to connect to metagenomics database: $@\n";
    return undef;
  }

  # create object
  my $self = { job => $job,
	       dbh => $dbh,
	       key2taxa => undef,
	       query => {},
	     };
  bless $self, $class;

  # load key2taxa mapping
  $self->get_key2taxa_mapping();
  
  return $self;

}




sub job {
  return $_[0]->{job};
}


sub dbh {
  return $_[0]->{dbh};
}


sub dbtable {
  unless (defined $_[0]->{dbtable}) {
    $_[0]->{dbtable} = 'tax_sim_'.$_[0]->job->id; 
  }
  return $_[0]->{dbtable};
}


sub get_key2taxa_mapping {

  unless (defined($_[0]->{key2taxa})) {
    my $sth  = $_[0]->dbh->prepare("select dbkey, str from tax_item");
    $sth->execute;
    $_[0]->{key2taxa} = $sth->fetchall_hashref('dbkey');
  }

  return $_[0]->{key2taxa};

}


sub key2taxa {    
  if(defined $_[1]) {
    my $t = $_[0]->{key2taxa}->{$_[1]}->{str};
    $t =~ s/\t+$//;
    return $t;
  }
  return '';
}


sub split_taxstr {
  my @r = split(':', $_[1]);
  return \@r;
}

sub join_taxstr {
    # do I really want an ending colon?
  return join(':', @{$_[1]}).':';
}


sub evalue2log {
  return 10 * (log($_[1]) / log(10));
}

sub log2evalue {
  return 10**($_[1]/10);
}


sub get_dataset_id {
  # mapping contained in table db_type
  # unstable, changes every time 
  my $db_ids = { 'SEED' => 2, 
		 'Greengenes' => 3, 
		 'RDP' => 1, 
		 'LSU' => 4, 
		 'SSU' => 5,
		 'Subsystem' => 6,
	       };
  return $db_ids->{ $_[1] } || undef;
}


#******************************************************************************
#* MANAGING QUERY CRITERIA
#******************************************************************************

=pod

=item * B<query_evalue> (I<evalue>)

Set/get the expectation value which is currently used to query the database. 
Parameter I<evalue> has to be a float or in '1e-5'-like format or undef.

=cut 

sub query_evalue {
  if(scalar(@_)>1) {
    $_[0]->{query}->{evalue} = $_[1];
  }
  return $_[0]->{query}->{evalue};
}


=pod

=item * B<query_bitscore> (I<score>)

Set/get the bitscore which is currently used to query the database. 
Parameter I<score> has to be a float or undef.

=cut 

sub query_bitscore {
  if(scalar(@_)>1) {
    $_[0]->{query}->{bitscore} = $_[1];
  }
  return $_[0]->{query}->{bitscore};
}


=pod

=item * B<query_align_len> (I<length>)

Set/get the minimum alignment which is currently used to query the database. 
Parameter I<length> has to be a positive integer or undef.

=cut 

sub query_align_len {
  if(scalar(@_)>1) {
    if($_[1] and $_[1]<0) {
      die "Alignment length has to be positive: ".$_[1];
    }
    $_[0]->{query}->{align_len} = $_[1];
  }
  return $_[0]->{query}->{align_len};
}


=pod

=item * B<query_identity> (I<percent>)

Set/get the minimum percent identity which is currently used to query the database. 
Parameter I<percent> has to be a number in 0..100 or undef.

=cut 

sub query_identity {
  if(scalar(@_)>1) {
    if($_[1] and ($_[1]<0 or $_[1]>100)) {
      die "Identity has to be between 0 and 100: ".$_[1];
    }
    $_[0]->{query}->{identity} = $_[1];
  }
  return $_[0]->{query}->{identity};
}


=pod

=item * B<query_load_from_cgi> (I<cgi>, [I<dataset>])

Sets all query parameter to the values provided in the CGI query object I<cgi>.
This method recognises 'evalue', 'pvalue' (bitscore), 'alignment_length' and
'percent_identity' as query criteria. Any missing param will be set to undef.
If the optional parameter I<dataset> is set to one of the accepted datasets
(db types), the method will additionally load the defaults for this type into
the CGI object.


=cut 

sub query_load_from_cgi {
  my ($self, $cgi, $dataset) = @_;
  
  unless(ref $cgi and $cgi->isa("CGI")) {
    die "Query load from cgi requires a valid CGI object.";
  }

  # load the defaults if necessary
  if($dataset and $self->get_dataset_id($dataset)) {

    my $d = $self->get_dataset_id($dataset);
    
    my @v = qw( evalue bitscore align_len identity );
    foreach my $v (@v) {
      if(!defined($cgi->param($v)) and QUERY_DEFAULTS->{$d}->{$v}) {
	$cgi->param($v, QUERY_DEFAULTS->{$d}->{$v});
      }
    }
  }
    
  # set the query params
  my $evalue = $cgi->param('evalue') || '';
  $self->query_evalue($evalue);

  my $bitscore = $cgi->param('bitscore') || '';
  $self->query_bitscore($bitscore);
  
  my $align_len = $cgi->param('align_len') || '';
  $self->query_align_len($align_len);
  
  my $identity = $cgi->param('identity') || '';
  $self->query_identity($identity);

  return $self;

}  
  

=pod

=item * B<get_where_clause> ()

Returns for the current query parameters the where clause as applicable to the 
tax_sim_XYZ table SQL queries. The method will take care of all conversions to
eg the logscore evalues.

=cut 

sub get_where_clause {
  my ($self) = @_;
  
  my @params;
  
  if($self->{query}->{evalue}) {
    push @params, "logpsc<=".$self->evalue2log($self->{query}->{evalue});
  }

  if($self->{query}->{bitscore}) {
    push @params, "bsc>=".$self->{query}->{bitscore};
  }
  
  if($self->{query}->{align_len}) {
    push @params, "ali_ln>=".$self->{query}->{align_len};
  }

  if($self->{query}->{identity}) {
    push @params, "iden>=".$self->{query}->{identity};
  }

  return join(' and ', @params);

}
  


#******************************************************************************
#* OTHER
#******************************************************************************





=pod

=item * B<get_sequence> (I<sequence_id>)

Retrieve the sequence I<sequence_id> from the metagenome job directory.

=cut 

sub get_sequence {
  my ($self, $id) = @_;
  
  my $sequence_file = $self->job->org_dir.'/contigs';

  my $sequence = '';
  open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!";
  while(<FASTA>) {
    next unless /^\>$id/;

    while(<FASTA>) {
      last if /^>/;
      chomp;
      $sequence .= $_;
    }
  }
  
  return $sequence;

}


=pod

=item * B<get_hits_count> (I<dataset_name>)

Given a dataset name (db_id), this method returns
the total number of sequences that contain a hit. 

=cut 

sub get_hits_count {
  my ($self, $dataset) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  my $where = $self->get_where_clause();
  $where = ($where) ? "and $where" : '';

  my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b");
  $sth->execute;
  my ($result) = $sth->fetchrow_array;

  return $result;

}


=pod

=item * B<get_group_counts> (I<dataset_name>, [I<group>, I<filter1>, I<filter2>])

Given a dataset name (db_id), this method returns the total counts for all 
taxonomy groups of a certain depth which are hit. If no group name I<group> 
was given, the method returns counts for tax_group_1.
Optionally, I<group> may be 'tax_group_2' or 'tax_group_3' and in that case
any optional provided filters I<filter1> and I<filter2> will be applied to 
the column 'tax_group_1' and 'tax_group_2' respectively.

=cut

sub get_group_counts {
  my ($self, $dataset, $group, $filter1, $filter2) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  my $where = $self->get_where_clause();
  $where = ($where) ? "and $where" : '';  
  $group = 'tax_group_1' unless($group);
  
  my @filters;
  push @filters, "tax_group_1='$filter1'" if($filter1);
  push @filters, "tax_group_2='$filter2'" if($filter2);
  my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';

  my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group");
  $sth->execute;
  my $result = $sth->fetchall_arrayref();

  return $result;

}


=pod

=item * B<get_taxa_counts> (I<dataset_name>)

Given a dataset name (db_id), this method returns the total counts for all 
taxonomy strings which are hit. 

=cut

sub get_taxa_counts {
  my ($self, $dataset) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  my $where = $self->get_where_clause();
  $where = ($where) ? "and $where" : '';  
  
  my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str");

  $sth->execute;
  my $result = $sth->fetchall_arrayref();

  return $result;

}


=pod

=item * B<get_subsystem_counts> (I<dataset_name>)

Given a dataset name (db_id), this method returns the total counts for all 
subsystems which are hit. 

=cut

sub get_subsystem_counts {
  my ($self, $dataset) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  my $where = $self->get_where_clause();
  $where = ($where) ? "and $where" : '';  
 
  #print STDERR "select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3";
 #die;
  my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3");

  $sth->execute;
  my $result = $sth->fetchall_arrayref();

  return $result;

}


=pod

=item * B<get_sequence_subset> (I<dataset_name>, I<filter>)

Given a dataset name (db_id), this method returns all sequence ids,
the alignment length, the match id and the taxonomy string for all 
sequences which match the criteria and have their tax_str start with
the filter string I<filter>.

=cut

sub get_sequence_subset {
  my ($self, $dataset, $filter) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  my $where = $self->get_where_clause();
  $where = ($where) ? "and $where" : '';  
  
  my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'");

  $sth->execute;
  my $result = $sth->fetchall_arrayref();

  return $result;

}


=pod

=item * B<get_hits_for_sequence> (I<seq_id>, I<dataset>, I<limit>)

Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns
the first I<limit> rows of hit data for this sequence. If no I<limit> is provided, it
will default to 10.
It returns (match id, taxonomy string, log evalue, bitscore, alignment length, 
percent identity, start1, end1) per hit.

=cut

sub get_hits_for_sequence {
  my ($self, $id, $dataset, $limit) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);
  $limit = 10 unless ($limit);
  
  my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;");
  $sth->execute;
  my $result = $sth->fetchall_arrayref();

  return $result;

}


=pod 

=item * B<get_align_len_range> (I<dataset_name>)

Given a dataset name (db_id), this method returns
the minimum and maximum alignment length.

=cut 

sub get_align_len_range {
  my ($self, $dataset) = @_;

  my $table = $self->dbtable; 
  my $dbid  = $self->get_dataset_id($dataset);

  my $sth = $self->dbh->prepare("select min(ali_ln), max(ali_ln) from $table where dbid=$dbid");
  $sth->execute;
  my ($min, $max) = $sth->fetchrow_array;

  return ($min, $max);

}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3