[Bio] / SeedViewer / MetagenomeAnalysis.pm Repository:
ViewVC logotype

Diff of /SeedViewer/MetagenomeAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Tue Apr 22 19:56:18 2008 UTC revision 1.18, Mon Jul 14 22:12:45 2008 UTC
# Line 7  Line 7 
7    
8  use FIG_Config;  use FIG_Config;
9  use DBI;  use DBI;
10    use Data::Dumper;
11    
12  1;  1;
13    
   
   
14  use constant QUERY_DEFAULTS =>  use constant QUERY_DEFAULTS =>
15    { 1 => { evalue => '1e-05', align_len => 50 }, # RDP    { 1 => { evalue => '1e-05', align_len => 50 }, # RDP
16      2 => { evalue => '0.01' }, # SEED      2 => { evalue => '0.01' }, # SEED
# Line 21  Line 20 
20      6 => { evalue => '0.01' }, # Subsystem      6 => { evalue => '0.01' }, # Subsystem
21    };    };
22    
   
   
   
23  sub new {  sub new {
24    my ($class, $job) = @_;    my ($class, $job) = @_;
25    
# Line 36  Line 32 
32    my $dbh;    my $dbh;
33    eval {    eval {
34    
35        my $dbms     = $FIG_Config::mgrast_dbms;
36      my $host     = $FIG_Config::mgrast_dbhost;      my $host     = $FIG_Config::mgrast_dbhost;
37      my $database = $FIG_Config::mgrast_db;      my $database = $FIG_Config::mgrast_db;
38      my $user     = $FIG_Config::mgrast_dbuser;      my $user     = $FIG_Config::mgrast_dbuser;
39      my $password = '';      my $password = '';
40    
41        if ($dbms eq 'Pg')
42        {
43            $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host", $user, $password,
44                            { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
45                                die "database connect error.";
46        }
47        elsif ($dbms eq 'mysql' or $dbms eq '') # Default to mysql
48        {
49      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,
50                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
51                                            die "database connect error.";                                            die "database connect error.";
52        }
53        else
54        {
55            die "MetagenomeAnalysis: unknown dbms '$dbms'";
56        }
57    
58    };    };
59    if ($@) {    if ($@) {
60      warn "Unable to connect to metagenomics database: $@\n";      warn "Unable to connect to metagenomics database: $@\n";
# Line 55  Line 66 
66                 dbh => $dbh,                 dbh => $dbh,
67                 key2taxa => undef,                 key2taxa => undef,
68                 query => {},                 query => {},
69                       dbid_cache => {},
70               };               };
71    bless $self, $class;    bless $self, $class;
72    
# Line 65  Line 77 
77    
78  }  }
79    
   
   
   
80  sub job {  sub job {
81    return $_[0]->{job};    return $_[0]->{job};
82  }  }
# Line 85  Line 94 
94    return $_[0]->{dbtable};    return $_[0]->{dbtable};
95  }  }
96    
97    sub dbtable_best_psc {
98      unless (defined $_[0]->{dbtable}) {
99        $_[0]->{dbtable} = 'tax_sim_best_by_psc_'.$_[0]->job->id;
100      }
101      return $_[0]->{dbtable};
102    }
103    
104    sub dbtable_best_iden {
105      unless (defined $_[0]->{dbtable}) {
106        $_[0]->{dbtable} = 'tax_sim_best_by_iden_'.$_[0]->job->id;
107      }
108      return $_[0]->{dbtable};
109    }
110    
111  sub get_key2taxa_mapping {  sub get_key2taxa_mapping {
112    
# Line 129  Line 151 
151  }  }
152    
153    
154    #
155    # Determine the correct dbid for this job. Use sims.database_list
156    # to find the version that the analysis was run with.
157    #
158  sub get_dataset_id {  sub get_dataset_id {
159    # mapping contained in table db_type      my($self, $dataset) = @_;
160    # unstable, changes every time  
161    my $db_ids = { 'SEED' => 2,      my $id = $self->{dbid_cache}->{$dataset};
162                   'Greengenes' => 3,      return $id if defined($id);
163                   'RDP' => 1,  
164                   'LSU' => 4,      my($dbname, $type) = split(/:/, $dataset);
                  'SSU' => 5,  
                  'Subsystem' => 6,  
                };  
   return $db_ids->{ $_[1] } || undef;  
 }  
165    
166        my $dbs = $self->job->metaxml->get_metadata('sims.database_list');
167    
168        my @this = grep { $_->{name} eq $dbname } @$dbs;
169        if (@this)
170        {
171            my $vers = $this[0]->{version};
172    
173            #
174            # Now we can find the dbid ni the database.
175            #
176            my $res = $self->dbh->selectcol_arrayref(qq(SELECT dbid
177                                                        FROM seq_db
178                                                        WHERE name = ? AND version = ?
179                                                            AND tax_db_name = ?), undef,
180                                                     $dbname, $vers, $type);
181            if (@$res)
182            {
183                #print STDERR "Found @$res for $dbname $type $vers\n";
184                $id = $res->[0];
185                $self->{dbid_cache}->{$dataset} = $id;
186                return $id;
187            }
188            print STDERR "Did not find anything for dataset='$dataset' '$dbname' '$type' '$vers'\n";
189        }
190        print STDERR "did not find a vers for dataset='$dataset' $dbname $type\n" . Dumper($dbs);
191    }
192    
193  #******************************************************************************  #******************************************************************************
194  #* MANAGING QUERY CRITERIA  #* MANAGING QUERY CRITERIA
# Line 149  Line 196 
196    
197  =pod  =pod
198    
199    =over 4
200    
201  =item * B<query_evalue> (I<evalue>)  =item * B<query_evalue> (I<evalue>)
202    
203  Set/get the expectation value which is currently used to query the database.  Set/get the expectation value which is currently used to query the database.
# Line 315  Line 364 
364  #******************************************************************************  #******************************************************************************
365    
366    
   
   
   
367  =pod  =pod
368    
369  =item * B<get_sequence> (I<sequence_id>)  =item * B<get_sequence> (I<sequence_id>)
# Line 347  Line 393 
393    
394  }  }
395    
396    =pod
397    
398    =item * B<get_sequences_fasta> (I<sequence_ids>)
399    
400    Retrieve the sequences for a given list of I<sequence_ids> from the metagenome job directory in fasta format.
401    
402    =cut
403    
404    sub get_sequences_fasta {
405      my ($self, $ids) = @_;
406    
407      # get the path to the sequence file
408      my $sequence_file = $self->job->org_dir.'/contigs';
409    
410      # hash the ids
411      my %idh = map { $_ => 1 } @$ids;
412    
413      # store the result
414      my $sequence = '';
415    
416      # iterate over the sequence file
417      open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!";
418      while(<FASTA>) {
419    
420        # look only at id lines
421        next unless /^\>/;
422    
423        # get the id
424        my ($id) = /^\>(.+)/;
425        chomp($id);
426    
427        # find the id in the requested id hash
428        next unless exists ($idh{$id});
429    
430        # id was found, delete it from the hash
431        delete $idh{$id};
432    
433        # print the header line
434        $sequence .= $_;
435    
436        # print all the following lines to the result
437        while(<FASTA>) {
438          last if /^>/;
439          $sequence .= $_;
440        }
441    
442        last unless (scalar(keys(%idh)));
443      }
444    
445      return $sequence;
446    
447    }
448    
449  =pod  =pod
450    
# Line 360  Line 458 
458  sub get_hits_count {  sub get_hits_count {
459    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
460    
461    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
462    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
463    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
464    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
465    
466    my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b");    my $sth = $self->dbh->prepare("select count(distinct id1) from  $table where dbid=$dbid $where");
467    $sth->execute;    $sth->execute;
468    my ($result) = $sth->fetchrow_array;    my ($result) = $sth->fetchrow_array;
469    
# Line 390  Line 488 
488  sub get_group_counts {  sub get_group_counts {
489    my ($self, $dataset, $group, $filter1, $filter2) = @_;    my ($self, $dataset, $group, $filter1, $filter2) = @_;
490    
491    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
492    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
493    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
494    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
# Line 401  Line 499 
499    push @filters, "tax_group_2='$filter2'" if($filter2);    push @filters, "tax_group_2='$filter2'" if($filter2);
500    my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';    my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';
501    
502    my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group");    my $sth = $self->dbh->prepare("select $group as tax, count(*) as num from $table where dbid=$dbid $where $filter group by tax");
503    $sth->execute;    $sth->execute;
504    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
505    
506      print STDERR "get_group_counts: ds=$dataset group=$group filter1=$filter1 filter2=$filter2\n";
507      print STDERR Dumper($result);
508    return $result;    return $result;
509    
510  }  }
# Line 422  Line 522 
522  sub get_taxa_counts {  sub get_taxa_counts {
523    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
524    
525    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
526    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
527    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
528    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
529    
530    my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str");    my $sth = $self->dbh->prepare("select tax_str as tax, count(*) from $table where dbid=$dbid $where group by tax");
531    
532    $sth->execute;    $sth->execute;
533    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
# Line 449  Line 549 
549  sub get_subsystem_counts {  sub get_subsystem_counts {
550    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
551    
552    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
553    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
554    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
555    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
556    
557    #print STDERR "select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3";    my $sth = $self->dbh->prepare("select tax_group_1, tax_group_2, tax_group_3, tax_str, count(*) as num from $table where dbid=$dbid $where group by tax_group_1, tax_group_2, tax_group_3, tax_str");
  #die;  
   my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3");  
558    
559    $sth->execute;    $sth->execute;
560    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
# Line 480  Line 578 
578  sub get_sequence_subset {  sub get_sequence_subset {
579    my ($self, $dataset, $filter) = @_;    my ($self, $dataset, $filter) = @_;
580    
581    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
582    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
583    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
584    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
585    
586    my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'");    $filter =~ s/'/''/g;
587    
588      my $sth = $self->dbh->prepare("select id1, ali_ln, id2, tax_str from $table where dbid=$dbid $where and tax_str like '$filter%'");
589    $sth->execute;    $sth->execute;
590    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
591    
# Line 497  Line 596 
596    
597  =pod  =pod
598    
599    =item * B<get_recruitment_plot_data> (I<genome>)
600    
601    Given a genome id (83333.1), this method returns all sequence ids,
602    the alignment length, the match id and the taxonomy string for all
603    sequences which match the criteria and have their tax_str start equal
604    the genome tax string I<filter>.
605    
606    =cut
607    
608    sub get_recruitment_plot_data {
609      my ($self, $genome) = @_;
610    
611      my $table = $self->dbtable_best_psc;
612      my $dbid  = $self->get_dataset_id("SEED:seed_genome_tax");
613      my $where = $self->get_where_clause();
614      $where = ($where) ? "and $where" : '';
615    
616      my ($tax_id) = $self->dbh->selectrow_array(qq(SELECT tax_str
617                                                    FROM rdp_to_tax
618                                                    WHERE seq_num= ?), undef, $genome);
619    
620      if($tax_id =~ /(\S+)\s/){
621        $tax_id = $1;
622      }
623    
624      my $sth = $self->dbh->prepare(qq(SELECT id1, id2, b2, e2, logpsc
625                                       FROM $table
626                                       WHERE dbid=? $where and tax_str=?));
627    
628    
629      $sth->execute($dbid, $tax_id);
630      my $result = $sth->fetchall_arrayref();
631    
632      return $result;
633    
634    }
635    
636    
637    
638    
639    =pod
640    
641  =item * B<get_hits_for_sequence> (I<seq_id>, I<dataset>, I<limit>)  =item * B<get_hits_for_sequence> (I<seq_id>, I<dataset>, I<limit>)
642    
643  Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns  Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns
# Line 514  Line 655 
655    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
656    $limit = 10 unless ($limit);    $limit = 10 unless ($limit);
657    
658    my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;");    my $sth = $self->dbh->prepare(qq(SELECT id2, tax_str, logpsc, bsc, ali_ln,
659    $sth->execute;                                          iden, b1, e1
660                                       FROM $table
661                                       WHERE id1=? AND dbid=? AND rank_psc < ?));
662      $sth->execute($id, $dbid, $limit);
663    
664      my $result = $sth->fetchall_arrayref();
665    
666      return $result;
667    
668    }
669    
670    =item * B<get_best_hit_for_sequence> (I<seq_id>, I<dataset>)
671    
672    Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns
673    the first row of hit data for this sequence.
674    It returns (match id, taxonomy string, log evalue, bitscore, alignment length,
675    percent identity, start1, end1) per hit.
676    
677    =cut
678    
679    sub get_best_hit_for_sequence {
680      my ($self, $id, $dataset) = @_;
681    
682      my $table = $self->dbtable;
683      my $dbid  = $self->get_dataset_id($dataset);
684    
685      my $sth = $self->dbh->prepare(qq(SELECT id2, tax_str, logpsc, bsc, ali_ln,
686                                            iden, b1, e1, b2, e2
687                                       FROM $table
688                                       WHERE id1=? AND dbid=?));
689      $sth->execute($id, $dbid);
690    
691    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
692    
693    return $result;    return $result;
# Line 546  Line 718 
718    
719  }  }
720    
721    =pod
722    
723    =item * B<get_genome_id> (I<tax_str>)
724    
725    =cut
726    
727    sub get_genome_id {
728      my ($self, $tax_str) = @_;
729      $tax_str =~ s/'/''/g;
730      my $retval =  $self->dbh->selectrow_array("select seq_num from rdp_to_tax where tax_str='". $tax_str . "'");
731      if (ref($retval) eq 'ARRAY') {
732        return $retval->[0];
733      } else {
734        return $retval;
735      }
736    }
737    
738    =pod
739    
740    =back
741    
742    =cut

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3