[Bio] / SeedViewer / MetagenomeAnalysis.pm Repository:
ViewVC logotype

Diff of /SeedViewer/MetagenomeAnalysis.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Tue Apr 22 19:56:18 2008 UTC revision 1.21, Wed Aug 20 18:27:15 2008 UTC
# Line 7  Line 7 
7    
8  use FIG_Config;  use FIG_Config;
9  use DBI;  use DBI;
10    use Data::Dumper;
11    
12  1;  1;
13    
   
   
14  use constant QUERY_DEFAULTS =>  use constant QUERY_DEFAULTS =>
15    { 1 => { evalue => '1e-05', align_len => 50 }, # RDP    { 1 => { evalue => '1e-05', align_len => 50 }, # RDP
16      2 => { evalue => '0.01' }, # SEED      2 => { evalue => '0.01' }, # SEED
# Line 21  Line 20 
20      6 => { evalue => '0.01' }, # Subsystem      6 => { evalue => '0.01' }, # Subsystem
21    };    };
22    
   
   
   
23  sub new {  sub new {
24    my ($class, $job) = @_;    my ($class, $job) = @_;
25    
# Line 36  Line 32 
32    my $dbh;    my $dbh;
33    eval {    eval {
34    
35        my $dbms     = $FIG_Config::mgrast_dbms;
36      my $host     = $FIG_Config::mgrast_dbhost;      my $host     = $FIG_Config::mgrast_dbhost;
37      my $database = $FIG_Config::mgrast_db;      my $database = $FIG_Config::mgrast_db;
38      my $user     = $FIG_Config::mgrast_dbuser;      my $user     = $FIG_Config::mgrast_dbuser;
39      my $password = '';      my $password = '';
40    
41        if ($dbms eq 'Pg')
42        {
43            $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host", $user, $password,
44                            { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
45                                die "database connect error.";
46        }
47        elsif ($dbms eq 'mysql' or $dbms eq '') # Default to mysql
48        {
49      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,      $dbh = DBI->connect("DBI:mysql:database=$database;host=$host", $user, $password,
50                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||                          { RaiseError => 1, AutoCommit => 0, PrintError => 0 }) ||
51                                            die "database connect error.";                                            die "database connect error.";
52        }
53        else
54        {
55            die "MetagenomeAnalysis: unknown dbms '$dbms'";
56        }
57    
58    };    };
59    if ($@) {    if ($@) {
60      warn "Unable to connect to metagenomics database: $@\n";      warn "Unable to connect to metagenomics database: $@\n";
# Line 55  Line 66 
66                 dbh => $dbh,                 dbh => $dbh,
67                 key2taxa => undef,                 key2taxa => undef,
68                 query => {},                 query => {},
69                   dbid_cache => {},
70               };               };
71    bless $self, $class;    bless $self, $class;
72    
# Line 65  Line 77 
77    
78  }  }
79    
   
   
   
80  sub job {  sub job {
81    return $_[0]->{job};    return $_[0]->{job};
82  }  }
# Line 85  Line 94 
94    return $_[0]->{dbtable};    return $_[0]->{dbtable};
95  }  }
96    
97    sub dbtable_best_psc {
98      unless (defined $_[0]->{dbtable}) {
99        $_[0]->{dbtable} = 'tax_sim_best_by_psc_'.$_[0]->job->id;
100      }
101      return $_[0]->{dbtable};
102    }
103    
104    sub dbtable_best_iden {
105      unless (defined $_[0]->{dbtable}) {
106        $_[0]->{dbtable} = 'tax_sim_best_by_iden_'.$_[0]->job->id;
107      }
108      return $_[0]->{dbtable};
109    }
110    
111  sub get_key2taxa_mapping {  sub get_key2taxa_mapping {
112    
# Line 98  Line 120 
120    
121  }  }
122    
   
123  sub key2taxa {  sub key2taxa {
124    if(defined $_[1]) {    if(defined $_[1]) {
125      my $t = $_[0]->{key2taxa}->{$_[1]}->{str};      my $t = $_[0]->{key2taxa}->{$_[1]}->{str};
# Line 129  Line 150 
150  }  }
151    
152    
153    #
154    # Determine the correct dbid for this job. Use sims.database_list
155    # to find the version that the analysis was run with.
156    #
157  sub get_dataset_id {  sub get_dataset_id {
158    # mapping contained in table db_type      my($self, $dataset) = @_;
159    # unstable, changes every time  
160    my $db_ids = { 'SEED' => 2,      my $id = $self->{dbid_cache}->{$dataset};
161                   'Greengenes' => 3,      return $id if defined($id);
162                   'RDP' => 1,  
163                   'LSU' => 4,      my($dbname, $type) = split(/:/, $dataset);
164                   'SSU' => 5,  
165                   'Subsystem' => 6,      my $dbs = $self->job->metaxml->get_metadata('sims.database_list');
                };  
   return $db_ids->{ $_[1] } || undef;  
 }  
166    
167        my @this = grep { $_->{name} eq $dbname } @$dbs;
168        if (@this)
169        {
170            my $vers = $this[0]->{version};
171    
172            #
173            # Now we can find the dbid ni the database.
174            #
175            my $res = $self->dbh->selectcol_arrayref(qq(SELECT dbid
176                                                        FROM seq_db
177                                                        WHERE name = ? AND version = ?
178                                                            AND tax_db_name = ?), undef,
179                                                     $dbname, $vers, $type);
180            if (@$res)
181            {
182                #print STDERR "Found @$res for $dbname $type $vers\n";
183                $id = $res->[0];
184                $self->{dbid_cache}->{$dataset} = $id;
185                return $id;
186            }
187            #print STDERR "Did not find anything for dataset='$dataset' '$dbname' '$type' '$vers'\n";
188        }
189        #print STDERR "did not find a vers for dataset='$dataset' $dbname $type\n" . Dumper($dbs);
190    }
191    
192  #******************************************************************************  #******************************************************************************
193  #* MANAGING QUERY CRITERIA  #* MANAGING QUERY CRITERIA
# Line 149  Line 195 
195    
196  =pod  =pod
197    
198    =over 4
199    
200  =item * B<query_evalue> (I<evalue>)  =item * B<query_evalue> (I<evalue>)
201    
202  Set/get the expectation value which is currently used to query the database.  Set/get the expectation value which is currently used to query the database.
# Line 315  Line 363 
363  #******************************************************************************  #******************************************************************************
364    
365    
   
   
   
366  =pod  =pod
367    
368  =item * B<get_sequence> (I<sequence_id>)  =item * B<get_sequence> (I<sequence_id>)
# Line 347  Line 392 
392    
393  }  }
394    
395    =pod
396    
397    =item * B<get_sequences_fasta> (I<sequence_ids>)
398    
399    Retrieve the sequences for a given list of I<sequence_ids> from the metagenome job directory in fasta format.
400    
401    =cut
402    
403    sub get_sequences_fasta {
404      my ($self, $ids) = @_;
405    
406      # get the path to the sequence file
407      my $sequence_file = $self->job->org_dir.'/contigs';
408    
409      # hash the ids
410      my %idh = map { $_ => 1 } @$ids;
411    
412      # store the result
413      my $sequence = '';
414    
415      # iterate over the sequence file
416      open(FASTA, "<$sequence_file") or die "Unable to read metagenome sequences: $!";
417      while(<FASTA>) {
418    
419        # look only at id lines
420        next unless /^\>/;
421    
422        # get the id
423        my ($id) = /^\>(.+)/;
424        chomp($id);
425    
426        # find the id in the requested id hash
427        next unless exists ($idh{$id});
428    
429        # id was found, delete it from the hash
430        delete $idh{$id};
431    
432        # print the header line
433        $sequence .= $_;
434    
435        # print all the following lines to the result
436        while(<FASTA>) {
437          last if /^>/;
438          $sequence .= $_;
439        }
440    
441        last unless (scalar(keys(%idh)));
442      }
443    
444      return $sequence;
445    
446    }
447    
448  =pod  =pod
449    
# Line 360  Line 457 
457  sub get_hits_count {  sub get_hits_count {
458    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
459    
460    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
461    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
462    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
463    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
464    
465    my $sth = $self->dbh->prepare("select count(*) from ( select id1, min(rank_psc) from $table where dbid=$dbid $where group by id1) as b");    my $sth = $self->dbh->prepare("select count(distinct id1) from  $table where dbid=$dbid $where");
466    $sth->execute;    $sth->execute;
467    my ($result) = $sth->fetchrow_array;    my ($result) = $sth->fetchrow_array;
468    
# Line 390  Line 487 
487  sub get_group_counts {  sub get_group_counts {
488    my ($self, $dataset, $group, $filter1, $filter2) = @_;    my ($self, $dataset, $group, $filter1, $filter2) = @_;
489    
490    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
491    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
492    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
493    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
# Line 401  Line 498 
498    push @filters, "tax_group_2='$filter2'" if($filter2);    push @filters, "tax_group_2='$filter2'" if($filter2);
499    my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';    my $filter = (scalar(@filters)) ? 'and '.join(' and ', @filters) : '';
500    
501    my $sth = $self->dbh->prepare("select s.$group as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid $filter group by s.$group");    my $sth = $self->dbh->prepare("select $group as tax, count(*) as num from $table where dbid=$dbid $where $filter group by tax");
502    $sth->execute;    $sth->execute;
503    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
504    
505      #print STDERR "get_group_counts: ds=$dataset group=$group filter1=$filter1 filter2=$filter2\n";
506      #print STDERR Dumper($result);
507    return $result;    return $result;
508    
509  }  }
# Line 422  Line 521 
521  sub get_taxa_counts {  sub get_taxa_counts {
522    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
523    
524    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
525    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
526    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
527    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
528    
529    my $sth = $self->dbh->prepare("select s.tax_str as tax, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_str");    my $sth = $self->dbh->prepare("select tax_str as tax, count(*) from $table where dbid=$dbid $where group by tax");
530    
531    $sth->execute;    $sth->execute;
532    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
# Line 437  Line 536 
536  }  }
537    
538    
539    sub get_tax2genomeid {
540      my ($self, $taxstring) = @_;
541    
542      my ($genomeid) = $self->dbh->selectrow_array(qq(SELECT seq_num
543                                                    FROM rdp_to_tax
544                                                    WHERE tax_str= ?), undef, $taxstring);
545    
546      return $genomeid;
547    }
548    
549    
550  =pod  =pod
551    
552  =item * B<get_subsystem_counts> (I<dataset_name>)  =item * B<get_subsystem_counts> (I<dataset_name>)
# Line 449  Line 559 
559  sub get_subsystem_counts {  sub get_subsystem_counts {
560    my ($self, $dataset) = @_;    my ($self, $dataset) = @_;
561    
562    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
563    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
564    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
565    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
566    
567    #print STDERR "select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3";    my $sth = $self->dbh->prepare("select tax_group_1, tax_group_2, tax_group_3, tax_str, count(*) as num from $table where dbid=$dbid $where group by tax_group_1, tax_group_2, tax_group_3, tax_str");
  #die;  
   my $sth = $self->dbh->prepare("select s.tax_group_1, s.tax_group_2, s.tax_group_3, s.tax_str, count(*) as num from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid group by s.tax_group_1, s.tax_group_2, s.tax_group_3");  
568    
569    $sth->execute;    $sth->execute;
570    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
# Line 480  Line 588 
588  sub get_sequence_subset {  sub get_sequence_subset {
589    my ($self, $dataset, $filter) = @_;    my ($self, $dataset, $filter) = @_;
590    
591    my $table = $self->dbtable;    my $table = $self->dbtable_best_psc;
592    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
593    my $where = $self->get_where_clause();    my $where = $self->get_where_clause();
594    $where = ($where) ? "and $where" : '';    $where = ($where) ? "and $where" : '';
595    
596    my $sth = $self->dbh->prepare("select s.id1, s.ali_ln, s.id2, s.tax_str from ( select id1, min(rank_psc) as rank from $table where dbid=$dbid $where group by id1) as b inner join $table as s on b.id1=s.id1 and b.rank=s.rank_psc where dbid=$dbid and s.tax_str like '$filter%'");    $filter =~ s/'/''/g;
597    
598      my $sth = $self->dbh->prepare("select id1, ali_ln, id2, tax_str from $table where dbid=$dbid $where and tax_str like '$filter%'");
599    $sth->execute;    $sth->execute;
600    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
601    
# Line 494  Line 603 
603    
604  }  }
605    
606    =pod
607    
608    =item * B<get_sequence_subset_genome> (I<genome>)
609    
610    Given a dataset name (db_id), this method returns all sequence ids,
611    the alignment length, the match id and the taxonomy string for all
612    sequences which match the criteria and have their tax_str start with
613    the filter string I<filter>.
614    
615    =cut
616    
617    sub get_sequence_subset_genome {
618      my ($self, $genome) = @_;
619    
620      my $table = $self->dbtable_best_psc;
621      my $dbid  = $self->get_dataset_id("SEED:seed_genome_tax");
622      my $where = $self->get_where_clause();
623      $where = ($where) ? "and $where" : '';
624    
625      my ($tax_id) = $self->dbh->selectrow_array(qq(SELECT tax_str
626                                                    FROM rdp_to_tax
627                                                    WHERE seq_num= ?), undef, $genome);
628    
629      if($tax_id =~ /(\S+)\s/){
630        $tax_id = $1;
631      }
632    
633      my $sth = $self->dbh->prepare(qq(SELECT id1, ali_ln, id2, tax_str
634                                       FROM $table
635                                       WHERE dbid=? $where and tax_str=?));
636    
637    
638      $sth->execute($dbid, $tax_id);
639      my $result = $sth->fetchall_arrayref();
640    
641      return $result;
642    
643    }
644    
645    =pod
646    
647    =item * B<get_recruitment_plot_data> (I<genome>)
648    
649    Given a genome id (83333.1), this method returns all sequence ids,
650    the alignment length, the match id and the taxonomy string for all
651    sequences which match the criteria and have their tax_str start equal
652    the genome tax string I<filter>.
653    
654    =cut
655    
656    sub get_recruitment_plot_data {
657      my ($self, $genome) = @_;
658    
659      my $table = $self->dbtable_best_psc;
660      my $dbid  = $self->get_dataset_id("SEED:seed_genome_tax");
661      my $where = $self->get_where_clause();
662      $where = ($where) ? "and $where" : '';
663    
664      my ($tax_id) = $self->dbh->selectrow_array(qq(SELECT tax_str
665                                                    FROM rdp_to_tax
666                                                    WHERE seq_num= ? and dbid= ?), undef, $genome, $dbid);
667    
668      if($tax_id =~ /(\S+)\s/){
669        $tax_id = $1;
670      }
671    
672      my $sth = $self->dbh->prepare(qq(SELECT id1, id2, b2, e2, logpsc
673                                       FROM $table
674                                       WHERE dbid=? $where and tax_str=?));
675    
676    
677      $sth->execute($dbid, $tax_id);
678      my $result = $sth->fetchall_arrayref();
679    
680      return $result;
681    
682    }
683    
684    
685    
686    
687  =pod  =pod
688    
# Line 514  Line 703 
703    my $dbid  = $self->get_dataset_id($dataset);    my $dbid  = $self->get_dataset_id($dataset);
704    $limit = 10 unless ($limit);    $limit = 10 unless ($limit);
705    
706    my $sth = $self->dbh->prepare("select id2, tax_str, logpsc, bsc, ali_ln, iden, b1, e1 from $table where id1=$id and dbid=$dbid and rank_psc<$limit;");    my $sth = $self->dbh->prepare(qq(SELECT id2, tax_str, logpsc, bsc, ali_ln,
707    $sth->execute;                                          iden, b1, e1
708                                       FROM $table
709                                       WHERE id1=? AND dbid=? AND rank_psc < ?));
710      $sth->execute($id, $dbid, $limit);
711    
712      my $result = $sth->fetchall_arrayref();
713    
714      return $result;
715    
716    }
717    
718    =item * B<get_best_hit_for_sequence> (I<seq_id>, I<dataset>)
719    
720    Given a sequence id I<seq_id> (id1) and a dataset name (db_id), this method returns
721    the first row of hit data for this sequence.
722    It returns (match id, taxonomy string, log evalue, bitscore, alignment length,
723    percent identity, start1, end1) per hit.
724    
725    =cut
726    
727    sub get_best_hit_for_sequence {
728      my ($self, $id, $dataset) = @_;
729    
730      my $table = $self->dbtable;
731      my $dbid  = $self->get_dataset_id($dataset);
732    
733      my $sth = $self->dbh->prepare(qq(SELECT id2, tax_str, logpsc, bsc, ali_ln,
734                                            iden, b1, e1, b2, e2
735                                       FROM $table
736                                       WHERE id1=? AND dbid=?));
737      $sth->execute($id, $dbid);
738    
739    my $result = $sth->fetchall_arrayref();    my $result = $sth->fetchall_arrayref();
740    
741    return $result;    return $result;
# Line 546  Line 766 
766    
767  }  }
768    
769    =pod
770    
771    =item * B<get_genome_id> (I<tax_str>)
772    
773    =cut
774    
775    sub get_genome_id {
776      my ($self, $tax_str) = @_;
777      $tax_str =~ s/'/''/g;
778      my $retval =  $self->dbh->selectrow_array("select seq_num from rdp_to_tax where tax_str='". $tax_str . "'");
779      if (ref($retval) eq 'ARRAY') {
780        return $retval->[0];
781      } else {
782        return $retval;
783      }
784    }
785    
786    =pod
787    
788    =back
789    
790    =cut

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3