[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.651, Tue Feb 5 01:09:49 2008 UTC revision 1.652, Thu Feb 7 21:35:39 2008 UTC
# Line 30  Line 30 
30  #  #
31    
32  use FileLocking;  use FileLocking;
33    use DB_File;
34    
35  use Fcntl qw/:flock/;  # import LOCK_* constants  use Fcntl qw/:flock/;  # import LOCK_* constants
36    
# Line 10320  Line 10321 
10321    
10322  =head3 dsims  =head3 dsims
10323    
10324  usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select)  usage: @sims = $fig->dsims($id,$seq,$maxN,$min_nbsc)
10325    
10326  Returns a list of similarities for $peg such that  Returns a list of similarities for $seq against PEGs from FIGfams such that
   
     there will be at most $maxN similarities,  
10327    
10328      each similarity will have a P-score <= $maxP, and      there will be at most $maxN similarities, and
10329    
10330      $select gives processing instructions:      each similarity will have a normalized bit-score >= $min_nbsc
   
         "raw" means that the similarities will not be expanded (by far fastest option)  
         "fig" means return only similarities to fig genes  
         "all" means that you want all the expanded similarities.  
   
 By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant  
 protein collection, and converting it to a set of similarities (one for each of the  
 proteins that are essentially identical to the representative in the nr).  
10331    
10332  The "dsims" or "dynamic sims" are not precomputed.  They are computed using a heuristic which  The "dsims" or "dynamic sims" are not precomputed.  They are computed using a heuristic which
10333  is much faster than blast, but misses some similarities.  Essentially, you have an "index" or  is much faster than blast, but misses some similarities.  Essentially, you have an "index" or
10334  representative sequences, a quick blast is done against it, and if there are any hits these are  representative sequences, a quick blast is done against it, and if there are any hits these are
10335  used to indicate which sub-databases to blast against.  used to indicate which sub-databases to blast against.  This implies that the p-scores are fairly
10336    meaningless; use the normalized bit-scores ($sim->nbsc)
10337    
10338  =cut  =cut
10339    
10340  sub dsims {  sub dsims {
10341      my($self,$id,$seq,$maxN,$maxP,$select) = @_;      my($self,$id,$seq,$maxN,$min_nbsc,$figfams_data) = @_;
10342      my($sim,$sub_dir,$db,$hit,@hits,%in);      my($sim,$partition,%hits);
10343    
10344      my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3);      my %reps_hash;
10345      foreach $sim (@index)      my $reps_hash_tie = tie %reps_hash, 'DB_File', "$figfams_data/partition_reps.db", O_RDWR, 0666, $DB_HASH;
10346        $reps_hash_tie || die "tie failed";
10347    
10348        my $reps_db = "$figfams_data/partition_reps.fasta";
10349        if (! $reps_db) { $reps_db = "$FIG_Config::data/FigfamsData/partitions_rep.fasta" }
10350        (-s $reps_db) || return ();
10351    
10352        my @index = &blastit($id,$seq,$reps_db,1.0e-3);
10353        my %partitions;
10354        foreach my $rep (map { $_->id2 } @index)
10355      {      {
10356          if ($sim->id2 =~ /_(\d+)$/)          foreach $partition (split(/\t/,$reps_hash{$rep}))
10357          {          {
10358              $in{$1}++;              $partitions{$partition} = 1;
10359          }          }
10360      }      }
10361        foreach $partition (keys(%partitions))
     @hits = ();  
     foreach $db (keys(%in))  
10362      {      {
10363          $sub_dir = $db % 1000;          my $part_db = &partition_db($figfams_data,$partition);
10364          push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP));          my @hits = &blastit($id,$seq,$part_db,1.0e-5);
10365            foreach my $sim (@hits)
10366      }          {
10367                my $id2 = $sim->id2;
10368      if (@hits == 0)              my $nbsc = $sim->nbsc;
10369                if ((! $hits{$id2}) || ($hits{$id2}->nbsc < $sim->nbsc))
10370      {      {
10371          push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP));                  $hits{$id2} = $sim;
10372                }
10373      }      }
10374        }
10375        return sort { $b->nbsc <=> $a->nbsc } map { $hits{$_} } keys(%hits);
10376    }
10377    
10378    sub partition_db {
10379        my($ff,$partition) = @_;
10380    
10381      @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits;      my $mod = $partition % 1000;
10382      if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 }      return "$ff/Partitions/$mod/$partition/fasta";
     return expand_raw_sims( $self, \@hits, $maxP, $select );  
10383  }  }
10384    
10385  sub blastit {  sub blastit {
10386      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
10387      my($id,$seq,$db,$maxP) = @_;      my($id,$seq,$db,$maxP) = @_;
10388    
10389        if (! -s $db) { return () }
10390      if (! $maxP) { $maxP = 1.0e-5 }      if (! $maxP) { $maxP = 1.0e-5 }
10391      my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP");      my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP");
10392      my $tmp1 = $tmp->{$id};      my $tmp1 = $tmp->{$id};

Legend:
Removed from v.1.651  
changed lines
  Added in v.1.652

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3