[Bio] / FigKernelPackages / gjoalignandtree.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoalignandtree.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Mon Apr 12 20:27:23 2010 UTC revision 1.3, Sat Aug 21 17:43:32 2010 UTC
# Line 1  Line 1 
1  package gjoalignandtree;  package gjoalignandtree;
2    
3    # This is a SAS component.
4    
5    #-------------------------------------------------------------------------------
6    #
7    #  Order a set of sequences:
8    #
9    #    @seqs = order_sequences_by_length( \@seqs, \%opts )
10    #   \@seqs = order_sequences_by_length( \@seqs, \%opts )
11    #
12    #  Write representative seqeunce groups to a file
13    #
14    #      write_representative_groups( \%rep, $file )
15  #  #
16    #   @align = trim_align_to_median_ends( \@align, \%opts )
17    #  \@align = trim_align_to_median_ends( \@align, \%opts )
18    #
19    #   print_alignment_as_pseudoclustal( $align, \%opts )
20    #
21    #   $prof_rep = representative_for_profile( $align )
22    #
23    #    \@trimmed_seq             = extract_with_psiblast( $db, $profile, \%opts )
24    #  ( \@trimmed_seq, \%report ) = extract_with_psiblast( $db, $profile, \%opts )
25    #
26    #   $structured_blast = blastpgp(  $dbfile,  $profilefile, \%options )
27    #   $structured_blast = blastpgp( \@dbseq,   $profilefile, \%options )
28    #   $structured_blast = blastpgp(  $dbfile, \@profileseq,  \%options )
29    #   $structured_blast = blastpgp( \@dbseq,  \@profileseq,  \%options )
30    #
31    #   print_blast_as_records(  $file, \@queries )
32    #   print_blast_as_records( \*FH,   \@queries )
33    #   print_blast_as_records(         \@queries )     # STDOUT
34    #
35    #   \@queries = read_blast_from_records(  $file )
36    #   \@queries = read_blast_from_records( \*FH )
37    #   \@queries = read_blast_from_records( )          # STDIN
38    #
39    #   $exists = verify_db( $db, $type );
40    #
41    #-------------------------------------------------------------------------------
42    
43  use strict;  use strict;
44    use SeedAware;
45  use gjoseqlib;  use gjoseqlib;
46    use gjoparseblast;
47    
48  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
49  #  Order a set of sequences:  #  Order a set of sequences:
# Line 13  Line 53 
53  #  #
54  #  Options:  #  Options:
55  #  #
56  #     order  => key  # increasing (D), decreasing, median_outward,  #     order  => $key              # increasing (D), decreasing, median_outward,
57  #                    #       closest_to_median, closest_to_length  #                    #       closest_to_median, closest_to_length
58  #     length => prefered_length  #     length => $prefered_length
59  #  #
60  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
61    
# Line 152  Line 192 
192    
193  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
194  #  #
195    #   print_alignment_as_pseudoclustal( $align, \%opts )
196    #
197  #   Options:  #   Options:
198  #  #
199  #        file  =>  $filename  #  supply a file name to open and write  #        file  =>  $filename  #  supply a file name to open and write
# Line 181  Line 223 
223    
224      my $id;      my $id;
225      my @lines = map { $id = $_->[0]; [ map { sprintf $fmt, $id, $_ }      my @lines = map { $id = $_->[0]; [ map { sprintf $fmt, $id, $_ }
226                                         map { $case < 0 ? lc $_ : $case > 0 ? up $_ : $_ }  # map sequence only                                         map { $case < 0 ? lc $_ : $case > 0 ? uc $_ : $_ }  # map sequence only
227                                         $_->[2] =~ m/.{1,$line_len}/g                                         $_->[2] =~ m/.{1,$line_len}/g
228                                       ] }                                       ] }
229    
# Line 199  Line 241 
241    
242    
243  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
244    #
245    #    @seqs = read_pseudoclustal( )              #  D = STDIN
246    #   \@seqs = read_pseudoclustal( )              #  D = STDIN
247    #    @seqs = read_pseudoclustal(  $file_name )
248    #   \@seqs = read_pseudoclustal(  $file_name )
249    #    @seqs = read_pseudoclustal( \*FH )
250    #   \@seqs = read_pseudoclustal( \*FH )
251    #
252    #-------------------------------------------------------------------------------
253    
254    sub read_pseudoclustal
255    {
256        my ( $file ) = @_;
257        my ( $fh, $close ) = input_file_handle( $file );
258        my %seq;
259        my @ids;
260        while ( <$fh> )
261        {
262            chomp;
263            my ( $id, $data ) = /^(\S+)\s+(\S.*)$/;
264            if ( defined $id && defined $data )
265            {
266                push @ids, $id if ! $seq{ $id };
267                $data =~ s/\s+//g;
268                push @{ $seq{ $id } }, $data;
269            }
270        }
271        close $fh if $close;
272    
273        my @seq = map { [ $_, '', join( '', @{ $seq{ $_ } } ) ] } @ids;
274        wantarray ? @seq : \@seq;
275    }
276    
277    
278    #-------------------------------------------------------------------------------
279  #  The profile 'query' sequence:  #  The profile 'query' sequence:
280  #     1. No terminal gaps, if possible  #     1. No terminal gaps, if possible
281  #        1.1 Otherwise, no gaps at start  #        1.1 Otherwise, no gaps at start
# Line 215  Line 292 
292      my @cand;      my @cand;
293      @cand = grep { $_->[2] =~ /^[^-].*[^-]$/ } @$align;            # No end gaps      @cand = grep { $_->[2] =~ /^[^-].*[^-]$/ } @$align;            # No end gaps
294      @cand = grep { $_->[2] =~ /^[^-]/ }        @$align if ! @cand; # No start gaps      @cand = grep { $_->[2] =~ /^[^-]/ }        @$align if ! @cand; # No start gaps
     @cand = @$align if ! @cand;  
295    
296      my ( $rep ) = map  { $->[0] }               # sequence entry      my ( $rep ) = map  { $_->[0] }                     # sequence entry
297                    sort { $b->[1] <=> $a->[1] }  # max nongaps                    sort { $b->[1] <=> $a->[1] }  # max nongaps
298                    map  { [ $_, tr/-//c ] }      # count nongaps                    map  { [ $_, $_->[2] =~ tr/-//c ] }  # count nongaps
299                    @cand;                    @cand;
300    
301      $rep = [ @$rep ];       # Make a copy      $rep = [ @$rep ];       # Make a copy
# Line 231  Line 307 
307    
308  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
309  #  #
310    #    \@seq             = extract_with_psiblast( $db, $profile, \%opts )
311    #  ( \@seq, \%report ) = extract_with_psiblast( $db, $profile, \%opts )
312    #
313    #     $db      can be $db_file_name      or \@db_seqs
314    #     $profile can be $profile_file_name or \@profile_seqs
315    #
316    #     If supplied as file, profile is pseudoclustal
317    #
318    #     Report records:
319    #
320    #         [ $sid, $slen, $status, $frac_id, $frac_pos,
321    #                 $q_uncov_n_term, $q_uncov_c_term,
322    #                 $s_uncov_n_term, $s_uncov_c_term ]
323    #
324    #  Options:
325    #
326    #     e_value       =>  $max_e_value    #  maximum blastpgp E-value (D = 0.01)
327    #     max_e_val     =>  $max_e_value    #  maximum blastpgp E-value (D = 0.01)
328    #     max_q_uncov   =>  $aa_per_end     #  maximum unmatched query (D = 20)
329    #     max_q_uncov_c =>  $aa_per_end     #  maximum unmatched query, c-term (D = 20)
330    #     max_q_uncov_n =>  $aa_per_end     #  maximum unmatched query, n-term (D = 20)
331    #     min_ident     =>  $frac_ident     #  minimum fraction identity (D =  0.15)
332    #     min_positive  =>  $frac_positive  #  minimum fraction positive scoring (D = 0.20)
333    #     nresult       =>  $max_seq        #  maximim matching sequences (D = 5000)
334    #     query         =>  $q_file_name    #  query sequence file (D = most complete)
335    #     query         => \@q_seq_entry    #  query sequence (D = most complete)
336    #     stderr        =>  $file           #  blastpgp stderr (D = /dev/stderr)
337    #
338    #  If supplied, query must be identical to a profile sequence, but no gaps.
339    #
340    #-------------------------------------------------------------------------------
341    
342    sub extract_with_psiblast
343    {
344        my ( $profile, $seqs, $opts ) = @_;
345    
346        $opts && ref $opts eq 'HASH' or $opts = {};
347    
348        $opts->{ e_value } ||= $opts->{ max_e_val } ||= 0.01;
349        $opts->{ nresult } ||= 5000;
350        my $max_q_uncov_c = $opts->{ max_q_uncov_c } || $opts->{ max_q_uncov } || 20;
351        my $max_q_uncov_n = $opts->{ max_q_uncov_n } || $opts->{ max_q_uncov } || 20;
352        my $min_ident     = $opts->{ min_ident }    ||  0.15;
353        my $min_pos       = $opts->{ min_positive } ||  0.20;
354    
355        my $blast = blastpgp( $profile, $seqs, $opts );
356        $blast && @$blast or return ();
357    
358        my ( $qid, $qdef, $qlen, $qhits ) = @{ $blast->[0] };
359    
360        my @trimmed;
361        my %report;
362    
363        foreach my $sdata ( @$qhits )
364        {
365            my ( $sid, $sdef, $slen, $hsps ) = @$sdata;
366    
367            if ( one_real_hsp( $hsps ) )
368            {
369                #  [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
370                #     0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
371    
372                my $hsp0 = $hsps->[0];
373                my ( $scr, $nmat, $nid, $npos, $q1, $q2, $qseq, $s1, $s2, $sseq ) = ( @$hsp0 )[ 0, 4, 5, 6, 9, 10, 11, 12, 13, 14 ];
374    
375                my $status;
376                if    ( $q1-1     > $max_q_uncov_n ) { $status = 'missing start' }
377                elsif ( $qlen-$q2 > $max_q_uncov_c ) { $status = 'missing end' }
378                elsif ( $nid  / $nmat < $min_ident ) { $status = 'low identity' }
379                elsif ( $npos / $nmat < $min_pos )   { $status = 'low positives' }
380                else
381                {
382                    $sseq =~ s/-+//g;
383                    $sdef .= " ($s1-$s2/$slen)" if ( ( $s1 > 1 ) || ( $s2 < $slen ) );
384                    push @trimmed, [ $sid, $sdef, $sseq ];
385                    $status = 'included';
386                }
387    
388                $report{ $sid } = [ $sid, $slen, $status, $nid/$nmat, $npos/$nmat, $q1-1, $qlen-$q2, $s1-1, $slen-$s2 ];
389            }
390        }
391    
392        wantarray ? ( \@trimmed, \%report ) : \@trimmed;
393    }
394    
395    
396    #-------------------------------------------------------------------------------
397    #
398    #  Allow fragmentary matches inside of the highest-scoring hsp:
399    #
400    #-------------------------------------------------------------------------------
401    
402    sub one_real_hsp
403    {
404        my ( $hsps ) = @_;
405        return 0 if ! ( $hsps && ( ref( $hsps ) eq 'ARRAY' ) && @$hsps );
406        return 1 if  @$hsps == 1;
407    
408        my ( $q1_0, $q2_0 ) = ( @{ $hsps->[0] } )[9, 10];
409        for ( my $i = 1; $i < @$hsps; $i++ )
410        {
411            my ($q1, $q2) = ( @{ $hsps->[$i] } )[9, 10];
412            return 0 if $q1 < $q1_0 || $q2 > $q2_0;
413        }
414    
415        return 1;
416    }
417    
418    
419    #-------------------------------------------------------------------------------
420    #
421  #   $structured_blast = blastpgp( $db, $profile, $options )  #   $structured_blast = blastpgp( $db, $profile, $options )
422  #  #
423  #  Required:  #  Required:
 #  db_file or db_seq  
 #  profile_file or profile_seq  
424  #  #
425  #  Optional:  #     $db_file      or \@db_seq
426  #  query => q_file  #     $profile_file or \@profile_seq
427  #  query => q_seq  #
428    #  Options:
429    #
430    #     e_value   =>  $max_e_value   # D = 0.01
431    #     max_e_val =>  $max_e_value   # D = 0.01
432    #     nresult   =>  $max_seq       # D = 5000
433    #     query     =>  $q_file_name   # most complete
434    #     query     => \@q_seq_entry   # most complete
435  #  #
436  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
437    
# Line 247  Line 439 
439  {  {
440      my ( $db, $profile, $opts ) = @_;      my ( $db, $profile, $opts ) = @_;
441      $opts ||= {};      $opts ||= {};
442        my $tmp = SeedAware::location_of_tmp();
443    
444      my ( $dbfile, $rm_db );      my ( $dbfile, $rm_db );
445      if ( defined $db && ref $db )      if ( defined $db && ref $db )
# Line 255  Line 448 
448              && @$db              && @$db
449              || print STDERR "blastpgp requires one or more database sequences.\n"              || print STDERR "blastpgp requires one or more database sequences.\n"
450                 && return undef;                 && return undef;
451          $dbfile = "blastpgp_db_$$";          $dbfile = "$tmp/blastpgp_db_$$";
452          gjoseqlib::print_alignment_as_fasta( $dbfile, $db );          gjoseqlib::print_alignment_as_fasta( $dbfile, $db );
453          $rm_db = 1;          $rm_db = 1;
454      }      }
# Line 267  Line 460 
460      {      {
461          die "blastpgp requires database.";          die "blastpgp requires database.";
462      }      }
463      SEED_utils::verify_db( $db, 'T' );  # protein      verify_db( $dbfile, 'P' );  # protein
464    
465      my ( $proffile, $rm_profile );      my ( $proffile, $rm_profile );
466      if ( defined $profile && ref $profile )      if ( defined $profile && ref $profile )
# Line 276  Line 469 
469              && @$profile              && @$profile
470              || print STDERR "blastpgp requires one or more profile sequences.\n"              || print STDERR "blastpgp requires one or more profile sequences.\n"
471                 && return undef;                 && return undef;
472          $proffile = "blastpgp_profile_$$";          $proffile = "$tmp/blastpgp_profile_$$";
473          &print_alignment_as_pseudoclustal( $profile,  { file => $proffile, upper => 1 } );          &print_alignment_as_pseudoclustal( $profile,  { file => $proffile, upper => 1 } );
474          $rm_profile = 1;          $rm_profile = 1;
475      }      }
# Line 293  Line 486 
486      my $query = $opts->{ query };      my $query = $opts->{ query };
487      if ( defined $query && ref $query )      if ( defined $query && ref $query )
488      {      {
489          ref $query eq 'ARRAY'          ref $query eq 'ARRAY' && @$query == 3
490              && @$query              or print STDERR "blastpgp invalid query sequence.\n"
491              || print STDERR "blastpgp invalid query sequence.\n"                 and return undef;
492                 && return undef;  
493          $qfile = "blastpgp_query_$$";          $qfile = "$tmp/blastpgp_query_$$";
494          gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );          gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );
495          $rm_query = 1;          $rm_query = 1;
496      }      }
# Line 308  Line 501 
501      elsif ( $profile )    #  Build it from profile      elsif ( $profile )    #  Build it from profile
502      {      {
503          $query = &representative_for_profile( $profile );          $query = &representative_for_profile( $profile );
504          $qfile = "blastpgp_query_$$";          $qfile = "$tmp/blastpgp_query_$$";
505          gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );          gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );
506          $rm_query = 1;          $rm_query = 1;
507      }      }
508      else      else
509      {;      {
510          die "blastpgp requires database.";          die "blastpgp requires database.";
511      }      }
512    
513      my $nkeep = $opts->{ nresult } || 5000;      my $nkeep = $opts->{ nresult } || $opts->{ n_result }  || 5000;
514      my $cmd = "blastpgp -B '$proffile' -j 1 -d '$dbfile -b $nkeep -v $nkeep -i '$qfile'";      my $e_val = $opts->{ e_value } || $opts->{ max_e_val } ||    0.01;
515        my @cmd = ( SeedAware::executable_for( 'blastpgp' ),
516                    '-B', $proffile,
517                    '-j', 1,
518                    '-d', $dbfile,
519                    '-b', $nkeep,
520                    '-v', $nkeep,
521                    '-i', $qfile,
522                    '-e', $e_val
523                  );
524    
525      open BLAST, "$cmd |" or print STDERR "Failed to open: '$cmd'.\n"      my $blastfh = SeedAware::read_from_pipe_with_redirect( @cmd, $opts->{stderr} ? { stderr => $opts->{stderr} } : () )
526               or print STDERR "Failed to open: '" . join( ' ', @cmd ), "'.\n"
527                              and return undef;                              and return undef;
528      my $out = &gjoparseblast::structured_blast_output( \*BLAST, 1 );  # selfmatches okay      my $out = &gjoparseblast::structured_blast_output( $blastfh, 1 );  # selfmatches okay
529      close BLAST;      close $blastfh;
530    
531      if ( $rm_db )      if ( $rm_db )
532      {      {
# Line 338  Line 541 
541    
542    
543  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
544    #  Get an input file handle, and boolean on whether to close or not:
545    #
546    #  ( \*FH, $close ) = input_file_handle(  $filename );
547    #  ( \*FH, $close ) = input_file_handle( \*FH );
548    #  ( \*FH, $close ) = input_file_handle( );                   # D = STDIN
549    #
550    #-------------------------------------------------------------------------------
551    
552    sub input_file_handle
553    {
554        my ( $file, $umask ) = @_;
555    
556        my ( $fh, $close );
557        if ( defined $file )
558        {
559            if ( ref $file eq 'GLOB' )
560            {
561                $fh = $file;
562                $close = 0;
563            }
564            elsif ( -f $file )
565            {
566                open( $fh, "<$file") || die "input_file_handle could not open '$file'.\n";
567                $close = 1;
568            }
569            else
570            {
571                die "input_file_handle could not find file '$file'.\n";
572            }
573        }
574        else
575        {
576            $fh = \*STDIN;
577            $close = 0;
578        }
579    
580        return ( $fh, $close );
581    }
582    
583    
584    #-------------------------------------------------------------------------------
585  #  Get an output file handle, and boolean on whether to close or not:  #  Get an output file handle, and boolean on whether to close or not:
586  #  #
587  #  ( \*FH, $close ) = output_file_handle(  $filename );  #  ( \*FH, $close ) = output_file_handle(  $filename );
# Line 444  Line 688 
688  sub read_blast_from_records  sub read_blast_from_records
689  {  {
690      my ( $file ) = @_;      my ( $file ) = @_;
691      my ( $fh, $close );  
692      if ( $file && ref $file eq 'GLOB' )      my ( $fh, $close ) = input_file_handle( $file );
693      {      $fh or print STDERR "read_blast_from_records could not open input file.\n"
         $fh = $file;  
     }  
     elsif ( defined $file )  
     {  
         -f $file && open( $fh, "<$file" )  
             or print STDERR "read_blast_from_records could not open '$file'.\n"  
694                  and return wantarray ? () : [];                  and return wantarray ? () : [];
695          $close = 1;  
     }  
     else  
     {  
         $fh = \@STDIN;  
     }  
696      my @queries = ();      my @queries = ();
697      my @qdata;      my @qdata;
698      my @subjects = ();      my @subjects = ();
# Line 501  Line 734 
734  }  }
735    
736    
737    sub verify_db
738    {
739        my ( $db, $type ) = @_;
740    
741        my @args;
742        if ($type =~ /^p/i && ((! -s "$db.psq") || (-M "$db.psq" > -M $db)))
743        {
744            @args = ( "-p", "T", "-i", $db );
745        }
746        elsif ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
747        {
748            @args = ( "-p", "F", "-i", $db );
749        }
750        else
751        {
752            return 1;
753        }
754    
755        #
756        # Find formatdb; if we're operating in a SEED environment
757        # use it from there.
758        #
759    
760        my $prog = SeedAware::executable_for( 'formatdb' );
761        if ( ! $prog )
762        {
763            warn "gjoalignandtree::verify_db: formatdb program not found.\n";
764            return 0;
765        }
766    
767        my $rc = system( $prog, @args );
768    
769        if ( $rc != 0 )
770        {
771            my $cmd = join( ' ', $prog, @args );
772            warn "gjoalignandtree::verify_db: formatdb failed with rc = $rc: $cmd\n";
773            return 0;
774        }
775    
776        return 1;
777    }
778    
779    
780  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3