[Bio] / FigKernelScripts / build_aligns_and_trees_for_seqs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_aligns_and_trees_for_seqs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Sat Mar 3 20:34:59 2012 UTC (7 years, 8 months ago) by fangfang
Branch: MAIN
CVS Tags: rast_rel_2014_0729, mgrast_version_3_2, rast_rel_2014_0912, HEAD
Changes since 1.1: +0 -6 lines
removed comments

use strict;
use Getopt::Long;
use Proc::ParallelLoop;
use gjoseqlib;

my $usage = "Usage: $0 -d AT_dir -f seq_file -i i_th_seq -s start_seq -n num_seqs -np n_procs <pegs.fa\n\n";

my $atdir   = "/home/fangfang/geassm/fangfang/AT/data";
my $nproc   = 1;
my $nthread = 1;
my $ith     = 0;
my $start   = 0;
my $n;
my $seqfile;

GetOptions("d=s"  => \$atdir,
           "f=s"  => \$seqfile,
           "i=i"  => \$ith,
           "n=i"  => \$n,
           "np=i" => \$nproc,
           "nt=i" => \$nthread,
           "s=i"  => \$start);

verify_dir($atdir);

my $seqs;

if ($seqfile && -s $seqfile) {
    $seqs = gjoseqlib::read_fasta($seqfile);
} else {
    $seqs = gjoseqlib::read_fasta();
}

if ($start > 0) {
    $seqs = [ @{$seqs}[$start..@$seqs-1] ];
}

if ($n > 0 && $n < @$seqs) {
    $seqs = [ @{$seqs}[0..$n-1] ];
}

if ($ith > 0 && $ith <= @$seqs) {
    process_seq($seqs->[$ith-1]);
} else {
    if ($nproc > 1) {
        pareach($seqs, \&process_seq, { Max_Workers => $nproc });
    } else {
        process_seq($_) for @$seqs;
    }
}

sub process_seq {
    my ($seq) = @_;
    return unless $seq && $seq->[2] && length($seq->[2]) > 60;

    my $peg = $seq->[0]; 
    my $name = $peg; $name =~ s/fig\|//;

    print STDERR "Processing $peg\n";

    my $dir = "$atdir/$name";
    verify_dir($dir);

    chdir($dir);

    gjoseqlib::print_alignment_as_fasta("query", [$seq]);

    run("svr_psiblast_search -l -b SEED -a $nthread -cq 0.8 -cs 0.8 -n 200 <query >hits.1");
    run("svr_representative_sequences -s 0.99 <hits.1 >reps.1");
    run("svr_align_seqs -l -mafft -auto <reps.1 >ali.1");
    run("svr_trim_ali -l -c -cd <ali.1 >trim.1");

    run("svr_psiblast_search -l -b SEED -a $nthread -cq 0.75 -cs 0.5 -n 500 <trim.1 >hits.2");
    run("svr_representative_sequences -s 0.95 <hits.2 >reps.2");
    run("svr_align_seqs -l -mafft -auto <reps.2 >ali.2");
    run("svr_trim_ali -l -c <ali.2 >trim.2");

    run("svr_psiblast_search -l -b PPSEED -a $nthread -cq 0.7 -cs 0.5 -n 2000 <trim.2 >hits.3");

    my $nhits = `grep "^>" hits.3 |wc -l`; chomp($nhits);
    my $sim = $nhits > 500 ? 0.80 :
              $nhits > 200 ? 0.90 :
              $nhits > 100 ? 0.95 : 0.99; 

    run("svr_representative_sequences -s $sim -g ../../woese.gid <hits.3 >reps.3");

    run("svr_align_seqs -l -mafft -alg linsi -z <reps.3 >ali.3");
    run("svr_trim_ali -l -c <ali.3 >trim.3");
    run("svr_tree -l -s -c 20 <trim.3 >tree.3");

    run("svr_psiblast_search -l -b PPSEED -a $nthread -u 100 -cq 0.6 -cs 0.01 -i 0.2 -p 0.30 -r report.4 -n 20000 <trim.3 >hits.4");

    $nhits = `grep "^>" hits.4 |wc -l`; chomp($nhits);
    my $lin = $nhits >  500 ? '-auto' : '-alg linsi';
    my $spr = $nhits > 1000 ? '' : '-s';

    run("svr_align_seqs -l -mafft $lin <hits.4 >ali");
    run("svr_tree -l -c 20 $spr <ali >tree");

    return unless -s 'tree';

    run("cp ali  ../../done/$name.ali");
    run("cp tree ../../done/$name.tree");

    print "Done $peg\n";
}

sub verify_dir {
    my ($dirName) = @_;
    $dirName =~ s#/$##;
    if (! -d $dirName) {
        if ($dirName =~ m#(.+)/[^/]+$#) {
            verify_dir($1);
        }
        mkdir $dirName, 0755;
    }
}

sub run {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my ($cmd) = @_;

    if ($ENV{FIG_VERBOSE}) {
        my @tmp = `date`;
        chomp @tmp;
        print STDERR "$tmp[0]: running $cmd\n";
    }
    (system($cmd) == 0) || die("FAILED: $cmd");
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3