[Bio] / FigKernelScripts / wb.pl Repository:
ViewVC logotype

View of /FigKernelScripts/wb.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Mar 8 21:42:52 2010 UTC (9 years, 8 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
workbench for alignments and trees

########################################################################
# -*- perl -*-
########################################################################
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use Carp;
use Data::Dumper;
use Time::HiRes qw(gettimeofday);
use Time::Local;
use gjoseqlib;
use gjoalignment;
use representative_sequences;
use gjoalignandtree;

use SeedEnv;
my $sapO = SAPserver->new;

my($user);

# usage: wb [-echo] [-time] [-ATdir ATdir] [command]

$echo       = 0;
$time_cmds  = 0;
$ATdir = '/home/overbeek/Ross/GaryTrees/WB/ATdir';

while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    $arg = shift @ARGV;
    if ($arg =~ /^-time/i) { $time_cmds = 1 }
    if ($arg =~ /^-echo/i) { $echo      = 1 }
    if ($arg =~ /^-ATdir/)
    {
	$ATdir = shift @ARGV;
    }
}

my($t1,$t2);
if (@ARGV > 0)  { $req = join( " ", @ARGV ); }
while ( (defined($req) && $req) || ((@ARGV == 0) && ($req = &get_req)) )
{
    if ($time_cmds)
    {
	$t1 = gettimeofday;
    }

    if ($req =~ /^\s*h\s*$/ || $req =~ /^\s*help\s*$/)
    {
     &help;
    }
    elsif ($req =~ /^\s*show_subsys\s*$/)
    {
	&show_all_subsys($sapO);
    }
    elsif ($req =~ /^\s*show_FR_index\s*$/)
    {
	&show_FR_index($ATdir);
    }
    elsif ($req =~ /^\s*build_reps\s+(\d+)\s*$/)
    {
	&build_reps($sapO,$ATdir,$1);
    }
    elsif ($req =~ /^\s*summarize_FR\s+(\d+)\s*$/)
    {
	&summarize_FR($ATdir,$1);
    }
    elsif ($req =~ /^\s*get_FR\s+\'(\S[^\']*\S)\'\s+\'(.*)\'\s*$/)
    {
	my $subsys = $1;
	my @roles = split(/\'\s+\'/,$2);
	&get_FR($sapO,$ATdir,$subsys,\@roles);
    }
    elsif ($req =~ /^\s*align1\s+(\d+)\.(\d+)\s*$/) 
    {
	my ($index, $subI) = ($1, $2);
	&align1($sapO, $ATdir, $index, $subI);
    }
    elsif ($req =~ /^\s*trim1\s+(\d+)\.(\d+)\s*$/) 
    {
	my ($index, $subI) = ($1, $2);
	&trim1($sapO, $ATdir, $index, $subI);
    }
    elsif ($req =~ /^\s*process\s+\'(\S[^\']*\S)\'\s+\'(.*)\'\s*$/)
    {
	my $subsys = $1;
	my @roles = split(/\'\s+\'/,$2);
	my $index = &get_FR($sapO,$ATdir,$subsys,\@roles);
	&build_reps($sapO,$ATdir,$index);
	opendir(TMP,"$ATdir/$index") || die "could not open $ATdir/$index";
	my @subIs = map { ($_ =~ /^\d+\.(\d+)$/) ? $1 : ()  } readdir(TMP);
	closedir(TMP);
	foreach my $subI (@subIs) {
	    &align1($sapO,$ATdir,$index,$subI);
	    &trim1($sapO,$ATdir,$index,$subI);
	}
    }
    elsif ($req =~ /^\s*extend_FR\s+(\d+)\s*$/)
    {
        my $index = $1;
        &extend_FR($sapO,$ATdir,$index);
    }
    elsif ($req =~ /^\s*show_roles_in_subsys\s+\'(\S.*\S)\'\s*$/)
    {
	my $subsys = $1;
	my $ssH = $sapO->subsystem_roles( -ids => [$subsys] );
	if (my $ss_entry = $ssH->{$subsys})
	{
	    foreach my $role (@$ss_entry)
	    {
		print "$role\n";
	    }
	}
	else
	{
	    print "Invalid subsystem\n";
	}
    }
    else
    {
	print "Invalid command\n";
    }
    print "\n";
    $req = "";

    if ($time_cmds)
    {
	$t2 = gettimeofday;
	print $t2-$t1," seconds to execute command\n\n";
    }
}

sub padded {
    my($x,$n) = @_;

    if (length($x) < $n)
    {
	return $x . (" " x ($n - length($x)));
    }
    return $x;
}

sub get_req {
    my($x);

    print "?? ";
    $x = <STDIN>;
    while (defined($x) && ($x =~ /^h$/i) )
    { 
	&help;
	print "?? ";
	$x = <STDIN>;
    }
    
    if ((! defined($x)) || ($x =~ /^\s*[qQxX]/))
    {
	return "";
    }
    else
    {
        if ($echo)
	{
	    print ">> $x\n";
	}
	return $x;
    }
}

sub show_all_subsys {
    my($sapO) = @_;

    my $ssH = $sapO->all_subsystems();
    my @ss = sort keys(%$ssH);
    foreach my $subsys (@ss)
    {
	print "$subsys\n";
    }
    print "\n";
}

sub build_fr_entry {
    my($sapO,$ATdir,$subsys,$roles) = @_;
    
    my $ts = time;

    &SeedUtils::verify_dir($ATdir);
    if (! -e "$ATdir/FR-index") { system "touch $ATdir/FR-index" }

    my @index = `cat $ATdir/FR-index`;
    my $nxt = @index + 1;
    my @peg_fasta_entries = &get_ss_entries($sapO,$subsys,$roles);
    if (@peg_fasta_entries > 0)
    {
	open(INDEX,">>$ATdir/FR-index") || die "Something is amiss with $ATdir/FR-index";
	print INDEX join("\t", $nxt, $ts, $subsys, @$roles),"\n";
	close(INDEX);

        if (-e "$ATdir/$nxt") { system "rm -rf $ATdir/$nxt" }
        mkdir("$ATdir/$nxt", 0777) || die "Could not make $ATdit/$nxt";
	&gjoseqlib::print_alignment_as_fasta("$ATdir/$nxt/seqs",\@peg_fasta_entries);
	return $nxt;
    }
    else
    {
	return 0;
    }
}

sub get_FR {
    my($sapO,$ATdir,$subsys,$roles) = @_;

    my $index;
    if ((@$roles > 0) && ($index = &build_fr_entry($sapO,$ATdir,$subsys,$roles)))
    {
	print "Built FR entry for index $index\n";
    }
    else
    {
	print "Failed to build entry for $subsys\n";
	foreach my $role (@$roles)
	{
	    print "\t$role\n";
	}
    }
    return $index;
}

sub get_ss_entries {
    my($sapO,$subsys,$roles) = @_;

    my $roleH = $sapO->pegs_implementing_roles( -subsystem => $subsys, -roles => $roles );
    my @pegs = map { @{$roleH->{$_}} } keys(%$roleH);
    my $idH  = $sapO->ids_to_sequences( -protein => 1, -ids => \@pegs );
    return map { [$_,'',$idH->{$_}] } keys(%$idH);
}

sub show_FR_index {
    my($ATdir) = @_;

    if (! -s "$ATdir/FR-index")
    {
	print "Sorry, no index exists for the FR entries\n";
    }
    else
    {
	my @index = `cat $ATdir/FR-index`;
	foreach $_ (@index)
	{
	    print $_;
	}
	print "\n";
    }
}

sub build_reps {
    my($sapO,$ATdir,$index) = @_;

    my $dir   = "$ATdir/$index";
    my $seqsF = "$dir/seqs";
    -s $seqsF || die "Could not find $seqsF";

    my @seqs        = &gjoseqlib::read_fasta($seqsF);
    @seqs           = &gjoalignandtree::order_sequences_by_length(\@seqs, { order => 'median_outward' } );
    my %to_tuple    = map { $_->[0] => $_ } @seqs;
    printf "Building reps for %d sequences...\n", scalar@seqs;

    my($reps,$repH) = &representative_sequences::rep_seq(\@seqs,{ max_ref_sim => 0.8 } );
    my @repIds      = map {$_->[0] } @$reps;
    
    &gjoseqlib::print_alignment_as_fasta("$dir/reps-0.8",$reps);
    open(GROUPS,">$dir/groups-0.8") || die "Could not make $dir/groups-0.8";
    foreach my $repId (@repIds)
    {
	print GROUPS join("\t",($repId, @{$repH->{$repId}})),"\n";
    }
    close(GROUPS);
    printf "Built %d reps with max similarity 0.8 \n", scalar@$reps;

    my($reps2,$repH2) = &representative_sequences::rep_seq($reps,{ max_ref_sim => 0.2 } );
    &gjoseqlib::print_alignment_as_fasta("$dir/reps-0.2", $reps2);
    open(GROUPS,">$dir/groups-0.2") || die "Could not make $dir/groups-0.2";
    foreach my $repId (map { $_->[0] } @$reps2)
    {
	print GROUPS join("\t",($repId, @{$repH2->{$repId}})),"\n";
    }
    close(GROUPS);
    printf "Built %d rep(s) with max similarity 0.2 \n", scalar@$reps2;

    if (@$reps2 > 1)
    {
        my $subI = 1;
        my @groups = `cat $dir/groups-0.2`;
        @groups = sort { split(/\t/, $b) <=> split(/\t/, $a) } @groups;
        foreach my $group (@groups) {
            my $subdir = "$dir/$index.$subI";
            &SeedUtils::verify_dir($subdir);
            chomp($group);
            my @repIds = split(/\t/, $group);
            open(GROUPS,">$subdir/groups-0.8") || die "Could not make $subdir/groups-0.8";
            my @seqIds;
            foreach my $repId (@repIds)
            {
                print GROUPS join("\t",($repId, @{$repH->{$repId}})),"\n";
                @seqIds = (@seqIds, $repId, @{$repH->{$repId}});
            }
            close(GROUPS);
            &gjoseqlib::print_alignment_as_fasta("$subdir/seqs",     [ map { $to_tuple{$_} } @seqIds ] );
            &gjoseqlib::print_alignment_as_fasta("$subdir/reps-0.8", [ map { $to_tuple{$_} } @repIds ] );
            printf "Built sub dir $index.$subI containing %5d sequences with %3d reps\n", scalar@seqIds, scalar@repIds;
            $subI++;
        }
    }
    elsif (@$reps2 == 1) 
    {
	my $subdir = "$dir/$index.1";
	&SeedUtils::verify_dir($subdir);
	system "cp $dir/seqs $subdir/";
	system "cp $dir/reps-0.8 $subdir/";
	system "cp $dir/groups-0.8 $subdir/";
    }

}

sub summarize_FR {
    my($ATdir,$index) = @_;

    my $dir   = "$ATdir/$index";
    my $seqsF = "$dir/seqs";

    if (! -d $dir) 
    {
        print "Directory $dir does not exist\n\n";
    }
    elsif (! -s $seqsF)
    {
        print "FR directory is empty\n\n";
    } 
    else 
    {
        my $nseqs = &num_seqs_in_fasta($seqsF);
        print "FR is associated with $nseqs sequences\n";
        if (! -s "$dir/groups-0.8" || ! -s "$dir/reps-0.8" || ! -s "$dir/groups-0.2") 
        {
            print "Reps have not been built for FR\n";
        } 
        else 
        {
            my @groups = `cat $dir/groups-0.8`;
            printf "%3d reps with max similarity 0.8\n", scalar@groups;
            my @groups2 = `cat $dir/groups-0.2`;
            if (@groups2 == 1) 
            {
                printf "Only 1 rep with max similarity 0.2\n";
            }
            else 
            {
                printf "%3d reps with max similarity 0.2\n", scalar@groups2;
                my $complete = 1;
                foreach (1..@groups2) 
                {
                    if (! -s "$dir/$index.$_/seqs" || ! -s "$dir/$index.$_/groups-0.8") 
                    {
                        $complete = 0;
                        last;
                    }
                }
                if (!$complete) 
                {
                    print "subgroups incomplete. consider rebuilding reps.\n";
                } 
                else 
                {
                    foreach (1..@groups2) 
                    {
                        my $subdir = "$dir/$index.$_";
                        my $nseqs = &num_seqs_in_fasta("$subdir/seqs");
                        my $nreps = &num_seqs_in_fasta("$subdir/reps-0.8");
                        printf "Subgroup $index.$_ contains %5d sequences with %3d reps\n", $nseqs, $nreps;
                    }
                }
            }
        }
    }
    print "\n";
}

sub align1 {
    my ($sapO, $ATdir, $index, $subI) = @_;
    my $dir = "$ATdir/$index/$index.$subI";
    print "Aligning reps in $index.$subI...\n";
    my @reps  = &gjoseqlib::read_fasta("$dir/reps-0.8");
    my $align = &gjoalignment::align_with_muscle(\@reps);
    &gjoseqlib::print_alignment_as_fasta("$dir/align1", $align);
}

sub trim1 {
    my ($sapO, $ATdir, $index, $subI) = @_;
    my $dir = "$ATdir/$index/$index.$subI";
    print "Trimming align1 in $index.$subI...\n";
    my @align = &gjoseqlib::read_fasta("$dir/align1");
    my $trim = &gjoalignandtree::trim_align_to_median_ends(\@align, { begin  => 1, end => 1 } );
    &gjoseqlib::print_alignment_as_fasta("$dir/trim1", $trim);    
}

sub num_seqs_in_fasta {
    my ($fasta) = @_;
    my $nseqs = `grep ">" $fasta |wc -l`;
    chomp($nseqs);
    return $nseqs;
}

sub extend_FR {
    my ($sapO,$ATdir,$index) = @_;
    my $dir = "$ATdir/$index";
    if (! -s "$dir/seqs") {
        print "Nothing to extend on: $dir/seqs does not exist.\n";
    } else {
        if (! -s "$dir/reps-0.8" || ! -s "$dir/groups-0.8" || ! -s "$dir/groups-0.2") {
            &build_reps($sapO, $ATdir, $index);
        }
        my @groups2 = `cat $dir/groups-0.2`;
        my $complete = 1;
        foreach my $subI (1 .. @groups2) {
            my $subdir = "$dir/$index.$subI";
            if (! -s "$subdir/seqs" || ! -s "$subdir/groups-0.8" || ! -s "$subdir/reps-0.8") {
                $complete = 0;
                last;
            }
        }
        if (!$complete) {
            &build_reps($sapO, $ATdir, $index);
        }
        foreach my $subI (1 .. @groups2) {
            my $subdir = "$dir/$index.$subI";
            &align1($sapO, $ATdir, $index, $subI) unless -s "$subdir/align1";
            &trim1 ($sapO, $ATdir, $index, $subI) unless -s "$subdir/trim1";
        }        
        print "FR $index is now complete\n";
    }
}

sub help {
    print <<END;
    align1                          FRindex.subIndex
    build_reps                      FRindex
    extend_FR                       FRindex                          
    get_FR                          'Subsystem' 'FR' ... 'FR'
    process                         'Subsystem' 'FR' ... 'FR' 
    show_roles_in_subsys            'Subsystem'
    show_subsys
    show_FR_index
    summarize_FR                    FRindex
    trim1                           FRindex.subIndex
    
END
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3