[Bio] / FigKernelScripts / k-class-adaboost-matrix-parallel.pl Repository:
ViewVC logotype

View of /FigKernelScripts/k-class-adaboost-matrix-parallel.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Apr 17 20:12:39 2015 UTC (4 years, 7 months ago) by jdavis
Branch: MAIN
CVS Tags: HEAD
adding k-class-adaboost-matrix-parallel.pl

#! /usr/bin/env perl 
use strict;
use Getopt::Long;
use Data::Dumper;
use POSIX qw(floor);
#use Proc::ParallelLoop;
use Parallel::ForkManager;


my $usage = "k-class-adaboost-matrix-parallel.pl -d1 Dir1 -l1 list1(optional) -d2 Dir2 -l2 list2(optional) [options]>Matrix
  This program takes k-mer count files and does parallel joining operations to merge them into an
  Adaboost matrix.  Note that column order is not input order because of the parallelization.
  
    Required parameters:
     -d1 k-mer count directory 1 (kmers are in the format k-mer tab count newline)
     -d2 k-mer count directory 2
      
     Options
     -l1 list of files to use from -d1 (optional)
     -l2 list of files to use from -d2 (optional)
     -m max number of threads
     -f keep full length matrix
        otherwise, you only get the data-reduced matrix, reduction is as follows:
        The highest alphabetical kmer with a unique mathcing pattern is kept by default, 
        and the matrix file is reduced to a list of kmers with non redundant matching 
        profiles and printed alphabetically.  This is done because otherwise boosting takes
        days, it is a cheat, but it works becuase there is good overlap in matching patterns.
        This will break down if there is a genome that has no overlapping kmer matches with any other genome.";
        
        
my ($help, $dir1, $dir2, $list1, $list2, $process, $full);
my $rc = GetOptions('d1=s' => \$dir1,
                    'l1=s' => \$list1,
                    'd2=s' => \$dir2,
                    'l2=s' => \$list2,
                    'f'    => \$full,
                    'm=s'  => \$process,
                    'h'    => \$help);

if (($help) || (! $dir1) || (! $dir2)){die "$usage\n";}
unless ($process){$process = 8;}
 
$dir1 =~ s/\/$//g;
$dir2 =~ s/\/$//g;
my (@files1, @files2);

# read and join the files from each directory
if ($list1)
{
	open (IN1, "<$list1") || die "could not open list1: $list1\n";
	chomp (@files1 = (<IN1>));
	close IN1;
}
else
{
	opendir(D1,$dir1) || die "could not open the directory: $dir1\n";
	@files1 = grep { $_ !~ /^\./ } readdir(D1);
	closedir(D1);
}
if ($list2)
{
	open (IN2, "<$list2") || die "could not open list2: $list2\n";
	chomp (@files2 = (<IN2>));
	close IN2;
}
else
{
	opendir(D2,$dir2) || die "could not open the directory: $dir2\n";
	@files2 = grep { $_ !~ /^\./ } readdir(D2);
	closedir(D2);
}


# Join the individual files of a given type:

my $file1 = &do_the_join ($process, $dir1, \@files1);
system "mv $file1 $file1.1";

my $file2 = &do_the_join ($process, $dir2, \@files2);
system "mv $file2 $file2.2";

#
#  This is the ugly part where I have to read each file and substitute 
#   numbers for 1 in file1, and zeros for 1s in file 2, so that the adaboost
#    will work.
#


# Read in the joined file (for dir1) and change all matches to ones.
# All values >= set equal to 1.

open (IN1, "<$file1.1") || die "cant open $file1.1\n";
open (OUT1, ">$file1.1.1") || die "cant open $file1.1\n";

my $size1;
my $size2;
my $switch = 0;

while (<IN1>)
{
	chomp;
	s/([1-9]+)(0+)?/1/g;
	unless ($switch)
	{
		my @array = split /\t/;
    	$size1 = scalar @array;
       	$switch = 1;
	}
	print OUT1 "$_\n";
}

close IN1;
close OUT1;
unlink "$file1.1";
$switch = 0;

# Read in the joined file (for dir2) and change all matches to zeros.
# Change all zeros to ones.

open (IN2, "<$file2.2") || die "cant open $file2.2\n";
open (OUT2, ">$file2.2.2") || die "cant open $file2.2.2\n";
while (<IN2>)
{
	chomp;
	s/([1-9]+)(0+)?/#/g;
    s/0/1/g;
    s/#/0/g;

	unless ($switch)
	{
		my @array = split /\t/;
    	$size2 = scalar @array;
       	$switch = 1;
	}
	print OUT2 "$_\n";

}
close IN2;
close OUT2;
unlink "$file2.2";

my $tab = '\t';
my $columns = &make_col($size1, $size2);
my $last_join = "join -t \$\'$tab\' -a 1 -a 2 -e\'0\' -o $columns $file1.1.1 $file2.2.2 >$file1.$file2.merged";
print STDERR "$last_join\n";
system ($last_join);
unlink "$file2.2.2";
unlink "$file1.1.1";

open (IN, "<$file1.$file2.merged") || die "cant open file merged file\n";

my $kmers;
my $rev_kmers;

while (<IN>)
{
    chomp; 
    my @array = split /\t/;
    my $kmer = shift @array;
    my $pattern = join ("\t", @array);
        
    unless (exists $rev_kmers->{$pattern})
    {
        $rev_kmers->{$pattern} = $kmer;
        print "$kmer\t$pattern\n";
    }
}		
close IN;

unless ($full)
{
    unlink "$file1.$file2.merged";
}




#--------------------------------------------------------
#
#  $filename = do_the_join ($process, $dir, \@files);
#
# This subroutine takes the max number of processes, 
#    the original k-mer count directory, 
#    and the files, and then joins them assuming each is
#    tab-delimited and in the form k-mer\tcount.  Kmers must also be
#    in alphabetical order for the joins to work.
#
# It returns the name of the final joined file.  Note that columns are not
# In order because it is joined as follows:
#   round1:   1-2; 3-4; 5-6; 7-8; 9-10;
#             ...waits for completion,
#   round2    1&2-3&4; 5&6-7&8
#              ....waits
#   round3    9&10-1&2&3&4
#             ....waits
#   round4    9&10&1&2&3&4-5&6&7&8
#--------------------------------------------------------

sub do_the_join
{
    my $max_proc = shift @_;
    my $dir1 = shift @_;
    my $filesr = shift @_;
    my @files1 = @$filesr;
    
    my $tmp = $$;
    my $count = 0;

    my @files1_ready = map("$dir1/$_", @files1);
    my %ncol = map{$_, 2}(@files1_ready);
    my @commands;
    my @to_delete;

    
    #create the necessary commands
    while ((scalar @files1_ready) > 1)
    {
        $count ++;
        my $first  = shift @files1_ready;
        my $second = shift @files1_ready;
        my $tab = '\t';
        my $cmd2 = &make_col($ncol{$first}, $ncol{$second});
        my $output = "$tmp"."."."$count";
        my $final_cmd = "join -t \$\'$tab\' -a 1 -a 2 -e\'0\' -o $cmd2 $first $second > $output";
        push @{$commands[$ncol{$second}]}, $final_cmd;
        push @files1_ready, $output;
        push @to_delete, $output;
        $ncol{$output} = (($ncol{$first} + $ncol{$second}) -1 );
    }

    #for @command arrays, do the parallel joins
    for my $i (0..$#commands)
    {
        my $arrayr = @commands[$i];
        if ($arrayr)
        {
            my @array = @$arrayr;
            
            my $process = (scalar @array);
            if ($process > $max_proc)
            {
                $process = $max_proc;
            }
            my $manager = new Parallel::ForkManager( $process );
            foreach my $command (@array) 
            {
                $manager->start and next;
                print STDERR "$command\n";
                system( $command );
                $manager->finish;
            }
            $manager->wait_all_children;
        }   
    }

    my $last = pop @to_delete;
    foreach (@to_delete)
    {
        unlink $_;
    }
    return $last;
}


#--------------------------------------------------------
#
# Given the number of columns for file 1 and file 2
# Create the join command segment that keeps those columns.
# &make_col($ncol1, $ncol2);
#--------------------------------------------------------
sub make_col
{
    my $first = shift @_;
    my $second = shift @_;
    my @cmd;
    
    push @cmd, 0;
    my $count1 = 2;
    my $count2 = 2;
    
    while ($count1 <= $first)
    {
        push @cmd, "1.$count1";
        $count1 ++;
    }
    
    while ($count2 <= $second)
    {
        push @cmd, "2.$count2";
        $count2 ++;
    }
    
    my $cmd = join (",", @cmd);
    return $cmd;
}
#--------------------------------------------------------







MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3