[Bio] / FigKernelScripts / k-class-adaboost-classify.pl Repository:
ViewVC logotype

View of /FigKernelScripts/k-class-adaboost-classify.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Apr 17 21:11:37 2015 UTC (4 years, 7 months ago) by jdavis
Branch: MAIN
CVS Tags: HEAD
adding k-class-adaboost-classify.pl

#! /usr/bin/env perl
use strict;
use Data::Dumper;
use Getopt::Long;

my $usage = "usage: k-class-adaboost-classify.pl -d directory -l list (optional) < boost file >value
    The inputs: -d Directory where the genomic kmer counts file is located
                -l List of kmer count files to use from a directory (optional), single column.
                -f File name if it is a single genome kmer file is to be used
                
                STDIN is the file of adaboost kmers. 
                It is formatted kmer tab epislon val tab alpha val new line
    
    The  entire workflow should be:
    1. k-class-adaboost-matrix.pl   This makes the matrix file.
    2. k-class-adaboost.pl          This gets you the set of kmers.
    3. k-class-adaboost-classify.pl This allows you to classify a new genome.";

my ($help, $single_file, $dir, $list);
my $rc = GetOptions('d=s' => \$dir,
                    'f=s' => \$single_file,
                    'h'   => \$help,
                    'l=s' => \$list);

if ($help){die "$usage\n"};
if (($single_file && $dir) || ($single_file && $list))
{
    die "\nCannot declare both a single file (-f) and list or dir (-d, -l)\n\n$usage\n";
}

#Read in the adaboost k-mers and values from stdin.
my %ckmers = map{chomp; my ($kmer, $error, $alpha) = split /\t/; $kmer, $alpha}(<>);   

if ($single_file)
{
    &process_file($single_file, \%ckmers);
}
     
else
{     
    opendir(D,$dir) || die "could not open the directory: $dir\n";
	my @files = grep { $_ !~ /^\./ } readdir(D);
	closedir(D);

    if ($list)
    {
        open (IN, "<$list") || die "Cannot open list file: $list\n";
        my %keep = map{chomp; $_, 1}(<IN>);
        @files = keys %keep;        
    }
    
    foreach (@files)
    {
        my $full_path = "$dir/$_";
        &process_file($full_path, \%ckmers);
    }
}
     
     
sub process_file
{     
     
    my $file = shift @_;
    my $sum; 
    my $kmersr = shift @_;
    my %kmers = %$kmersr;
    my $sigma;
    my $file_name = $file;
    $file_name =~ s/.+\///g;
    open (IN, "<$file") || die "Cannot open genomic kmer counts file: $file\n";
    my %matched;
    while (<IN>)
    {
       chomp;
       my ($kmer, $count) = split /\t/; 
       if (exists $kmers{$kmer})
        {
            $matched{$kmer} = 1;
        }
    }
    foreach (keys %kmers)
    {
        if (exists $matched{$_})
        {
            $sum += $kmers{$_};
        }
        else
        {
            $sum -= $kmers{$_};
        }
    }

    if ($sum > 0)
    {
        print "$file_name\t1\t$sum\n";
    }
    elsif ($sum < 0)
    {
        print "$file_name\t-1\t$sum\n";
    }
    else
    {
        print "$file_name\t0\t$sum\n";
    }
}     
     
     
     
     
     
     
     

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3