[Bio] / FigKernelPackages / SampleDir.pm Repository:
ViewVC logotype

View of /FigKernelPackages/SampleDir.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Oct 20 17:20:07 2010 UTC (9 years, 4 months ago) by olson
Branch: MAIN
new module

#
# Utilities for dealing with samples, function summaries,
# OTU summaries, and the vectors that represent them.
#

#
# A sample is stored in a directory.
#
# The raw DNA for the sample is in sample.fa
# Metadata about the sample is stored in a set of flat files:
#
# NAME
# DESCRIPTION
#
# Each analysis is stored in a directory
#
#  dataset-name/idx
#
# where dataset-name is the name of the Kmer dataset used for the analysis,
# and idx is a numeric index. Each analysis with a different set of
# parameters is stored in its own directory.
#
# The processing parameters are stored in a YAML file params.yml.
# The parameters file contains a serialized hash with the following keys:
#    kmer:		kmer size
#    max_gap:		max-gap parameter
#    min_size:		min-size parameter
#    dataset:		Kmer dataset identifier
#

package SampleDir;

use strict;
use SeedUtils;
use ANNOserver;
use Data::Dumper;
use YAML::Any qw(Dump Load DumpFile LoadFile);

use base qw(Class::Accessor);

__PACKAGE__->mk_accessors(qw(dir sequence_type anno));

our %kmer_defaults = (-kmer => 8,
		      -minHits => 3,
		      -maxGap => 600);

sub create
{
    my($class, $dir, $sample_file) = @_;

    if (-d $dir)
    {
	die "SampleDir::create: directorty $dir already exists\n";
    }

    if (! -f $sample_file)
    {
	die "SampleDir::create: sample file $sample_file does not exist\n";
    }

    mkdir($dir) or die "SampleDir::create: mkdir $dir failed: $!";
    my $seq_type = SeedUtils::validate_fasta_file($sample_file, "$dir/sample.fa");

    my $obj = $class->new($dir);
    $obj->sequence_type($seq_type);

    return $obj;
}

sub new
{
    my($class, $dir, %args ) = @_;

    my $self = {
	dir => $dir,
	anno => ANNOserver->new(exists($args{-url}) ? (-url => $args{-url}) : ()),
	args => \%args,
    };

    return bless $self, $class;
}

sub name
{
    my($self) = @_;
    my $n = &SeedUtils::file_head("$self->{dir}/NAME",1 );
    chomp $n;
    return $n;
}

sub description
{
    my($self) = @_;
    my $n = &SeedUtils::file_head("$self->{dir}/DESCRIPTION",1 );
    chomp $n;
    return $n;
}

sub perform_basic_analysis
{
    my($self, %args) = @_;

    my %params = %kmer_defaults;
    for my $k (keys %args)
    {
	if (defined($params{$k}))
	{
	    $params{$k} = $args{$k};
	}
    }

    #
    # If dataset not specified, look up the current
    # default and specify it with all future calls.
    #
    if (!defined($params{-kmerDataset}))
    {
	my $ds = $self->anno->get_dataset();
	$params{-kmerDataset} = $ds;
	print "Using default dataset $ds\n";
    }

    #
    # Create the dataset directory if not already present.
    #

    my $ds = $params{-kmerDataset};
    my $ds_dir = "$self->{dir}/$ds";
    if (!-d $ds_dir)
    {
	mkdir($ds_dir) or die "Cannot mkdir $ds_dir: $!";
    }
    
    #
    # Find a new analysis dir pathname. Start at zero.
    #
    my $analysis_num = 0;
    my $analysis_dir = "$ds_dir/$analysis_num";
    while (-e $analysis_dir)
    {
	$analysis_num++;
	$analysis_dir = "$ds_dir/$analysis_num";
    }

    mkdir($analysis_dir) or die "Cannot mkdir $analysis_dir: $!";
    
    print STDERR "Using analysis dir $analysis_dir\n";

    DumpFile("$analysis_dir/params.yml", \%params);

    my $fh;
    my $sample = "$self->{dir}/sample.fa";
    if (!open($fh, "<", $sample))
    {
	die "Cannot open sample file $sample: $!";
    }
    my %otu_summary;
    my %fn_summary;

    my $details_fh;
    open($details_fh, ">", "$analysis_dir/sample.out") or die "Cannot write $analysis_dir/sample.out: $!";

    my $otu_sum_fh;
    open($otu_sum_fh, ">", "$analysis_dir/sample.otu.sum") or die "Cannot write $analysis_dir/sample.otu.sum: $!";

    my $fn_sum_fh;
    open($fn_sum_fh, ">", "$analysis_dir/sample.fn.sum") or die "Cannot write $analysis_dir/sample.fn.sum: $!";

    my $rh = $self->anno->assign_functions_to_dna(-input => $fh, %params);
    my $totF = 0;
    my $totO = 0;
    while (my $res = $rh->get_next())
    {
	my($id, $tuple) = @$res;

	my($count, $start, $stop, $func, $otu) = @$tuple;

	my $loc = "${id}_${start}_${stop}";
	print $details_fh "$id\t$count\t$loc\t$func\t$otu\n";

	if (defined($otu))
	{
	    $otu_summary{$otu}++;
	    $totO++;
	}
	if (defined($func))
	{
	    $fn_summary{$func}++;
	    $totF++;
	}
    }

    close($details_fh);
    
    for my $fn (sort { $fn_summary{$b} <=> $fn_summary{$a} } keys %fn_summary)
    {
	print $fn_sum_fh join("\t",
			      $fn_summary{$fn},
			      sprintf("%0.6f", $fn_summary{$fn} / $totF),
			      $fn), "\n";
    }
    close($fn_sum_fh);
    
    for my $otu (sort { $otu_summary{$b} <=> $otu_summary{$a} } keys %otu_summary)
    {
	print $otu_sum_fh join("\t",
			      $otu_summary{$otu},
			      sprintf("%0.6f", $otu_summary{$otu} / $totO),
			      $otu), "\n";
    }
    close($otu_sum_fh);

}
    

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3