[Bio] / FigKernelPackages / ExpressionDir.pm Repository:
ViewVC logotype

View of /FigKernelPackages/ExpressionDir.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Fri Jan 14 20:06:51 2011 UTC (9 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2011_0119
Changes since 1.4: +70 -4 lines
update

package ExpressionDir;

use Data::Dumper;
use strict;
use SeedAware;
use File::Copy;
use File::Temp 'tempfile';
use File::Spec::Functions;
use base 'Class::Accessor';
use Carp;
use Fcntl ':seek';
    
__PACKAGE__->mk_accessors(qw(genome_dir expr_dir genome_id));

our @probe_parsers = qw(parse_probe_format_1lq
			parse_probe_format_1
			parse_probe_format_2
			parse_probe_format_3);

=head3 new

    my $edir = ExpressionDir->new($expr_dir);

Create a new ExpressionDir object from an existing expression dir.


=cut

sub new
{
    my($class, $expr_dir) = @_;

    my $gfile = catfile($expr_dir, "GENOME_ID");
    open(GF, "<", $gfile) or die "Cannot open $expr_dir/GENOME_ID: $!";
    my $genome_id = <GF>;
    chomp $genome_id;
    close(GF);
    
    my $self = {
	genome_dir => catfile($expr_dir, $genome_id),
	genome_id => $genome_id,
	expr_dir => $expr_dir,
    };
    return bless $self, $class;
}

=head3 create

    my $edir = ExpressionDir->create($expr_dir, $genome_id, $genome_src)

Create a new expression directory from the given genome id and genome data source. 
The data source is either a FIG or FIGV object, or a SAPserver object
that points at a server from which the data can be extracted.

=cut

sub create
{
    my($class, $expr_dir, $genome_id, $genome_src) = @_;

    if (! -d $expr_dir)
    {
	mkdir($expr_dir);
    }
    my $genome_dir = catfile($expr_dir, $genome_id);
    if (! -d $genome_dir)
    {
	mkdir($genome_dir);
    }

    open(GF, ">", catfile($expr_dir, "GENOME_ID"));
    print GF "$genome_id\n";
    close(GF);
    
    my $self = {
	genome_dir => $genome_dir,
	genome_id => $genome_id,
	expr_dir => $expr_dir,
    };
    bless $self, $class;

    if (ref($genome_src) =~ /^FIG/)
    {
	$self->create_from_fig($genome_src);
    }
    elsif (ref($genome_src =~ /^SAP/))
    {
	$self->create_from_sap($genome_src);
    }
    else
    {
	confess "Unknown genome source\n";
    }
    return $self;
}

sub create_from_fig
{
    my($self, $fig) = @_;

    my $gdir = $fig->organism_directory($self->genome_id);

    if (! -d $gdir)
    {
	confess "Genome directory $gdir not found";
    }

    copy(catfile($gdir, "contigs"), catfile($self->genome_dir, "contigs"));
    mkdir(catfile($self->genome_dir, "Features"));
    my @pegs;
    for my $ftype (qw(peg rna))
    {
	my $ofdir = catfile($gdir, "Features", $ftype);
	my $nfdir = catfile($self->genome_dir, "Features", $ftype);
	
	if (open(OT, "<", "$ofdir/tbl"))
	{
	    mkdir($nfdir);
	    open(NT, ">", "$nfdir/tbl") or confess "Cannot write $nfdir/tbl: $!";
	    while (<OT>)
	    {
		my($id) = /^(\S+)\t/;
		if (!$fig->is_deleted_fid($id))
		{
		    print NT $_;
		    push(@pegs, $id) if $ftype eq 'peg';
		}
	    }
	    close(OT);
	    close(NT);
	}
	copy("$ofdir/fasta", "$nfdir/fasta");
    }

    open(SS, ">", catfile($self->genome_dir, "subsystem.data"));
    my %phash = $fig->subsystems_for_pegs_complete(\@pegs);
    for my $peg (keys %phash)
    {
	my $list = $phash{$peg};
	next unless $list;
	
	for my $ent (@$list)
	{
	    print SS join("\t", $peg, @$ent), "\n";
	}
    }
    close(SS);

    open(AF, ">", catfile($self->genome_dir, "assigned_functions"));
    my $fns = $fig->function_of_bulk(\@pegs);
    for my $peg (@pegs)
    {
	print AF join("\t", $peg, $fns->{$peg}), "\n";
    }
    close(AF);
}

sub create_from_sap
{
    my($self, $sap) = @_;
    confess "create_from_sap not yet implemented";
}

sub parse_probe_format_1lq
{
    my($self, $in_file, $out_file) = @_;

    my($fh);

    if ($in_file !~ /\.1lq$/)
    {
	return undef;
    }

    open($fh, "<", $in_file) or confess "Cannot open $in_file for reading: $!";

    my $out;
    open($out, ">", $out_file) or confess "Cannot open $out for writing: $!";

    # Skip 3 header lines.
    $_ = <$fh> for 1..3;
    
    while (defined($_ = <$fh>))
    {
	if ($_ =~ /(\d+)\s+(\d+)\s+([ACGT]+)\s+(-?\d+)\s/)
	{
	    if (length($3) < 15)
	    {
		close($fh);
		close($out);
		confess "Bad length at line $. of $in_file";
		return undef;
	    }
	    next if ($4 =~ /\d+3$/); #mismatch probe
	    my($x,$y,$seq) = ($1,$2,$3);
	    $seq = scalar reverse $seq;
	    print $out "$x\_$y\t$seq\n";
	}
	else
	{
	    #
	    # We expect some lines not to match.
	    #
	}
    }
    
    close($fh);
    close($out);
    return 1;
}

sub parse_probe_format_1
{
    my($self, $in_file, $out_file) = @_;

    my($fh);

    open($fh, "<", $in_file) or confess "Cannot open $in_file for reading: $!";
    my $l = <$fh>;
    chomp $l;
    $l =~ s/\r//;
    my @hdrs = split(/\t/, $l);
    my %hdrs;
    $hdrs{$hdrs[$_]} = $_ for 0..$#hdrs;

    my $x_col = $hdrs{"Probe X"};
    my $y_col = $hdrs{"Probe Y"};
    my $seq_col = $hdrs{"Probe Sequence"};
    if (!(defined($x_col) && defined($y_col) && defined($seq_col)))
    {
	close($fh);
	return undef;
    }

    my $out;
    open($out, ">", $out_file) or confess "Cannot open $out for writing: $!";

    while (<$fh>)
    {
	chomp;
	s/\r//g;
	my @flds = split(/\t/,$_);
	my($x,$y,$seq);
	$x = $flds[$x_col];
	$y = $flds[$y_col];
	$seq = $flds[$seq_col];
	my $id = "$x\_$y";
	print $out "$id\t$seq\n";
    }
    close($fh);
    close($out);
    return 1;
}

sub parse_probe_format_2
{
    my($self, $in_file, $out_file) = @_;

    my($fh);

    local $/ = "\n>";

    open($fh, "<", $in_file) or confess "Cannot open $in_file for reading: $!";
    my $l = <$fh>;
    chomp $l;
    $l =~ s/\r//;

    if ($l !~ /^>?\S+:(\d+):(\d+);\s+Interrogation_Position=\d+;\s+Antisense;\n([ACGT]+)/s)
    {
	close($fh);
	return undef;
    }
    seek($fh, 0, SEEK_SET);

    my $out;
    open($out, ">", $out_file) or confess "Cannot open $out for writing: $!";

    while (<$fh>)
    {
	chomp;

	if ($_ =~ /^>?\S+:(\d+):(\d+);\s+Interrogation_Position=\d+;\s+Antisense;\n([ACGT]+)/s)
	{
	    if (length($3) < 15)
	    {
		close($fh);
		confess "Bad length at line $. of $in_file";
	    }
	    print $out "$1\_$2\t$3\n";
	}
	else
	{
	    confess "Bad input at line $. of $in_file";
	}
    }
    close($out);
    close($fh);
    return 1;
}

#
# Our "native" format, used for passing through pre-parsed data.
#
sub parse_probe_format_3
{
    my($self, $in_file, $out_file) = @_;

    my($fh);

    open($fh, "<", $in_file) or confess "Cannot open $in_file for reading: $!";
    my $l = <$fh>;
    chomp $l;
    $l =~ s/\r//;

    if ($l !~ /^\d+_\d+\t[ACGT]+$/)
    {
	close($fh);
	return undef;
    }
    seek($fh, 0, SEEK_SET);

    my $out;
    open($out, ">", $out_file) or confess "Cannot open $out for writing: $!";

    while (<$fh>)
    {
	if ($_ =~ /^\d+_\d+\t[ACGT]+$/)
	{
	    print $out $_;
	}
	else
	{
	    confess "Bad input at line $. of $in_file";
	}
    }
    close($out);
    close($fh);
    return 1;
}

sub compute_probe_to_peg
{
    my($self, $probes) = @_;

    my($probe_suffix) = $probes =~ /(\.[^.]+)$/;

    my $my_probes = catfile($self->expr_dir, "probes.in$probe_suffix");
    
    copy($probes, $my_probes) or confess "Cannot copy $probes to $my_probes: $!";

    my $probes_fasta = catfile($self->expr_dir, "probes");

    #
    # Attempt to translate probe file.
    #
    my $success;
    for my $meth (@probe_parsers)
    {
	if ($self->$meth($my_probes, $probes_fasta))
	{
	    print STDERR "Translated $probes to $probes_fasta using $meth\n";
	    $success = 1;
	    last;
	}
	else
	{
	    print STDERR "Failed to translate $probes to $probes_fasta using $meth\n";
	}
    }
    if (!$success)
    {
	confess "Could not translate $probes\n";
    }

    my $peg_probe_table = catfile($self->expr_dir, 'peg.probe.table');
    my $probe_occ_table = catfile($self->expr_dir, 'probe.occ.table');

    my $feature_dir = catfile($self->genome_dir, "Features");
    my @tbls;
    for my $ftype (qw(peg rna))
    {
	my $tfile = catfile($feature_dir, $ftype, 'tbl');
	if (-f $tfile)
	{
	    push(@tbls, $tfile);
	}
    }
    if (@tbls == 0)
    {
	confess "Could not find any tbl files in $feature_dir";
    }

    $self->run([executable_for("make_probes_to_genes"),
		$probes_fasta,
		catfile($self->genome_dir, 'contigs'),
		$tbls[0],
		$peg_probe_table,
		$probe_occ_table,
		@tbls[1..$#tbls],
		],
	   { stderr => catfile($self->expr_dir, 'problems') });
			 

    $self->run([executable_for("remove_multiple_occurring_probes"),
		$peg_probe_table,
		$probe_occ_table],
	   { stdout => catfile($self->expr_dir, 'peg.probe.table.no.multiple') } );

    $self->make_missing_probes($peg_probe_table, $probes_fasta,
			       catfile($self->expr_dir, 'probe.no.match'));
}

sub make_missing_probes
{
    my($self, $probe_table, $probes, $output) = @_;
    open(MATCH,"<", $probe_table) or die "Cannot open $probe_table: $!";
    open(PROBES,"<", $probes) or die "Cannot open $probes: $!";
    open(OUTPUT, ">", $output) or die "Cannot open $output: $!";
    my %locations;
    while(<MATCH>)
    {
	chomp;
	my($peg,$loc)=split "\t";
	$locations{$loc} = $peg;
    }
    
    while(<PROBES>)
    {
	chomp;
	my($loc,$seq) = split "\t";
	print OUTPUT $loc, "\n" if ! exists $locations{$loc};
    }
    close(MATCH);
    close(PROBES);
    close(OUTPUT);
}

#
# we don't copy the experiment files in here because
# they may be very large. This may change.
#
# We do copy the cdf.
#
sub compute_rma_normalized
{
    my($self, $cdf_file, $expt_dir) = @_;

    my $my_cdf = catfile($self->expr_dir, "expr.cdf");
    copy($cdf_file, $my_cdf) or confess "Cannot copy $cdf_file to $my_cdf: $!";

    #
    # We need to build the R library for this cdf.
    #
    my($fh, $tempfile) = tempfile();
#m = make.cdf.package("S_aureus.cdf", cdf.path="..",packagename="foo",package.path="/tmp")

    my $cdf_path = $self->expr_dir;
    my $libdir = catfile($self->expr_dir, "r_lib");
    -d $libdir or mkdir $libdir;
    my $pkgdir = catfile($self->expr_dir, "r_pkg");
    -d $pkgdir or mkdir $pkgdir;

    print $fh "library(makecdfenv);\n";
    print $fh qq(make.cdf.package("expr.cdf", cdf.path="$cdf_path", packagename="datacdf", package.path="$pkgdir", species="genome name");\n);
    close($fh);
    system("Rscript", $tempfile);
    system("R", "CMD", "INSTALL", "-l", $libdir, "$pkgdir/datacdf");

    local($ENV{R_LIBS}) = $libdir;
    $self->run([executable_for("RunRMA"),
		"data",
		catfile($self->expr_dir, "peg.probe.table"),
		catfile($self->expr_dir, "probe.no.match"),
		$expt_dir,
		$self->expr_dir]);
	       
    my $output = catfile($self->expr_dir, "rma_normalized.tab");
    if (! -f $output)
    {
	confess("Output file $output was not generated");
    }
}

sub compute_atomic_regulons
{
    my($self, $pearson_cutoff) = @_;

    $pearson_cutoff ||= 0.7;

    my $coreg_clusters = catfile($self->expr_dir, "coregulated.clusters");
    my $coreg_subsys = catfile($self->expr_dir, "coregulated.subsys");
    my $merged_clusters = catfile($self->expr_dir, "merged.clusters");
    my $probes_always_on = catfile($self->expr_dir, "probes.always.on");
    my $pegs_always_on = catfile($self->expr_dir, "pegs.always.on");

    $self->run([executable_for("call_coregulated_clusters_on_chromosome"), $self->expr_dir],
	   { stdout => $coreg_clusters });
    $self->run([executable_for("make_coreg_conjectures_based_on_subsys"), $self->expr_dir],
	   { stdout => $coreg_subsys });

    $self->run([executable_for("filter_and_merge_gene_sets"), $self->expr_dir, $coreg_clusters, $coreg_subsys],
	   { stdout => $merged_clusters });
    $self->run([executable_for("get_ON_probes"), $self->expr_dir, $probes_always_on, $pegs_always_on]);

    if (-s $probes_always_on == 0)
    {
	confess "No always-on probes were found";
    }

    $self->run([executable_for("Pipeline"), $pegs_always_on, $merged_clusters, $self->expr_dir],
	   { stdout => catfile($self->expr_dir, "comments.by.Pipeline.R") });

    $self->run([executable_for("SplitGeneSets"), $merged_clusters, $pearson_cutoff, $self->expr_dir],
	   { stdout => catfile($self->expr_dir, "split.clusters") });
    
    $self->run([executable_for("compute_atomic_regulons_for_dir"), $self->expr_dir]);
}

sub run
{
    my($self, $cmd, $redirect) = @_;

    print "Run @$cmd\n";
    my $rc = system_with_redirect($cmd, $redirect);
    if ($rc != 0)
    {
	confess "Command failed: @$cmd\n";
    }
}

sub get_experiment_names
{
    my($self) = @_;
    my $f = catfile($self->expr_dir, "experiment.names");
    my $fh;
    open($fh, "<", $f) or confess "Could not open $f: $!";
    my @out = map { chomp; my($num, $name) = split(/\t/); $name } <$fh>;
    close($fh);
    return @out;
}

sub get_experiment_on_off_calls
{
    my($self, $expt) = @_;

    my $f= catfile($self->expr_dir, "final_on_off_calls.txt");
    my $fh;
    open($fh, "<", $f) or confess "Could not open $f: $!";
    my $names = <$fh>;
    chomp $names;
    my @names = split(/\t/, $names);
    my $idx = 0;
    my $expt_idx;
    foreach my $n (@names)
    {
	if ($n eq $expt)
	{
	    $expt_idx = $idx;
	    last;
	}
	$idx++;
    }
    if (!defined($expt_idx))
    {
	confess("Could not find experiment $expt in $f");
    }

    my $calls = {};
    while (<$fh>)
    {
	chomp;
	my($peg, @calls) = split(/\t/);
	#
	# +1 because the row[0] element is the peg, and our index is
	# zero-based.
	#
	$calls->{$peg} = $calls[$expt_idx + 1];
    }

    close($fh);
    return($calls);
	
}

=head3 save_model_gene_activity

    $e->save_model_gene_activity($data)

Save the results of a modeling run for a given experiment.

$data is of the form { experiment_id => $data_hash }

=cut

sub save_model_gene_activity
{
    my($self, $data) = @_;
}

sub all_features
{
    my($self, $type) = @_;

    my @ftypes;
    my $fdir = catfile($self->genome_dir, "Features");
    if (defined($type))
    {
	@ftypes = ($type);
    }
    else
    {
	opendir(D, $fdir);
	@ftypes = grep { -f catfile($fdir, $_) && /^\./ } readdir(D);
	closedir(D);
    }
    my @out;
    for my $ftype (@ftypes)
    {
	if (open(TBL, "<", catfile($fdir, $ftype, "tbl")))
	{
	    push(@out, map { /^(\S+)/; $1 } <TBL>);
	    close(TBL);
	}
    }
    return @out;
}

sub fid_locations
{
    my($self, $fids) = @_;

    my %fids;
    $fids{$_}++ for @$fids;

    my $genome_id = $self->genome_id;

    my $fdir = catfile($self->genome_dir, "Features");
    opendir(D, $fdir);
    my @ftypes = grep { -d catfile($fdir, $_) && ! /^\./ } readdir(D);
    closedir(D);
    
    my $out = {};
    for my $ftype (@ftypes)
    {
	if (open(TBL, "<", catfile($fdir, $ftype, "tbl")))
	{
	    while (<TBL>)
	    {
		my($id, $locs) = /^(\S+)\t(\S+)\t/;
		
		if ($fids{$id})
		{
		    $out->{$id} = "$genome_id:" . SeedUtils::boundary_loc($locs);
		}
	    }
	    close(TBL);
	}
    }
    return $out;
}

sub ids_in_subsystems
{
    my($self) = @_;

    open(SS, "<", catfile($self->genome_dir, "subsystem.data"));
    my $res = {};
    while (<SS>)
    {
	chomp;
	my($peg, $ss, $role, $variant) = split(/\t/);
	$ss =~ s/\s+/_/g;
	push(@{$res->{$ss}->{$role}}, $peg);
    }
    close(SS);
    return $res;
}

sub ids_to_subsystems
{
    my($self, $ids) = @_;

    my %ids;
    $ids{$_} = 1 for @$ids;

    open(SS, "<", catfile($self->genome_dir, "subsystem.data"));
    my $res = {};
    while (<SS>)
    {
	chomp;
	my($peg, $ss, $role, $variant) = split(/\t/);
	if ($ids{$peg})
	{
	    push(@{$res->{$peg}}, $ss);
	}
    }
    close(SS);
    return $res;
}

sub ids_to_functions
{
    my($self, $ids) = @_;
    open(AF, "<", catfile($self->genome_dir, "assigned_functions"));
    my %ids;
    $ids{$_} = 1 for @$ids;
    my $res = {};

    while (<AF>)
    {
	chomp;
	my($id, $fn) = split(/\t/);
	$res->{$id} = $fn if $ids{$id};
    }
    close(AF);
    return $res;
}

1;



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3