[Bio] / FigKernelScripts / exp_pc_compute.pl Repository:
ViewVC logotype

View of /FigKernelScripts/exp_pc_compute.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Mon Nov 28 19:33:32 2011 UTC (7 years, 11 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2014_0912, rast_rel_2014_0729, mgrast_release_3_1_2, mgrast_version_3_2, mgrast_dev_12152011, HEAD
Changes since 1.9: +1 -1 lines
Updated to process directories in order.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2011 University of Chicago and Fellowship
# for Interpretations of Expressions. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Expressions at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 Compute Pearson Correlation Coefficients

    exp_pc_compute <exp_directory>

Compute the pearson coefficients for the gene-to-gene correlations in the
specified expression data directory. The coefficients greater than 0.5 will
be put in the C<pearson.tbl> file in the directory.

The single positional parameter is the name of the directory containing the
expression data.

The optional command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item recursive

If specified, then instead of treating the positional parameter as an expression
directory, it is treated as a directory of directories, and all the sub-directories
are processed.

=item passive

If specified, then no change will be made to the expression directory if the
pearson coefficients are already computed.

=item h

Display this command's parameters and options.

=back

=cut

    use strict;
    use Statistics::Descriptive;
    use Tracer;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw() ],
                                           {
                                            recursive => ["", "if specified, all child directories will be processed"],
                                            passive => ["", "if specified, coefficients will only be computed if they don't already exist"],
                                           },
                                           "<expressionDirectory>",
                                           @ARGV);
# Get the expression directory.
my $mainDirectory = $parameters[0];
if (! -d $mainDirectory) {
    # It wasn't found, so complain.
    Trace("Directory $mainDirectory not found.") if T(0);
} else {
    # Get the list of directories to process.
    my @dirs;
    if (! $options->{recursive}) {
        # Direct mode. Process the incoming directory.
        Trace("Processing $mainDirectory.") if T(2);
        push @dirs, $mainDirectory;
    } else {
        # Recursive mode. Process the subdirectories.
        Trace("Recursive mode. Processing genomes under $mainDirectory.") if T(2);
        if (opendir(my $dh, $mainDirectory)) {
            my @genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir $dh;
            push @dirs, map { "$mainDirectory/$_" } @genomes;
        }
    }
    # Process all the directories.
    for my $expDataDirectory (sort @dirs) {
        # Only proceed if we are NOT passive OR there is no pearson file.
        if ($options->{passive} && -f "$expDataDirectory/pearson.tbl") {
            Trace("Directory $expDataDirectory skipped due to passive mode.") if T(2);
        } else {
            # Get the raw data file.
            my $raw_data_file = "$expDataDirectory/rma_normalized.tab";
            if (! -f $raw_data_file) {
                Trace("No rma_normalized.tab file found in $expDataDirectory.") if T(0);
            } else {
                my $ih = Open(undef, "<$raw_data_file");
                Trace("Processing expression data from $raw_data_file.") if T(2);
                # Read the list of experiments. We don't need it, so the data is
                # allowed to disappear.
                Tracer::GetLine($ih);
                # We'll put the signal values in here. For each feature, we will
                # have a list of signals in the same order as the list of experiments.
                my $corrH = {};
                while (! eof $ih) {
                    my ($fid, @signals) = Tracer::GetLine($ih);
                    # Store the signals in the hash.
                    $corrH->{$fid} = \@signals;
                }
                # Close the raw data file.
                close $ih;
                # Now we compute the correlation coefficients. Anything over 0.5 gets stored
                # in the database.
                my @fids = sort keys %$corrH;
                Trace("Computing correlation coefficients.") if T(2);
                # Compute the Pearson coefficients for these features.
                my $pcs = compute_pc(\@fids, $corrH);
                # Open the pearson.tbl file for output.
                my $oh = Open(undef, ">$expDataDirectory/pearson.tbl");
                # Loop through the features, generating Pearson coefficients for each
                # pair.
                Trace("Storing correlation coefficients.") if T(2);
                my $found = 0;
                for (my $i = 0; $i <= $#fids; $i++) {
                    my $fid1 = $fids[$i];
                    for (my $j = $i + 1; $j <= $#fids; $j++) {
                        my $fid2 = $fids[$j];
                        # Get the pearson coefficient from the hash. We assume 0 if
                        # the coefficient isn't found.
                        my $pc = $pcs->{$fid1}{$fid2} || 0;
                        # Only store the coefficient if it's 0.5 or more.
                        if ($pc >= 0.5 || $pc <= -0.5) {
                            # Store both directions of the correlation.
                            print $oh "$fid1\t$fid2\t$pc\n";
                            # Trace our progress.
                            if (($found++ % 5000) == 0) {
                                Trace("$found coefficient pairs stored.") if T(3);
                            }
                        }
                    }
                }
            }
        }
    }
}

=head3 compute_pc

    my $hash = compute_pc(\@gene_ids, \%gxp_hash);

Compute the Pearson coefficients for each pair of features in the list of incoming
gene IDs. The coefficients will indicate the correlation between the features' gene
expression lists from the incoming hash.

=over 4

=item gene_ids

List of feature IDs for which correlation coefficients are desired.

=item gxp_hash

A hash mapping each feature ID to a list of normalized expression values.

=item RETURN

Returns a reference to a hash of hashes keyed by feature ID. Each feature ID
is mapped to a sub-hash that maps the other feature IDs to the appropriate
Pearson coefficients.

=back

=cut

sub compute_pc {
    my ($gene_ids, $gxp_hash) = @_;
    my %values = map { $_ => {} } @$gene_ids;
    require Statistics::Descriptive;
    for (my $i = 0; $i < @$gene_ids-1; $i++) {
        my $stat = Statistics::Descriptive::Full->new();
        my $iData = $gxp_hash->{$gene_ids->[$i]};
        $stat->add_data(@$iData);

        for (my $j = $i+1; $j < @$gene_ids; $j++)
        {
            my $jData = $gxp_hash->{$gene_ids->[$j]};
            my ($q, $m, $r, $err) = $stat->least_squares_fit(@$jData);
            $values{$gene_ids->[$i]}->{$gene_ids->[$j]} = $r;
            $values{$gene_ids->[$j]}->{$gene_ids->[$i]} = $r;
        }
    }
    return \%values;
}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3