[Bio] / FigKernelScripts / compute_pc.pl Repository:
ViewVC logotype

View of /FigKernelScripts/compute_pc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Apr 19 20:22:16 2010 UTC (9 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
code from Hope

use Statistics::Descriptive;
use strict;

# read the raw_data file from STDIN; first line is a header with experiment names
# subsequent rows contain a gene id in the first column and experiment values
# in the other columns
<STDIN>; # skip header

# store expression values in a hash as an array indexed by gene id
my %gene_to_values;
my @gene_ids;

while (<STDIN>)
{
    chomp;
    my ($gene_id, @gxp_values) = split("\t");
    $gene_to_values{$gene_id} = \@gxp_values;
    push @gene_ids, $gene_id;
}


# NOW THAT YOU HAVE READ IN THE RAW DATA YOU CAN COMPUTE PEARSON CORRELATIONS.
# CALCULATE PEARSON CORRELATIONS FOR FIRST 10 GENES - THIS IS AN EXAMPLE.

my @ten = @gene_ids[0..9];
print STDERR "Computing pc\n";
my $values = compute_pc(\@ten, \%gene_to_values);

# print the pearson correlations
foreach my $gene_id (keys %$values)
{
    foreach my $gene_id2 (keys %{$values->{$gene_id}})
    {
	print $gene_id, "\t", $gene_id2, "\t", $values->{$gene_id}->{$gene_id2}, "\n";
    }
}

# END OF EXAMPLE


# compute the perason correlations for a set of gene ids
# returns a hash of hashes - keys are gene ids; internal hashes have correlation values
# correlation values are stored twice so that either gene id can be used for the first
# hash and the other gene id for the internal hash
sub compute_pc
{
    my ($gene_ids, $gxp_hash) = @_;
    my %values = ();

    for (my $i = 0; $i < @$gene_ids-1; $i++)
    {
	my $stat = Statistics::Descriptive::Full->new();
	$stat->add_data(@{$gxp_hash->{$gene_ids->[$i]}});

	for (my $j = $i+1; $j < @$gene_ids; $j++)
	{
	    my ($q, $m, $r, $err) = $stat->least_squares_fit(@{$gxp_hash->{$gene_ids->[$j]}});
	    $values{$gene_ids[$i]}->{$gene_ids[$j]} = $r;
	    $values{$gene_ids[$j]}->{$gene_ids[$i]} = $r;
	}
    }
    
    return \%values;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3