[Bio] / FigKernelScripts / exp_pc_compute.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/exp_pc_compute.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2011 University of Chicago and Fellowship
5 :     # for Interpretations of Expressions. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Expressions at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     =head1 Compute Pearson Correlation Coefficients
21 :    
22 :     exp_pc_compute <exp_directory>
23 :    
24 :     Compute the pearson coefficients for the gene-to-gene correlations in the
25 :     specified expression data directory. The coefficients greater than 0.5 will
26 :     be put in the C<pearson.tbl> file in the directory.
27 :    
28 :     The single positional parameter is the name of the directory containing the
29 :     expression data.
30 :    
31 :     The optional command-line options are as follows.
32 :    
33 :     =over 4
34 :    
35 :     =item user
36 :    
37 :     Name suffix to be used for log files. If omitted, the PID is used.
38 :    
39 :     =item trace
40 :    
41 :     Numeric trace level. A higher trace level causes more messages to appear. The
42 :     default trace level is 2. Tracing will be directly to the standard output
43 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
44 :     where I<User> is the value of the B<user> option above.
45 :    
46 :     =item background
47 :    
48 :     Save the standard and error output to files. The files will be created
49 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
50 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
51 :     B<user> option above.
52 :    
53 : parrello 1.2 =item recursive
54 :    
55 :     If specified, then instead of treating the positional parameter as an expression
56 :     directory, it is treated as a directory of directories, and all the sub-directories
57 :     are processed.
58 :    
59 : parrello 1.7 =item passive
60 :    
61 :     If specified, then no change will be made to the expression directory if the
62 :     pearson coefficients are already computed.
63 :    
64 : parrello 1.1 =item h
65 :    
66 :     Display this command's parameters and options.
67 :    
68 :     =back
69 :    
70 :     =cut
71 :    
72 :     use strict;
73 :     use Statistics::Descriptive;
74 :     use Tracer;
75 :    
76 :     # Get the command-line options and parameters.
77 : parrello 1.2 my ($options, @parameters) = StandardSetup([qw() ],
78 : parrello 1.7 {
79 :     recursive => ["", "if specified, all child directories will be processed"],
80 :     passive => ["", "if specified, coefficients will only be computed if they don't already exist"],
81 :     },
82 : parrello 1.1 "<expressionDirectory>",
83 :     @ARGV);
84 :     # Get the expression directory.
85 : parrello 1.3 my $mainDirectory = $parameters[0];
86 :     if (! -d $mainDirectory) {
87 : parrello 1.1 # It wasn't found, so complain.
88 : parrello 1.3 Trace("Directory $mainDirectory not found.") if T(0);
89 : parrello 1.1 } else {
90 : parrello 1.2 # Get the list of directories to process.
91 :     my @dirs;
92 : parrello 1.5 if (! $options->{recursive}) {
93 : parrello 1.2 # Direct mode. Process the incoming directory.
94 : parrello 1.4 Trace("Processing $mainDirectory.") if T(2);
95 : parrello 1.3 push @dirs, $mainDirectory;
96 : parrello 1.2 } else {
97 :     # Recursive mode. Process the subdirectories.
98 : parrello 1.4 Trace("Recursive mode. Processing genomes under $mainDirectory.") if T(2);
99 : parrello 1.3 if (opendir(my $dh, $mainDirectory)) {
100 : parrello 1.6 my @genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir $dh;
101 : parrello 1.3 push @dirs, map { "$mainDirectory/$_" } @genomes;
102 : parrello 1.2 }
103 :     }
104 : parrello 1.3 # Process all the directories.
105 :     for my $expDataDirectory (@dirs) {
106 : parrello 1.7 # Only proceed if we are NOT passive OR there is no pearson file.
107 :     if ($options->{passive} && -f "$expDataDirectory/pearson.tbl") {
108 :     Trace("Directory $expDataDirectory skipped due to passive mode.") if T(2);
109 : parrello 1.3 } else {
110 : parrello 1.7 # Get the raw data file.
111 :     my $raw_data_file = "$expDataDirectory/rma_normalized.tab";
112 :     if (! -f $raw_data_file) {
113 :     Trace("No rma_normalized.tab file found in $expDataDirectory.") if T(0);
114 :     } else {
115 :     my $ih = Open(undef, "<$raw_data_file");
116 :     Trace("Processing expression data from $raw_data_file.") if T(2);
117 :     # Read the list of experiments. We don't need it, so the data is
118 :     # allowed to disappear.
119 :     Tracer::GetLine($ih);
120 :     # We'll put the signal values in here. For each feature, we will
121 :     # have a list of signals in the same order as the list of experiments.
122 :     my $corrH = {};
123 :     while (! eof $ih) {
124 :     my ($fid, @signals) = Tracer::GetLine($ih);
125 :     # Store the signals in the hash.
126 :     $corrH->{$fid} = \@signals;
127 :     }
128 :     # Close the raw data file.
129 :     close $ih;
130 :     # Now we compute the correlation coefficients. Anything over 0.5 gets stored
131 :     # in the database.
132 :     my @fids = sort keys %$corrH;
133 :     Trace("Computing correlation coefficients.") if T(2);
134 :     # Compute the Pearson coefficients for these features.
135 :     my $pcs = compute_pc(\@fids, $corrH);
136 :     # Open the pearson.tbl file for output.
137 :     my $oh = Open(undef, ">$expDataDirectory/pearson.tbl");
138 :     # Loop through the features, generating Pearson coefficients for each
139 :     # pair.
140 :     Trace("Storing correlation coefficients.") if T(2);
141 :     my $found = 0;
142 :     for (my $i = 0; $i <= $#fids; $i++) {
143 :     my $fid1 = $fids[$i];
144 :     for (my $j = $i + 1; $j <= $#fids; $j++) {
145 :     my $fid2 = $fids[$j];
146 :     # Get the pearson coefficient from the hash. We assume 0 if
147 :     # the coefficient isn't found.
148 :     my $pc = $pcs->{$fid1}{$fid2} || 0;
149 :     # Only store the coefficient if it's 0.5 or more.
150 :     if ($pc >= 0.5) {
151 :     # Store both directions of the correlation.
152 :     print $oh "$fid1\t$fid2\t$pc\n";
153 :     # Trace our progress.
154 :     if (($found++ % 5000) == 0) {
155 :     Trace("$found coefficient pairs stored.") if T(3);
156 :     }
157 : parrello 1.3 }
158 : parrello 1.1 }
159 :     }
160 :     }
161 :     }
162 :     }
163 :     }
164 :    
165 :     =head3 compute_pc
166 :    
167 :     my $hash = compute_pc(\@gene_ids, \%gxp_hash);
168 :    
169 :     Compute the Pearson coefficients for each pair of features in the list of incoming
170 :     gene IDs. The coefficients will indicate the correlation between the features' gene
171 :     expression lists from the incoming hash.
172 :    
173 :     =over 4
174 :    
175 :     =item gene_ids
176 :    
177 :     List of feature IDs for which correlation coefficients are desired.
178 :    
179 :     =item gxp_hash
180 :    
181 :     A hash mapping each feature ID to a list of normalized expression values.
182 :    
183 :     =item RETURN
184 :    
185 :     Returns a reference to a hash of hashes keyed by feature ID. Each feature ID
186 :     is mapped to a sub-hash that maps the other feature IDs to the appropriate
187 :     Pearson coefficients.
188 :    
189 :     =back
190 :    
191 :     =cut
192 :    
193 :     sub compute_pc {
194 :     my ($gene_ids, $gxp_hash) = @_;
195 :     my %values = map { $_ => {} } @$gene_ids;
196 :     require Statistics::Descriptive;
197 :     for (my $i = 0; $i < @$gene_ids-1; $i++) {
198 :     my $stat = Statistics::Descriptive::Full->new();
199 : parrello 1.8 my $iData = $gxp_hash->{$gene_ids->[$i]};
200 :     $stat->add_data(@$iData);
201 : parrello 1.1
202 :     for (my $j = $i+1; $j < @$gene_ids; $j++)
203 :     {
204 : parrello 1.8 my $jData = $gxp_hash->{$gene_ids->[$j]};
205 :     my ($q, $m, $r, $err) = $stat->least_squares_fit(@$jData);
206 : parrello 1.1 $values{$gene_ids->[$i]}->{$gene_ids->[$j]} = $r;
207 :     $values{$gene_ids->[$j]}->{$gene_ids->[$i]} = $r;
208 :     }
209 :     }
210 :     return \%values;
211 :     }
212 :    
213 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3