[Bio] / FigKernelScripts / Pipeline.R Repository:
ViewVC logotype

Annotation of /FigKernelScripts/Pipeline.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download)

1 : overbeek 1.1 args = commandArgs(TRUE);
2 : olson 1.4 if (length(args) != 3) { print("usage: Rscript Pipeline.R on_genes_file gene_sets_file output_dir"); q() }
3 : overbeek 1.2 on_genes_file = args[1];
4 :     gene_sets_file = args[2];
5 : olson 1.4 output_dir = args[3];
6 : overbeek 1.1
7 :     ## Read in gene expression data ##
8 :     ## This file is the output of 'exprs' from Bioconductor's RMA procedure
9 :     ## Header line contains experiment names
10 :     ## row name is gene id, rest of the columns are normalized expression values per experiment
11 : olson 1.4 all_geneexp=as.matrix(read.table(paste(output_dir, "rma_normalized.tab", sep="/"),header=TRUE));
12 : overbeek 1.1 num_genes = dim(all_geneexp)[1];
13 :     num_experiments = dim(all_geneexp)[2];
14 :    
15 : overbeek 1.2 ## gene_sets_files contains the gene set information ##
16 :     ## Each line is a comma separated value containing gene ids corresponding to the row names of the raw data ##
17 : overbeek 1.1 ## Need to convert these to row numbers in the raw data
18 : overbeek 1.2 input_genesets = readLines(gene_sets_file);
19 : overbeek 1.1 input_genesets = strsplit(input_genesets, ",");
20 : overbeek 1.2 split_genesets = list();
21 : overbeek 1.1
22 :     for (i in 1:length(input_genesets))
23 :     {
24 :     input_geneset = input_genesets[[i]];
25 : overbeek 1.2 split_geneset = c();
26 : overbeek 1.1 for (j in 1:length(input_geneset))
27 :     {
28 :     # check if the peg is in the raw_data
29 :     row_num = which(rownames(all_geneexp)==input_geneset[j]);
30 :    
31 :     if (length(row_num) > 0) {
32 : overbeek 1.2 split_geneset = cbind(split_geneset, row_num);
33 : overbeek 1.1 }
34 :     else {
35 :     print(c("Couldn't find row for", input_geneset[j]), quote=FALSE);
36 :     }
37 :     }
38 :    
39 : overbeek 1.2 if (length(split_geneset) > 1) {
40 :     split_genesets[[length(split_genesets)+1]] = split_geneset;
41 : overbeek 1.1 }
42 :     }
43 :    
44 : overbeek 1.2 print("Starting New Algorithm");
45 : overbeek 1.1
46 : overbeek 1.2 # Read genes that are the standard for being "on"
47 :     on_genes = readLines(on_genes_file);
48 :     # get gxp values for the on genes
49 :     on_values = matrix(0,nrow=length(on_genes),ncol=num_experiments);
50 : overbeek 1.1
51 : overbeek 1.2 for (i in 1:length(on_genes)) {
52 :     on_values[i,] = all_geneexp[which(rownames(all_geneexp)==on_genes[i]),];
53 :     i = i + 1;
54 : overbeek 1.1 }
55 :    
56 :     cutoffs=matrix(0,nrow=num_experiments,ncol=2);
57 :     for (i in 1:num_experiments)
58 :     {
59 : overbeek 1.2 # "on" cutoff
60 :     cutoffs[i,2]=quantile(on_values[,i],probs=.1);
61 :    
62 :     # "off" cutoff
63 :     cutoffs[i,1] = quantile(all_geneexp[which(all_geneexp[,i]<cutoffs[i,2]),i],probs=.8);
64 : overbeek 1.1 }
65 :    
66 : overbeek 1.2 diff_25 = quantile(cutoffs[,2]-cutoffs[,1],probs=.25);
67 : overbeek 1.1
68 : overbeek 1.2 for (i in 1:num_experiments)
69 : overbeek 1.1 {
70 : overbeek 1.2 if ((cutoffs[i,2]-cutoffs[i,1])<diff_25)
71 : overbeek 1.1 {
72 : overbeek 1.2 cutoffs[i,1]=cutoffs[i,2]-diff_25;
73 : overbeek 1.1 }
74 :     }
75 :    
76 : overbeek 1.2 cutoff_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
77 : overbeek 1.1
78 : overbeek 1.2 for (i in 1:num_genes)
79 : overbeek 1.1 {
80 : overbeek 1.2 for (j in 1:num_experiments)
81 : overbeek 1.1 {
82 : overbeek 1.2 if (all_geneexp[i,j] < cutoffs[j,1])
83 :     {
84 :     cutoff_calls[i,j] = all_geneexp[i,j]-cutoffs[j,1];
85 :     }
86 :     else if (all_geneexp[i,j] > cutoffs[j,2])
87 :     {
88 :     cutoff_calls[i,j] = all_geneexp[i,j]-cutoffs[j,2];
89 :     }
90 : overbeek 1.1 }
91 :     }
92 :    
93 : overbeek 1.2 final_table = matrix(0,nrow=6,ncol=2);
94 : overbeek 1.1
95 : overbeek 1.2 print("Assessing consistency of initial cutoffs");
96 : overbeek 1.1
97 : overbeek 1.2 table2=matrix(0,nrow=length(split_genesets),ncol=num_experiments);
98 :     exptcalls = matrix(0,nrow=(length(split_genesets)*num_experiments), ncol=1);
99 : overbeek 1.1 counter = 1;
100 :    
101 : overbeek 1.2 for (i in 1:length(split_genesets))
102 : overbeek 1.1 {
103 : overbeek 1.2 geneset = split_genesets[[i]];
104 : overbeek 1.1 geneset_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
105 :     j=1;
106 :    
107 :     for (k in geneset)
108 :     {
109 : overbeek 1.2 geneset_calls[j,] = cutoff_calls[k,];
110 : overbeek 1.1 j=j+1;
111 :     }
112 :    
113 :     for (j in 1:num_experiments)
114 :     {
115 : overbeek 1.2 atleastoneon=sum(geneset_calls[,j] > 0);
116 : overbeek 1.1 atleastonegray=sum(geneset_calls[,j]==0);
117 : overbeek 1.2 atleastoneoff=sum(geneset_calls[,j] < 0);
118 : overbeek 1.1
119 :     ##1 means all genes in set are on for this experiment##
120 :     if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray==0) {exptcalls[counter]=1;}
121 :     ##2 means all genes in set are off for this experiment##
122 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray==0) {exptcalls[counter]=2;}
123 : overbeek 1.1 ##3 means all genes in set are either off or gray for this experiment##
124 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray>=1) {exptcalls[counter]=3;}
125 : overbeek 1.1 ##4 means all genes in set are either on or gray for this experiment##
126 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=4;}
127 : overbeek 1.1 ##5 means at least one gene in set is off and at least one gene is on for this experiment##
128 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff>=1)
129 :     {
130 :     exptcalls[counter]=5;
131 :     }
132 : overbeek 1.1 ##6 means all genes in set are gray for this experiment##
133 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=6;}
134 : overbeek 1.1
135 :     table2[i,j] = exptcalls[counter];
136 :     counter = counter + 1;
137 :     }
138 :     }
139 :    
140 :    
141 :     for (i in 1:6) {
142 :     final_table[i,1]=sum(exptcalls==i);
143 :     }
144 :    
145 : overbeek 1.2 print("Geneset voting")
146 :    
147 :     final_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
148 : overbeek 1.1
149 : overbeek 1.2 for (i in 1:num_genes)
150 :     {
151 :     for (j in 1:num_experiments)
152 :     {
153 :     if (cutoff_calls[i,j]>0) {final_calls[i,j]=1}
154 :     else if (cutoff_calls[i,j]<0) {final_calls[i,j]=-1}
155 :     }
156 :     }
157 :    
158 :     changed_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
159 :     done = 0;
160 :    
161 :     while (done != 1)
162 :     {
163 :    
164 :     done = 1;
165 :     print("looping")
166 : overbeek 1.1
167 : overbeek 1.2 for (i in 1:length(split_genesets))
168 : overbeek 1.1 {
169 : overbeek 1.2 geneset = split_genesets[[i]];
170 :     geneset_cutoff_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
171 :     geneset_final_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
172 : overbeek 1.1 j=1;
173 :    
174 :     for (k in geneset)
175 :     {
176 : overbeek 1.2 geneset_cutoff_calls[j,] = cutoff_calls[k,];
177 :     geneset_final_calls[j,] = final_calls[k,];
178 : overbeek 1.1 j=j+1;
179 :     }
180 :    
181 :     for (j in 1:num_experiments)
182 :     {
183 : overbeek 1.2 cross_product_value = 0;
184 :    
185 :     if (sum(geneset_final_calls[,j] > 0) > sum(geneset_final_calls[,j] < 0))
186 :     {
187 :     cross_product_value = 1;
188 :     }
189 :     else if (sum(geneset_final_calls[,j] > 0) < sum(geneset_final_calls[,j] < 0))
190 :     {
191 :     cross_product_value = -1;
192 :     }
193 :     else if (mean(geneset_cutoff_calls[,j]) > .5)
194 :     {
195 :     cross_product_value = 1;
196 :     }
197 :     else if (mean(geneset_cutoff_calls[,j]) < -.5)
198 :     {
199 :     cross_product_value = -1;
200 :     }
201 : overbeek 1.1
202 : overbeek 1.2 if (cross_product_value != 0)
203 :     {
204 :     for (k in geneset)
205 :     {
206 :     if (changed_calls[k,j] != -1)
207 :     {
208 :     if (changed_calls[k,j] != 0 && final_calls[k,j] != cross_product_value)
209 :     {
210 :     final_calls[k,j] = 0;
211 :     changed_calls[k,j] = -1;
212 :     done = 0;
213 :     }
214 :     else if (final_calls[k,j] != cross_product_value)
215 :     {
216 :     final_calls[k,j]=cross_product_value;
217 :     changed_calls[k,j] = changed_calls[k,j] + 1;
218 :     done = 0;
219 :     }
220 :     }
221 :     }
222 :     }
223 : overbeek 1.1 }
224 :     }
225 :    
226 : overbeek 1.2 print(c("number of changed calls: ", sum(changed_calls==-1)));
227 : overbeek 1.1
228 :     }
229 :    
230 : overbeek 1.2 print("Assessing consistency of cutoffs after voting");
231 : overbeek 1.1
232 : overbeek 1.2 table3=matrix(0,nrow=length(split_genesets),ncol=num_experiments);
233 :     exptcalls = matrix(0,nrow=(length(split_genesets)*num_experiments), ncol=1);
234 : overbeek 1.1 counter = 1;
235 : overbeek 1.2 howmanyonoffgray = c();
236 : overbeek 1.1
237 : overbeek 1.2 for (i in 1:length(split_genesets))
238 : overbeek 1.1 {
239 : overbeek 1.2 geneset = split_genesets[[i]];
240 : overbeek 1.1 geneset_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
241 :     j=1;
242 :    
243 :     for (k in geneset)
244 :     {
245 : overbeek 1.2 geneset_calls[j,] = final_calls[k,];
246 : overbeek 1.1 j=j+1;
247 :     }
248 :    
249 :     for (j in 1:num_experiments)
250 :     {
251 : overbeek 1.2 atleastoneon=sum(geneset_calls[,j] > 0);
252 : overbeek 1.1 atleastonegray=sum(geneset_calls[,j]==0);
253 : overbeek 1.2 atleastoneoff=sum(geneset_calls[,j] < 0);
254 : overbeek 1.1
255 :     ##1 means all genes in set are on for this experiment##
256 :     if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray==0) {exptcalls[counter]=1;}
257 :     ##2 means all genes in set are off for this experiment##
258 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray==0) {exptcalls[counter]=2;}
259 : overbeek 1.1 ##3 means all genes in set are either off or gray for this experiment##
260 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray>=1) {exptcalls[counter]=3;}
261 : overbeek 1.1 ##4 means all genes in set are either on or gray for this experiment##
262 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=4;}
263 : overbeek 1.1 ##5 means at least one gene in set is off and at least one gene is on for this experiment##
264 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff>=1)
265 :     {
266 :     exptcalls[counter]=5;
267 :     howmanyonoffgray = rbind(howmanyonoffgray, c(i,j,atleastoneon,atleastoneoff,atleastonegray));
268 :     }
269 : overbeek 1.1 ##6 means all genes in set are gray for this experiment##
270 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=6;}
271 : overbeek 1.1
272 : overbeek 1.2 table3[i,j] = exptcalls[counter];
273 : overbeek 1.1 counter = counter + 1;
274 :     }
275 :     }
276 :    
277 :    
278 :     for (i in 1:6) {
279 : overbeek 1.2 final_table[i,2]=sum(exptcalls==i);
280 : overbeek 1.1 }
281 :    
282 :     rownames(final_table)=c("all on", "all off", "off or gray", "on or gray", "inconsistent", "all gray");
283 : olson 1.4 write.table(final_table, file=paste(output_dir, "final_quality.txt", sep="/"), col.names=FALSE, row.names=TRUE, sep="\t", quote=FALSE);
284 :     write.table(howmanyonoffgray,file=paste(output_dir, "howmanyonoffgray.txt", sep="/"));
285 :     write.table(cutoffs,file=paste(output_dir, "cutoffs.txt", sep="/"), col.names=FALSE, row.names=FALSE);
286 : overbeek 1.2 colnames(final_calls)=colnames(all_geneexp);
287 :     rownames(final_calls)=rownames(all_geneexp);
288 : olson 1.4 write.table(final_calls,file=paste(output_dir,"final_on_off_calls.txt", sep="/"), quote=FALSE, sep="\t", col.names=TRUE,row.names=TRUE);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3