[Bio] / FigKernelScripts / Pipeline.R Repository:
ViewVC logotype

Annotation of /FigKernelScripts/Pipeline.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download)

1 : overbeek 1.1 args = commandArgs(TRUE);
2 : overbeek 1.3 if (length(args) != 2) { print("usage: Rscript Pipeline.R on_genes_file gene_sets_file"); q() }
3 : overbeek 1.2 on_genes_file = args[1];
4 :     gene_sets_file = args[2];
5 : overbeek 1.1
6 :     ## Read in gene expression data ##
7 :     ## This file is the output of 'exprs' from Bioconductor's RMA procedure
8 :     ## Header line contains experiment names
9 :     ## row name is gene id, rest of the columns are normalized expression values per experiment
10 :     all_geneexp=as.matrix(read.table("raw_data.tab",header=TRUE));
11 :     num_genes = dim(all_geneexp)[1];
12 :     num_experiments = dim(all_geneexp)[2];
13 :    
14 : overbeek 1.2 ## gene_sets_files contains the gene set information ##
15 :     ## Each line is a comma separated value containing gene ids corresponding to the row names of the raw data ##
16 : overbeek 1.1 ## Need to convert these to row numbers in the raw data
17 : overbeek 1.2 input_genesets = readLines(gene_sets_file);
18 : overbeek 1.1 input_genesets = strsplit(input_genesets, ",");
19 : overbeek 1.2 split_genesets = list();
20 : overbeek 1.1
21 :     for (i in 1:length(input_genesets))
22 :     {
23 :     input_geneset = input_genesets[[i]];
24 : overbeek 1.2 split_geneset = c();
25 : overbeek 1.1 for (j in 1:length(input_geneset))
26 :     {
27 :     # check if the peg is in the raw_data
28 :     row_num = which(rownames(all_geneexp)==input_geneset[j]);
29 :    
30 :     if (length(row_num) > 0) {
31 : overbeek 1.2 split_geneset = cbind(split_geneset, row_num);
32 : overbeek 1.1 }
33 :     else {
34 :     print(c("Couldn't find row for", input_geneset[j]), quote=FALSE);
35 :     }
36 :     }
37 :    
38 : overbeek 1.2 if (length(split_geneset) > 1) {
39 :     split_genesets[[length(split_genesets)+1]] = split_geneset;
40 : overbeek 1.1 }
41 :     }
42 :    
43 : overbeek 1.2 print("Starting New Algorithm");
44 : overbeek 1.1
45 : overbeek 1.2 # Read genes that are the standard for being "on"
46 :     on_genes = readLines(on_genes_file);
47 :     # get gxp values for the on genes
48 :     on_values = matrix(0,nrow=length(on_genes),ncol=num_experiments);
49 : overbeek 1.1
50 : overbeek 1.2 for (i in 1:length(on_genes)) {
51 :     on_values[i,] = all_geneexp[which(rownames(all_geneexp)==on_genes[i]),];
52 :     i = i + 1;
53 : overbeek 1.1 }
54 :    
55 :     cutoffs=matrix(0,nrow=num_experiments,ncol=2);
56 :     for (i in 1:num_experiments)
57 :     {
58 : overbeek 1.2 # "on" cutoff
59 :     cutoffs[i,2]=quantile(on_values[,i],probs=.1);
60 :    
61 :     # "off" cutoff
62 :     cutoffs[i,1] = quantile(all_geneexp[which(all_geneexp[,i]<cutoffs[i,2]),i],probs=.8);
63 : overbeek 1.1 }
64 :    
65 : overbeek 1.2 diff_25 = quantile(cutoffs[,2]-cutoffs[,1],probs=.25);
66 : overbeek 1.1
67 : overbeek 1.2 for (i in 1:num_experiments)
68 : overbeek 1.1 {
69 : overbeek 1.2 if ((cutoffs[i,2]-cutoffs[i,1])<diff_25)
70 : overbeek 1.1 {
71 : overbeek 1.2 cutoffs[i,1]=cutoffs[i,2]-diff_25;
72 : overbeek 1.1 }
73 :     }
74 :    
75 : overbeek 1.2 cutoff_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
76 : overbeek 1.1
77 : overbeek 1.2 for (i in 1:num_genes)
78 : overbeek 1.1 {
79 : overbeek 1.2 for (j in 1:num_experiments)
80 : overbeek 1.1 {
81 : overbeek 1.2 if (all_geneexp[i,j] < cutoffs[j,1])
82 :     {
83 :     cutoff_calls[i,j] = all_geneexp[i,j]-cutoffs[j,1];
84 :     }
85 :     else if (all_geneexp[i,j] > cutoffs[j,2])
86 :     {
87 :     cutoff_calls[i,j] = all_geneexp[i,j]-cutoffs[j,2];
88 :     }
89 : overbeek 1.1 }
90 :     }
91 :    
92 : overbeek 1.2 final_table = matrix(0,nrow=6,ncol=2);
93 : overbeek 1.1
94 : overbeek 1.2 print("Assessing consistency of initial cutoffs");
95 : overbeek 1.1
96 : overbeek 1.2 table2=matrix(0,nrow=length(split_genesets),ncol=num_experiments);
97 :     exptcalls = matrix(0,nrow=(length(split_genesets)*num_experiments), ncol=1);
98 : overbeek 1.1 counter = 1;
99 :    
100 : overbeek 1.2 for (i in 1:length(split_genesets))
101 : overbeek 1.1 {
102 : overbeek 1.2 geneset = split_genesets[[i]];
103 : overbeek 1.1 geneset_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
104 :     j=1;
105 :    
106 :     for (k in geneset)
107 :     {
108 : overbeek 1.2 geneset_calls[j,] = cutoff_calls[k,];
109 : overbeek 1.1 j=j+1;
110 :     }
111 :    
112 :     for (j in 1:num_experiments)
113 :     {
114 : overbeek 1.2 atleastoneon=sum(geneset_calls[,j] > 0);
115 : overbeek 1.1 atleastonegray=sum(geneset_calls[,j]==0);
116 : overbeek 1.2 atleastoneoff=sum(geneset_calls[,j] < 0);
117 : overbeek 1.1
118 :     ##1 means all genes in set are on for this experiment##
119 :     if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray==0) {exptcalls[counter]=1;}
120 :     ##2 means all genes in set are off for this experiment##
121 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray==0) {exptcalls[counter]=2;}
122 : overbeek 1.1 ##3 means all genes in set are either off or gray for this experiment##
123 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray>=1) {exptcalls[counter]=3;}
124 : overbeek 1.1 ##4 means all genes in set are either on or gray for this experiment##
125 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=4;}
126 : overbeek 1.1 ##5 means at least one gene in set is off and at least one gene is on for this experiment##
127 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff>=1)
128 :     {
129 :     exptcalls[counter]=5;
130 :     }
131 : overbeek 1.1 ##6 means all genes in set are gray for this experiment##
132 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=6;}
133 : overbeek 1.1
134 :     table2[i,j] = exptcalls[counter];
135 :     counter = counter + 1;
136 :     }
137 :     }
138 :    
139 :    
140 :     for (i in 1:6) {
141 :     final_table[i,1]=sum(exptcalls==i);
142 :     }
143 :    
144 : overbeek 1.2 print("Geneset voting")
145 :    
146 :     final_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
147 : overbeek 1.1
148 : overbeek 1.2 for (i in 1:num_genes)
149 :     {
150 :     for (j in 1:num_experiments)
151 :     {
152 :     if (cutoff_calls[i,j]>0) {final_calls[i,j]=1}
153 :     else if (cutoff_calls[i,j]<0) {final_calls[i,j]=-1}
154 :     }
155 :     }
156 :    
157 :     changed_calls=matrix(0,nrow=num_genes,ncol=num_experiments);
158 :     done = 0;
159 :    
160 :     while (done != 1)
161 :     {
162 :    
163 :     done = 1;
164 :     print("looping")
165 : overbeek 1.1
166 : overbeek 1.2 for (i in 1:length(split_genesets))
167 : overbeek 1.1 {
168 : overbeek 1.2 geneset = split_genesets[[i]];
169 :     geneset_cutoff_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
170 :     geneset_final_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
171 : overbeek 1.1 j=1;
172 :    
173 :     for (k in geneset)
174 :     {
175 : overbeek 1.2 geneset_cutoff_calls[j,] = cutoff_calls[k,];
176 :     geneset_final_calls[j,] = final_calls[k,];
177 : overbeek 1.1 j=j+1;
178 :     }
179 :    
180 :     for (j in 1:num_experiments)
181 :     {
182 : overbeek 1.2 cross_product_value = 0;
183 :    
184 :     if (sum(geneset_final_calls[,j] > 0) > sum(geneset_final_calls[,j] < 0))
185 :     {
186 :     cross_product_value = 1;
187 :     }
188 :     else if (sum(geneset_final_calls[,j] > 0) < sum(geneset_final_calls[,j] < 0))
189 :     {
190 :     cross_product_value = -1;
191 :     }
192 :     else if (mean(geneset_cutoff_calls[,j]) > .5)
193 :     {
194 :     cross_product_value = 1;
195 :     }
196 :     else if (mean(geneset_cutoff_calls[,j]) < -.5)
197 :     {
198 :     cross_product_value = -1;
199 :     }
200 : overbeek 1.1
201 : overbeek 1.2 if (cross_product_value != 0)
202 :     {
203 :     for (k in geneset)
204 :     {
205 :     if (changed_calls[k,j] != -1)
206 :     {
207 :     if (changed_calls[k,j] != 0 && final_calls[k,j] != cross_product_value)
208 :     {
209 :     final_calls[k,j] = 0;
210 :     changed_calls[k,j] = -1;
211 :     done = 0;
212 :     }
213 :     else if (final_calls[k,j] != cross_product_value)
214 :     {
215 :     final_calls[k,j]=cross_product_value;
216 :     changed_calls[k,j] = changed_calls[k,j] + 1;
217 :     done = 0;
218 :     }
219 :     }
220 :     }
221 :     }
222 : overbeek 1.1 }
223 :     }
224 :    
225 : overbeek 1.2 print(c("number of changed calls: ", sum(changed_calls==-1)));
226 : overbeek 1.1
227 :     }
228 :    
229 : overbeek 1.2 print("Assessing consistency of cutoffs after voting");
230 : overbeek 1.1
231 : overbeek 1.2 table3=matrix(0,nrow=length(split_genesets),ncol=num_experiments);
232 :     exptcalls = matrix(0,nrow=(length(split_genesets)*num_experiments), ncol=1);
233 : overbeek 1.1 counter = 1;
234 : overbeek 1.2 howmanyonoffgray = c();
235 : overbeek 1.1
236 : overbeek 1.2 for (i in 1:length(split_genesets))
237 : overbeek 1.1 {
238 : overbeek 1.2 geneset = split_genesets[[i]];
239 : overbeek 1.1 geneset_calls = matrix(0,nrow=length(geneset),ncol=num_experiments);
240 :     j=1;
241 :    
242 :     for (k in geneset)
243 :     {
244 : overbeek 1.2 geneset_calls[j,] = final_calls[k,];
245 : overbeek 1.1 j=j+1;
246 :     }
247 :    
248 :     for (j in 1:num_experiments)
249 :     {
250 : overbeek 1.2 atleastoneon=sum(geneset_calls[,j] > 0);
251 : overbeek 1.1 atleastonegray=sum(geneset_calls[,j]==0);
252 : overbeek 1.2 atleastoneoff=sum(geneset_calls[,j] < 0);
253 : overbeek 1.1
254 :     ##1 means all genes in set are on for this experiment##
255 :     if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray==0) {exptcalls[counter]=1;}
256 :     ##2 means all genes in set are off for this experiment##
257 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray==0) {exptcalls[counter]=2;}
258 : overbeek 1.1 ##3 means all genes in set are either off or gray for this experiment##
259 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff>=1 && atleastonegray>=1) {exptcalls[counter]=3;}
260 : overbeek 1.1 ##4 means all genes in set are either on or gray for this experiment##
261 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=4;}
262 : overbeek 1.1 ##5 means at least one gene in set is off and at least one gene is on for this experiment##
263 : overbeek 1.2 else if (atleastoneon>=1 && atleastoneoff>=1)
264 :     {
265 :     exptcalls[counter]=5;
266 :     howmanyonoffgray = rbind(howmanyonoffgray, c(i,j,atleastoneon,atleastoneoff,atleastonegray));
267 :     }
268 : overbeek 1.1 ##6 means all genes in set are gray for this experiment##
269 : overbeek 1.2 else if (atleastoneon==0 && atleastoneoff==0 && atleastonegray>=1) {exptcalls[counter]=6;}
270 : overbeek 1.1
271 : overbeek 1.2 table3[i,j] = exptcalls[counter];
272 : overbeek 1.1 counter = counter + 1;
273 :     }
274 :     }
275 :    
276 :    
277 :     for (i in 1:6) {
278 : overbeek 1.2 final_table[i,2]=sum(exptcalls==i);
279 : overbeek 1.1 }
280 :    
281 :     rownames(final_table)=c("all on", "all off", "off or gray", "on or gray", "inconsistent", "all gray");
282 : overbeek 1.2 write.table(final_table, "final_quality.txt", col.names=FALSE, row.names=TRUE, sep="\t", quote=FALSE);
283 :     write.table(howmanyonoffgray,"howmanyonoffgray.txt");
284 :     write.table(cutoffs,"cutoffs.txt", col.names=FALSE, row.names=FALSE);
285 :     colnames(final_calls)=colnames(all_geneexp);
286 :     rownames(final_calls)=rownames(all_geneexp);
287 :     write.table(final_calls,"final_on_off_calls.txt", quote=FALSE, sep="\t", col.names=TRUE,row.names=TRUE);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3