[Bio] / FigKernelScripts / RunRMA.R Repository:
ViewVC logotype

Annotation of /FigKernelScripts/RunRMA.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download)

1 : overbeek 1.1 args = commandArgs(TRUE);
2 : olson 1.4 if (length(args) != 5) { print("usage: Rscript RunRMA.R cdf_pkg peg_probe_table peg_probes_no_match experiment_dir output_dir"); q() }
3 :     force_chip_descriptor=args[1]
4 :     peg_probe_table = args[2];
5 :     peg_probes_no_match = args[3];
6 :     experiments_dir = args[4];
7 :     output_dir = args[5];
8 : overbeek 1.1
9 :     library(affy)
10 : olson 1.3 library(affyio)
11 :     library(preprocessCore)
12 : overbeek 1.1
13 : olson 1.3 affy_data = AllButCelsForReadAffy(celfile.path = experiments_dir)
14 :    
15 :     if (length(affy_data$filenames) == 0)
16 :     {
17 :     print("No experiments were found");
18 :     q();
19 :     }
20 :    
21 :     cdfs = unique(lapply(affy_data$filenames, whatcdf));
22 :     if (length(cdfs) > 1)
23 :     {
24 :     print("More than one CDF type appeared in the data");
25 :     q();
26 : overbeek 1.1 }
27 : olson 1.3 chip_descriptor = cdfs[[1]];
28 :     print(c("Using chip description", chip_descriptor));
29 : overbeek 1.1
30 :     # read old CDF environment for chip dimensions
31 : olson 1.4 cdf_library = cleancdfname(force_chip_descriptor)
32 : overbeek 1.1 library(cdf_library, character.only=TRUE)
33 :    
34 :     # create the new CDF environment from peg_probe_table
35 :     newcdfenv = new.env(hash=TRUE);
36 : olson 1.3 probesets = readLines(peg_probe_table);
37 : overbeek 1.1 probesets = strsplit(probesets, "\t");
38 :    
39 :     for (i in 1:length(probesets))
40 :     {
41 :     probeset = probesets[[i]];
42 :     peg = probeset[1]
43 :     x_y = strsplit(probeset[2], "_")
44 :     index = xy2indices(as.numeric(x_y[[1]][1]), as.numeric(x_y[[1]][2]),cdf=cdf_library)
45 : olson 1.3
46 :     #print(c(peg, index));
47 : overbeek 1.1 if (! exists(peg, envir= newcdfenv)) {
48 :     # we're not assigning a mismatch index for now
49 :     pm_mm = matrix(c(index,0), nrow=1,ncol=2, dimnames=list(c(), c("pm", "mm")))
50 :     assign(peg,pm_mm,envir=newcdfenv)
51 :     }
52 :     else {
53 :     pm_mm = get(peg,envir=newcdfenv)
54 :     pm_mm = rbind(pm_mm,c(index,0))
55 :     assign(peg,pm_mm,envir=newcdfenv)
56 :     }
57 :     }
58 :    
59 : overbeek 1.2 # add all the probes that didn't match pegs
60 : olson 1.3 no_match = readLines(peg_probes_no_match);
61 :    
62 : dejongh 1.5 no_match_pm_mm = matrix(nrow=length(no_match),ncol=2, dimnames=list(c(), c("pm", "mm")))
63 :     #no_match_pm_mm = matrix(nrow=0,ncol=2, dimnames=list(c(), c("pm", "mm")))
64 : olson 1.3 #no_match_pm_mm = matrix(c(index,0), nrow=1,ncol=2, dimnames=list(c(), c("pm", "mm")))
65 : overbeek 1.2 for (i in 1:length(no_match))
66 :     {
67 :     no_match_loc = no_match[[i]]
68 :     x_y = strsplit(no_match_loc, "_")
69 :     index = xy2indices(as.numeric(x_y[[1]][1]), as.numeric(x_y[[1]][2]),cdf=cdf_library)
70 : dejongh 1.5 #no_match_pm_mm = rbind(no_match_pm_mm,c(index,0))
71 :     no_match_pm_mm[i,] = c(index,0)
72 : overbeek 1.2 }
73 :     assign("no_match",no_match_pm_mm,envir=newcdfenv)
74 : olson 1.3 #
75 :     # OLD code
76 :     # #
77 :     # # Read the Data and perform RMA
78 :     # eset = justRMA(celfile.path=experiments_dir, cdfname="newcdfenv", compress=TRUE, verbose=TRUE)
79 :     # geneexp=exprs(eset)
80 :    
81 :     # # remove the no_match and write the table
82 :     # geneexp=geneexp[rownames(geneexp)!='no_match',]
83 : olson 1.4 # write.table(geneexp, file="rma_normalized.tab", row.names=TRUE, col.names=TRUE, sep="\t", quote=FALSE)
84 : olson 1.3
85 :     #
86 :     # NEW code. Use the results of the affy read above to get the filename list and other chunks of information
87 :     # that the various stages need.
88 :     #
89 :    
90 :     filenames = affy_data$filenames;
91 :     phenoData = affy_data$phenoData;
92 :    
93 :     checkValidFilenames(filenames)
94 :    
95 :     n <- length(filenames)
96 :    
97 :     pdata <- pData(phenoData)
98 :    
99 :     #
100 :     # Much of the below is lifted from the R code in justrma.R from the affy package.
101 :     #
102 :    
103 :     ##try to read sample names form phenoData. if not there use CEL filenames
104 :    
105 :     #
106 :     # #f empty pdata filename are samplenames
107 :     #
108 :    
109 :     if (dim(pdata)[1] != n)
110 :     {
111 :     warning("Incompatible phenoData object. Created a new one.\n");
112 :    
113 :     samplenames <- gsub("^/?([^/]*/)*", "", unlist(filenames));
114 :     pdata <- data.frame(sample=1:n,row.names=samplenames);
115 :     phenoData <- new("AnnotatedDataFrame",
116 :     data=pdata,
117 :     varMetadata=data.frame(
118 :     labelDescription="arbitrary numbering",
119 :     row.names="sample"));
120 :     } else {
121 :     samplenames <- rownames(pdata);
122 :     }
123 :    
124 :     write.table(samplenames, file=paste(output_dir, "experiment.names", sep="/"),
125 :     row.names = TRUE, col.names = FALSE, sep= "\t", quote = FALSE);
126 :    
127 :     description = affy_data$description;
128 :     if (is.null(description))
129 :     {
130 :     description <- new("MIAME");
131 :     description@preprocessing$filenames <- filenames;
132 :     description@preprocessing$affyversion <- library(help=affy)$info[[2]][[2]][2];
133 :     }
134 :    
135 :     #
136 :     # We're using the CDF that we constructed above.
137 :     #
138 :    
139 :     headdetails <- read.celfile.header(filenames[[1]]);
140 :     #cdfname = chip_descriptor;
141 :     cdfname = "newcdfenv";
142 :    
143 :     if(is.null(cdfname))
144 :     {
145 :     cdfname <- headdetails[[1]];
146 :     }
147 :    
148 :     scandates <-
149 :     sapply(seq_len(length(filenames)), function(i) {
150 :     sdate <- read.celfile.header(filenames[i], info = "full")[["ScanDate"]];
151 :     if (is.null(sdate) || length(sdate) == 0) NA_character_ else sdate;
152 :     });
153 :     protocol <-
154 :     new("AnnotatedDataFrame",
155 :     data=data.frame("ScanDate"=scandates, row.names = sampleNames(phenoData),
156 :     stringsAsFactors=FALSE),
157 :     dimLabels=c("sampleNames", "sampleColumns"));
158 :     tmp <- new("AffyBatch",
159 :     cdfName=cdfname,
160 :     annotation=cleancdfname(cdfname, addcdf=FALSE));
161 :     pmIndex <- pmindex(tmp);
162 :     probenames <- rep(names(pmIndex), unlist(lapply(pmIndex,length)));
163 :     pNList <- split(0:(length(probenames) -1), probenames);
164 : olson 1.4 print("foo");
165 : olson 1.3
166 :     chip_tmp <- new("AffyBatch",
167 : olson 1.4 cdfName=cdf_library,
168 :     # cdfName=chip_descriptor,
169 : olson 1.3 annotation=cleancdfname(chip_descriptor, addcdf=FALSE));
170 :     chip_pmIndex <- pmindex(chip_tmp);
171 : olson 1.4 print("foo2");
172 : olson 1.3
173 :     chip_probenames <- rep(names(chip_pmIndex), unlist(lapply(chip_pmIndex,length)));
174 : olson 1.4 print("foo3");
175 : olson 1.3
176 :     outer_index = 1
177 :     pfile = file(paste(output_dir, "probe.info", sep="/"), "w");
178 :     for (pname in (names(chip_pmIndex)))
179 :     {
180 :     for (idx in chip_pmIndex[[pname]])
181 :     {
182 :     xy = indices2xy(idx, cdf=cdf_library);
183 :    
184 :     cat(outer_index, "\t", pname, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
185 :     outer_index = outer_index + 1;
186 :     }
187 :     }
188 :     close(pfile);
189 :    
190 :     outer_index = 1
191 :     pfile = file(paste(output_dir, "probepeg.info", sep="/"), "w");
192 :     for (pname in (names(pmIndex)))
193 :     {
194 :     for (idx in pmIndex[[pname]])
195 :     {
196 :     xy = indices2xy(idx, cdf=cdf_library);
197 :    
198 :     cat(outer_index, "\t", pname, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
199 :     outer_index = outer_index + 1;
200 :     }
201 :     }
202 :     close(pfile);
203 :    
204 :    
205 :     write.table(chip_probenames, file=paste(output_dir, "probe.names", sep="/"),
206 :     row.names = TRUE, col.names=FALSE, sep="\t", quote=FALSE);
207 :    
208 :     ## read pm data into matrix
209 :    
210 :     probeintensities <- read.probematrix(filenames=filenames, cdfname = cdfname);
211 :    
212 :     #
213 :     # This is the initial data. Write to output file.
214 :     #
215 :     write.table(probeintensities$pm, file=paste(output_dir, "probes.raw", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
216 :    
217 :     ##pass matrix of pm values to rma
218 :    
219 :     ngenes <- length(geneNames(tmp));
220 :    
221 :     #
222 :     # Default parameters from justrma.R
223 :     #
224 :     background = TRUE;
225 :     bgversion = 2;
226 :     normalize = TRUE
227 :     verbose = TRUE
228 :     notes = ""
229 :    
230 :     #
231 :     # Do bg correction first
232 :     #
233 :    
234 :     #
235 :     # The conditional here is for sanity checks; if you don't take this branch
236 :     # the justRMA code is allowed to do the background correction.
237 :     #
238 :    
239 :     if (1)
240 :     {
241 :     rma.background.correct(probeintensities$pm, FALSE)
242 :    
243 :     #
244 :     # Write the background corrected data.
245 :     #
246 :     write.table(probeintensities$pm, file=paste(output_dir, "probes.bg_corrected", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
247 :    
248 :     background = FALSE;
249 :     }
250 :    
251 :     #
252 :     # Same story here but for the normalization.
253 :     #
254 :     if (1)
255 :     {
256 :     x = .Call("R_qnorm_c",probeintensities$pm, FALSE, PACKAGE="preprocessCore");
257 :     write.table(probeintensities$pm, file=paste(output_dir, "probes.normalized", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
258 :     rm(x)
259 :     normalize = FALSE;
260 :     }
261 :    
262 :     #exprs <- .Call("rma_c_complete",new_intensities, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");
263 :     exprs <- .Call("rma_c_complete",probeintensities$pm, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");
264 :    
265 :     colnames(exprs) <- samplenames;
266 :     se.exprs <- array(NA, dim(exprs),
267 :     dimnames=list(rownames(exprs), colnames(exprs)));
268 :    
269 : dejongh 1.5 # remove the no_match and write the table
270 :     exprs=exprs[rownames(exprs)!='no_match',];
271 : olson 1.4 write.table(exprs, file=paste(output_dir, "rma_normalized.tab", sep = "/"), row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
272 : overbeek 1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3