[Bio] / FigKernelScripts / RunRMA.R Repository:
ViewVC logotype

View of /FigKernelScripts/RunRMA.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (annotate)
Tue Feb 22 21:51:53 2011 UTC (8 years, 9 months ago) by dejongh
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Changes since 1.4: +6 -2 lines
preallocate no_match_pm_mm; delete no_match row at end

args = commandArgs(TRUE);
if (length(args) != 5) { print("usage:  Rscript RunRMA.R cdf_pkg peg_probe_table peg_probes_no_match experiment_dir output_dir"); q() }
force_chip_descriptor=args[1]
peg_probe_table = args[2];
peg_probes_no_match = args[3];
experiments_dir = args[4];
output_dir = args[5];

library(affy)
library(affyio)
library(preprocessCore)

affy_data = AllButCelsForReadAffy(celfile.path = experiments_dir)

if (length(affy_data$filenames) == 0)
{
    print("No experiments were found");
    q();
}

cdfs = unique(lapply(affy_data$filenames, whatcdf));
if (length(cdfs) > 1)
{
    print("More than one CDF type appeared in the data");
    q();
}
chip_descriptor = cdfs[[1]];
print(c("Using chip description", chip_descriptor));

# read old CDF environment for chip dimensions
cdf_library = cleancdfname(force_chip_descriptor)
library(cdf_library, character.only=TRUE)

# create the new CDF environment from peg_probe_table
newcdfenv = new.env(hash=TRUE);
probesets = readLines(peg_probe_table);
probesets = strsplit(probesets, "\t");

for (i in 1:length(probesets))
{
	probeset = probesets[[i]];
	peg = probeset[1]
	x_y = strsplit(probeset[2], "_")
	index = xy2indices(as.numeric(x_y[[1]][1]), as.numeric(x_y[[1]][2]),cdf=cdf_library)

#print(c(peg, index));	
	if (! exists(peg, envir= newcdfenv)) {
		# we're not assigning a mismatch index for now
		pm_mm = matrix(c(index,0), nrow=1,ncol=2, dimnames=list(c(), c("pm", "mm")))
		assign(peg,pm_mm,envir=newcdfenv)
	}
	else {
		pm_mm = get(peg,envir=newcdfenv)
		pm_mm = rbind(pm_mm,c(index,0))
		assign(peg,pm_mm,envir=newcdfenv)
	}
}

# add all the probes that didn't match pegs
no_match = readLines(peg_probes_no_match);

no_match_pm_mm = matrix(nrow=length(no_match),ncol=2, dimnames=list(c(), c("pm", "mm")))
#no_match_pm_mm = matrix(nrow=0,ncol=2, dimnames=list(c(), c("pm", "mm")))
#no_match_pm_mm = matrix(c(index,0), nrow=1,ncol=2, dimnames=list(c(), c("pm", "mm")))
for (i in 1:length(no_match))
{
	no_match_loc = no_match[[i]]
	x_y = strsplit(no_match_loc, "_")
	index = xy2indices(as.numeric(x_y[[1]][1]), as.numeric(x_y[[1]][2]),cdf=cdf_library)
	#no_match_pm_mm = rbind(no_match_pm_mm,c(index,0))
	no_match_pm_mm[i,] = c(index,0)
}
assign("no_match",no_match_pm_mm,envir=newcdfenv)
#
# OLD code
# #
# # Read the Data and perform RMA
# eset = justRMA(celfile.path=experiments_dir, cdfname="newcdfenv", compress=TRUE, verbose=TRUE)
# geneexp=exprs(eset)

# # remove the no_match and write the table
# geneexp=geneexp[rownames(geneexp)!='no_match',]
# write.table(geneexp, file="rma_normalized.tab", row.names=TRUE, col.names=TRUE, sep="\t", quote=FALSE)

#
# NEW code. Use the results of the affy read above to get the filename list and other chunks of information
# that the various stages need.
#

filenames = affy_data$filenames;
phenoData = affy_data$phenoData;

checkValidFilenames(filenames)
  
n <- length(filenames)

pdata <- pData(phenoData)

#
# Much of the below is lifted from the R code in justrma.R from the affy package.
#

##try to read sample names form phenoData. if not there use CEL filenames

#
# #f empty pdata filename are samplenames
#

if (dim(pdata)[1] != n)
{
    warning("Incompatible phenoData object. Created a new one.\n");
	
    samplenames <- gsub("^/?([^/]*/)*", "", unlist(filenames));
    pdata <- data.frame(sample=1:n,row.names=samplenames);
    phenoData <- new("AnnotatedDataFrame",
                     data=pdata,
                     varMetadata=data.frame(
                       labelDescription="arbitrary numbering",
                       row.names="sample"));
} else {
    samplenames <- rownames(pdata);
}

write.table(samplenames, file=paste(output_dir, "experiment.names", sep="/"),
	    row.names = TRUE, col.names = FALSE, sep= "\t", quote = FALSE);

description = affy_data$description;
if (is.null(description))
{
    description <- new("MIAME");
    description@preprocessing$filenames <- filenames;
    description@preprocessing$affyversion <- library(help=affy)$info[[2]][[2]][2];
}

#
# We're using the CDF that we constructed above.
#

headdetails <- read.celfile.header(filenames[[1]]);
#cdfname = chip_descriptor;
cdfname = "newcdfenv";

if(is.null(cdfname))
{
    cdfname <- headdetails[[1]];
}

scandates <-
    sapply(seq_len(length(filenames)), function(i) {
	sdate <- read.celfile.header(filenames[i], info = "full")[["ScanDate"]];
	if (is.null(sdate) || length(sdate) == 0) NA_character_ else sdate;
    });
protocol <-
    new("AnnotatedDataFrame",
        data=data.frame("ScanDate"=scandates, row.names = sampleNames(phenoData),
        stringsAsFactors=FALSE),
        dimLabels=c("sampleNames", "sampleColumns"));
tmp <- new("AffyBatch",
             cdfName=cdfname,
	   annotation=cleancdfname(cdfname, addcdf=FALSE));
pmIndex <- pmindex(tmp);
probenames <- rep(names(pmIndex), unlist(lapply(pmIndex,length)));
pNList <- split(0:(length(probenames) -1), probenames);
print("foo");

chip_tmp <- new("AffyBatch",
             cdfName=cdf_library,
#             cdfName=chip_descriptor,
	   annotation=cleancdfname(chip_descriptor, addcdf=FALSE));
chip_pmIndex <- pmindex(chip_tmp);
print("foo2");

chip_probenames <- rep(names(chip_pmIndex), unlist(lapply(chip_pmIndex,length)));
print("foo3");

outer_index = 1
pfile = file(paste(output_dir, "probe.info", sep="/"), "w");
for (pname in (names(chip_pmIndex)))
{
    for (idx in chip_pmIndex[[pname]])
    {
	xy = indices2xy(idx, cdf=cdf_library);

	cat(outer_index, "\t", pname, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
	outer_index = outer_index + 1;
    }
}
close(pfile);

outer_index = 1
pfile = file(paste(output_dir, "probepeg.info", sep="/"), "w");
for (pname in (names(pmIndex)))
{
    for (idx in pmIndex[[pname]])
    {
	xy = indices2xy(idx, cdf=cdf_library);

	cat(outer_index, "\t", pname, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
	outer_index = outer_index + 1;
    }
}
close(pfile);


write.table(chip_probenames, file=paste(output_dir, "probe.names", sep="/"),
	row.names = TRUE, col.names=FALSE, sep="\t", quote=FALSE);

## read pm data into matrix

probeintensities <- read.probematrix(filenames=filenames, cdfname = cdfname);

#
# This is the initial data. Write to output file.
#
write.table(probeintensities$pm, file=paste(output_dir, "probes.raw", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);

##pass matrix of pm values to rma

ngenes <- length(geneNames(tmp));

#
# Default parameters from justrma.R
#
background = TRUE;
bgversion = 2;
normalize = TRUE
verbose = TRUE
notes = ""

#
# Do bg correction first
#

#
# The conditional here is for sanity checks; if you don't take this branch
# the justRMA code is allowed to do the background correction.
#

if (1)
{
    rma.background.correct(probeintensities$pm, FALSE)
    
    #
    # Write the background corrected data.
    #
    write.table(probeintensities$pm, file=paste(output_dir, "probes.bg_corrected", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);

    background = FALSE;
}

#
# Same story here but for the normalization.
#
if (1)
{
    x = .Call("R_qnorm_c",probeintensities$pm, FALSE, PACKAGE="preprocessCore");
    write.table(probeintensities$pm, file=paste(output_dir, "probes.normalized", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
    rm(x)
    normalize = FALSE;
}

#exprs <- .Call("rma_c_complete",new_intensities, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");
exprs <- .Call("rma_c_complete",probeintensities$pm, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");

colnames(exprs) <- samplenames;
se.exprs <- array(NA, dim(exprs),
		  dimnames=list(rownames(exprs), colnames(exprs)));

# remove the no_match and write the table
exprs=exprs[rownames(exprs)!='no_match',];
write.table(exprs, file=paste(output_dir, "rma_normalized.tab", sep = "/"), row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3