[Bio] / FigKernelScripts / RunRMA_SIF_format.R Repository:
ViewVC logotype

View of /FigKernelScripts/RunRMA_SIF_format.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (annotate)
Thu Jan 20 22:05:23 2011 UTC (9 years, 1 month ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
SIF-processing run-rma.

args = commandArgs(TRUE);
if (length(args) != 3) { print("usage:  Rscript RunRMA_SIF_format.R cdf_pkg experiment_dir output_dir"); q() }
force_chip_descriptor=args[1]
experiments_dir = args[2];
output_dir = args[3];

library(affy)
library(affyio)
library(preprocessCore)

affy_data = AllButCelsForReadAffy(celfile.path = experiments_dir)

if (length(affy_data$filenames) == 0)
{
    print("No experiments were found");
    q();
}

cdfs = unique(lapply(affy_data$filenames, whatcdf));
if (length(cdfs) > 1)
{
    print("More than one CDF type appeared in the data");
    q();
}
chip_descriptor = cdfs[[1]];
print(c("Using chip description", chip_descriptor));

# read CDF environment
cdf_library = cleancdfname(force_chip_descriptor)
library(cdf_library, character.only=TRUE)
#require(cdf_library, character.only=TRUE, lib.loc = ".Library")

# OLD code
# #
# # Read the Data and perform RMA
# eset = justRMA(celfile.path=experiments_dir, cdfname="newcdfenv", compress=TRUE, verbose=TRUE)
# geneexp=exprs(eset)

# # remove the no_match and write the table
# geneexp=geneexp[rownames(geneexp)!='no_match',]
# write.table(geneexp, file="rma_normalized.tab", row.names=TRUE, col.names=TRUE, sep="\t", quote=FALSE)

#
# NEW code. Use the results of the affy read above to get the filename list and other chunks of information
# that the various stages need.
#

filenames = affy_data$filenames;
phenoData = affy_data$phenoData;

checkValidFilenames(filenames)
  
n <- length(filenames)

pdata <- pData(phenoData)

#
# Much of the below is lifted from the R code in justrma.R from the affy package.
#

##try to read sample names form phenoData. if not there use CEL filenames

#
# #f empty pdata filename are samplenames
#

if (dim(pdata)[1] != n)
{
    warning("Incompatible phenoData object. Created a new one.\n");
	
    samplenames <- gsub("^/?([^/]*/)*", "", unlist(filenames));
    pdata <- data.frame(sample=1:n,row.names=samplenames);
    phenoData <- new("AnnotatedDataFrame",
                     data=pdata,
                     varMetadata=data.frame(
                       labelDescription="arbitrary numbering",
                       row.names="sample"));
} else {
    samplenames <- rownames(pdata);
}

write.table(samplenames, file=paste(output_dir, "experiment.names", sep="/"),
	    row.names = TRUE, col.names = FALSE, sep= "\t", quote = FALSE);

description = affy_data$description;
if (is.null(description))
{
    description <- new("MIAME");
    description@preprocessing$filenames <- filenames;
    description@preprocessing$affyversion <- library(help=affy)$info[[2]][[2]][2];
}

#
# We're using the CDF that we constructed above.
#

headdetails <- read.celfile.header(filenames[[1]]);
cdfname = force_chip_descriptor;

if(is.null(cdfname))
{
    cdfname <- headdetails[[1]];
}

scandates <-
    sapply(seq_len(length(filenames)), function(i) {
	sdate <- read.celfile.header(filenames[i], info = "full")[["ScanDate"]];
	if (is.null(sdate) || length(sdate) == 0) NA_character_ else sdate;
    });
protocol <-
    new("AnnotatedDataFrame",
        data=data.frame("ScanDate"=scandates, row.names = sampleNames(phenoData),
        stringsAsFactors=FALSE),
        dimLabels=c("sampleNames", "sampleColumns"));
tmp <- new("AffyBatch",
             cdfName=cdfname,
	   annotation=cleancdfname(cdfname, addcdf=FALSE));
pmIndex <- pmindex(tmp);
probenames <- rep(names(pmIndex), unlist(lapply(pmIndex,length)));
pNList <- split(0:(length(probenames) -1), probenames);
print("foo");

chip_tmp <- new("AffyBatch",
             cdfName=cdf_library,
#             cdfName=chip_descriptor,
	   annotation=cleancdfname(chip_descriptor, addcdf=FALSE));
chip_pmIndex <- pmindex(chip_tmp);
print("foo2");

chip_probenames <- rep(names(chip_pmIndex), unlist(lapply(chip_pmIndex,length)));
print("foo3");

outer_index = 1
pfile = file(paste(output_dir, "probe.info", sep="/"), "w");
for (pname in (names(chip_pmIndex)))
{
    for (idx in chip_pmIndex[[pname]])
    {
	xy = indices2xy(idx, cdf=cdf_library);

	cat(outer_index, "\t", pname, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
	outer_index = outer_index + 1;
    }
}
close(pfile);

write.table(chip_probenames, file=paste(output_dir, "probe.names", sep="/"),
	row.names = TRUE, col.names=FALSE, sep="\t", quote=FALSE);

## read pm data into matrix

probeintensities <- read.probematrix(filenames=filenames, cdfname = cdfname);

#
# This is the initial data. Write to output file.
#
write.table(probeintensities$pm, file=paste(output_dir, "probes.raw", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);

##pass matrix of pm values to rma

ngenes <- length(geneNames(tmp));

#
# Default parameters from justrma.R
#
background = TRUE;
bgversion = 2;
normalize = TRUE
verbose = TRUE
notes = ""

#
# Do bg correction first
#

#
# The conditional here is for sanity checks; if you don't take this branch
# the justRMA code is allowed to do the background correction.
#

if (1)
{
    rma.background.correct(probeintensities$pm, FALSE)
    
    #
    # Write the background corrected data.
    #
    write.table(probeintensities$pm, file=paste(output_dir, "probes.bg_corrected", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);

    background = FALSE;
}

#
# Same story here but for the normalization.
#
if (1)
{
    x = .Call("R_qnorm_c",probeintensities$pm, FALSE, PACKAGE="preprocessCore");
    write.table(probeintensities$pm, file=paste(output_dir, "probes.normalized", sep="/"),row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);
    rm(x)
    normalize = FALSE;
}

#exprs <- .Call("rma_c_complete",new_intensities, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");
exprs <- .Call("rma_c_complete",probeintensities$pm, pNList, ngenes, normalize, background, bgversion, verbose, PACKAGE="affy");

colnames(exprs) <- samplenames;
se.exprs <- array(NA, dim(exprs),
		  dimnames=list(rownames(exprs), colnames(exprs)));

# Read the sif2peg file and extract the geneexp rows that match, substituting peg ids
#sif2peg = read.table(paste(output_dir, "sif2peg", sep="/"), sep="\t")
sif2peg = read.table(paste(output_dir, "sif2peg", sep="/"), sep="\t", colClasses="character")


geneexp_rn = merge(exprs,sif2peg,by.x=0,by.y=1)
geneexp = geneexp_rn[,2:(dim(geneexp_rn)[2]-1)]
row.names(geneexp)=geneexp_rn[,dim(geneexp_rn)[2]]

write.table(geneexp, file=paste(output_dir, "rma_normalized.tab", sep = "/"), row.names = TRUE, col.names = TRUE, sep="\t", quote=FALSE);

# write the probepeg.info using matching pegs where possible
outer_index = 1
pfile = file(paste(output_dir, "probepeg.info", sep="/"), "w");
p2file = file(paste(output_dir, "peg.probe.table", sep="/"), "w");
for (pname in (names(pmIndex)))
{
   # changed next three lines
   peg_row = which(sif2peg[,1]==pname);
   if (length(peg_row)==1) {
	peg = sif2peg[peg_row,2];
       for (idx in pmIndex[[pname]])
       {
	    xy = indices2xy(idx, cdf=cdf_library);
	    cat(outer_index, "\t", peg, "\t", idx, "\t", xy[[1]], "\t", xy[[2]], "\n", sep="", file = pfile);
	    cat(peg, "\t", xy[[1]], "_", xy[[2]], "\n", sep="", file=p2file);
	    outer_index = outer_index + 1;
	}
   }
}
close(pfile);
close(p2file);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3