[Bio] / FigKernelScripts / SplitGeneSets.R Repository:
ViewVC logotype

Annotation of /FigKernelScripts/SplitGeneSets.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download)

1 : overbeek 1.1 split_geneset = function(geneset, gxp_data, pearson_cutoff)
2 :     {
3 :     values=matrix(0,nrow=length(geneset), ncol=length(geneset));
4 :    
5 :     for (i in 1:(length(geneset)-1))
6 :     {
7 :     for (j in (i+1):(length(geneset)))
8 :     {
9 :     values[j,i] = cor(gxp_data[geneset[i],],gxp_data[geneset[j],], use="everything", method="pearson");
10 :     }
11 :     }
12 :    
13 :     values=as.dist(-(values-1));
14 :     hc = hclust(values, method="complete");
15 :     treecut = cutree(hc, h=(-(pearson_cutoff-1)));
16 :     tb=table(treecut);
17 :     num_singletons = sum(tb==1);
18 :     #print(c("num_singletons:", num_singletons, "out of", length(geneset), "genes"), quote=FALSE);
19 :     tb=tb[tb!=1]; # remove singletons
20 :     new_genesets = vector("list", length(tb));
21 :    
22 :     if (length(tb)==0) {return (new_genesets);}
23 :    
24 :     for (i in 1:length(tb))
25 :     {
26 :     cluster = as.integer(names(tb))[i];
27 :     new_genesets[[i]] = geneset[treecut==cluster];
28 :     }
29 :    
30 :     return (new_genesets);
31 :     }
32 :    
33 :     args = commandArgs(TRUE);
34 : olson 1.2 if (length(args) != 3) { print("usage: Rscript SplitGeneSets.R gene_sets pearson_cutoff output_dir"); q() }
35 : overbeek 1.1 gene_sets_file = args[1];
36 :     splitting_cutoff = as.numeric(args[2]);
37 : olson 1.2 output_dir = args[3];
38 : overbeek 1.1
39 :     ## Read in gene expression data ##
40 :     ## This file is the output of 'exprs' from Bioconductor's RMA procedure
41 :     ## Header line contains experiment names
42 :     ## row name is gene id, rest of the columns are normalized expression values per experiment
43 : olson 1.2 all_geneexp=as.matrix(read.table(paste(output_dir, "rma_normalized.tab", sep="/"),header=TRUE));
44 : overbeek 1.1 num_genes = dim(all_geneexp)[1];
45 :     num_experiments = dim(all_geneexp)[2];
46 :    
47 :     ## Each line is a comma separated value containing gene ids corresponding to the first column in the raw data ##
48 :     ## Need to convert these to row numbers in the raw data
49 :     input_genesets = readLines(gene_sets_file);
50 :     input_genesets = strsplit(input_genesets, ",");
51 :     unsplit_genesets = list();
52 :    
53 :     for (i in 1:length(input_genesets))
54 :     {
55 :     input_geneset = input_genesets[[i]];
56 :     unsplit_geneset = c();
57 :     for (j in 1:length(input_geneset))
58 :     {
59 :     # check if the peg is in the raw_data
60 :     row_num = which(rownames(all_geneexp)==input_geneset[j]);
61 :    
62 :     if (length(row_num) > 0) {
63 :     unsplit_geneset = cbind(unsplit_geneset, row_num);
64 :     }
65 :     else {
66 :     print(c("Couldn't find row for", input_geneset[j]), quote=FALSE);
67 :     }
68 :     }
69 :    
70 :     if (length(unsplit_geneset) > 1) {
71 :     unsplit_genesets[[length(unsplit_genesets)+1]] = unsplit_geneset;
72 :     }
73 :     }
74 :    
75 :     # Now split all the unsplit_genesets on pearson correlations
76 :     split_genesets=list();
77 :    
78 :     for (i in 1:length(unsplit_genesets))
79 :     {
80 :     split_genesets = append(split_genesets,split_geneset(unsplit_genesets[[i]], all_geneexp, splitting_cutoff));
81 :     }
82 :    
83 :     for (i in 1:length(split_genesets))
84 :     {
85 :     cat(rownames(all_geneexp)[split_genesets[[i]]], sep=",");
86 :     cat("\n");
87 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3