[Bio] / FigKernelScripts / SplitGeneSets.R Repository:
ViewVC logotype

Annotation of /FigKernelScripts/SplitGeneSets.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download)

1 : overbeek 1.1 split_geneset = function(geneset, gxp_data, pearson_cutoff)
2 :     {
3 :     values=matrix(0,nrow=length(geneset), ncol=length(geneset));
4 :    
5 :     for (i in 1:(length(geneset)-1))
6 :     {
7 :     for (j in (i+1):(length(geneset)))
8 :     {
9 :     values[j,i] = cor(gxp_data[geneset[i],],gxp_data[geneset[j],], use="everything", method="pearson");
10 :     }
11 :     }
12 :    
13 :     values=as.dist(-(values-1));
14 :     hc = hclust(values, method="complete");
15 :     treecut = cutree(hc, h=(-(pearson_cutoff-1)));
16 :     tb=table(treecut);
17 :     num_singletons = sum(tb==1);
18 :     #print(c("num_singletons:", num_singletons, "out of", length(geneset), "genes"), quote=FALSE);
19 :     tb=tb[tb!=1]; # remove singletons
20 :     new_genesets = vector("list", length(tb));
21 :    
22 :     if (length(tb)==0) {return (new_genesets);}
23 :    
24 :     for (i in 1:length(tb))
25 :     {
26 :     cluster = as.integer(names(tb))[i];
27 :     new_genesets[[i]] = geneset[treecut==cluster];
28 :     }
29 :    
30 :     return (new_genesets);
31 :     }
32 :    
33 :     args = commandArgs(TRUE);
34 :     if (length(args) != 2) { print("usage: Rscript SplitGeneSets.R gene_sets pearson_cutoff"); q() }
35 :     gene_sets_file = args[1];
36 :     splitting_cutoff = as.numeric(args[2]);
37 :    
38 :     ## Read in gene expression data ##
39 :     ## This file is the output of 'exprs' from Bioconductor's RMA procedure
40 :     ## Header line contains experiment names
41 :     ## row name is gene id, rest of the columns are normalized expression values per experiment
42 :     all_geneexp=as.matrix(read.table("raw_data.tab",header=TRUE));
43 :     num_genes = dim(all_geneexp)[1];
44 :     num_experiments = dim(all_geneexp)[2];
45 :    
46 :     ## Each line is a comma separated value containing gene ids corresponding to the first column in the raw data ##
47 :     ## Need to convert these to row numbers in the raw data
48 :     input_genesets = readLines(gene_sets_file);
49 :     input_genesets = strsplit(input_genesets, ",");
50 :     unsplit_genesets = list();
51 :    
52 :     for (i in 1:length(input_genesets))
53 :     {
54 :     input_geneset = input_genesets[[i]];
55 :     unsplit_geneset = c();
56 :     for (j in 1:length(input_geneset))
57 :     {
58 :     # check if the peg is in the raw_data
59 :     row_num = which(rownames(all_geneexp)==input_geneset[j]);
60 :    
61 :     if (length(row_num) > 0) {
62 :     unsplit_geneset = cbind(unsplit_geneset, row_num);
63 :     }
64 :     else {
65 :     print(c("Couldn't find row for", input_geneset[j]), quote=FALSE);
66 :     }
67 :     }
68 :    
69 :     if (length(unsplit_geneset) > 1) {
70 :     unsplit_genesets[[length(unsplit_genesets)+1]] = unsplit_geneset;
71 :     }
72 :     }
73 :    
74 :     # Now split all the unsplit_genesets on pearson correlations
75 :     split_genesets=list();
76 :    
77 :     for (i in 1:length(unsplit_genesets))
78 :     {
79 :     split_genesets = append(split_genesets,split_geneset(unsplit_genesets[[i]], all_geneexp, splitting_cutoff));
80 :     }
81 :    
82 :     for (i in 1:length(split_genesets))
83 :     {
84 :     cat(rownames(all_geneexp)[split_genesets[[i]]], sep=",");
85 :     cat("\n");
86 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3