################################################################################ # # File: experiment.r # RCS: $Header: $ # Description: Experiment example # Author: Staal Vinterbo # Created: Fri Dec 02 20:15:14 2005 # Modified: Mon Dec 12 13:46:12 2005 (Staal Vinterbo) staal@MGV # Language: ESS[S] # Package: N/A # Status: Experimental # # (c) Copyright 2005, Staal Vinterbo, all rights reserved. # ################################################################################ # # Usage: # 1) download the following files to a directory named 'data' under your # R directory (the directory where you keep your R programs, selected # by File->Change Dir or getwd() and setwd() ): #http://www.broad.mit.edu/mpr/publications/projects/Leukemia/ALL_vs_AML_train_set_38_sorted.res #http://www.broad.mit.edu/mpr/publications/projects/Leukemia/ALL_vs_AML_train_set_38_sorted.cls #http://www.broad.mit.edu/mpr/publications/projects/Leukemia/Leuk_ALL_AML.test.res #http://www.broad.mit.edu/mpr/publications/projects/Leukemia/Leuk_ALL_AML.test.cls # # 2) open the two .res files with Excel (or something similar) and remove the first # column with header "Description". Then remove rows 2 and 3. # Save the resulting files under their original names. # # 3) Dowload the fuzzy classification software from http://www.mit.edu/~sav/fuzzy/ # and install in your R directory # # 4) put this file in your R directory. # 5) start R and do # > source("experiment.r") # source('gcl/gcl.r') library(nnet) # lrc below only works for binary outcomes... lrc <- function(InDataFrame,...){ TrainingData <- as.data.frame(InDataFrame) nc <- ncol(TrainingData) predictors <- colnames(TrainingData)[1:min(nc - 1, nrow(TrainingData)-2)] outcome <- colnames(TrainingData)[nc] formulaText <- paste(paste(outcome, " ~ "), paste(predictors, collapse="+")) formula <- as.formula(formulaText) lrmodel <- glm(formula, family = binomial(link = logit), data=TrainingData) return(makefun(function(x, model){ p <- predict.glm(model, as.data.frame(x), type="response", se.fit=FALSE) cs <- cbind(1-p,p) colnames(cs) <- c(0,1) return(cs) }, list(model=lrmodel))) } # nrc below only works for binary outcomes... nrc <- function(InDataFrame, nunits=ceiling(ncol(InDataFrame)/5),alpha=0.001, MaxNWts=1500,...){ nips <- ncol(InDataFrame) - 1 ncls <- length(unique(InDataFrame[, ncol(InDataFrame)])) nw <- nips*nunits + ncls*nunits + 1 cat("Inputs:", nips , "\n") cat("Nclasses", ncls , "\n") cat("MaxNWts:", MaxNWts, "\n") cat("alpha:", alpha, "\n") if(nw > MaxNWts){ cat(" Max number of weights (", MaxNWts, ") exeeded: ", nw, "!\n") no <- nunits nunits <- floor((MaxNWts - 1)/(nips + ncls)) cat(" Adjusting # hidden nodes to:", nunits, " (old:", no, ")\n") } cat("nunits:", nunits, "\n") cat("N Wts:", nw <- nips*nunits + ncls*nunits + 1, "\n") TrainingData <- as.data.frame(InDataFrame) nc <- ncol(TrainingData) nn <- nnet(TrainingData[,-nc], TrainingData[, nc, drop=F], size=nunits, entropy=T, trace=F, decay=alpha,MaxNWts=MaxNWts) return(makefun(function(x, model){ p <- predict(model, x) cs <- cbind(1-p,p) colnames(cs) <- c(0,1) return(cs) # here is why only for binary outcomes... }, list(model=nn))) } filtgcor <- function(golub, ngenes=nrow(golub) - 2){ corrs <- abs(apply(golub, 2, cor, golub[,ncol(golub)], method="kendall")) corrs[ncol(golub)] <- 0 o <- order(corrs, decreasing=T) incl <- o[1:ngenes] return(golub[,c(incl, ncol(golub))]) } # make probability proxies out of memberships or similar prediction beliefs pmem <- function(m) t(apply(m, 1, function(x) x/sum(x))) #assumes first column is acc number, i.e., description column has been deleted getgolubdata <- function(fname){ gdata <- read.delim(fname) incl <- sapply(gdata[2,], is.numeric) cn <- as.vector(gdata[,1]) cn <- gsub("[-\/+\*]","_", cn) # get rid of -/+* in colnames gdata <- t(gdata[, incl]) colnames(gdata) <- cn return(gdata) } goodseed <- 885090 myexperiment <- function(golub, folds=8, seed=sample(1:1000000, 1)){ # read data cat("Data ", nrow(golub), "rows, with", ncol(golub), "genes.\n") # filter genes ngenes <- nrow(golub) - 2 cat("Filtering data...\n") golub.f <- filtgcor(golub) cat("done.\n") cat("Selected the", ngenes, "highest correlated genes.\n") res <- c() cat("Doing", folds, "fold CV...\n") rescv <- cvcomp(golub.f, tcl, nrc, ci.eval, folds=folds, seed=seed, cv.verbose=F, t.nlev=3) res <- c(res, rescv) cat("Doing 5x2 fold CV...\n") res52 <- cv52(golub.f, tcl, nrc, ci.eval, cv.verbose=F, t.nlev=3, seed=seed) res <- c(res, res52) return(res) } calp <- function(preds, acts, bins=10, do.plot=F,...){ n <- length(preds) bsize <- ceiling(n/bins) o <- order(preds) g <- rep(1:bins, 1, n, each=bsize) p <- t(sapply(unique(g), function(x,g,a,b) c(sum(a[g==x])/sum(g==x), sum(b[g==x])/sum(g==x)), g, preds[o], acts[o])) if(do.plot) plot(p, xlim=c(0,1), ylim=c(0,1), ...) return(p) } ################# The Experiment cat("Reading data...\n") train <- getgolubdata("data/ALL_vs_AML_train_set_38_sorted.res") test <- getgolubdata("data/Leuk_ALL_AML.test.res") traincls <- unlist(read.table("data/ALL_vs_AML_train_set_38_sorted.cls", fill=T)[2,],use.names=F) testcls <- unlist(read.table("data/Leuk_ALL_AML.test.cls", fill=T)[2,],use.names=F) train <- cbind(train, class=traincls) test <- cbind(test, class=testcls) cat("Filtering training set...\n") train.f <- filtgcor(train) cat("Repeating Original Classification Task:\n---------------------------------\n") cat("*Fuzzy Tree:\n") tf <- tcl(train.f, t.nlev=3) cat(tf()) cat(" Performance on Training Set:", ci.eval(tf, train.f), "\n") cat(" Performance on Test Set:", ci.eval(tf, test), "\n") cat(" Plotting calibration plot for Fuzzy tree on test data.\n") calp(pmem(tf(test))[,2], test[,ncol(test)], do.plot=T, main="Calibration: Fuzzy Tree on Test Data", type="l", xlab="Prediction", ylab="Actual") cat("\n*Neural Network:\n") nf <- nrc(train.f) cat(" Performance on Training Set:", ci.eval(nf, train.f), "\n") cat(" Performance on Test Set:", ci.eval(nf, test[,colnames(train.f)]), "\n") cat("Doing 8 fold CV and 5x2 CV performance comparison:\n----------------------------------\n") res <- myexperiment(rbind(train, test)) cat("\n\n8 fold CV T test:\n") print(res$tt) cat(" Plotting boxplots of performances for the two classifiers...\n") boxplot(res$tcis, res$lcis, names=c("Fuzzy Tree", "Neural Network"), ylab="C-Index", main="8-CV Performance") cat("\n5x2CV Results:\n") print(res$m) cat("5x2CV mean CI difference:", mean(res$m[,5:6]), "\n") cat("5x2CV F test P-value:", res$p, "\n") cat("The probability that any observed difference in performance is by chance is:", 1 - res$p, "\n") cat("That's all folks...\n")