########################################################################### # Statistics for Microarray Analysis (sma) # Discriminant analysis # # Date : August 10, 2000 # Last update: September 28, 2000 # # The scripts in this file demonstrate how the functions in the sma library # can be used in discriminant analysis. The functions are applied to # the Golub et al. data available at # http://waldo.wi.mit.edu/MPR/data_set_ALL_AML.html # You can either simply "source" the functions in Rarray.R and Rdisc.R # or load the library sma. Working with the library sma offers several # advantages, including the availability of help files. # ############################################################################ # Invoking R by "R --vsize=40M --nsize=2000k" # See help(Memory) for information on vsize and nsize. ########################################################################## # Sourcing the sma functions, if you are not using the library sma source("Rarray.R") source("Rdisc.R") ########################################################################### # Loading the library sma library(sma) # Starting the hypertext (currently HTML) version of R's online documentation. help.start() # View the help file for the function plot.cor ? plot.cor # or help(plot.cor) ########################################################################### # Reading in the Golub et al. data # Learning set (LS) golub1<-read.table("data_set_ALL_AML_train.txt",sep="\t",quote="",header=T,row.names=NULL) #g1<-cbind(dimnames(golub1)[[1]],as.character(golub1[,1])) lab.golub1<-c(rep(1,27),rep(2,11))-1 golub1<-golub1[,1+2*(1:38)] golub1<-golub1[,c(1:27,33:38,28:32)] # Test set (TS) golub2<-read.table("data_set_ALL_AML_independent.txt",sep="\t",quote="",header=T,row.names=NULL) #g2<-cbind(dimnames(golub2)[[1]],as.character(golub2[,1])) lab.golub2<-c(rep(1,11),rep(2,5),rep(1,2),rep(2,2),rep(1,1),rep(2,7),rep(1,6))-1 golub2<-golub2[,1+2*(1:34)] golub2<-golub2[,c(1:2,7,3,8:11,4:6,24,23,21:22,25,18:19,26:27,20,28:29,34,32: 33,30:31,17,15:16,12:14)] # Combine LS and TS: 7129x72 golub.data<-as.matrix(cbind(golub1,golub2)) golub.resp<-c(lab.golub1,lab.golub2) rm(golub1) rm(golub2) # Floor & ceiling golub.data[golub.data<100]<-100 golub.data[golub.data>16000]<-16000 # Preliminary selection of genes: 3571x72 tmp1<-apply(golub.data,1,max) tmp2<-apply(golub.data,1,min) which1<-(1:7129)[(tmp1/tmp2)>5] which2<-(1:7129)[(tmp1-tmp2)>500] which<-intersect(which1,which2) golub.data<-golub.data[which,] # Log_10 transformation and standardization of arrays golub.data.std<-log(golub.data,10) golub.data.std<-t(scale(golub.data.std,T,T)) # Three-class analysis: ALL B-cell/ALL T-cell/AML golub.resp2<-c(0,1,1,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,0,0,2,2,0,2,2,2,2,2,2,2,1,0,0,0,0,0) ########################################################################### # Identifying differentially expressed genes using BSS/WSS and t # For k=2 classes (AML vs. ALL) # BSS/WSS and t with equal variances are the same: BSS/WSS = t^2/(n-2) tt<-stat.t2(t(golub.data.std),golub.resp+1,x.ratio=T) bw<-stat.bwss(t(golub.data.std),golub.resp) # Normal Q-Q plot for t qqnorm(tt$t,pch=".") #qqline(tt$t,col=2) plot.qqline(qnorm(tt$t),tt$t,a=0.1,pch=16,col=2) # Chisq Q-Q plot for bw qchisq<-rchisq(length(bw$bw),1) qqplot(qchisq,bw$bw*70,pch=".",xlab="Theoretical quantiles",ylab="Sample quantiles",main="Chi^2(1) Q-Q plot") plot.qqline(qchisq,bw$bw*70,a=0.1,col=2,pch=16) # Plot of BSS vs. WSS plot(bw$wss,bw$bss,pch=".") which<-bw$bw>quantile(bw$bw,0.99) points(bw$wss[which],bw$bss[which],col=2,pch=16) # Image of data matrix for top 1\% tmp<-t(golub.data.std)[which,] labs<-c(rep("ALL-B",38),rep("ALL-T",9),rep("AML",25)) labcols<-rep(c("blue","purple","orange"),table(labs)) plot.mat(tmp[,order(golub.resp)],nrgcols=50,rlabels=F,clabels=labs,labcols=labcols,xlab="",ylab="") ######################### # For k=3 classes (ALL, AML-B, AML-T) # BSS/WSS and t with equal variances are the same: BSS/WSS = t^2/(n-2) bw2<-stat.bwss(t(golub.data.std),golub.resp2) # Chisq Q-Q plot for bw qchisq<-rchisq(length(bw2$bw),2) qqplot(qchisq,bw2$bw*69,pch=".",xlab="Theoretical quantiles",ylab="Sample quantiles",main="Chi^2(2) Q-Q plot") plot.qqline(qchisq,bw2$bw*69,a=0.1,col=2,pch=16) # Plot of BSS vs. WSS plot(bw2$wss,bw2$bss,pch=".") which<-bw2$bw>quantile(bw2$bw,0.99) points(bw2$wss[which],bw2$bss[which],col=2,pch=16) ########################################################################### # Image of correlation matrices labs<-c(rep("ALL-B",38),rep("ALL-T",9),rep("AML",25)) labcols<-rep(c("blue","purple","orange"),table(labs)) # All genes cor.all<-cor(t(golub.data.std)) plot.cor(cor.all[order(golub.resp2),order(golub.resp2)],labels=labs,labcols=labcols,title="Golub et al. data",xlab="",ylab="") # To save as a postscript file: postscript("tmp.ps",width=7,height=7) plot.cor(cor.all[order(golub.resp2),order(golub.resp2)],labels=labs,labcols=labcols,title="Golub et al. data",xlab="",ylab="") dev.off() # Top 40 genes cor.40<-cor(t(golub.data.std[,rev(order(bw2$bw))[1:40]])) plot.cor(cor.40[order(golub.resp2),order(golub.resp2)],labels=labs,labcols=labcols,title="Golub et al. data",xlab="",ylab="") ########################################################################### # BSS/WSS and DLDA bw.ls<-stat.bwss(t(golub.data.std[1:38,]),golub.resp[1:38])$bw top.40<-rev(order(bw.ls))[1:40] ls<-golub.data.std[1:38,top.40] ts<-golub.data.std[-(1:38),top.40] stat.diag.da(ls,golub.resp[1:38],ts,1)