########################################################################### # Statistics for Microarray Analysis (sma) # (For version 0.5) # # Date : January, 2001 # # Here is a sample script to demonstrate how the functions in the # R package "Statistics for Microarray Analysis" (SMA) can be # used. The functions are applied to data from 6 different # hybridizations, 3 control and 3 treatment hybridizations. The sample # data are stored in 6 different Spot output files: sample.c1, sample.c2, # sample.c3, sample.t1, sample.t2, and sample.t3. # ############################################################################ # Invoking R by "R --vsize=20M --nsize=2000k" # See help(Memory) for information on vsize and nsize. ########################################################################### # Loading the library sma library(sma) # Starting the hypertext (currently HTML) version of R's online documentation. help.start() # View the help file for the function init.grid ? init.grid # or help(init.grid) ########################################################################### # Reading the data into R ########################################################################### ## If the data are not in R yet, you need to read them in using the ## read.table() function. The data matrix for each slide should contain ## at least 4 columns, corresponding to the red and green raw spot intensities ## and the red and green spot background intensities. ## If you are using Spot the data are probably in the system already. ## If you are using the "sma" library, the data are included in library, ## you can load it by data(MouseArray) ## Otherwise, you can untar the sample data files (from the web) and import ## the data using mouse1 <- read.spot("sample.c1") mouse2 <- read.spot("sample.c2") mouse3 <- read.spot("sample.c3") mouse4 <- read.spot("sample.t1") mouse5 <- read.spot("sample.t2") mouse6 <- read.spot("sample.t3") ## where sample.c1 etc. are Spot output files containing the 4 columns just ## described (as well as a few other columns). ########################################################################## # Initialization: slide layout and data format ########################################################################## ## 1. Slide layout ## This can be done two ways, you can preset the layout or define it ## interactively. ## First method: Pre-setting the layout of the microarray slides. sample.setup <- list(nspot.r = 19, # Number of rows of spots per grid nspot.c = 21, # Number of columns of spots per grid ngrid.r = 4, # Number of rows of grids per image ngrid.c = 4, # Number of columns of grids per image ) ## Second method: Interactive definition sample.setup <- init.grid() ## 2. Keeping track of your experiment files in R ## Here is another interactive function to let you create a table ## keeping track of the name of your experimental files and their ## corresponding name in R. After you have untar the data files (from ## the web) and import the data using "read.spot". Run init.name.exp() ## Here is what you should see: (and enter) # Are you creating a new batch.exp file or adding new data names # to a prexisting batch.exp file? # Enter "n" for creating and "a" for adding new data names: n # Enter the batch name for the new .exp file: sample # Enter the number of names of files to be entered: 6 # # Enter the R name of your 1 th dataset:mouse1 # # Enter the actual file name including the full path name for mouse1 ?sample.c1 # # Enter the R name of your 2 th dataset:mouse2 # # Enter the actual file name including the full path name for mouse2 ?sample.c2 # # Enter the R name of your 3 th dataset:mouse3 # # Enter the actual file name including the full path name for mouse3 ?sample.c3 # # Enter the R name of your 4 th dataset:mouse4 # # Enter the actual file name including the full path name for mouse4 ?sample.t1 # # Enter the R name of your 5 th dataset:mouse5 # # Enter the actual file name including the full path name for mouse5 ?sample.t2 # # Enter the R name of your 6 th dataset:mouse6 # # Enter the actual file name including the full path name for mouse6 ?sample.t3 #Finished adding names to .exp file. #NULL ## 2a To view: init.show.exp("sample") ## 3. Creation of data matrices ## This can be done interactively and the process can be simplified if the ## batch of experiments that you are analyzing have the same name structure. ## For example, in the sample data, the name of the R objects for each slide ## consists of a prefix, "mouse", followed by an integer suffix. ## Interactively putting all the data into one matrix: sample.data <- init.data() ## Here is what you should see: #> sample.data <- init.data() #Are you creating a new data matrix or adding new array data #to a prexisting data matrix? #Enter "n" for creating and "a" for adding new array data: n #Do the names of all your datasets have the following format: #prefix1, prefix2, prefix3?, ... Here prefix can be any name, #but the suffixes must be integers 1,2, ..., # of arrays. #Enter "y" for yes, "n" for no: y #Enter the prefix:mouse #Enter the number of arrays to be processed:6 #Enter the name of Cy3 raw data: Gmean #Enter the name of Cy3 background: morphG #Enter the name of Cy5 raw data: Rmean #Enter the name of Cy5 background: morphR #Finished creating new dataset. ## If later on, you would like to add a data set, run interactively sample.data <- init.data() ########################################################################## # Single Slide Analysis (Mostly visual) ########################################################################## ## Exploratory plots (How good are your data?) ## 1. Checking for correlations between spot background and background ## corrected signal intensities, one channel at a time ## For the red channel of the first experiment. plot.svb(sample.data, channel="R", image.id=1) ## For the green channel of the third experiment. plot.svb(sample.data, channel="G", image.id=3) ## 2. M vs. A plot plot.mva(sample.data, sample.setup, norm="p", image.id=3, col.ex="blue") plot.mva(sample.data, sample.setup, norm="p", image.id=5, nclass=1) ## or have a look at them all together for(i in 1:3) plot.mva(sample.data, sample.setup, norm="p", image.id=i) ## 3. Looking for spatial effects on the log ratios sample.lratio <- stat.ma(sample.data, sample.setup) ## To look at the first experiment. plot.spatial(sample.lratio$M[,1], sample.setup, col=rgcolors.func(20)) ## default 85% cutoff ## To look at the third experiment. plot.spatial(sample.lratio$M[,3], sample.setup, crit1= 50, col=rgcolors.func(20)) ## 4. Summary statistics for the log ratios M and absolute intensities A ## using standard R function summary() ## One column at a time (the log ratio M of the first experiment) summary(sample.lratio$M[,1]) ## One column at a time (the absolute intensities A of the first experiment) summary(sample.lratio$A[,1]) ## all columns simultaneously using the R function apply() apply(sample.lratio$M, 2, summary) apply(sample.lratio$A, 2, summary) ## Histogram of the log ratios M2 for the second experiment hist(sample.lratio$M[,2], nclass=100, xlab="M", main="Histogram") hist(sample.lratio$M[,2], nclass=100, xlab="M", main="Histogram",col=2) ## Methods based on three different papers: ## 1. Newton etal ## http://www.stat.wisc.edu/~newton/papers/abstracts/btr139a.html stat.Newton(mouse.data, mouse.setup, image.id=4) ## 2. Chen etal, Ratio-based decisions and the quantitative analysis ## of cDNA microarray images. Journal of Biomedical Optics , Volume ## 2 (4), 364-374, 1997. (1997) stat.Chen(mouse.data, mouse.setup, image.id=4) ## 3. Sapir and Churchill(2000), Estimating the posterior probability ## of differential gene expression from microarray ## data. http://www.jax.org/re search/churchill/ stat.ChurSap(mouse.data, mouse.setup, image.id=4) ########################################################################## # Multiple Slide Analysis (Mostly Visual) ########################################################################## ## 1. Normalization and calculation of log ratios M and overall intensities A. ## Reset the argument norm for different normalization methods. sample.lratio <-stat.ma(sample.data, sample.setup) ## 2. Two-sample t-statistics ## Defining the class labels (ctl and trt) for the slides. ## E.g. The first 3 sample experiments are from the control group (1) and ## the last 3 are from the treatment group (2). cl <- c(1,1,1,2,2,2) ## or alternatively cl <- c(rep(1, 3), rep(2,3)) ## Calculation of t-statistics sample.tstat <- stat.t2(sample.lratio, cl) ## Finding the names of the 10 genes, say, with the largest absolute ## t-statistics. You need to read in the names of the genes using the R ## function read.table(), note that they must be in the same order as in ## the data file! mouse.gnames <- read.table("sample.gnames", sep="\t", header=T, quote="") stat.gnames(abs(sample.tstat$t),mouse.gnames,crit=10) ## 3. Diagnostic plots ## Q-Q plot and histogram plot.qq(sample.tstat$t, "Sample") ## t, t-numerator, t-denominator vs. average absolute intensities plot.t2(sample.tstat, "Sample") ## Spatial distribution (on the slide) for the test statistics plot.spatial(sample.tstat$t,sample.setup,crit1=10,col=rgcolors.func(20))