####################################### # UNIT 1: UNIX ####################################### ####################################### # 3: Files and directories ####################################### # login and copying to a remote machine ssh -X bilbo ssh -X paciorek@bilbo.berkeley.edu pwd ls cd /accounts/gen/vis/paciorek/teaching/243 cd ~paciorek/teaching/243 cd ~/teaching/243 cd 243 # provided I am in 'teaching' cd .. cd - cp cp -r cp -rp # preserves timestamps and other metainfo (VERY handy for tracing your workflows if you move files between machines) mkdir rm rm -r # recursively remove a whole directory structure rm -rf # CAREFUL! In particular, make sure you know what directory you are in # file transfer between machines scp file.txt paciorek@bilbo.berkeley.edu:~/research/. scp paciorek@bilbo.berkeley.edu:/data/file.txt ~/research/renamed.txt # changing permissions on a file chmod ugo+x myProg # myProg should be compiled code or a shell script chmod ugo+rw code.q chmod go-w myThesisCode.q zip files.zip a.txt b.txt c.txt gzip a.txt b.txt c.txt # will create a.txt.gz, b.txt.gz, c.txt.gz tar -cvf files.tar myDirectory tar -cvzf files.tgz myDirectory tar -xvf files.tar tar -xvzf files.tgz gzip -cd file.gz | less zcat file.zip | less ########################################## # 4: A variety of UNIX tools/capabilities ########################################## man cp which R which matlab which maple emacs -nw file.txt # let's play around with some of the keystrokes in emacs /proc/meminfo /proc/cpuinfo /etc/issue grep processor /proc/cpuinfo ########################################## # UNIT 2: The bash Shell ########################################## ################ # 1 Shell basics ################ echo $SHELL tcsh exit which bash # create a shell script and try to execute it; the first line tells the operating system what shell to use to run the script #! /bin/bash # ls -al *pdf > myPdfs.txt # we might need to change the permissions (recall from Unit 1) ##################### # 3 Command history ##################### !ls !-1 !-1:p ##################### # 4 Utilities ##################### cd research/fusion/hei/write/revisedReport grep pdf *q cd ~/Desktop/243 # in R # for(i in 1:10000) write(mean(rpois(100, 1)), file = 'clt.txt', append = TRUE) tail -f clt.txt grep com websites.txt # any problems? can we be a bit smarter? ##################### # 5 Redirection ##################### # ls ls | head -5 cd ~/Desktop/243 cut -d',' -f2 mileage2009.csv | sort | uniq | wc cut -d',' -f2 mileage2009.csv | sort | uniq | nl # you won't be able to replicate this as it uses files on my desktop cut -b1,2,3,4,5,6,7,8,9,10,11,29,37,45,53,61,69,77,85,93,101,109,117,125,133,141,149,157,165,173,181,189,197,205,213,221,229,237,245,253,261,269 AE000041196.dly | grep "M" | more cut -b1,2,3,4,5,6,7,8,9,10,11,29,37,45,53,61,69,77,85,93,101,109,117,125,133,141,149,157,165,173,181,189,197,205,213,221,229,237,245,253,261,269 USC*.dly | grep "M" | more # you won't be able to replicate this as it uses files in my home directory cd ~/research/fusion/hei/write/finalReport ls -t *.{R,r,q} | head -4 grep pdf `ls -t *.{R,r,q} | head -4` files=$(ls) echo $files #################### # 6 Wildcards #################### cd ~/Desktop/243 ls *{pdf,R} # in my home directory cd ~/teaching/243/lectures ls *[0-9]* ls *[!~#] # don't show automatically-generated backup files echo cp filename{,old} ##################### # 7 Job control ##################### R # i = 0; while(i < 1) print(i) C-c C-\ R --no-save < code.q >& code.Rout & # let's parse what this will do ssh arwen ps -aux | grep R # in R # for(i in 1:1000){ # x = matrix(rnorm(5000*5000), nr = 5000, nc = 5000) # y = crossprod(x) # } nice -19 R CMD BATCH file.r Rout # monitor on top and watch CPU and memory use # notice the priority is 39 = 20 + 19 ######################## # 8 Aliases ######################## alias ls="ls -F" ls \ls # here are some aliases in my .bashrc alias q="exit" alias tf="tail -f" alias m="less" alias res="cd ~/research" alias todo="emacs ~/todo &" alias r="R --no-save" alias myjobs="ps -eafl | grep paciorek" alias scf="ssh -X legolas.berkeley.edu" ######################### # 9 Shell variables ######################### name="chris" echo $name env echo $HOSTNAME echo $HOME cd lectures export CDPATH=.:~/research:~/teaching:~/teaching/243 cd lectures # I put the following in my .bashrc export PS1="\u@\h:\w> " ######################### # 10 Functions ######################### function putscf() { scp $1 paciorek@bilbo.berkeley.edu:~/$2 } putscf file.txt teaching/243/lectures/garbage.txt # a few functions from my .bashrc function mounts(){ # remotely mount filesystems I have access to sshfs carver.nersc.gov /accounts/gen/vis/paciorek/nersc sshfs bilbo.berkeley.edu: /accounts/gen/vis/paciorek/scf } function a(){ acroread $1& } function putweb() { scp $1 paciorek@bilbo.berkeley.edu:/mirror/data/pub/users/paciorek/$2 } function e() { emacs $1 & } function enw() { emacs -nw $1 } function l2h(){ latex2html $1.tex -local_icons -long_titles 5 } ######################### # 11 If/then/else ######################### # niceR shortcut for nicing R jobs # usage: niceR inputRfile outputRfile # Author: Brian Caffo (Johns Hopkins Biostat) # Date: 10/01/03 function niceR(){ # submits nice'd R jobs # syntax of a function call: niceR file.r Rout if [ $# != "2" ] then echo "usage: niceR inputRfile outputfile" elif [ -e "$2" ] then echo "$2 exists, I won't overwrite" elif [ ! -e "$1" ] then echo "inputRfile $1 does not exist" else echo "running R on $1" nice -n 19 R --no-save < $1 &> $2 fi } ######################### # 12 For loops ######################### for file in $(ls *txt) do mv $file ${file/.txt/.q} # this syntax replaces .txt with .q in $file done # example of bash for loop and wget for downloading a collection of files on the web IFS=: # internal field separator mths=jan:feb:mar:apr # alternatively I could do mths="jan feb mar apr" and not set IFS for ((yr=1910; yr<=1920; yr++)) do for mth in $mths do wget ftp://ftp3.ncdc.noaa.gov/pub/data/3200/${yr}/3200${mth}${yr} done done # if I want to do some post-processing, do the following instead IFS=: # internal field separator mths=jan:feb:mar:apr for ((yr=1910; yr<=1920; yr++)) do for mth in $mths do wget ftp://ftp3.ncdc.noaa.gov/pub/data/3200/${yr}/3200${mth}${yr} grep PRCP 3200${mth}${yr} >> master${mth} # what does this do? rm 3200${mth}${yr} # clean up extraneous files done done # example of bash for loop for starting jobs n=100 # if I want to be able to vary n from outside the R program for(( it=1; it<=100; it++)); do echo "n=$n; it=$it; source('base.q')" > tmp-$n-$it.q R CMD BATCH --no-save tmp-$n-$it.q sim-n$n-it$it.Rout& done # note that base.q should NOT set either 'n' or 'it', but should make use of them, including when creating unique names for output files for each iteration # example of bash for loop and wget for downloading a collection of files on the web # usage: ./forloopDownload.sh # Author: Chris Paciorek # Date: July 28, 2011 IFS=: # internal field separator mths=jan:feb:mar:apr # I don't know how to specify a vector of characters in bash, so I'll get around this using the : as a delimiter and the bash for loop will automatically do the looping over the elements for ((yr=1910; yr<=1920; yr++)) do for mth in $mths do wget ftp://ftp3.ncdc.noaa.gov/pub/data/3200/${yr}/3200${mth}${yr} done done # if I want to do some post-processing, do the following instead IFS=: # internal field separator mths=jan:feb:mar:apr # I don't know how to specify a vector of characters in bash, so I'll get around this using the : as a delimiter and the bash for loop will automatically do the looping over the elements for ((yr=1910; yr<=1920; yr++)) do for mth in $mths do wget ftp://ftp3.ncdc.noaa.gov/pub/data/3200/${yr}/3200${mth}${yr} grep PRCP 3200${mth}${yr} >> master${mth} # what does this do? rm 3200${mth}${yr} # clean up extraneous files done done # example of bash for loop for starting jobs # usage: ./forloopJobs.sh # Author: Chris Paciorek # Date: July 28, 2011 n=100 # if I want to be able to vary n from outside the R program for(( it=1; it<=100; it++)); do echo "n=$n; it=$it; source('base.q')" > tmp-$n-$it.q R CMD BATCH --no-save tmp-$n-$it.q sim-n$n-it$it.Rout done # note that base.q should NOT set either 'n' or 'it'# niceR shortcut for nicing R jobs # usage: niceR inputRfile outputRfile # Author: Brian Caffo # Date: 10/01/03 function niceR(){ # submits nice'd R jobs if [ $# != "2" ] then echo "usage: niceR inputRfile outputfile" elif [ -e "$2" ] then echo "$2 exists, I won't overwrite" elif [ ! -e "$1" ] then echo "inputRfile $1 does not exist" else echo "running R on $1" nice -n 19 R --no-save < $1 &> $2 fi } ################################################## ### Demo code for Unit 3 of Stat243, "Using R" ### Chris Paciorek, September 2012 ################################################## ############################# # 1: Some basic functionality ############################# system("ls -al") # knitr/Sweave doesn't seem to show the output of system() files <- system("ls", intern = TRUE) files[1:5] file.exists("file.txt") list.files("~/research") Sys.info() # options() # this would print out a long list of options options()[1:5] options()[c('width', 'digits')] options(width = 120) # often nice to have more characters on screen options(width = 60) # for purpose of making the pdf of this document options(max.print = 5000) options(digits = 3) a <- 0.123456; b <- 0.1234561 a; b; a == b ############################# # 2: Packages ############################# library(fields) library(help = fields) # library() # I don't want to run this on SCF because so many are installed install.packages('fields', lib = '~/Rlibs') # ~/Rlibs needs to exist! search() # ls(pos = 7) # for the graphics package ls(pos = 7)[1:5] # just show the first few ############################# # 3: Objects ############################# # 3.1 Classes of objects vec = c("Sam", "0923", "Sam9", "Sam is", "Sam\t9", "Sam's the man.\nNo doubt.\n") cat(vec[5]) cat(vec[6]) print(vec[5]) print(vec[6]) # 3.2 Assignment and coercion out = mean(rnorm(7)) # OK system.time(out = rnorm(10000)) # NOT OK, system.time expects its argument to be a complete R expression system.time(out <- rnorm(10000)) mean x <- 0; y <- 0 out <- mean(x = c(3,7)) # the usual way to pass an argument to a function out <- mean(x <- c(3,7)) # this is allowable, though perhaps not useful; what does it do? out <- mean(y = c(3,7)) out <- mean(y <- c(3,7)) mat <- matrix(c(1, NA, 2, 3), nrow = 2, ncol = 2) apply(mat, 1, sum.isna <- function(vec) {return(sum(is.na(vec)))}) apply(mat, 1, sum.isna = function(vec) {return(sum(is.na(vec)))}) # NOPE vals <- c(1, 2, 3) class(vals) vals <- 1:3 class(vals) vals <- c(1L, 2L, 3L) vals class(vals) as.character(c(1,2,3)) as.numeric(c("1", "2.73")) as.factor(c("a", "b", "c")) x <- rnorm(5) x[3] <- 'hat' # What do you think is going to happen? indices = c(1, 2.73) myVec = 1:10 myVec[indices] # 3.3 Type vs. class a <- data.frame(x = 1:2) class(a) typeof(a) m <- matrix(1:4, nrow = 2) class(m) typeof(m) me <- list(firstname = 'Chris', surname = 'Paciorek') class(me) <- 'personClass' # it turns out R already has a 'person' class defined class(me) is.list(me) typeof(me) typeof(me$firstname) # 3.4 Information about objects is(me, 'personClass') str(me) attributes(me) mat <- matrix(1:4, 2) class(mat) typeof(mat) length(mat) # recall that a matrix can be thought of as a vector with dimensions attributes(mat) dim(mat) x <- rnorm(10 * 365) qs <- quantile(x, c(.025, .975)) qs qs[1] + 3 names(qs) <- NULL qs row.names(mtcars)[1:6] names(mtcars) mat <- data.frame(x = 1:2, y = 3:4) row.names(mat) <- c("first", "second") mat vec <- c(first = 7, second = 1, third = 5) vec['first'] # 3.5 The workspace objects() # what objects are in my workspace identical(ls(), objects()) # synonymous dat <- 7; dat2 <- 9; subdat <- 3; obj <- 5 objects(pattern = "^dat") rm(dat2, subdat) rm(list = c("dat2", "subdat")) # a bit confusing - the 'list' argument should be a character vector rm(list = ls(pattern = "^dat")) exists('dat') # can be helpful when programming dat <- rnorm(500000) object.size(dat) print(object.size(dat), units = "Mb") # this seems pretty clunky! rm(list = ls()) # what does this do? # 3.6 Missing values 3 / Inf 3 + Inf 0/0 x <- c(3, NA, 5, NaN) is.na(x) x == NA x[is.na(x)] <- 0 x # 3.7 Some other details rnorm(10) # .Last.value # this should return the 10 random normals but knitr is messing things up, commented out here x <- 1e5 log10(x) y <- 100000 x <- 1e-8 ?lm # or help(lm) help.search('lm') apropos('lm') help('[[') # operators are functions too args(lm) ch1 <- "Chris's\n" ch2 <- 'He said, "hello."\n' ch3 <- "He said, \"hello.\"\n" ################################## # 4: Working with data structures ################################## # 4.1 Lists and dataframes x <- list(a = 1:2, b = 3:4, sam = rnorm(4)) x[[2]] # extracts the indicated component, which can be anything, in this case just an integer vector x[c(1, 3)] # extracts subvectors, which since it is a list, will also be a list lapply(x, length) sapply(x, length) # returns things in a user-friendly way apply(CO2, 2, class) # hmmm sapply(CO2, class) params <- list( a = list(mn = 7, sd = 3), b = list(mn = 6, sd =1), c = list(mn = 2, sd = 1)) sapply(params, '[[', 1) unlist(x) tapply(mtcars$mpg, mtcars$cyl, mean) tapply(mtcars$mpg, list(mtcars$cyl, mtcars$gear), mean) aggregate(mtcars, list(cyl = mtcars$cyl), mean) # this uses the function on each column of mtcars, split by the 'by' argument by(warpbreaks, warpbreaks$tension, function(x) {lm(breaks ~ wool, data = x)}) split(mtcars, mtcars$cyl) myList <- list(a = 1:3, b = 11:13, c = 21:23) rbind(myList$a, myList$b, myList$c) rbind(myList) do.call(rbind, myList) # 4.2 Vectors and matrices mat <- matrix(rnorm(500), nr = 50) identical(mat[1:50], mat[ , 1]) identical(mat[1:10], mat[1, ]) vec <- c(mat) mat2 <- matrix(vec, nr = 50) identical(mat,mat2) matrix(1:4, 2, byrow = TRUE) mat <- matrix(1:8, 2) mat[1, ] <- c(1,2) mat mat[1, ] <- 1:3 rep(seq(1, 9, by = 2), each = 2) rep(seq(1, 9, by = 2), times = 2) # repeats the whole vector 'times' times rep(seq(1, 9, by = 2), times = 1:5) # repeats each element based on 'times' expand.grid(x = 1:2, y = -1:1, theta = c("lo", "hi")) x <- c(1, 10, 2, 9, 3, 8) which(x < 3) set.seed(0) vec <- rnorm(8); mat <- matrix(rnorm(9), 3) vec; mat vec[vec < 0] vec[vec < 0] <- 0 vec mat[mat[ , 1] < 0, ] # similarly for data frames mat[mat[ , 1] < 0, 2:3] # similarly for data frames mat[ , mat[1, ] < 0] mat[mat[ , 1] < 0, 2:3] <- 0 set.seed(0) # so we get the same vec as we had before vec <- rnorm(8) wh <- which(vec < 0) logicals <- vec < 0 logicals; wh identical(vec[wh], vec[logicals]) vec <- c(1L, 2L, 1L) is.integer(vec) vec[vec == 1L] # in general, not safe with numeric vectors vec[vec != 3L] # nor this # subsetting based on a 2-column matrix of {row,column} indices mat <- matrix(rnorm(25), 5) rowInd <- c(1, 3, 5); colInd <- c(1, 1, 4) mat[cbind(rowInd, colInd)] # extracts the (1,1), (3,1) and (5,4) elements x <- matrix(1:6, nr = 2) x apply(x, 1, min) # by row apply(x, 2, min) # by column x <- array(1:24, c(2, 3, 4)) apply(x, 2, min) # for(j in 1:3) print(min(x[ , j, ])) apply(x, c(2, 3), min) # for(j in 1:3) {for(k in 1:4) print(min(x[ , j, k]))} n <- 500000; nr <- 1000; nCalcs <- n/nr mat <- matrix(rnorm(n), nr = nr) times <- 1:nCalcs system.time(out1 <- apply(mat, 1, function(vec) {mod = lm(vec ~ times); return(mod$coef[2])})) system.time({out2 = rep(NA, nCalcs); for(i in 1:nCalcs){out2[i] = lm(mat[i, ] ~ times)$coef[2]}}) # 4.3 Linear algebra X <- matrix(rnorm(9), 3) X X[col(X) >= row(X)] diag(X) diag(X) <- 1 X d <- diag(c(rep(1, 2), rep(2, 2))) d X %*% Y # matrix multiplication X * Y # direct product x %o% y # outer product of vectors x, y: x times t(y) outer(x, y) # same thing outer(x, y, function(x, y) cos(y)/(1 + x^2)) # evaluation of f(x,y) for all pairs of x,y values crossprod(X, Y) # same as but faster than t(X) %*% Y! solve(X) # inverse of X solve(X, y) # (inverse of X) %*% y ################################## # 5: Functions ################################## # 5.1 Inputs args(lm) pplot <- function(x, y, pch = 16, cex = 0.4, ...) { plot(x, y, pch = pch, cex = cex, ...) } myFun <- function(...){ print(..2) args <- list(...) print(args[[2]]) } myFun(1,3,5,7) args(dgamma) tmpf = function(x, shape, rate = 1, scale = 1/rate, log = FALSE){ print(shape) print(rate) print(scale) } tmpf(1, 2, rate = 3) tmpf(1, 2, scale = 5) dgamma(1, 2, scale = 5) # does it matter that the rate and scale are inconsistent? dgamma # why can't we use "rate = 1/scale, scale = 1/rate"? or can we? mat <- matrix(1:9, 3) apply(mat, 2, function(vec) vec - vec[1]) apply(mat, 1, function(vec) vec - vec[1]) # explain why the result is transposed f <- function(x, y = 2, z = 3 / y) { x + y + z } args(f) formals(f) class(formals(f)) match.call(mean, quote(mean(y, na.rm = TRUE))) # what do you think quote does? f <- function(){ oldpar <- par() par(cex = 2) # body of code par() <- oldpar } # 5.2 Outputs f <- function(x){ invisible(x^2) } f(3) a <- f(3) a mod <- lm(mpg ~ cyl, data = mtcars) class(mod) is.list(mod) # 5.3 Variable scope x <- 3 f <- function() {x <- x^2; print(x)} f(x) x # what do you expect? f <- function() {assign('x', x^2, env = .GlobalEnv)} # careful, this could be dangerous as a variable is changed as a side effect x <- 3 f <- function() { f2 <- function() { print(x) } f2() } f() # what will happen? f <- function() { f2 <- function() { print(x) } x <- 7 f2() } f() # what will happen? f2 <- function() print(x) f <- function() { x <- 7 f2() } f() # what will happen? y <- 100 f <- function(){ y <- 10 g <- function(x) x + y return(g) } h <- f() h(3) z <- 3 x <- 100 f <- function(x, y = x*3) {x+y} f(z*5) ############################################# # 6: Environments and frames ############################################# # 6.1 Frames and the call stack sys.nframe() f <- function() cat('Frame number is ', sys.nframe(), '; parent is ', sys.parent(), '.\n', sep = '') f() f2 <- function() f() f2() # exploring functions that give us information the frames in the stack g <- function(y) { gg <- function() { # this gives us the information from sys.calls(), sys.parents() and sys.frames() as one object print(sys.status()) } if(y > 0) g(y-1) else gg() } g(3) # 6.2 Environments and the search path search() searchpaths() x <- .GlobalEnv parent.env(x) while (environmentName(x) != environmentName(emptyenv())) { print(environmentName(parent.env(x))) x <- parent.env(x) } ls(pos = 7)[1:5] # what does this do? ls("package:graphics")[1:5] environment(lm) lm <- function() {return(NULL)} # this seems dangerous but isn't x <- 1:3; y <- rnorm(3); mod <- lm(y ~ x) mod <- get('lm', pos = 'package:stats')(y ~ x) mod <- stats::lm(y ~ x) # an alternative # 6.3 with() and within() with(mtcars, cyl*mpg) new.mtcars <- within(mtcars, crazy <- cyl*mpg) names(new.mtcars) #################################################### # 7: Flow control and logical operations #################################################### # 7.1 Logical operators x <- rnorm(10) x[x > 1 | x < -1] x <- 1:10; y <- c(rep(10, 9), NA) x > 5 | y > 5 # note that TRUE | NA evaluates to TRUE a <- 7; b <- NULL a < 8 | b > 3 a < 8 || b > 3 a <- c(0, 3); b <- c(4, 2) if(a < 7 & b < 7) print('this is buggy code') if(a < 7 && b < 7) print('this is buggy code too, but runs w/o warnings') if(a[1] < 7 && b[1] < 7) print('this code is correct and the condition is TRUE') a <- 7; b <- 5 !(a < 8 && b < 6) # 7.2 If statements x <- 5 if(x > 7){ x <- x + 3 } else{ x <- x - 3 } if(x > 7) x <- x + 3 else x <- x - 3 x <- -3 if (x > 7){ x <- x + 3 print(x) } else if (x > 4){ x <- x + 1 print(x) } else if (x > 0){ x <- x - 3 print(x) } else{ x <- x - 7 print(x) } if(x > 7) { statement1 } # what happens at this point? else{ # what happens now? statement2 } x <- rnorm(6) truncx <- ifelse(x > 0, x, 0) truncx vals <- c(1, 2, NA); eps <- 1e-9 # now pretend vals comes from some other chunk of code that we don't control if(min(vals) > eps) { print(vals) } # not good practice minval <- min(vals) if(!is.na(minval) && minval > eps) { print(vals) } # better practice # 7.3 switch() x <- 2; y <- 10 switch(x, log(y), sqrt(y), y) center <- function(x, type){ switch(type, mean = mean(x), # make sure to use = and not <- median = median(x), trimmed = mean(x, trim = .1)) } x <- rgamma(100, 1) center(x, 'median') center(x, 'mean') # 7.4 Loops nIts <- 500; means <- rep(NA, nIts) for(it in 1:nIts){ means[it] <- mean(rnorm(100)) if(identical(it %% 100, 0)) cat('Iteration', it, date(), '\n') } for(state in c('Ohio', 'Iowa','Georgia')) print(state.x77[row.names(state.x77) == state, "Income"]) for(i in 1:10) i for(i in 1:10){ if(i == 5) break print(i) } for(i in 1:5){ if(i == 2) next print(i) } # while loop example - MLE for zero-truncated Poisson, from p. 59 of Venables and Ripley, 4th edition yp = rpois(50, lambda = 1) y = yp[yp > 0] ybar = mean(y) lam = ybar it = 0 del = 1 tol = 0.0001 while(abs(del) > tol && (it <- it + 1) < 10){ # let's parse this del = (lam - ybar * (1- exp(-lam)))/(1 - ybar * exp(-lam)) # adjustment to parameter estimate lam = lam - del # new parameter value cat(it, lam, "\n") } mat <- matrix(1:4, 2) submat <- mat[mat[1, ] > 5] for(i in 1:nrow(submat)) print(i) ##################### # 8: Formulas ##################### resp <- 'y ~' covTerms <- 'x1' for(i in 2:5){ covTerms <- paste(covTerms, '+ x', i, sep='')} form <- as.formula(paste(resp, covTerms, sep = '')) # lm(form, data = dat) form class(form) resp <- 'y ~' covTerms <- paste('x', 1:5, sep = '', collapse = ' + ') form <- as.formula(paste(resp, covTerms)) form # lm(form, data = dat) ############################## # 9: Data storage and formats ############################## text <- "_Melhore sua seguran\xe7a_" iconv(text, from = "latin1", to = "UTF-8") iconv(text, from = "latin1", to = "ASCII", sub = "???") ########################################### # 10: Reading data from text files into R ########################################### ?read.table setwd('~/Desktop/243/data') dat <- read.table('RTAData.csv', sep = ',', head = TRUE) # this is data from a kaggle.com competition on road traffic on an Australian freeway lapply(dat, class) levels(dat[ ,2]) dat2 <- read.table('RTAData.csv', sep = ',', head = TRUE, na.strings = c("NA", "x"), stringsAsFactors = FALSE) unique(dat2[ ,2]) # hmmm, what happened to the blank values this time? which(dat[ ,2] == "") dat2[which(dat[, 2] == "")[1], ] # deconstruct it! sequ <- read.table('hivSequ.csv', sep = ',', header = T, colClasses = c('integer','integer','character','character','numeric','integer')) # this is data from a kaggle.com competition on HIV sequences and health outcomes # let's make sure the coercion worked - sometimes R is obstinant lapply(sequ, class) # this makes use of the fact that a data frame is a list dat <- readLines('~/Desktop/243/data/precip.txt') # this is daily precipitation amounts from US weather stations, Feb. 2010 id <- as.factor( substring(dat, 4, 11) ) year <- substring(dat, 17, 20) class(year) year <- as.integer(substring(dat, 18, 21)) month <- as.integer(substring(dat, 22, 23)) nvalues <- as.integer(substring(dat, 28, 30)) dat <- readLines(pipe("ls -al")) dat <- read.table(pipe("unzip dat.zip")) dat <- read.csv(gzfile("dat.csv.gz")) dat <- readLines("http://www.stat.berkeley.edu/~paciorek/index.html") con <- file("~/Desktop/243/data/precip.txt", "r") # "r" for 'read' - you can also open files for writing with "w" (or "a" for appending) class(con) blockSize <- 1000 # obviously this would be large in any real application nLines <- 300000 for(i in 1:ceiling(nLines / blockSize)){ lines <- readLines(con, n = blockSize) # manipulate the lines and store the key stuff } close(con) dat <- readLines('~/Desktop/243/data/precip.txt') con <- textConnection(dat[1], "r") read.fwf(con, c(3,8,4,2,4,2)) ################################################## # 11: Text manipulations and regular expressions ################################################## # 11.1 Basic text manipulation out <- paste("My", "name", "is", "Chris", ".", sep = " ") paste(c("My", "name", "is", "Chris", "."), collapse = " ") strsplit(out, split = ' ') vars <- c("P", "HCA24", "SOH02") substring(vars, 2, 3) vars <- c("date98", "size98", "x98weights98", "sdfsd") grep("98", vars) gregexpr("98", vars) gsub("98", "04", vars) # 11.2 Regular expressions (regexp) addresses <- c("john@att.com", "stat243@bspace.berkeley.edu", "john_smith@att.com") grep("[[:digit:]_]", addresses) text <- c("john","jennifer pierce","Juan carlos rey") grep("^[[:upper:]]", text) # finds text that starts with an upper case letter grep("[[:digit:]]$", text) # finds text with a number at the end grep("[ \t]", text) gregexpr("[ \t]", text) gsub("^j", "J", text) text <- c("hi John", "V1@gra", "here's the problem set") grep("[[:alpha:]]+[[:digit:][:punct:]]+[[:alpha:]]*", text) # ok, we need to do a bit more work... grep("([[:digit:]]{1,3}\\.){3}[[:digit:]]{1,3}", text) text <- ('"H4NY07011","ACKERMAN, GARY L.","H","$13,242",,,') gsub("([^\",]),", "\\1", text) gregexpr("(http|ftp):\\/\\/", c("at the site http://www.ibm.com", "other text")) text <- "Students may participate in an internship in place of one of their courses." gsub("<.*>", "", text) gregexpr("[[:space:]]+.+?[[:space:]]+", "the dog jumped over the blue moon") line <- "a dog\tjumped\nover \tthe moon." cat(line) strsplit(line, split = "[[:space:]]+") strsplit(line, split = "[[:blank:]]+") ################################################## # 12: Manipulating dates ################################################## date1 <- as.Date("03-01-2011", format = "%m-%d-%Y") date2 <- as.Date("03/01/11", format = "%m/%d/%y") date3 <- as.Date("05-May-11", format = "%d-%b-%y") date1 date1 + 42 date3 - date1 julianValues <- c(1, 100, 1000) as.Date(julianValues, origin="1990-01-01") library(chron) d1 <- chron("12/25/2004", "11:37:59") # default format of m/d/Y and h:m:s d2 <- as.chron(date1) d1; d2 ################################################## # 13: Database manipulations ################################################## # 13.1 Databases drv <- dbDriver("SQLite") con <- dbConnect(drv, dbname = "myDatabase") # so we're using a connection again mars <- dbReadTable(con, "MarsData") dbDisconnect(con) myQuery <- "SELECT City, State FROM Station WHERE Lat_N > 39.7" marsSub <- dbGetQuery(con, myQuery) # 13.2 Dataset manipulations in R sum(mtcars$cyl == 6) # counts the number of TRUE values mean(mtcars$cyl == 6) # proportion of TRUE values unique(mtcars$cyl) length(unique(mtcars$cyl)) duplicated(mtcars$cyl) # tells us which elements are repeated table(mtcars$gear) # tabulates # of records in each unique value of mtcars$gear table(mtcars$cyl, mtcars$gear) # cross-tabulation by two variables table(mtcars[c("cyl", "vs", "am", "gear")]) ftable(mtcars[c("cyl", "vs", "am", "gear")]) mtcarsSorted <- mtcars[order(mtcars$cyl, mtcars$mpg), ] head(mtcarsSorted) load("~/Desktop/243/data/exampleMerge.RData") fits2 <- merge(fits, master, by.x = 'stn', by.y = 'coop', all.x = TRUE, sort = FALSE) identical(fits$stn, fits2$stn) # hmmm...., ordering has not been preserved which(fits2$stn != fits$stn)[1:5] f.merge <- function(x, y, ...){ # merges two datasets but preserves the ordering based on the first dataset x$ordering <- 1:nrow(x) tmp <- merge(x, y, ...) tmp <- tmp[order(tmp$ordering), ] # sort by the indexing (note that some values may be missing if all.x = FALSE and some rows of x not matched in y) return(tmp[,!(names(tmp) == "ordering")]) } load('~/Desktop/243/data/prec.RData') prec <- prec[1:1000, ] # just to make the example code run faster precVars <- 5:ncol(prec) precStacked <- stack(prec, select = precVars) out <- unstack(precStacked) # to use reshape, we need a unique id for each row since reshape considers each row in the wide format as a subject prec <- cbind(unique = 1:nrow(prec), prec) precVars <- precVars + 1 precLong <- reshape(prec, varying = names(prec)[precVars], idvar = 'unique', direction = 'long', sep = '') precLong <- precLong[!is.na(precLong$prec), ] precWide <- reshape(precLong, v.names = 'prec', idvar = 'unique', direction = 'wide', sep='') x <- rnorm(100) f <- cut(x, breaks = c(-Inf, -1, 1, Inf), labels = c('low', 'medium', 'high')) levels(f) # note that f is not explicitly ordered f <- relevel(f, 'high') # puts high as first level f <- cut(x, breaks = c(-Inf, -1, 1, Inf), labels = c('low', 'medium', 'high'), ordered_result = TRUE) ############################### # 14: Output from R ############################### # 14.2 Printing to the screen or a file temps <- c(12.5, 37.234324, 1342434324.79997234, 2.3456e-6, 1e10) sprintf("%9.4f C", temps) val <- 1.5 cat('My value is ', val, '.\n', sep = '') print(paste('My value is ', val, '.', sep = '')) # input x <- 7 n <- 5 # display powers cat("Powers of", x, "\n") cat("exponent result\n\n") result <- 1 for (i in 1:n) { result <- result * x cat(format(i, width = 8), format(result, width = 10),"\n", sep = "") } x <- 7 n <- 5 # display powers cat("Powers of", x, "\n") cat("exponent result\n\n") result <- 1 for (i in 1:n) { result <- result * x cat(i, '\t', result, '\n', sep = '') } ################################################################# ### Demo code for Unit 5 of Stat243, "Debugging and Profiling" ### Chris Paciorek, September 2012 ################################################################# ################################ # 1: Comon syntax errors ################################ mat <- matrix(1:4, 2, 2)[1, ] dim(mat) print(mat) colSums(mat) mat <- matrix(1:4, 2, 2)[1, , drop = FALSE] ################################ # 2: Debugging strategies ################################ # 2.1 Basic strategies x <- 3 stopifnot(is(x, "matrix")) # finding what global variables are used library(codetools) findGlobals(lm) f <- function() {y = 3; print(x + y)} findGlobals(f) options()$warn x = 1:3 for(i in 1:100){ if(x > 2) print("hi") } warnings() options(warn = 1) for(i in 1:100){ if(x > 2) print("hi") } options(warn = 2) for(i in 1:100){ if(x > 2) print("hi") } i # 2.3 Using debug() # simple use of browser via debug() debug(lm) n <- nrow(mtcars) lm(mpg ~ disp, data = mtcars) ls() # use browser commands: c, n, Q # let's step through this with 'n' print(n) # typing 'n' won't work! undebug(lm) # 2.4 Tracing errors in the call stack library(MASS) gamma.est <- function(data) { # this fits a gamma distribution to a collection of numbers m <- mean(data) v <- var(data) s <- v/m a <- m/s return(list(a=a,s=s)) } calcVar = function(estimates){ var.of.ests <- apply(estimates, 2, var) return(((n-1)^2/n)*var.of.ests) } gamma.jackknife.2 <- function(data) { # jackknife the estimation n <- length(data) jack.estimates = gamma.est(data[-1]) for (omitted.point in 2:n) { jack.estimates = rbind(jack.estimates, gamma.est(data[-omitted.point])) } jack.var = calcVar(jack.estimates) return(sqrt(jack.var)) } # using traceback() gamma.jackknife.2(cats$Hwt) # jackknife gamma dist. estimates of cat heart weights traceback() # something is going on when we try to calculate the variance # using recover() upon an error options(error = recover) gamma.jackknife.2(cats$Hwt) # choose '2' to go into calcVar estimates apply(estimates, 2, var) # ok, so the error does occur here var(estimates[ ,1]) # can we use var() a single time? estimates[ ,1] class(estimates) options(error = NULL) # reset things # 2.5 Using trace() trace(calcVar, browser) calcVar gamma.jackknife.2(cats$Hwt) # at this point you can use 'n', 'c', 'Q' as with debug() ########################### # 3: Memory management ########################### # 3.2 Monitoring memory use gc() x = rnorm(1e8) # should use about 800 Mb object.size(x) gc() rm(x) gc() # note the “max used” column ########################### # 4: Timing/Benchmarking ########################### n <- 1000 x <- matrix(rnorm(n^2), n) system.time({mns <- rep(NA, n); for(i in 1:n) mns[i] <- mean(x[i , ])}) system.time(rowMeans(x)) # speed of one calculation n <- 1000 x <- matrix(rnorm(n^2), n) benchmark(crossprod(x), replications = 10, columns=c('test', 'elapsed', 'replications')) # comparing different approaches to a task benchmark( {mns <- rep(NA, n); for(i in 1:n) mns[i] <- mean(x[i , ])}, rowMeans(x), replications = 10, columns=c('test', 'elapsed', 'replications')) ########################### # 5: Profiling ########################### # here's an example of generating temporally correlated random variables makeTS = function(param, len){ times = seq(0, 1, length = len) dd = rdist(times) C = exp(-dd/param) U = chol(C) white = rnorm(len) return(crossprod(U, white)) } Rprof("makeTS.prof") makeTS(0.1, 2000) Rprof(NULL) summaryRprof("makeTS.prof") # this takes a bit of deciphering because most of the time is in calls to C and Fortran code ################################################## ### Demo code for Unit 6 of Stat243, "R programming" ### Chris Paciorek, September 2012 ################################################## ############################# # 1: Efficiency ############################# ### 1.1 Fast initialization options(width = 50) n <- 10000 x <- 1; system.time(for(i in 2:n) x <- c(x, i)) system.time({x <- rep(as.numeric(NA), n); for(i in 1:n) x[i] <- i}) n <- 1000000 system.time(x <- rep(as.numeric(NA), n)) system.time({x <- as.numeric(NA); length(x) <- n}) nr <- nc <- 2000 system.time(x <- matrix(as.numeric(NA), nr, nc)) system.time({x <- as.numeric(NA); length(x) <- nr * nc; dim(x) <- c(nr, nc)}) ### 1.2 Vectorized calculations n <- 1e6 x <- rnorm(n) system.time(x2 <- x^2) x2 <- as.numeric(NA) system.time({length(x2) <- n; for(i in 1:n){ x2[i] <- x[i]^2}}) # how many orders of magnitude slower? line <- c("Four score and 7 years ago, this nation") startIndices = seq(1, by = 3, length = nchar(line)/3) substring(line, startIndices, startIndices + 1) # how soon does the R code send stuff off to compiled C code? `+` mean() mean.default() chol.default() # a surprise where "manual" coding is faster than a standard function x <- rnorm(1000000) system.time(truncx <- ifelse(x > 0, x, 0)) system.time({truncx <- x; truncx[x < 0] <- 0}) system.time(truncx <- x * (x > 0)) # surprising case where vectorization with a for loop beats apply() rmRows1 = function(x){ omit = F n = ncol(x) for(i in 1:n) omit = omit | is.na(x[ , i]) x[!omit,] } rmRows2 = function(x){ sum.isna = function(row){ return(sum(is.na(row))) } n.isna = apply(x, 1, sum.isna) return(x[!n.isna, ]) } # this seems clever; does it improve things? rmRows3 = function(x){ return(x[!is.na(rowMeans(x)), ]) } ### 1.3 Using apply() and specialized functions n <- 3000; x <- matrix(rnorm(n * n), nr = n) system.time(out <- apply(x, 1, mean)) system.time(out <- rowMeans(x)) system.time(out <- sweep(x, 2, STATS = colMeans(x), FUN = "-")) system.time(out2 <- t(t(x) - colMeans(x))) identical(out, out2) ### 1.4 Matrix algebra efficiency mat <- matrix(rnorm(500*500), 500) system.time(apply(mat, 1, sum)) system.time(mat %*% rep(1, ncol(mat))) # rewritten as matrix algebra system.time(rowSums(mat)) # how do I take successive differences of columns of a matrix quickly? nc = 500 nr = 500 mat = matrix(rnorm(nr * nc), nr, nc) # try #1 via matrix algebra system.time({A <- matrix(0, nr = nc, nc = nc - 1); A[row(A) == col(A)] <- 1; A[row(A) == col(A) + 1] <- -1; res <- mat %*% A} ) # try #2 via a for loop system.time({res <- matrix(NA, nr, nc -1); for(j in 2:nc) res[ , j-1] = mat[ , j-1] - mat[ , j]}) # try #3 via direct matrix subtraction with clever subsetting system.time(res <- mat[ , -nc] - mat[ , -1]) n <- 5000 A <- matrix(rnorm(5000 * 5000), 5000) B <- matrix(rnorm(5000 * 5000), 5000) x <- rnorm(5000) system.time(res <- A %*% B %*% x) # how can I force an efficient order of operations? # how do I find a trace efficiently? system.time(sum(diag(A %*% B))) system.time(sum(A*t(B))) # avoid matrix algebra with diagonal matrices when vectorization is possible n <- 1000 X <- matrix(rnorm(n^2), n) diagvals <- rnorm(n) D = diag(diagvals) diag(X) <- diag(X) + diagvals tmp <- diagvals * X # instead of D %*% X tmp2 <- t(t(X) * diagvals) # instead of X %*% D ### 1.5 Fast mapping/lookup tables vals <- rnorm(10) names(vals) <- letters[1:10] labs <- c("h", "h", "a", "c") vals[labs] ### 1.6 Byte compiling library(compiler); library(rbenchmark) f <- function(x){ for(i in 1:length(x)) x[i] <- x[i] + 1 return(x) } fc <- cmpfun(f) fc # notice the indication that the function is byte compiled. benchmark(f(x), fc(x), replications = 5) ### 1.7 Challenges # Challenge 1 p = 5 mns = rnorm(p) sds = exp(rnorm(p)) n = 100000 y = rnorm(n) system.time({ lik <- matrix(NA, nr = n, nc = p); for(j in 1:p) lik[ , j] <- dnorm(y, mns[j], sds[j]) }) # can we do better? ################################################# # 2: Advanced topics in working with functions ################################################# ### 2.2 Alternatives to pass by value in R x <- rnorm(10) f <- function(input){ data <- input g <- function(param) return(param * data) } myFun <- f(x) rm(x) # to demonstrate we no longer need x myFun(3) x <- rnorm(1e7) myFun <- f(x) object.size(myFun) x <- rnorm(10) myFun2 <- with(list(data = x), function(param) return(param * data)) rm(x) myFun2(3) x <- rnorm(1e7) myFun2 <- with(list(data = x), function(param) return(param * data)) object.size(myFun2) ### 2.3 Operators a <- 7; b <- 3 # let's think about the following as a mathematical function # -- what's the function call? a + b `+`(a, b) myList = list(list(a = 1:5, b = "sdf"), list(a = 6:10, b = "wer")) myMat = sapply(myList, `[[`, 1) # note that the index "1" is the additional argument to the [[ function x <- 1:3; y <- c(100,200,300) outer(x, y, `+`) `%2%` <- function(a, b) {2 * (a + b)} 3 %2% 7 mat <- matrix(1:4, 2, 2) mat[ , 1] mat[ , 1, drop = FALSE] # what's the difference? ### 2.4 Unexpected functions and replacement functions if(x > 27){ print(x) } else{ print("too small") } diag(mat) <- c(3, 2) is.na(vec) <- 3 names(df) <- c('var1', 'var2') mat <- matrix(rnorm(4), 2, 2) diag(mat) <- c(3, 2) mat <- `diag<-`(mat, c(10, 21)) base::`diag<-` me <- list(name = 'Chris', age = 25) `name<-` <- function(obj, value){ obj$name <- value return(obj) } name(me) <- 'Christopher' ### 2.5 Functions as objects x <- 3 x(2) x <- function(z) z^2 x(2) myFun = 'mean'; x = rnorm(10) eval(as.name(myFun))(x) f <- function(fxn, x){ match.fun(fxn)(x) } f("mean", x) f <- function(x) x f2 <- function(x) y <- x^2 f3 <- function(x) {y <- x^2; z <- x^3; return(list(y, z))} class(f) typeof(body(f)); class(body(f)) typeof(body(f2)); class(body(f2)) typeof(body(f3)); class(body(f3)) f4 <- function(x, y = 2, z = 1/y) {x + y + z} args <- formals(f4) class(args) ### 2.6 Promises and lazy evaluation f <- function(a, b=c) { c <- log(a); return(a*b) } f(7) f <- function(x) print("hi") system.time(mean(rnorm(1000000))) system.time(f(3)) system.time(f(mean(rnorm(1000000)))) ############################################# # 3: Evaluating memory use ############################################# ### 3.1 Hidden uses of memory x <- rnorm(1e7) gc() dim(x) <- c(1e4, 1e3) diag(x) <- 1 gc() x <- rnorm(1e7) .Internal(inspect(x)) x[5] <- 7 .Internal(inspect(x)) gc() x <- rnorm(1e7) gc() y <- x[1:(length(x) - 1)] gc() ### 3.2 Passing objects to compiled code f <- function(x){ print(.Internal(inspect(x))) return(mean(x)) } x <- rnorm(1e7) class(x) debug(f) f(x) f(as.numeric(x)) f(as.integer(x)) ### 3.3 Lazy evaluation, delayed copying (copy-on-change) and memory use f <- function(x){ print(gc()) z <- x[1] .Internal(inspect(x)) return(x) } y <- rnorm(1e7) gc() .Internal(inspect(y)) out <- f(y) .Internal(inspect(out)) y <- rnorm(1e7) gc() .Internal(inspect(y)) x <- y gc() .Internal(inspect(x)) x[1] <- 5 gc() .Internal(inspect(x)) rm(x) x <- y .Internal(inspect(x)) .Internal(inspect(y)) y[1] <- 5 .Internal(inspect(x)) .Internal(inspect(y)) x <- rnorm(1e7) myfun <- function(y){ z <- y return(mean(z)) } myfun(x) x <- c(NA, x) myfun <- function(y){ return(mean(y, na.rm = TRUE)) } ### 3.5 Example fastcount <- function(xvar,yvar) { naline <- is.na(xvar) naline[is.na(yvar)] = TRUE xvar[naline] <- 0 yvar[naline] <- 0 useline <- !naline; # Table must be initialized for -1's tablex <- numeric(max(xvar)+1) tabley <- numeric(max(xvar)+1) stopifnot(length(xvar) == length(yvar)) res <- .C("fastcount",PACKAGE="GCcorrect", tablex = as.integer(tablex),tabley = as.integer(tabley), as.integer(xvar), as.integer(yvar),as.integer(useline), as.integer(length(xvar))) xuse <- which(res$tablex>0) xnames <- xuse - 1 resb <- rbind(res$tablex[xuse], res$tabley[xuse]) colnames(resb) <- xnames return(resb) } ############################################## # 4: Object-oriented programming (OOP) ############################################## ### 4.1 S3 approach library(methods) yb <- sample(c(0, 1), 10, replace = TRUE) yc <- rnorm(10) x <- rnorm(10) mod1 <- lm(yc ~ x) mod2 <- glm(yb ~ x, family = binomial) class(mod1) class(mod2) is.list(mod1) names(mod1) is(mod2, "lm") # methods associated with lm methods(class = "lm") # creating our own class me <- list(firstname = 'Chris', surname = 'Paciorek', age = NA) class(me) <- 'indiv' # there is already a 'person' class in R indiv <- function(firstname = NA, surname = NA, age = NA){ # constructor for 'indiv' class obj <- list(firstname = firstname, surname = surname, age = age) class(obj) <- 'indiv' return(obj) } me <- indiv('Chris','Paciorek') class(me) <- "silly" class(me) <- "indiv" mod <- lm(yc ~ x) summary(mod) gmod <- glm(yb ~ x, family = 'binomial') summary(gmod) mean methods(mean) # creating a generic summarize <- function(object, ...) UseMethod("summarize") # creating a specific method for a class summarize.indiv <- function(object) return(with(object, cat("Individual of age ", age, " whose name is ", firstname, " ", surname, ".\n",sep = ""))) summarize(me) out <- summary(mod) out print(out) getS3method(f="print",class="summary.lm") class(me) <- c('BerkeleyIndiv', 'indiv') summarize(me) # class-specific operators methods(`+`) `+.indiv` <- function(object, incr) { object$age <- object$age + incr return(object) } old.me <- me + 15 `age<-` <- function(x, ...) UseMethod("age<-") `age<-.indiv` <- function(object, value){ object$age <- value return(object) } age(old.me) <- 60 ### 4.2 S4 Approach library(methods) setClass("indiv", representation( name = "character", age = "numeric", birthday = "Date" ) ) me <- new("indiv", name = 'Chris', age = 20, birthday = as.Date('91-08-03')) # next notice the missing age slot me <- new("indiv", name = 'Chris', birthday = as.Date('91-08-03')) # finally, apparently there's not a default object of class Date me <- new("indiv", name = 'Chris', age = 20) me me@age <- 60 # S4 methods are designed to be more structured than S3 with careful checking setValidity("indiv", function(object) { if(!(object@age > 0 && object@age < 130)) return("error: age must be between 0 and 130") if(length(grep("[0-9]", object@name))) return("error: name contains digits") return(TRUE) # what other validity check would make sense given the slots? } ) me <- new("indiv", name = "5z%a", age = 20, birthday = as.Date('91-08-03')) me <- new("indiv", name = "Z%a B''*", age = 20, birthday = as.Date('91-08-03')) me@age <- 150 # so our validity check is not foolproof # generic method - needed if generic is not already defined setGeneric("voter", function(object, ...) { standardGeneric("voter") }) # class-specific method voter.indiv = function(object){ if(object@age > 17){ cat(object@name, "is of voting age.\n") } else cat(object@name, "is not of voting age.\n") } setMethod(voter, signature = c("indiv"), definition = voter.indiv) voter(me) setClass("BerkeleyIndiv", representation( CalID = "character" ), contains = "indiv" ) me <- new("BerkeleyIndiv", name = "Chris", age = 20, birthday = as.Date('91-08-03'), CalID = "01349542") voter(me) is(me, "indiv") showClass("data.frame") ### 4.3 Reference classes tsSimClass <- setRefClass("tsSimClass", fields = list( n = "numeric", times = "numeric", corMat = "matrix", lagMat = "matrix", corParam = "numeric", U = "matrix", currentU = "logical"), methods = list( initialize = function(times = 1:10, corParam = 1, ...){ # we seem to need default values for the copy() method to function properly require(fields) times <<- times # field assignment requires using <<- n <<- length(times) corParam <<- corParam currentU <<- FALSE calcMats() callSuper(...) # calls initializer of base class (envRefClass) }, calcMats = function(){ # Python-style doc string ' calculates correlation matrix and Cholesky factor ' lagMat <- rdist(times) # local variable corMat <<- exp(-lagMat / corParam) # field assignment U <<- chol(corMat) # field assignment cat("Done updating correlation matrix and Cholesky factor\n") currentU <<- TRUE }, changeTimes = function(newTimes){ times <<- newTimes calcMats() }, show = function(){ # 'print' method cat("Object of class 'tsSimClass' with ", n, " time points.\n", sep = '') } ) ) tsSimClass$methods(list( simulate = function(){ ' simulates random processes from the model ' if(!currentU) calcMats() return(crossprod(U, rnorm(n))) }) ) master <- tsSimClass$new(1:100, 10) master tsSimClass$help('calcMats') devs <- master$simulate() plot(master$times, devs, type = 'l') mycopy <- master myDeepCopy <- master$copy() master$changeTimes(seq(0,1, length = 100)) mycopy$times[1:5] myDeepCopy$times[1:5] master$field('times')[1:5] # the next line is dangerous in this case, since # currentU will no longer be accurate master$field('times', 1:10) ################################################### ### 5: Creating and working in an environment ################################################### e <- new.env() assign('x', 3, envir = e) # same as e$x = 3 e$x get('x', envir = e, inherits = FALSE) # the FALSE avoids looking for x in the enclosing environments e$y <- 5 objects(e) rm('x', envir = e) parent.env(e) myWalk <- new.env(); myWalk$pos = 0 nextStep <- function(walk) walk$pos = walk$pos + sample(c(-1, 1), size = 1) nextStep(myWalk) # if we didn't use an environment, here's how we might do it; note the somewhat awkward reassignment myWalk = list(pos = 0) nextStep = function(walk){ walk$pos = walk$pos + sample(c(-1, 1), size = 1) return(walk) } myWalk = nextStep(myWalk) eval(quote(pos <- pos + sample(c(-1, 1), 1)), envir = myWalk) evalq(pos <- pos + sample(c(-1, 1), 1), envir = myWalk) ###################################### ### 6: Computing on the language ###################################### ### 6.1 The R interpreter plot.xy `%*%` ### 6.2 Parsing code and understanding language objects obj <- quote(if (x > 1) "orange" else "apple") as.list(obj) class(obj) weirdObj <- quote(`if`(x > 1, 'orange', 'apple')) identical(obj, weirdObj) x <- 3; typeof(quote(x)) myExpr <- expression(x <- 3) eval(myExpr) typeof(myExpr) # here's what I did to gather the information for the Table in the notes obj1 = function() {} obj2 = quote(x) obj3 = expression(x <- 3) obj4 = quote(f()) obj5 = quote(if(x < 3) y = 5) obj6 = quote(x <- 3) class(obj1); typeof(obj1) class(obj2); typeof(obj2) class(obj3); typeof(obj3) class(obj4); typeof(obj4) class(obj5); typeof(obj5) class(obj6); typeof(obj6) is.call(obj6) is.language(obj2) # exploring language objects e0 <- quote(3) e1 <- expression(x <- 3) e1m <- expression({x <- 3; y <- 5}) e2 <- quote(x <- 4) e3 <- quote(rnorm(3)) class(e0); typeof(e0) class(e1); typeof(e1) class(e1[[1]]); typeof(e1[[1]]) class(e1m); typeof(e1m) class(e2); typeof(e2) identical(e1[[1]], e2) class(e3); typeof(e3) e4 <- quote(-7) class(e4); typeof(e4) # huh? what does this imply? as.list(e4) eval(e1) eval(e2) e1mlist <- as.list(e1m) e2list <- as.list(e2) eval(as.call(e2list)) # here's how to do it if the language object is actually a list eval(as.expression(e1mlist)) # delving into the list-like structure of language objects e1 = expression(x <- 3) # e1 is one-element list with the element an object of class '<-' whose type is 'language' typeof(e1) # expression class(e1) # expression e1[[1]] as.list(e1[[1]]) lapply(e1[[1]], class) y = rnorm(5) e3 = quote(mean(y)) as.list(e3) class(e3[[2]]) # we have recursion e3 = quote(mean(c(12,13,15))) as.list(e3) as.list(e3[[2]]) ### 6.3 Manipulating the parse tree out <- quote(y <- 3) out[[3]] <- 4 eval(out) e1 <- quote(2 + 5) e2 <- quote(plot(x, y)) e2[[1]] <- `+` eval(e2) e1[[3]] <- e2 e1 class(e1[[3]]) # note the nesting eval(e1) # what should I get? codeText <- deparse(out) parsedCode <- parse(text = codeText) # works like quote except on the code in the form of a string eval(parsedCode) deparse(quote(if (x > 1) "orange" else "apple")) x3 <- 7 i <- 3 as.name(paste('x', i, sep='')) eval(as.name(paste('x', i, sep=''))) deparse(quote(x)) x3 <- 7 i <- 3 as.name(paste('x', i, sep='')) eval(as.name(paste('x', i, sep=''))) f <- function(obj){ objName <- deparse(substitute(obj)) print(objName) } f(y) ### 6.4 Parsing replacement expressions animals = c('cat', 'dog', 'rat','mouse') `[<-`(animals,4,'cat') animals '[<-'(animals,4,'dog') animals out1 = quote(animals[4] <- 'rat') out2 = quote(`<-`(animals[4], 'rat')) # same as out1 out3 = quote('[<-'(animals,4,'rat')) # same as out2 when evaluated (see below), # but parse tree is different (so parsing is not completely invertible) as.list(out1) as.list(out2) identical(out1, out2) as.list(out3) identical(out1, out3) typeof(out1[[2]]) # language class(out1[[2]]) # call eval(out1) animals animals[4] = 'dog' eval(out3) animals # both do the same thing # here's another example of a parsed replacement expression x = diag(rep(1, 3)) obj = quote(diag(x) <- 3) as.list(obj) class(obj[[2]]) ### 6.5 substitute() identical(quote(z <- x^2), substitute(z <- x^2)) e <- new.env(); e$x <- 3 substitute(z <- x^2, e) e$z <- 5 substitute(z <- x^2, e) f <- function(obj){ objName <- deparse(substitute(obj)) print(objName) } f(y) substitute(a + b, list(a = 1, b = quote(x))) e1 <- quote(x + y) e2 <- substitute(e1, list(x = 3)) e2 <- substitute(substitute(e, list(x = 3)), list(e = e1)) substitute(substitute(e, list(x = 3)), list(e = e1)) # so e1 is substituted as an evaluated object, which then allows for substitution for 'x' e2 eval(e2) ############################################################ ### Demo code for Unit 7 of Stat243, "Parallel processing" ### Chris Paciorek, October 2012 ############################################################ ################################# # 2: Parallelization ################################# ### 2.2 Threading X <- matrix(rnorm(8000^2), 8000) system.time({ X <- crossprod(X) # X^t X produces pos.def. matrix U <- chol(x)}) # U^t U = X # exit R, execute: "export OMP_NUM_THREADS=1", and restart R system.time({ X <- crossprod(X) U <- chol(X)}) ################################# # 3: Explicit parallel code in R ################################# ### 3.1 foreach require(parallel) # one of the core R packages require(doParallel) # require(multicore); require(doMC) # alternative to parallel/doParallel # require(Rmpi); require(doMPI) # when Rmpi is available as the back-end library(foreach) library(iterators) taskFun <- function(){ mn <- mean(rnorm(10000000)) return(mn) } nCores <- 4 registerDoParallel(nCores) # registerDoMC(nCores) # alternative to registerDoParallel # # cl <- startMPIcluster(nCores); registerDoMPI(cl) # when using Rmpi as the back-end out <- foreach(i = 1:100, .combine = c) %dopar% { cat('Starting ', i, 'th job.\n', sep = '') outSub <- taskFun() cat('Finishing ', i, 'th job.\n', sep = '') outSub # this will become part of the out object } ### 3.2 Parallel apply and vectorization require(parallel) nCores <- 4 ### using sockets # # ?clusterApply cl <- makeCluster(nCores) # by default this uses sockets nSims <- 60 testFun <- function(i){ mn <- mean(rnorm(1000000)) return(mn) } # if the processes need objects (x and y, here) from the master's workspace: # clusterExport(cl, c('x', 'y')) system.time( res <- parSapply(cl, 1:nSims, testFun) ) system.time( res2 <- sapply(1:nSims, testFun) ) myList <- as.list(1:nSims) res <- parLapply(cl, myList, testFun) ### using forking system.time( res <- mclapply(seq_len(nSims), testFun, mc.cores = nCores) ) require(parallel) nCores <- 4 cl <- makeCluster(nCores) library(fields) ds <- runif(6000000, .1, 10) system.time( corVals <- pvec(ds, Matern, .1, 2, mc.cores = nCores) ) system.time( corVals <- Matern(ds, .1, 2) ) ### 3.3 Explicit parallel programming in R: mcparallel and forking # mcparallel() library(parallel) n <- 10000000 system.time({ p <- mcparallel(mean(rnorm(n))) q <- mcparallel(mean(rgamma(n, shape = 1))) res <- mccollect(list(p,q)) }) system.time({ p <- mean(rnorm(n)) q <- mean(rgamma(n, shape = 1)) }) # forking library(fork) # mode 1 pid <- fork(slave = myfun) # mode 2 { # this set of braces is REQUIRED when you don't pass a function # to the slave argument of fork() pid <- fork(slave = NULL) if(pid==0) { cat("Starting child process execution.\n") tmpChild <- mean(rnorm(10000000)) cat("Result is ", tmpChild, "\n", sep = "") save(tmpChild, file = 'child.RData') # clunky cat("Finishing child process execution.\n") exit() } else { cat("Starting parent process execution.\n") tmpParent <- mean(rnorm(10000000)) cat("Finishing parent process execution.\n") wait(pid) # wait til child is finished so can read # in updated child.RData below } } load('child.RData') # clunky print(c(tmpParent, tmpChild)) ######################################################### ### Demo code for Unit 8 of Stat243, "Computer numbers" ### Chris Paciorek, October 2012 ######################################################### ############################ # 1: Basic representations ############################ a <- as.integer(3423333) a * a x <- rnorm(100000) y <- 1:100000 z <- rep('a', 100000) object.size(x) # 800040 bytes object.size(y) # 400040 bytes - so how many bytes per integer in R? object.size(z) # 800088 bytes - hmm, what's going on here? ############################ # 2: Floating point basics ############################ ### 2.1 Representing real numbers 0.3 - 0.2 == 0.1 0.3 0.2 0.1 # Hmmm... options(digits=22) a <- 0.3 b <- 0.2 a - b 0.1 1/3 # so empirically, it looks like we're accurate up to the 16th decimal place 1e-16 + 1 1e-15 + 1 2e-16 + 1 .Machine$double.eps .Machine$double.eps + 1 .1 .5 .25 .26 1/32 1/33 ### 2.2 Overflow and underflow log10(2^1024) # whoops ... we've actually just barely overflowed log10(2^1023) .Machine$double.xmax .Machine$double.xmin ### 2.3 Integers or floats x <- 2^45 z <- 25 class(x) class(z) as.integer(x) as.integer(z) 1e308 as.integer(1e308) 1e309 x <- 3; typeof(x) x <- as.integer(3); typeof(x) x <- 3L; typeof(x) ### 2.4 Precision options(digits = 22) # large vs. small numbers .1234123412341234 1234.1234123412341234 # not accurate to 16 places 123412341234.123412341234 # only accurate to 4 places 1234123412341234.123412341234 # no places! 12341234123412341234 # fewer than no places! # How precision affects calculations 1234567812345678 - 1234567812345677 12345678123456788888 - 12345678123456788887 .1234567812345678 - .1234567812345677 .12345678123456788888 - .12345678123456788887 .00001234567812345678 - .00001234567812345677 # not as close as we'd expect, should be 1e-20 .000012345678123456788888 - .000012345678123456788887 123456781234 - .0000123456781234 # the correct answer is 123456781233.99998765.... 1000000.1 2^52 2^52+1 2^53 2^53+1 2^53+2 2^54 2^54+2 2^54+4 ################################################## # 3: Implications for calculation and comparisons ################################################## ### 3.1 Computer arithmetic is not mathematical arithmetic! val1 <- 1/10; val2 <- 0.31; val3 <- 0.57 res1 <- val1*val2*val3 res2 <- val3*val2*val1 identical(res1, res2) res1 res2 ### 3.3 Comparisons a = 12345678123456781000 b = 12345678123456782000 approxEqual = function(a, b){ if(abs(a - b) < .Machine$double.eps * abs(a + b)) print("approximately equal") else print ("not equal") } approxEqual(a,b) a = 1234567812345678 b = 1234567812345677 approxEqual(a,b) ### 3.4 Calculations # catastrophic cancellation w/ large numbers 123456781234.56 - 123456781234.00 # how many accurate decimal places? # catastrophic cancellation w/ small numbers a = .000000000000123412341234 b = .000000000000123412340000 # so we know the right answer is .000000000000000000001234 EXACTLY a-b # 1.233999993151397490851e-21, which is accurate only to 8 places + 21 = 29 places, as expected from a machine epsilon calculation, since the "1" is in the 13th position, plus 16 more # ideally we would have accuracy to 37 places on a computer since the actual result is of order 10^-21 and 37 = 21 + 16, but we've lost 8 digits of accuracy (the '12341234' cancellation) to catastrophic cancellation x <- c(-1, 0, 1) + 1e8 n <- length(x) sum(x^2)-n*mean(x)^2 # that's not good! sum((x - mean(x))^2) # adding/subtracting numbers very different in magnitude gives precision of the larger number 123456781234 - 0.000001 # part of the predictive density calculation example: exp(-1000) xs <- 1:100 dists <- rdist(xs) corMat <- exp(- (dists/10)^2) # this is a p.d. matrix (mathematically) eigen(corMat)$values[80:100] # but not numerically ################################################################ ### Demo code for Unit 9 of Stat243, "Numerical linear algebra" ### Chris Paciorek, October 2012 ################################################################ ######################## # 1: Preliminaries ######################## ### 1.7 Some vector and matrix properties A + B A %*% B # matrix multiplication A * B # Hadamard (direct) product ### 1.9 rank, LIN, basis vectors # suppose I want to use (1, 1) and (1, -1) as basis vectors in R^2 rather than (1, 0), and (0, 1) # express (1, 0) in terms of the basis vectors: x = c(1, 0) v1 = c(1, 1) v2 = c(1, -1) v1 = v1/norm2(v1) v2 = v2/norm2(v2) c1 = sum(x*v1) c2 = sum(x*v2) c1*v1 + c2*v2 # this should be x ### 1.10 Invertibility, etc. mat0 = matrix(c(1,2,2,4), 2) # symmetric, not full rank mat1 = matrix(c(1,2,4,1), 2) # not symmetric mat2 = matrix(c(1,.5,.25,1), 2) # not symmetric mat3 = matrix(c(1,1,1,1), 2) # not full rank mat4 = matrix(c(1,.5,.5,1), 2) # symmetric, full rank # let's look at the eigendecompositions eigen(mat0) eigen(mat4) # numerically positive definite! ### 1.11 Generalized inverses precMat <- matrix(c(1,-1,0,0,0,-1,2,-1,0,0,0,-1,2,-1, 0,0,0,-1,2,-1,0,0,0,-1,1), 5) e <- eigen(precMat) e$values e$vectors[ , 5] # generate a realization e$values[1:4] <- 1 / e$values[1:4] y <- e$vec %*% (e$values * rnorm(5)) sum(y) ########################### # 2: Computational issues ########################### ### 2.3 Ill-conditioned problems norm2 <- function(x) sqrt(sum(x^2)) A <- matrix(c(10,7,8,7,7,5,6,5,8,6,10,9,7,5,9,10),4) e <- eigen(A) b <- c(32, 23, 33, 31) bPerturb <- c(32.1, 22.9, 33.1, 30.9) x <- solve(A, b) xPerturb <- solve(A, bPerturb) norm2(x - xPerturb) norm2(b - bPerturb) norm2(x - xPerturb)/norm2(x) (e$val[1]/e$val[4])*norm2(b - bPerturb)/norm2(b) x1 <- 1990:2010 x2 <- x1 - 2000 # centered x3 <- x2/10 # centered and scaled X1 <- cbind(rep(1, 21), x1, x1^2) X2 <- cbind(rep(1, 21), x2, x2^2) X3 <- cbind(rep(1, 21), x3, x3^2) e1 <- eigen(crossprod(X1)) e1$values e2 <- eigen(crossprod(X2)) e2$values e3 <- eigen(crossprod(X3)) e3$values ################################################## # 3: Matrix factorizations (decompositions) and # solving systems of linear equations ################################################## ### 3.1 Triangular systems n <- 20 X <- crossprod(matrix(rnorm(n^2), n)) b <- rnorm(n) U <- chol(crossprod(X)) # U is upper-triangular L <- t(U) # L is lower-triangular out1 <- backsolve(U, b) out2 <- forwardsolve(L, b) all.equal(out1, c(solve(U) %*% b)) all.equal(out2, c(solve(L) %*% b)) # 3.2 LU # example of the impact of pivoting # eqn 1: .0001 x1 + x2 = 1 # eqn 2: x1 + x2 = 2 A = matrix(c(.0001, 1, 1 ,1), nr = 2, byrow = TRUE) b = c(1,2) x = solve(A, b) options(digits = 16) print(x) c1 = -A[2,1]/A[1,1] A[2 , ] = c1*A[1,] + A[2, ] b[2] = c1*b[1] + b[2] # compute solution if numbers only available with 3 digits of accuracy x3dig = rep(NA, 2) x3dig[2] = round(b[2], -3)/round(A[2,2], -3) # round to 3 significant digits x3dig[1] = (b[1] - A[1,2]*x3dig[2])/A[1,1] print(x3dig) # whoops # switch the rows A = matrix(c(1, 1, .0001, 1), nr = 2, byrow = TRUE) b = c(2,1) c1 = -A[2,1]/A[1,1] A[2 , ] = c1*A[1,] + A[2, ] b[2] = c1*b[1] + b[2] # solution based on 3 digits of accuracy with pivoting x3dig = rep(NA, 2) x3dig[2] = round(b[2], 3)/round(A[2,2], 3) # numbers are now less than 1 - round to 3 decimal places x3dig[1] = (b[1] - A[1,2]*x3dig[2])/A[1,1] print(x3dig) # now a good approximation ### 3.3 Cholesky decomposition backsolve(U, b, transpose = TRUE) forwardsolve(L, b, transpose = TRUE) backsolve(U, backsolve(U, b, transpose = TRUE)) backsolve(U, forwardsolve(t(U), b)) # equivalent but less efficient U <- chol(A) y <- crossprod(U, rnorm(n)) # i.e., t(U)%*%rnorm(n), but much faster # numerically not positive definite library(fields) locs <- runif(100) rho <- .1 C <- exp(-rdist(locs)^2/rho^2) e <- eigen(C) e$values[96:100] U <- chol(C) vals <- abs(e$values) max(vals)/min(vals) U <- chol(C, pivot = TRUE) ### 3.4 QR decomposition ### 3.4.3 Regression and the QR in R X.qr = qr(X) Q = qr.Q(X.qr) R = qr.R(X.qr) ### 3.4.4 Computing the QR nlz = function(vec) vec/norm2(vec) orth = function(vec, std) vec - sum(std*vec)*std norm2 = function(x) sqrt(sum(x^2)) # modified Gram-Schmidt X = matrix(rnorm(5*4), nr=5) # X[ ,4] = 0.5*X[ ,1] - .7*X[ ,2] + rnorm(5, 0, .00000001) Q = matrix(NA, 5, 4) R = matrix(0, 4, 4) Q[ ,1] = nlz(X[ ,1]) Q[ ,2] = nlz(orth(X[ ,2], Q[ ,1])) Q[ ,3] = orth(X[ ,3], Q[ ,1]) Q[ ,4] = orth(X[ ,4], Q[ ,1]) Q[ ,3] = nlz(orth(Q[, 3], Q[ ,2])) Q[ ,4] = orth(Q[ ,4], Q[ ,2]) Q[ ,4] = nlz(orth(Q[, 4], Q[ ,3])) # original Gram-Schmidt Q = matrix(NA, 5, 4) Q2[ ,1] = nlz(X[ ,1]) Q2[ ,2] = nlz(orth(X[ ,2], Q2[ ,1])) Q2[ ,3] = orth(X[ ,3], Q2[ ,1]) Q2[ ,3] = nlz(orth(Q2[ ,3], Q2[ ,2])) Q2[ ,4] = orth(X[ ,4], Q2[ ,1]) Q2[ ,4] = orth(Q2[ ,4], Q2[ ,2]) Q2[ ,4] = nlz(orth(Q2[, 4], Q2[ ,3])) ### 3.5 Determinants myqr = qr(A) magn = sum(log(abs(diag(myqr$qr)))) sign = prod(sign(diag(myqr$qr))) ### 4.1 Eigendecomposition # computing the top few eigenvalue/vector pairs A = matrix(c(3, 1.3, .7, 1.3, 2, .5, .7, .5, 1), 3) e = eigen(A) z1 = c(1, 0, 0) for(i in 1:20){ z1 = A %*% z1 z1 = z1/norm2(z1) print(z1) } val1 = (A %*% z1 / z1)[1, 1] # following Gentle-NLA, p. 125, we can get the next largest eigenvalue, which is the largest eigenvalue of the matrix B B = A - val1 * outer(c(z1), c(z1)) z2 = c(1, 0, 0) for(i in 1:200){ z2 = B %*% z2 z2 = z2/norm2(z2) print(z2) } val2 = B%*%z2/z2 + val1 ################################################################ ### Demo code for Unit 10 of Stat243, "Simulation" ### Chris Paciorek, November 2012 ################################################################ ######################################### # 2: Random number generation ######################################### ### 2.1 Generating random uniforms on a computer n <- 100 a <- 171 m <- 30269 u <- rep(NA, n) u[1] <- 7306 for(i in 2:n) u[i] <- (a * u[i-1]) %% m u <- u/m uFromR <- runif(n) par(mfrow = c(2,2)) plot(1:n, u, type = 'l') plot(1:n, uFromR, type = 'l') hist(u, nclass = 25) hist(uFromR, nclass = 25) RNGkind("Wichmann-Hill") set.seed(0) saveSeed <- .Random.seed uFromR <- runif(10) a <- c(171, 172, 170) m <- c(30269, 30307, 30323) xyz <- matrix(NA, nr = 10, nc = 3) xyz[1, ] <- (a * saveSeed[2:4]) %% m for( i in 2:10) xyz[i, ] <- (a * xyz[i-1, ]) %% m for(i in 1:10) print(c(uFromR[i],sum(xyz[i, ]/m)%%1)) # we should be able to recover the current value of the seed xyz[10, ] .Random.seed[2:4] ### 2.3 RNG in R set.seed0) rnorm(10) set.seed(0) rnorm(10) set.seed(0) rnorm(5) savedSeed <- .Random.seed tmp <- sample(1:50, 2000, replace = TRUE) .Random.seed <- savedSeed rnorm(5) f <- function(args) { oldseed <- .Random.seed # other code .Random.seed <<- oldseed # note global assignment! } ### 2.4 RNG in parallel require(parallel) require(rlecuyer) nSims <- 250 testFun <- function(i){ val <- runif(1) return(val) } nCores <- 4 RNGkind() cl <- makeCluster(nCores) iseed <- 0 clusterSetRNGStream(cl = cl, iseed = iseed) RNGkind() # hmmm... clusterSetRNGStream() should set as "L'Ecuyer-CMRG" res <- parSapply(cl, 1:nSims, testFun) clusterSetRNGStream(cl = cl, iseed = iseed) res2 <- parSapply(cl, 1:nSims, testFun) identical(res,res2) stopCluster(cl) RNGkind("L'Ecuyer-CMRG") seed <- 0 set.seed(seed) ## now start M workers s <- .Random.seed for (i in 1:M) { s <- nextRNGStream(s) # now send s to worker i as .Random.seed } require(parallel) require(rlecuyer) RNGkind("L'Ecuyer-CMRG") res <- mclapply(seq_len(nSims), testFun, mc.cores = nSlots, mc.set.seed = TRUE) # this also seems to auotmatically reset the seed when it is run res2 <- mclapply(seq_len(nSims), testFun, mc.cores = nSlots, mc.set.seed = TRUE) identical(res,res2) ################################### # 3: Generating random variables ################################### ### 3.1 Multivariate distributions U <- chol(covMat) crossprod(U, nrow(covMat)) ### 3.3 Rejection sampling # generating efficiently from a truncated normal # suppose we want to generate from X ~ N(1, 2^2) I(x > 4) mu <- 1; sd <- 2 tau <- 4 # truncation point samps <- rnorm(10000, mu, sd) samps <- samps[samps > tau] length(samps)/10000 # around 640 of 10000 hist(samps) tauStd <- (tau - mu)/sd # put trunc. pt on standardized scale lamStar <- (tauStd + sqrt(tauStd^2+4))/2 # rate of exponential c <- (1/lamStar)*exp(lamStar^2/2 - lamStar*tauStd)/(sqrt(2*pi)*(1-pnorm(tauStd))) y <- rexp(2000, rate = lamStar) + tauStd # + tauStd does the shifting of the exponential u <- runif(2000) samps2 <- mu + sd * y[u <= exp(-(y-lamStar)^2/2)] # mu + sd part reverses the standardization length(samps2)/2000 # empirical acceptance rate 1/c # theoretical acceptance rate # plots here are done on the standardized scale for simplicity par(mfrow = c(1,3)) plot(y, u) points(y[u <= exp(-(y-lamStar)^2/2)], u[u <= exp(-(y-lamStar)^2/2)], col = 'red') # we accept red points and reject black # this shows f(x) and c*g(x) on the standardized scale yvals <- seq(tauStd, 5, len = 100) plot(yvals, c*dexp(yvals - tauStd, rate = lamStar), type = 'l') lines(yvals, dnorm(yvals)/(1-pnorm(tauStd)), col = 'red') # exp is a nice envelope yvals <- seq(5, 10, len = 100) plot(yvals, c*dexp(yvals - tauStd, rate = lamStar), type = 'l') lines(yvals, dnorm(yvals)/(1-pnorm(tauStd)), col = 'red') # in tail, exp is much fatter than normal, but few proposals are in tail anyway ### 3.5 Importance sampling # suppose we want to estimate P(X^2 < -3) for a Cauchy # instead of drawing from a standard Cauchy, let's draw from a Cauchy shifted to be centered at -5 # write out the code below for a single estimate m <- 1000 # number of samples for each estimator # standard MC estimator set.seed(0) y <- rt(m, df = 1) # samples for importance sampler set.seed(0) x <- rt(m, df = 1) - 5 # i.e sample from g(x) being a Cauchy centered at -5 f <- dt(x, df = 1) # density of x under f g <- dt(x + 5, df = 1) # density of x under g (density of x under a Cauchy centered at -5 is the same as density of x+5 under Cauchy centered at 0) w <- f/g # weights stdEst <- mean(y < (-3)) isEst <- mean((x < (-3))*w) # now let's do a small simulation study of the estimator nSims <- 100 m <- 1000 # number of samples for each estimator isEst <- stdEst <- varIS <- varStd <- rep(NA, nSims) set.seed(0) for(i in 1:nSims){ # a small simulation study of the approach with m = 1000 # note this could be done w/out looping if I were trying to be efficient # samples for MC estimator y <- rt(m, df = 1) # samples for importance sampler set.seed(0) x <- rt(m, df = 1) - 5 # i.e sample from g(x) being a Cauchy centered at -5 f <- dt(x, df = 1) # density of x under f g <- dt(x + 5, df = 1) # density of x under g (density of x under a Cauchy centered at -5 is the same as density of x+5 under Cauchy centered at 0) w <- f/g # weights isEst[i] <- mean((x < (-3))*w) stdEst[i] <- mean(y < (-3)) } mean((isEst - pt(-3, df = 1))^2) # variance of IS estimator based on entire sim study mean((stdEst - pt(-3, df = 1))^2) # variance of standard MC estimator based on entire sim study # order of magnitude gain in efficiency ############################################ # 4: Design of simulation studies ############################################ # 4.2 Overview devs <- rnorm(100) tdevs <- qt(pnorm(devs), df = 1) plot(devs, tdevs) abline(0,1) ############################################## # 5: Implementation of simulation studies ############################################## thetaLevels <- c("low", "med", "hi") n <- c(10, 100, 1000) tVsNorm <- c("t", "norm") levels <- expand.grid(thetaLevels, tVsNorm, n) # example of replicate() -- generate m sets correlated normals set.seed(0) genFun <- function(n, theta = 1){ u <- rnorm(n) x <- runif(n) Cov <- exp(-rdist(x)/theta) U <- chol(Cov) return(cbind(x,crossprod(U, u))) } m <- 20 simData <- replicate(m, genFun(100, 1)) dim(simData) # 100 observations by {x, y} values by 20 replicates ########################################################################## ### Demo code for Unit 11 of Stat243, "Integration and Differentiation" ### Chris Paciorek, November 2012 ########################################################################## ######################################### # 1: Differentiation ######################################### ### 1.1 Numerical differentiation # compute first and second derivatives of log(gamma(x)) at x=1/2 options(digits = 9, width = 120) h <- 10^(-(1:15)) x <- 1/2 fx <- lgamma(x) # targets: actual derivatives can be computed very accurately using built-in R functions: digamma(x) # accurate first derivative trigamma(x) # accurate second derivative # calculate discrete differences fxph <- lgamma(x+h) fxmh <- lgamma(x-h) fxp2h <- lgamma(x+2*h) fxm2h <- lgamma(x-2*h) # now find numerical derivatives fp1 <- (fxph - fx)/h # forward difference fp2 <- (fxph - fxmh)/(2*h) # central difference # second derivatives fpp1 <- (fxp2h - 2*fxph + fx)/(h*h) # forward difference fpp2 <- (fxph - 2*fx + fxmh)/(h*h) # central difference # table of results cbind(h,fp1,fp2,fpp1,fpp2) ### 1.2 Numerical differentiation in R x <- 1/2 numericDeriv(quote(lgamma(x)), "x") ### 1.3 Symbolic differentiation deriv(quote(atan(x)), "x") # derivative of simple expression # derivative of a function; note we need to pass in an expression, # not the entire function f <- function(x,y) sin(x * y)+x^3+exp(y) newBody <- deriv(body(f), c("x", "y"), hessian = TRUE) # now create a new version of f that provides gradient # and hessian as attributes of the output, # in addition to the function value as the return value f <- function(x, y) {} # function template body(f) <- newBody # try out the new function f(3,1) attr(f(3,1), "gradient") attr(f(3,1), "hessian") ### symbolic differentiation in Mathematica ### first partial derivative wrt x # D[ Exp[x^n] - Cos[x y], x] ### second partial derivative # D[ Exp[x^n] - Cos[x y], {x, 2}] ### partials # D[ Exp[x^n] - Cos[x y], x, y] ### trig function example # D[ ArcTan[x], x] ######################################### # 2: Integration ######################################### f <- function(x) sin(x) # mathematically, the integral from 0 to pi is 2 # quadrature through integrate() integrate(f, 0, pi) system.time(integrate(f, 0, pi)) # MC estimate ninteg <- function(n) mean(sin(runif(n, 0, pi))*pi) n <- 1000 ninteg(n) system.time(ninteg(n)) n <- 10000 ninteg(n) system.time(ninteg(n)) n <- 1000000 ninteg(n) system.time(ninteg(n)) # that was fairly slow, # especially if you need to do a lot of individual integrals ### 2.2 Numerical integration in R integrate(dnorm, -Inf, Inf, 0, .1) integrate(dnorm, -Inf, Inf, 0, .001) integrate(dnorm, -Inf, Inf, 0, .0001) # THIS FAILS! ### 2.3 Singularities and infinite ranges # doing it directly with integrate() f <- function(x) exp(x)/sqrt(x) integrate(f, 0, 1) # subtracting off the singularity f <- function(x) (exp(x) - 1)/sqrt(x) x <- seq(0,1, len = 200) integrate(f, 0, 1) # analytic change of variables, followed by numeric integration f <- function(u) 2*exp(u^2) integrate(f, 0, 1) ### 2.4 Symbolic integration ### in Mathematica ### one-dimensional integration # Integrate[Sin[x]^2, x] ### two-dimensional integration # Integrate[Sin[x] Exp[-y^2], x, y] ################################################################ ### Demo code for Unit 12 of Stat243, "Optimization" ### Chris Paciorek, November 2012 ################################################################ ####################################### # 3: Univariate function optimization ####################################### ### 3.3.3 How can Newton's method go wrong? par(mfrow = c(1,2)) fp <- function(x, theta = 1){ exp(x*theta)/(1+exp(x*theta)) - .5 } fpp <- function(x, theta = 1){ exp(x*theta)/((1+exp(x*theta))^2) } xs <- seq(-15, 15, len = 300) plot(xs, fp(xs), type = 'l') lines(xs, fpp(xs), lty = 2) # good starting point x0 <- 2 xvals <- c(x0,rep(NA,9)) for(t in 2:10){ xvals[t]=xvals[t-1] - fp(xvals[t-1]) / fpp(xvals[t-1]) } print(xvals) points(xvals, fp(xvals), pch = as.character(1:length(xvals))) # bad starting point x0 <- 2.5 xvals <- c(x0,rep(NA,9)) for(t in 2:10){ xvals[t]=xvals[t-1] - fp(xvals[t-1]) / fpp(xvals[t-1]) } print(xvals) points(xvals, fp(xvals), pch = as.character(1:length(xvals)), col = 'red') # whoops # mistakenly climbing uphill f <- function(x) cos(x) fp <- function(x) -sin(x) fpp <- function(x) -cos(x) xs <- seq(0, 2*pi, len = 300) plot(xs, f(xs), type = 'l', lwd = 2) lines(xs, fp(xs)) lines(xs, fpp(xs), lty = 2) x0 <- 0.2 # starting point fp(x0) # negative fpp(x0) # negative x1 <- x0 - fp(x0)/fpp(x0) # whoops, we've gone uphill # because of the negative second derivative xvals <- c(x0,rep(NA,9)) for(t in 2:10){ xvals[t]=xvals[t-1]-fp(xvals[t-1])/fpp(xvals[t-1]) } xvals points(xvals, fp(xvals), pch = as.character(1:length(xvals)), col = 'red') # and we've found a maximum rather than a minimum... ####################################### # 4: Convergence ideas ####################################### ### 4.3 Convergence rates options(digits = 10) f <- function(x) cos(x) fp <- function(x) -sin(x) fpp <- function(x) -cos(x) xstar <- pi # known minimum # N-R x0 <- 2 xvals <- c(x0,rep(NA,9)) for(t in 2:10){ xvals[t] <- xvals[t-1] - fp(xvals[t-1]) / fpp(xvals[t-1]) } print(xvals) # bisection bisecStep <- function(interval, fp){ xt <- mean(interval) if(fp(interval[1]) * fp(xt) <= 0) interval[2] <- xt else interval[1] <- xt return(interval) } nIt <- 30 a0 <- 2; b0 <- (3*pi/2) - (xstar - a0) # have b0 be as far from min as a0 for fair comparison with N-R interval <- matrix(NA, nr = nIt, nc = 2) interval[1, ] <- c(a0, b0) for(t in 2:nIt){ interval[t, ] <- bisecStep(interval[t-1, ], fp) } rowMeans(interval) ######################################## ### 5: Multivariate optimization ######################################## ### 5.2 Newton-Raphson (Newton's method) library(MASS) head(wtloss) attach(wtloss) plot(Days, Weight, ylab = "Weight (kg)") ## Linear fit - not a good model wtloss.lm <- lm(Weight ~ Days, data = wtloss) coef(wtloss.lm) # estimates of the intercept and slope lines(Days, fitted(wtloss.lm), col = "grey") # we need some starting values # guess that beta2 approx 100 since the midpoint of Days is near 100 beta2.init = 100 plot(2^(-Days/beta2.init), Weight) tmpMod = lm(Weight ~ I(2^(-Days/beta2.init))) beta0.init = tmpMod$coef[1] beta1.init = tmpMod$coef[2] plot(Days, Weight, ylab = "Weight (kg)") lines(Days, beta0.init + beta1.init * 2^(-Days/beta2.init), col = 'red') # not bad; let's go with that expr = quote((Weight - (beta0 + beta1 * 2^(-Days/beta2)))^2) # objective is sum of this quantity over the observations # let R do the differentiation of the basic expression for us (since human error in taking derivatives is very common) # R won't deal with the summation, so we'll have to do that ourselves deriv(expr, c("beta0", "beta1", "beta2"), function.arg = TRUE) deriv3(expr, c("beta0", "beta1", "beta2"), function.arg = TRUE) # same as deriv() with Hessian = TRUE f =function(betas){ sum((Weight - (betas[1] + betas[2] * 2^(-Days/betas[3])))^2) } # use the basic code based on deriv() and deriv3() fp = function(betas){ # a bit sloppy as I use Days and Weight as global vars here beta0 = betas[1] beta1 = betas[2] beta2 = betas[3] .expr3 <- 2^(-Days/beta2) .expr6 <- Weight - (beta0 + beta1 * .expr3) grad = - c(sum(2*.expr6), sum(2*(.expr3*.expr6)), sum(2* (beta1 * (.expr3 * (log(2) * (Days/beta2^2))) * .expr6))) return(grad) } fpp = function(betas){ # a bit sloppy as I use Days and Weight as global vars here n = length(Days) beta0 = betas[1] beta1 = betas[2] beta2 = betas[3] .expr3 <- 2^(-Days/beta2) .expr6 <- Weight - (beta0 + beta1 * .expr3) .expr11 <- log(2) .expr12 <- beta2^2 .expr14 <- .expr11 * (Days/.expr12) .expr15 <- .expr3 * .expr14 .expr16 <- beta1 * .expr15 hessian = matrix(0, 3, 3) hessian[1, 1] <- 2*n # don't forget to do summation (i.e., multiply by n) hessian[1, 2] <- sum(2 * .expr3) hessian[1, 3] <- sum( 2 * .expr16) hessian[2, 2] <- sum( 2 * (.expr3 * .expr3)) hessian[2, 3] <- - sum(2 * (.expr15 * .expr6 - .expr3 * .expr16)) hessian[3, 3] <- - sum(2 * (beta1 * (.expr15 * .expr14 - .expr3 * (.expr11 * (Days * (2 * beta2)/.expr12^2))) * .expr6 - .expr16 * .expr16)) hessian[lower.tri(hessian)] = t(hessian[upper.tri(hessian)]) return(hessian) } nIt = 20 xvals = matrix(NA, nr = nIt, nc = 3) xvals[1, ] = c(beta0.init, beta1.init, beta2.init) for(t in 2:nIt){ xvals[t, ] = xvals[t-1, ] - solve(fpp(xvals[t-1, ]), fp(xvals[t-1, ])) if(FALSE){ # in case not numerically p.d., use pseudoinverse e = eigen(fpp(xvals[t-1, ])) e$val = 1/e$val e$val[e$val < 0] = 0 # pseudo-inverse xvals[t, ] = xvals[t-1, ] - e$vec%*%((t(e$vec)*e$val)%*%fp(xvals[t-1, ])) } } # let's check against R's built-in nonlinear least squares function beta.start = xvals[1, ] names(beta.start) = c("beta0", "beta1", "beta2") wtloss.fm <- nls(Weight ~ beta0 + beta1*2^(-Days/beta2), data = wtloss, start = beta.start, trace = TRUE) nlsEst = coef(wtloss.fm) f(xvals[20, ]) f(nlsEst) plot(Days, Weight, ylab = "Weight (kg)") lines(Days, fitted(wtloss.fm), col = "blue") ### 5.3 Fisher scoring # data are only involved in .expr6 and all Hessian terms involving .expr6 are linear in it, so replace with its expectation, which is simply 0 fppFS = function(betas){ # a bit sloppy as I use Days and Weight as global vars here n = length(Days) beta0 = betas[1] beta1 = betas[2] beta2 = betas[3] .expr3 <- 2^(-Days/beta2) .expr6 <- 0 # Weight - (beta0 + beta1 * .expr3) .expr11 <- log(2) .expr12 <- beta2^2 .expr14 <- .expr11 * (Days/.expr12) .expr15 <- .expr3 * .expr14 .expr16 <- beta1 * .expr15 hessian = matrix(0, 3, 3) hessian[1, 1] <- 2*n # don't forget to do summation (i.e., multiply by n) hessian[1, 2] <- sum(2 * .expr3) hessian[1, 3] <- sum( 2 * .expr16) hessian[2, 2] <- sum( 2 * (.expr3 * .expr3)) hessian[2, 3] <- - sum(2 * (.expr15 * .expr6 - .expr3 * .expr16)) hessian[3, 3] <- - sum(2 * (beta1 * (.expr15 * .expr14 - .expr3 * (.expr11 * (Days * (2 * beta2)/.expr12^2))) * .expr6 - .expr16 * .expr16)) hessian[lower.tri(hessian)] = t(hessian[upper.tri(hessian)]) return(hessian) } nIt = 20 xvals = matrix(NA, nr = nIt, nc = 3) xvals[1, ] = c(beta0.init, beta1.init, beta2.init) fpp(xvals[1, ]) fppFS(xvals[1, ]) # pretty similar - some terms in the observed FI didn't involve the data, so don't change anyway for(t in 2:nIt){ xvals[t, ] = xvals[t-1, ] - solve(fppFS(xvals[t-1, ]), fp(xvals[t-1, ])) } # FS seems to give slightly faster convergence here ### 5.5.1 Descent methods and Newton-like methods # steepest descent f <- function(x){ x[1]^2/1000 + 4*x[1]*x[2]/1000 + 5*x[2]^2/1000 } fp <- function(x){ c(2 * x[1]/1000 + 4 * x[2]/1000, 4 * x[1]/1000 + 10 * x[2]/1000) } lineSearch <- function(alpha, xCurrent, direction, FUN){ newx <- xCurrent + alpha * direction FUN(newx) } nIt <- 50 xvals <- matrix(NA, nr = nIt, nc = 2) xvals[1, ] <- c(7, -4) for(t in 2:50){ newalpha <- optimize(lineSearch, interval = c(-5000, 5000), xCurrent = xvals[t-1, ], direction = fp(xvals[t-1, ]), FUN = f)$minimum xvals[t, ] <- xvals[t-1, ] + newalpha * fp(xvals[t-1, ]) } x1s <- seq(-5, 8, len = 100); x2s = seq(-5, 2, len = 100) fx <- apply(expand.grid(x1s, x2s), 1, f) # plot f(x) surface on log scale image.plot(x1s, x2s, matrix(log(fx), 100, 100), xlim = c(-5, 8), ylim = c(-5,2)) lines(xvals) # overlay optimization path # kind of slow ### 5.6 Gauss-Seidel f <- function(x){ return(x[1]^2/1000 + 4*x[1]*x[2]/1000 + 5*x[2]^2/1000) } f1 <- function(x1, x2){ # f(x) as a function of x1 return(x1^2/1000 + 4*x1*x2/1000 + 5*x2^2/1000) } f2 <- function(x2, x1){ # f(x) as a function of x2 return(x1^2/1000 + 4*x1*x2/1000 + 5*x2^2/1000) } x1s <- seq(-5, 8, len = 100); x2s = seq(-5, 2, len = 100) fx <- apply(expand.grid(x1s, x2s), 1, f) image.plot(x1s, x2s, matrix(log(fx), 100, 100)) nIt <- 49 xvals <- matrix(NA, nr = nIt, nc = 2) xvals[1, ] <- c(7, -4) # 5, -10 for(t in seq(2, nIt, by = 2)){ newx1 <- optimize(f1, x2 = xvals[t-1, 2], interval = c(-40, 40))$minimum xvals[t, ] <- c(newx1, xvals[t-1, 2]) newx2 <- optimize(f2, x1 = newx1, interval = c(-40, 40))$minimum xvals[t+1, ] <- c(newx1, newx2) } lines(xvals) ### 5.7 Nelder-Mead # set up tuning factors alpha = 1 gamma = 2 beta = .5 delta = .5 # auxiliary function to plot line segments plotseg = function(ind1, ind2, col = 1){ if(length(ind1) == 1){ segments(xs[ind1, 1], xs[ind1 , 2], xs[ind2, 1], xs[ind2, 2], col = col) } else{ segments(ind1[1], ind1[2], xs[ind2, 1], xs[ind2, 2], col = col) } } # initial polytope xs = matrix(c(-2,3,-6,4,-4,2),nc=2,byrow=T) plot(xs,xlim=c(-10,5),ylim=c(-2,8)) plotseg(2, 3) plotseg(1, 3) plotseg(1, 2) xbar = (xs[1,]+xs[2,])/2 points(xbar[1],xbar[2], col = 'red') ### reflection xr = (1+alpha)*xbar - alpha*xs[3,] points(xr[1],xr[2],col = 'red') plotseg(xr, 1, col = 'red') plotseg(xr, 2, col = 'red') plotseg(1, 2, col = 'red') # red triangle is now our proposed new polytope ### consider expansion if xr is better than all the other points xe = gamma*xr + (1-gamma)*xbar points(xe[1], xe[2], col = 'green') points(xr[1], xr[2], col = 'green') # if xe is better than xr, use xe, o.w. use xr plotseg(xe, 1, col = 'green') plotseg(xe, 2, col = 'green') plotseg(1, 2, col = 'green') ### consider contraction if xr is worse than all the other points points(xs[3,1], xs[3, 2], col = 'blue') points(xr[1], xr[2], col = 'blue') # set xh to be the best of these two points xh = xs[3, ] # suppose the original point is better than the reflection xc = beta*xh + (1-beta)*xbar points(xc[1], xc[2], col = 'blue') # if xc is better than xh, then contract plotseg(xc, 1, col = 'blue') plotseg(xc, 2, col = 'blue') plotseg(1, 2, col = 'blue') # if not, shrink simplex toward the best point (xs[1, ]) ### shrinkage xs[2, ] = delta*xs[2, ] + (1-delta)*xs[1, ] xs[3, ] = delta*xs[3, ] + (1-delta)*xs[1, ] points(xs[2, 1], xs[2, 2], col = 'purple') points(xs[3, 1], xs[3, 2], col = 'purple') plotseg(1, 2, 'purple') plotseg(1, 3, 'purple') plotseg(2, 3, 'purple') ### 5.8 Simulated annealing x= seq(0,10,len=100) f=function(x) sin(x) plot(x, f(x), type='l', ylim = c(-1, 3)) tau = 10 # plot the modified function lines(x, exp(-f(x)/tau)) # try tau = 3, 1, .3, etc. ############################## # 6: Optimization in R ############################## ### 6.1 Core optimization functions # using optim() yHundredths = scan('precipData.txt') # precip in hundredths of inches yHundredths = yHundredths[!is.na(yHundredths)] y = yHundredths/100 # precip now in inches par(mfrow = c(1, 2)) hist(y) thresh = 3 hist(y[y > thresh]) npy=31+28+31 # number of days in winter season cutoff = (1/25.4) # wet days defined as those with > 1 mm (1/25.4 inches) of precip thresh = as.numeric(quantile(y[y > cutoff],.98,na.rm=T)) # 98%ile of wet days pp.lik <- function(par, y, thresh, npy) { mu <- par[1] sc <- par[2] xi <- par[3] uInd = y > thresh if(sc <= 0) return(10^6); if ((1 + ((xi * (thresh - mu))/sc)) < 0) { l <- 10^6; } else { y <- (y - mu)/sc y <- 1 + xi * y if (min(y^uInd) <= 0){ l <- 10^6; } else{ ytmp = y ytmp[!uInd]=1 # 'zeroes' out those below the threshold after applying the log in next line l <- sum(uInd * log(sc)) + sum(uInd * log(ytmp) * (1/xi + 1)) + length(y)/npy * mean((1 + (xi * (thresh - mu))/sc)^(-1/xi)) } } l } # optim() usage # out <- optim(init, pp.lik, hessian = TRUE, method = "BFGS", control = list(maxit = maxit, trace = TRUE)) yExc = y[y > thresh] in2 <- sqrt(6 * var(yExc))/pi # have initial values depend only on those above the threshold in1 <- mean(yExc) - 0.57722 * in2 init0 = c(in1, in2, 0.1) # fit with Nelder-Mead (default) and BFGS optim(init0, pp.lik, y = y, thresh = thresh, npy = npy, control = list(trace = TRUE)) # 118 fxn evals optim(init0, pp.lik, y = y, thresh = thresh, npy = npy, method = 'BFGS', control = list(trace = TRUE)) # ~ 10 its system.time(optim(init0, pp.lik, y = y, thresh = thresh, npy = npy)) system.time(optim(init0, pp.lik, y = y, thresh = thresh, npy = npy, method = 'BFGS')) # different starting value init1 = c(mean(y[y > thresh]), sd(y[y > thresh]), -0.1) optim(init1, pp.lik, y = y, thresh = thresh, npy = npy, control = list(trace = TRUE)) optim(init1, pp.lik, y = y, thresh = thresh, npy = npy, method = 'BFGS', control = list(trace = TRUE)) # bad starting value for BFGS init2 = c(thresh, .01, .1) out = optim(init2, pp.lik, y = y, thresh = thresh, npy = npy, control = list(trace = TRUE), hessian = TRUE) solve(out$hessian) out2 = optim(init2, pp.lik, y = y, thresh = thresh, npy = npy, method = 'BFGS', control = list(trace = TRUE), hessian = TRUE) solve(out2$hessian) # suppose the data were on a different scale in2 <- sqrt(6 * var(yExc*1000))/pi # have initial values depend only on those above the threshold in1 <- mean(yExc*1000) - 0.57722 * in2 init3 = c(in1, in2, 0.1) optim(init3, pp.lik, y = y*1000, thresh = thresh*1000, npy = npy, control = list(trace = TRUE)) # note that the parameters have changed even beyond a scaling effect optim(init3, pp.lik, y = y*1000, thresh = thresh*1000, npy = npy, method = 'BFGS', control = list(trace = TRUE)) # note lack of convergence after 100 its optim(init3, pp.lik, y = y*1000, thresh = thresh*1000, npy = npy, method = 'BFGS', control = list(trace = TRUE, maxit = 1000)) # convergence, but to values not concordant with those from fitting the original y data # when we have y*1000, the location and scale parameters are on very different scales than the shape parameter # can we use parscale to deal with the problems when the data are on a different scale? optim(init3, pp.lik, y = y*1000, thresh = thresh*1000, npy = npy, control = list(trace = TRUE, parscale = c(1000,1000,1))) optim(init3, pp.lik, y = y*1000, thresh = thresh*1000, npy = npy, method = 'BFGS', control = list(trace = TRUE, parscale = c(1000,1000,1))) # yes, that works! the parameter estimates are now equivalent to those from the standard fitting of the original data # default step size for numerical derivative is only .001; perhaps we should try with higher accuracy optim(init0, pp.lik, y = y, thresh = thresh, npy = npy, method = 'BFGS', control = list(trace = TRUE, ndeps = rep(1e-6, 3))) # note that we needed only 33 function evaluations instead of the original 43, presumably because of higher accuracy in the derivative #### let's do some plotting of the objective fxn as a sanity check # 3-d grid of location, scale, shape locVals = seq(0, 5, len = 30) scaleVals = seq(.1, 3, len = 30) shapeVals = seq(-.3, .25, by = .05) parGrid = expand.grid(loc = locVals, scale = scaleVals, shape = shapeVals) obj=apply(parGrid, 1, pp.lik, y=y, thresh=thresh, npy=npy) # compute objective function for all parameter combos tmp = cbind(parGrid,obj) par(mfrow=c(3, 4), mai = c(.6,.5, .3, .1), mgp = c(1.8, .7, 0)) for( i in 1:length(shapeVals)){ tmp2 = tmp[tmp$shape == shapeVals[i],] # slice of objective fxn for fixed shape parameter image.plot(locVals, scaleVals, matrix((tmp2$obj), length(locVals), length(scaleVals)), col = tim.colors(32), zlim = c(40,80), main = as.character(shapeVals[i])) } # note there a couple weird things about this log likelihood - (1) weird things happen when the shape parameter is very close to 0 and (2) for certain combinations the likelihood is not defined (it's set to 1e6) - this is why some of the colors go from aqua to white without showing red values - reparameterizing or optimizing with explicit constraints may be better approaches # apart from that, it appears that optim() has probably found the minimum