August 2013, UC Berkeley
Chris Paciorek
Note: Remember to start recording.
2 + 2 # add numbers
## [1] 4
2 * pi # multiply by a constant
## [1] 6.283
7 + runif(1) # add a random number
## [1] 7.649
3^4 # powers
## [1] 81
sqrt(4^4) # functions
## [1] 16
log(10)
## [1] 2.303
log(100, base = 10)
## [1] 2
23%/%2
## [1] 11
23%%2
## [1] 1
# scientific notation
5e+09 * 1000
## [1] 5e+12
5e+09 * 1000
## [1] 5e+12
Think of a mathematical operation you need - can you guess how to do it in R?
A key action in R is to store values in the form of R objects, and to examine the value of R objects.
val <- 3
val
## [1] 3
print(val)
## [1] 3
Val <- 7 # case-sensitive!
print(c(val, Val))
## [1] 3 7
We can work with (and store) sequences and repetitions
mySeq <- 1:6
mySeq
## [1] 1 2 3 4 5 6
myOtherSeq <- seq(1.1, 11.1, by = 2)
myOtherSeq
## [1] 1.1 3.1 5.1 7.1 9.1 11.1
length(myOtherSeq)
## [1] 6
fours <- rep(4, 6)
fours
## [1] 4 4 4 4 4 4
# This is a comment: here is an example of non-numeric data
depts <- c("espm", "pmb", "stats")
depts
## [1] "espm" "pmb" "stats"
If we don't assign the output of a command to an object, we haven't saved it for later use.
R gives us a lot of flexibility (within certain rules) for assigning to (parts of) objects from (parts of) other objects.
If you're starting to type something you've typed before, or the long name of an R object or function, STOP! You likely don't need to type all of that.
source()
. For example: source('myRcodeFile.R')
Are there other tricks that anyone knows of?
The most basic form of an R object is a vector. In fact, individual (scalar) values are vectors of length one.
We can concatenate values into a vector with c()
.
# numeric vector
nums <- c(1.1, 3, -5.7)
devs <- rnorm(5)
devs
## [1] -1.4806 1.2553 -1.3394 -0.6328 -0.6519
# integer vector
ints <- c(1L, 5L, -3L) # force storage as integer not decimal number
# 'L' is for 'long integer' (historical)
idevs <- sample(ints, 100, replace = TRUE)
# character vector
chars <- c('hi', 'hallo', "mother's", 'father\'s',
"She said, 'hi'", "He said, \"hi\"" )
chars
## [1] "hi" "hallo" "mother's" "father's"
## [5] "She said, 'hi'" "He said, \"hi\""
cat(chars, sep = "\n")
## hi
## hallo
## mother's
## father's
## She said, 'hi'
## He said, "hi"
# logical vector
bools <- c(TRUE, FALSE, TRUE)
bools
## [1] TRUE FALSE TRUE
vals <- seq(2, 12, by = 2)
vals
## [1] 2 4 6 8 10 12
vals[3]
## [1] 6
vals[3:5]
## [1] 6 8 10
vals[c(1, 3, 6)]
## [1] 2 6 12
vals[-c(1, 3, 6)]
## [1] 4 8 10
vals[c(rep(TRUE, 3), rep(FALSE, 2), TRUE)]
## [1] 2 4 6 12
We can substitute values into vectors
vals[4] <- -35
vals[1:2] <- 0
# How does R process these next subset operations?
vals <- rnorm(100)
vals[vals < 0] <- 0
vals[1:8]
## [1] 1.4354 0.0000 0.0000 0.4205 0.0000 0.1906 0.0000 0.6670
set.seed(0) # make random number generation repeatable
vals <- rnorm(100)
extremes <- vals[vals > 3]
extremes
## numeric(0)
# what happened?
At the core of R is the idea of doing calculations on entire vectors.
vec1 <- sample(1:5, 10, replace = TRUE)
vec2 <- sample(1:5, 10, replace = TRUE)
vec1
## [1] 4 2 2 3 2 1 3 3 1 2
vec2
## [1] 4 5 1 4 5 5 2 4 5 5
vec1 + vec2
## [1] 8 7 3 7 7 6 5 7 6 7
vec1^vec2
## [1] 256 32 2 81 32 1 9 81 1 32
vec1 >= vec2
## [1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
vec1 <= 3
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
vec1 == vec2
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
vec1 != vec2
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# careful:
vec1 = vec2
identical(vec1, vec2)
## [1] TRUE
An important related concept is that of recycling
vec3 <- sample(1:5, 5, replace = TRUE)
vec4 <- sample(1:5, 3, replace = TRUE)
vec1
## [1] 4 5 1 4 5 5 2 4 5 5
vec3
## [1] 2 2 1 2 3
vec4
## [1] 5 3 2
vec1 + vec3
## [1] 6 7 2 6 8 7 4 5 7 8
vec1 + vec4
## Warning: longer object length is not a multiple of shorter object length
## [1] 9 8 3 9 8 7 7 7 7 10
Tell me what's going on. What choices were made by the R developers?
sum
## function (..., na.rm = FALSE) .Primitive("sum")
class(sum)
## [1] "function"
hist(rnorm(1000))
plot of chunk unnamed-chunk-9
To get information about a function you know exists, use help
or ?
, e.g., ?lm
. For information on a general topic, use apropos
or ??
help(lm)
`?`(lm)
`?`(log)
# ??'regr' # take out the '#' - there's an issue with Markdown
# processing...
Vectors are not the only kinds of R objects.
Vectors: vectors of various types (numeric (i.e., decimal/floating point/double), integer, boolean, character), all items must be of the same type
Matrices: matrices of various types, all items must be of the same type
mat <- matrix(rnorm(9), nrow = 3)
t(mat) %*% mat
## [,1] [,2] [,3]
## [1,] 1.621 1.650 -1.223
## [2,] 1.650 1.954 -1.507
## [3,] -1.223 -1.507 4.047
dim(mat)
## [1] 3 3
library(foreign)
vote <- read.dta("../data/2004_labeled_processed_race.dta")
head(vote)
## state pres04 sex race age9 partyid income relign8 age60 age65
## 1 2 1 female white 25-29 <NA> <NA> <NA> 18-29 25-29
## 2 2 2 male white 18-24 <NA> <NA> <NA> 18-29 18-24
## 3 2 1 female black 30-39 <NA> <NA> <NA> 30-44 30-39
## 4 2 1 female black 30-39 <NA> <NA> <NA> 30-44 30-39
## 5 2 1 female white 40-44 <NA> <NA> <NA> 30-44 40-49
## 6 2 1 female white 30-39 <NA> <NA> <NA> 30-44 30-39
## geocode sizeplac brnagain attend year region y
## 1 3 rural <NA> <NA> 2004 4 0
## 2 3 rural <NA> <NA> 2004 4 1
## 3 3 rural <NA> <NA> 2004 4 0
## 4 3 rural <NA> <NA> 2004 4 0
## 5 3 rural <NA> <NA> 2004 4 0
## 6 3 rural <NA> <NA> 2004 4 0
dim(vote)
## [1] 76205 17
nrow(vote)
## [1] 76205
names(vote)
## [1] "state" "pres04" "sex" "race" "age9" "partyid"
## [7] "income" "relign8" "age60" "age65" "geocode" "sizeplac"
## [13] "brnagain" "attend" "year" "region" "y"
class(vote)
## [1] "data.frame"
is.matrix(vote)
## [1] FALSE
class(vote$pres04)
## [1] "integer"
class(vote$sex)
## [1] "factor"
class(vote$age9)
## [1] "factor"
myList <- list(stuff = 3, mat = matrix(1:4, nrow = 2), moreStuff = c("china",
"japan"), list(5, "bear"))
myList
## $stuff
## [1] 3
##
## $mat
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## $moreStuff
## [1] "china" "japan"
##
## [[4]]
## [[4]][[1]]
## [1] 5
##
## [[4]][[2]]
## [1] "bear"
myList[[1]] # result is not (usually) a list (unless you have nested lists)
## [1] 3
identical(myList[[1]], myList$stuff)
## [1] TRUE
myList$moreStuff[2]
## [1] "japan"
myList[[4]][[2]]
## [1] "bear"
myList[1:3] # subset of a list is a list
## $stuff
## [1] 3
##
## $mat
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## $moreStuff
## [1] "china" "japan"
myList$newOne <- "more weird stuff"
names(myList)
## [1] "stuff" "mat" "moreStuff" "" "newOne"
Lists can be used as vectors of complicated objects. E.g., suppose you have a linear regression for each value of a stratifying variable. You could have a list of regression fits. Each regression fit will itself be a list, so you'll have a list of lists.
R has several approaches to object-oriented programming. These are widely used, albeit a bit klunky.
The most basic is 'S3' objects. These objects are generally built upon lists.
y <- rnorm(10)
x <- rnorm(10)
mod <- lm(y ~ x)
class(mod)
## [1] "lm"
is.list(mod)
## [1] TRUE
names(mod)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
mod$coefficients
## (Intercept) x
## -0.3874 0.1842
mod[["coefficients"]]
## (Intercept) x
## -0.3874 0.1842
mod[[1]]
## (Intercept) x
## -0.3874 0.1842
The magic of OOP here is that methods (i.e., functions) can be tailored to work specifically with specific kinds of objects.
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.880 -0.799 -0.647 -0.290 0.521 1.330
summary(mod)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.909 -0.372 -0.159 0.603 1.460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.387 0.340 -1.14 0.29
## x 0.184 0.250 0.74 0.48
##
## Residual standard error: 0.989 on 8 degrees of freedom
## Multiple R-squared: 0.0635, Adjusted R-squared: -0.0536
## F-statistic: 0.542 on 1 and 8 DF, p-value: 0.483
What do you think R is doing behind the scenes?
Consider summary.lm
.
You can use the as()
family of functions.
ints <- 1:10
as.character(ints)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
as.numeric(c("3.7", "4.8"))
## [1] 3.7 4.8
Be careful: R tries to be helpful and convert between types/classes when it thinks it's a good idea. Sometimes it is overly optimistic.
indices <- c(1.7, 2.3)
ints[indices]
## [1] 1 2
ints[0.999999999]
## integer(0)
R has a number of functions for getting metadata about your objects. Some of this is built in to RStudio.
vec1 <- 1:4
vec2 <- c(1, 2, 3, 4)
length(vec1)
## [1] 4
str(vec1)
## int [1:4] 1 2 3 4
class(vec1)
## [1] "integer"
typeof(vec1)
## [1] "integer"
class(vec2)
## [1] "numeric"
typeof(vec2)
## [1] "double"
is.vector(vec1)
## [1] TRUE
is.list(vec1)
## [1] FALSE
is.list(myList)
## [1] TRUE
is.vector(myList)
## [1] TRUE
is.data.frame(vote)
## [1] TRUE
is.list(vote)
## [1] TRUE
What have you learned? Does it make sense?
R also has functions for learning about the collection of objects in your workspace. Some of this is built in to RStudio.
ls() # search the user workspace (global environment)
## [1] "bools" "chars" "depts" "devs" "extremes"
## [6] "fours" "idevs" "indices" "ints" "mat"
## [11] "mod" "myList" "myOtherSeq" "mySeq" "nums"
## [16] "val" "Val" "vals" "vec1" "vec2"
## [21] "vec3" "vec4" "vote" "x" "y"
rm(x) # delete a variable
ls()
## [1] "bools" "chars" "depts" "devs" "extremes"
## [6] "fours" "idevs" "indices" "ints" "mat"
## [11] "mod" "myList" "myOtherSeq" "mySeq" "nums"
## [16] "val" "Val" "vals" "vec1" "vec2"
## [21] "vec3" "vec4" "vote" "y"
ls.str() # list and describe variables
## bools : logi [1:3] TRUE FALSE TRUE
## chars : chr [1:6] "hi" "hallo" "mother's" "father's" ...
## depts : chr [1:3] "espm" "pmb" "stats"
## devs : num [1:5] -1.481 1.255 -1.339 -0.633 -0.652
## extremes : num(0)
## fours : num [1:6] 4 4 4 4 4 4
## idevs : int [1:100] -3 1 -3 5 -3 -3 1 5 1 5 ...
## indices : num [1:2] 1.7 2.3
## ints : int [1:10] 1 2 3 4 5 6 7 8 9 10
## mat : num [1:3, 1:3] -0.629 0.563 -0.953 -1.095 0.412 ...
## mod : List of 12
## $ coefficients : Named num [1:2] -0.387 0.184
## $ residuals : Named num [1:10] -0.423 -0.289 0.762 0.99 -0.199 ...
## $ effects : Named num [1:10] 0.918 -0.728 0.925 1.097 -0.157 ...
## $ rank : int 2
## $ fitted.values: Named num [1:10] -0.3858 -0.2359 -0.0471 -0.2877 -0.5698 ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## $ df.residual : int 8
## $ xlevels : Named list()
## $ call : language lm(formula = y ~ x)
## $ terms :Classes 'terms', 'formula' length 3 y ~ x
## $ model :'data.frame': 10 obs. of 2 variables:
## myList : List of 5
## $ stuff : num 3
## $ mat : int [1:2, 1:2] 1 2 3 4
## $ moreStuff: chr [1:2] "china" "japan"
## $ :List of 2
## $ newOne : chr "more weird stuff"
## myOtherSeq : num [1:6] 1.1 3.1 5.1 7.1 9.1 11.1
## mySeq : int [1:6] 1 2 3 4 5 6
## nums : num [1:3] 1.1 3 -5.7
## val : num 3
## Val : num 7
## vals : num [1:100] 1.263 -0.326 1.33 1.272 0.415 ...
## vec1 : int [1:4] 1 2 3 4
## vec2 : num [1:4] 1 2 3 4
## vec3 : int [1:5] 2 2 1 2 3
## vec4 : int [1:3] 5 3 2
## vote : 'data.frame': 76205 obs. of 17 variables:
## $ state : int 2 2 2 2 2 2 2 2 2 2 ...
## $ pres04 : int 1 2 1 1 1 1 1 2 2 2 ...
## $ sex : Factor w/ 2 levels "male","female": 2 1 2 2 2 2 1 2 2 2 ...
## $ race : Factor w/ 5 levels "white","black",..: 1 1 2 2 1 1 1 1 1 1 ...
## $ age9 : Factor w/ 9 levels "18-24","25-29",..: 2 1 3 3 4 3 4 1 2 1 ...
## $ partyid : Factor w/ 4 levels "democrat","republican",..: NA NA NA NA NA NA NA NA NA NA ...
## $ income : Factor w/ 8 levels "under $15,000",..: NA NA NA NA NA NA NA NA NA NA ...
## $ relign8 : Factor w/ 8 levels "protestant","catholic",..: NA NA NA NA NA NA NA NA NA NA ...
## $ age60 : Factor w/ 4 levels "18-29","30-44",..: 1 1 2 2 2 2 2 1 1 1 ...
## $ age65 : Factor w/ 6 levels "18-24","25-29",..: 2 1 3 3 4 3 4 1 2 1 ...
## $ geocode : int 3 3 3 3 3 3 3 3 3 3 ...
## $ sizeplac: Factor w/ 5 levels "city over 500,000",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ brnagain: Factor w/ 2 levels "yes","no": NA NA NA NA NA NA NA NA NA NA ...
## $ attend : Factor w/ 6 levels "more than once a week",..: NA NA NA NA NA NA NA NA NA NA ...
## $ year : num 2004 2004 2004 2004 2004 ...
## $ region : num 4 4 4 4 4 4 4 4 4 4 ...
## $ y : num 0 1 0 0 0 0 0 1 1 1 ...
## y : num [1:10] -0.809 -0.525 0.715 0.702 -0.769 ...
Finally we can save the objects in our R session:
ls()
## [1] "bools" "chars" "depts" "devs" "extremes"
## [6] "fours" "idevs" "indices" "ints" "mat"
## [11] "mod" "myList" "myOtherSeq" "mySeq" "nums"
## [16] "val" "Val" "vals" "vec1" "vec2"
## [21] "vec3" "vec4" "vote" "y"
save.image("module1.Rda")
rm(list = ls())
ls()
## character(0)
load("module1.Rda")
# the result of this may not be quite right in the slide version
ls()
## [1] "copy.refObject" "D2R" "deepExtract"
## [4] "delete.refObject" "denslines" "densplot"
## [7] "dim" "dmvt" "ellipse.default"
## [10] "f.angdist" "f.ciplot" "f.dplot"
## [13] "f.ess" "f.ess.old" "f.flushplot"
## [16] "f.gm" "f.grstat" "f.identity"
## [19] "f.invlogit" "f.logit" "f.logmatern.euc"
## [22] "f.lonlat2eucl" "f.matern.ang" "f.matern.ang.cov"
## [25] "f.matern.euc" "f.merge" "format.bytes"
## [28] "f.rdist.earth" "f.sort" "f.sort2"
## [31] "f.squexp" "f.trimat" "f.vecrep"
## [34] "getNcdf" "im" "indices"
## [37] "ls.sizes" "machineName" "makePoly"
## [40] "module" "name.refObject" "plot.ell"
## [43] "pmap" "pmap2" "pointsInPoly"
## [46] "pplot" "pretty" "print.closeR"
## [49] "print.refObject" "q" "R2"
## [52] "R2D" "rcsv" "refObject"
## [55] "set.refObject" "sizes" "source"
## [58] "temp.colors" "thresh" "tplot"
## [61] "tsplot" "value.refObject" "wcsv"
Challenge: how would I find all of my objects that have 'x' in their names?
R has several different plotting systems:
We'll see a little bit of base graphics here and then lattice and ggplot2 tomorrow in Module 8.
earnings <- read.dta("../data/heights.dta")
names(earnings)
## [1] "earn" "height1" "height2" "sex" "race" "hisp" "ed"
## [8] "yearbn" "height"
hist(earnings$earn)
plot of chunk basic_plots
plot(earnings$earn ~ earnings$height)
plot of chunk basic_plots
boxplot(earnings$earn ~ earnings$height)
plot of chunk basic_plots
boxplot(earnings$earn ~ earnings$sex)
plot of chunk basic_plots
Check out help(par)
for various graphics settings; these are set via par()
or within the specific graphics command (some can be set in either place), e.g.,
par(pch = 2)
plot(earnings$earn ~ earnings$height, xlab = "height (cm)", ylab = "earnings (log $)",
log = "y")
## Warning: 187 y values <= 0 omitted from logarithmic plot
plot of chunk unnamed-chunk-21
Create a new R object that contains the heights in meters () using the height1 and height2 variables in the earnings dataset. Make a histogram of the new variable with breakpoints between the bins every 10 cm.
For a random subset of 100 individuals from the earnings dataset, create a vector with their earnings.