#R code for the third lecture on January 27, 2015 #The least squares estimate for beta is given by (X'X)^{-1}X'Y body = read.csv("BodyFat.csv", header = TRUE) head(body, 10) #Linear Model Fitting ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + THIGH, data = body) summary(ft) xmat = matrix(0, nrow(body), 5) xmat[,1] = rep(1, nrow(body)) xmat[,2] = body$AGE xmat[,3] = body$WEIGHT xmat[,4] = body$HEIGHT xmat[,5] = body$THIGH yvec = body$BODYFAT beta.est = (solve(t(xmat) %*% xmat))%*%t(xmat)%*%yvec beta.est summary(ft) #Note that the estimates of beta cannot be obtained when X'X is singular: ft1 = lm(BODYFAT ~ AGE + I(AGE + 4*THIGH) + WEIGHT + HEIGHT + THIGH , data = body) summary(ft1) #If we use singular.ok = FALSE, R will give an error if X'X is singular lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + THIGH + I(AGE + 4*THIGH), data = body, singular.ok = F) #Simple Linear Regression plot(BODYFAT ~ THIGH, data = body) summary(lm(BODYFAT ~ THIGH, data = body)) #To explain regression to mediocrity, let us scale the variables BODYFAT and THIGH to have mean zero and variance one fat.thigh = data.frame(scale(cbind(body$BODYFAT, body$THIGH))) colnames(fat.thigh) = c("fat.scaled", "thigh.scaled") #Check that each variable has mean zero and variance one apply(fat.thigh, 2, "mean") apply(fat.thigh, 2, "sd") #Let us look at a plot between the fat and thigh variables after they have been scaled to have mean zero and standard deviation one. plot(fat.scaled~thigh.scaled, data = fat.thigh) abline(0, 1) #Let us now fit the regression line g = lm(fat.scaled ~ thigh.scaled, fat.thigh) abline(coef(g), lty = 5) coef(g) cor(body$BODYFAT, body$THIGH) #Observe that the regression line has slope less than 1. A person whose thigh circumference is 1 SD above average is predicted to have Fat Percentage only 0.56 SDs above average. This is regression to mediocrity.