# Chihara and Hesterberg, chap 9 # Chapter 9 Regression #------------------------- # Section 9.1 covariance spruce <- read.csv("Spruce.csv") head(spruce) str(spruce) summary(spruce) plot(spruce\$Ht.change, spruce\$Di.change, col="palegreen4", xlab="Change in height (cm)", ylab="Change in diameter (cm)", main="Black Spruce Seedlings") mean.ht.change <- mean(spruce\$Ht.change) mean.dia.change <- mean(spruce\$Di.change) abline(v=mean.ht.change, col="dark red") abline(h=mean.dia.change, col="dark red") #------------------------- # Section 9.2 correlation cor(spruce\$Ht.change, spruce\$Di.change) # => 0.9020819 #------------------------- # Section 9.3 least squares regression #------------------------- # Example 9.2 spruce spruce.lm <- lm(Di.change ~ Ht.change, data=spruce); spruce.lm plot(spruce\$Ht.change, spruce\$Di.change, col="sienna1", xlab="Change in height (cm)", ylab="Change in diameter (cm)", main="Black Spruce Seedlings") abline(spruce.lm, col="plum4") #------------------------- # Example 9.3 spruce r <- cor(spruce\$Ht.change, spruce\$Di.change); r s.y <- sd(spruce\$Di.change); s.y s.x <- sd(spruce\$Ht.change); s.x r * s.y / s.x coef(spruce.lm) #------------------------- # Example 9.4 spruce # r^2 of the variance is explained by the regression r <- cor(spruce\$Ht.change, spruce\$Di.change) r^2 # => 0.8137518 #------------------------- # Example 9.5 predicted values; residuals spruce.data <- cbind(spruce\$Ht.change, spruce\$Di.change, predict(spruce.lm), resid(spruce.lm)) dimnames(spruce.data)[[2]] <- c("height change", "diameter change", "predicted y.hat", "residual") head(spruce.data) #------------------------- # spruce residuals; smooth spline plot(spruce\$Ht.change, resid(spruce.lm), col="lightsalmon4", xlab="Change in height (cm)", ylab="residuals", main="Black Spruce Seedlings") abline(h=0, col="lightskyblue4") lines(smooth.spline(spruce\$Ht.change, resid(spruce.lm), df=3), col="blue") #------------------------- # Example 9.6 vanilla ice cream sugar <- c(15,13,20,23,11,21.5,12,23,23.0,19,19.0,19,21.8,17,20,17,20,16,11,12) fat <- c(8,8,16,14,7,15.5,8,21,15.5,16,4.5,13,13.5,12,16,8,15,8,7,6) sugar.mean <- mean(sugar); sugar.mean sugar.sd <- sd(sugar); sugar.sd fat.mean <- mean(fat); fat.mean fat.sd <- sd(fat); fat.sd r <- cor(sugar, fat); r # => 0.7919348 r^2 # => 0.6271607 vanilla.lm <- lm(fat ~ sugar); vanilla.lm plot(sugar, fat, col="royalblue3", xlab="sugar (g)", ylab="fat (g)", main="Vanilla Ice Cream") abline(vanilla.lm, col="paleturquoise2") plot(sugar, resid(vanilla.lm), col="royalblue3", xlab="sugar (g)", ylab="residuals", main="Vanilla Ice Cream - Residuals") abline(h=0, col="paleturquoise2") # figure 9.13 does not correspond to this data #------------------------- # multiple regression spruce <- read.csv("Spruce.csv") head(spruce) lm(Di.change ~ Ht.change + Fertilizer + Competition, data=spruce) # Coefficients: # (Intercept) Ht.change FertilizerNF CompetitionNC # 1.0487 0.1040 -1.0266 0.4895 # this differs from the result in the text # note the sign changes and different intercept # recode levels levels(spruce\$Fertilizer) levels(spruce\$Fertilizer) <- c(1, 0) levels(spruce\$Competition) levels(spruce\$Competition) <- c(1, 0) head(spruce) lm(Di.change ~ Ht.change + Fertilizer + Competition, data=spruce) # Coefficients: # (Intercept) Ht.change FertilizerNF CompetitionNC # 1.0487 0.1040 -1.0266 0.4895 # recoding did not change the analysis spruce <- read.csv("Spruce.csv") head(spruce) #------------------------- # Section 9.4 the simple linear model #------------------------- # Example 9.7 Olympic skating skating <- read.csv("Skating2010.csv") head(skating) str(skating) summary(skating) skating.lm <- lm(Free ~ Short, data=skating); skating.lm # Coefficients: # (Intercept) Short # 7.969 1.735 r <- cor(skating\$Short, skating\$Free); r # => 0.8363805 plot(skating\$Short, skating\$Free, col="slateblue3", xlab="Short", ylab="Free", main="2010 Olympics Men's Figure Skating") abline(skating.lm, col="turquoise4") # regression lines from 30 bootstrap samples # see the bootstrap code on p.281 # first, draw the plot plot(skating\$Short, skating\$Free, col="slateblue3", xlab="Short", ylab="Free", main="Skating - Regression Lines from 30 Bootstrap Samples") # now, add 30 regression lines N <- 30 # number of resamples n <- 24 # number of skaters for (i in 1:N) { index <- sample(n, replace = TRUE) # sample from 1, 2, ... n Skate.boot <- skating[index, ] # recalculate linear model skateBoot.lm <- lm(Free ~ Short, data = Skate.boot) abline(skateBoot.lm, col="turquoise4") } #------------------------- # Example 9.8 Olympic skating - inference for alpha and beta skating.lm summary(skating.lm) # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 7.9691 18.1175 0.440 0.664 # Short 1.7347 0.2424 7.157 3.56e-07 *** # ... # Residual standard error: 11.36 on 22 degrees of freedom # Multiple R-squared: 0.6995, Adjusted R-squared: 0.6859 # F-statistic: 51.22 on 1 and 22 DF, p-value: 3.562e-07 # estimate for sigma s <- 11.36 # test # H0 : beta == 0 # H1 : beta != 0 # use t == betaHat / SE[betaHat] betaHat <- 1.7347 betaHat.SE <- 0.2424 t <- betaHat / betaHat.SE; t # => 7.156353 # compare to a t distribution with df == n - 2 == 22 p <- 2 * (1 - pt(t, df=22)); p # => 3.565114e-07 # conclusion : reject H0 # compute a 95% CI for the true beta alpha <- .05 t.975 <- qt(1 - alpha/2, df=22); t.975 e <- t.975 * betaHat.SE beta.CI <- betaHat + c(-e, e); beta.CI # => 1.231993 2.237407 #------------------------- # Example 9.8 Olympic skating - inference for the response # a specific x : x.s # mean response : YHat.s = alphaHat + betaHat * x.s (point estimate) # compute the sampling distribution of YHat.s # theorem 9.6, p.274 # YBar is normal # E[YBar] == alpha + beta * xBar # Var[YBar] == sigma^2 / n # YHat.s is normal # E[YHat.s] == E[Y.s] == alpha + beta * x.s # Var[YHat.s] == sigma^2 * (1 / n + (x.s - xBar)^2 / ssx) # use the residual standard error, S, to estimate sigma # let SEHat[YHat.s] == S * sqrt(1 / n + (x.s - xBar)^2 / ssx) # then, # t <- (YHat.s - E[YHat.s]) / SEHat[YHat.s] # has a t distribution with df = n - 2 # CI for E[YHat.s] at x.s is # YHat.s + q * c(SEHat[YHat.s], -SEHat[YHat.s]) # where # q <- qt(1 - alpha/2, df=n-2) # s <- residual standard error #------------------------- # Example 9.9 Olympic skating - inference for the response skating.lm alphaHat <- 7.969 betaHat <- 1.735 x.s <- 60 # E[YHat.s] YHat.s <- alphaHat + betaHat * x.s; YHat.s # => 112.069 xBar <- mean(skating\$Short); xBar # => 74.13292 sx <- sd(skating\$Short); sx # => 9.770611 n <- 24 S <- 11.36 ssx <- (n - 1) * sx^2; ssx # => 2195.691 # SEHat[YHat.s] SEHat.YHat.s <- S * sqrt(1 / n + (x.s - xBar)^2 / ssx); SEHat.YHat.s # => 4.137215 alpha <- .05 q <- qt(1 - alpha/2, df=n-2); q # => 2.073873 # CI for E[YHat.s] at x.s is YHat.s.CI <- YHat.s + q * c(-SEHat.YHat.s, SEHat.YHat.s); YHat.s.CI # => 103.4889 120.6491 #------------------------- # Example 9.10 Olympic skating - prediction interval SEHat.prediction <- S * sqrt(1 + 1 / n + (x.s - xBar)^2 / ssx); SEHat.prediction # => 12.08992 PI <- YHat.s + c(-q * SEHat.prediction, q * SEHat.prediction); PI # => 86.99604 137.14196 # confidence band and prediction band # figure 9.16, p.276 conf.upper <- function(x) {alphaHat + betaHat * x + q * S * sqrt(1 / n + (x - xBar)^2 / ssx)} conf.lower <- function(x) {alphaHat + betaHat * x - q * S * sqrt(1 / n + (x - xBar)^2 / ssx)} pred.upper <- function(x) {alphaHat + betaHat * x + q * S * sqrt(1 + 1 / n + (x - xBar)^2 / ssx)} pred.lower <- function(x) {alphaHat + betaHat * x - q * S * sqrt(1 + 1 / n + (x - xBar)^2 / ssx)} plot(skating\$Short, skating\$Free, col="slateblue3", ylim=c(80, 190), xlab="Short", ylab="Free", main="2010 Olympics Men's Figure Skating") abline(skating.lm, col="turquoise4") curve(conf.upper(x), lty=2, col="red", add=TRUE) curve(conf.lower(x), lty=2, col="red", add=TRUE) curve(pred.upper(x), lty=3, col="springgreen4", add=TRUE) curve(pred.lower(x), lty=3, col="springgreen4", add=TRUE) legend(58, 185, legend = c("Confidence Band", "Prediction Band"), col=c("red", "springgreen4"), lty = c(2, 3)) #------------------------- # Example 9.11 grizzly bears n <- 17 alphaHat <- 32.67 betaHat <- 0.55 s <- 4.69 # residual atandard error r <- sqrt(0.792) xBar <- 141.59 sx <- 16.19 sy <- betaHat * sx / r; sy # => 10.00569 x.s <- 120 Ys <- alphaHat + betaHat * x.s; Ys # => 98.67 ssx <- (n - 1) * sx^2; ssx SEHat.YHat.s <- s * sqrt(1 / n + (x.s - xBar)^2 / ssx); SEHat.YHat.s # => 1.93356 alpha <- .05 q <- qt(1 - alpha/2, df=n-2); q # => 2.13145 # CI for E[YHat.s] at x.s is YHat.s <- Ys YHat.s.CI <- YHat.s + q * SEHat.YHat.s * c(-1, 1); YHat.s.CI # => 94.54871 102.79129 SEHat.prediction <- s * sqrt(1 + 1 / n + (x.s - xBar)^2 / ssx); SEHat.prediction # => 5.072943 PI <- YHat.s + q * SEHat.prediction * c(-1, 1); PI # => 87.85728 109.48272 #------------------------- # Section 9.5 resampling correlation and regression Skating2010 <- read.csv("Skating2010.csv") N <- 10^4 cor.boot <- numeric(N) beta.boot <- numeric(N) alpha.boot <- numeric(N) yPred.boot <- numeric(N) n <- 24 #number of skaters for (i in 1:N) { index <- sample(n, replace = TRUE) # sample from 1, 2, ... n Skate.boot <- Skating2010[index, ] cor.boot[i] <- cor(Skate.boot\$Short, Skate.boot\$Free) #recalculate linear model estimates skateBoot.lm <- lm(Free ~ Short, data = Skate.boot) alpha.boot[i] <- coef(skateBoot.lm)[1] # new intercept beta.boot[i] <- coef(skateBoot.lm)[2] # new slope yPred.boot[i] <- alpha.boot[i] + 60 * beta.boot[i] # recompute Y^ } mean(cor.boot) sd(cor.boot) quantile(cor.boot, c(0.025, 0.975)) hist(cor.boot, main = "Bootstrap distribution of correlation", xlab = "Correlation", col="mistyrose3") observed <- cor(Skating2010\$Short, Skating2010\$Free) abline(v = observed, col = "blue") #add line at observed cor. hist(beta.boot, main = "Bootstrap distribution of slope", xlab = "Slope", col="seagreen") observed <- coef(skating.lm)[2] abline(v = observed, col = "wheat2") #add line at observed slope. alphaHat <- 7.969 betaHat <- 1.735 x.s <- 60 hist(yPred.boot, main = "Bootstrap distribution of free scores", xlab = "Mean response at short == 60", col="sienna2") observed <- alphaHat + betaHat * x.s abline(v = observed, col = "springgreen1") #add line at observed slope. alpha <- .05 quantile(yPred.boot, probs=c(alpha/2, 1 - alpha/2)) # => 102.4691 120.4822 # not quite the book's result #------------------------------------------------------- # Section 9.5.1 Permutation test N <- 9999 n <- nrow(Skating2010) #number of observations result <- numeric(N) observed <- cor(Skating2010\$Short, Skating2010\$Free) for (i in 1:N) { index <- sample(n , replace = FALSE) Short.permuted <- Skating2010\$Short[index] result[i] <- cor(Short.permuted, Skating2010\$Free) } (sum(observed <= result) + 1)/(N+1) #P-value # => 1e-04 #------------------------------------------------------- # Section 9.5.2 bushmeat bushmeat <- read.csv("Bushmeat.csv") head(bushmeat) str(bushmeat) summary(bushmeat) layout(matrix(c(1,2), 2, 1, byrow = TRUE)) plot(bushmeat\$Year, bushmeat\$Fish, type="b", col="lightsalmon4", xlab="Year", ylab="Fish per capita", main="Bushmeat - Fish per capita") plot(bushmeat\$Year, bushmeat\$Biomass, type="b", col="steelblue3", xlab="Year", ylab="Biomass", main="Bushmeat - Biomass") percent.change <- 100 * (bushmeat\$Biomass[2:30] - bushmeat\$Biomass[1:29]) / bushmeat\$Biomass[2:30] plot(bushmeat\$Fish[2:30], percent.change, col="maroon2", xlab="Per capita fish supply (kg)", ylab="Percent change in biomass", main="Percent change in biomass against fish supply") # Note that the lowest value of about -29.6% does not match the illustration in the textbook. bushmeat.lm <- lm(percent.change ~ bushmeat\$Fish[2:30]) # Coefficients: # (Intercept) bushmeat\$Fish[2:30] # -23.1373 0.6911 abline(bushmeat.lm, col="slategray2") cor(percent.change, bushmeat\$Fish[2:30]) # => 0.6130507 alpha <- -23.1373 beta <- 0.6911 # solve alpha + beta * fish == 0 for fish fish.intercept <- -alpha / beta; fish.intercept # => 33.47895 abline(h=0, lty=2, col="paleturquoise4") # sample distribution of regression lines change <- percent.change fish <- bushmeat\$Fish[2:30] bushmeat.df <- data.frame(change, fish) alpha.boot <- numeric(N) beta.boot <- numeric(N) N <- 40 n <- nrow(bushmeat.df) #number of observations for (i in 1:N) { index <- sample(n , replace = TRUE) bushmeat.boot <- bushmeat.df[index, ] boot.lm <- lm(change ~ fish, data=bushmeat.boot) alpha.boot[i] <- coef(boot.lm)[1] beta.boot[i] <- coef(boot.lm)[2] abline(boot.lm, col="slategray4") } # sample distribution of x-intercepts N <- 10^4 for (i in 1:N) { index <- sample(n , replace = TRUE) bushmeat.boot <- bushmeat.df[index, ] boot.lm <- lm(change ~ fish, data=bushmeat.boot) alpha.boot[i] <- coef(boot.lm)[1] beta.boot[i] <- coef(boot.lm)[2] } x.intercept <- -alpha.boot / beta.boot hist(x.intercept, col="peachpuff4", xlab="x intercepts", ylab="Frequency", main="Bushmeat - Bootstrap Distribution of the x-intercept") abline(v=fish.intercept, col="seashell") q <- quantile(x.intercept, c(.025, .975)); q # => 2.5% 97.5% # 31.69148 35.61627 q - fish.intercept # Note : The authors mention a 22% drop in biomass in one year, # but the data in the bushmeat file indicate a drop of 29.6% in the final year. # This may indicate why the present numbers do not quite match the author's numbers. # ---------------------------------------------- # Chapter 9.6.1 Inference for logistic regression # http://www.cdc.gov/Motorvehiclesafety # http://nhtsa.gov/FARS Fatalities <-read.csv("Fatalities.csv") head(Fatalities) str(Fatalities) summary(Fatalities) # generalized linear model fit <- glm(Alcohol ~ Age, data = Fatalities, family = binomial) data.class(fit) # is a "glm" object, so for help use: help(glm) fit # prints the coefficients and other basic info coef(fit) # the coefficients as a vector # => (Intercept) Age # -0.12262199 -0.02898332 summary(fit) # gives standard errors for coefficients, etc. x <- seq(17, 91, length=500) # vector spanning the age range # compute predicted probabilities y1 <- exp(-.123-.029*x) / (1+exp(-.123-.029*x)) ?plogis y2 <- plogis(coef(fit)[1] + coef(fit)[2] * x) plot(Fatalities\$Age, Fatalities\$Alcohol, ylab = "Probability of alcohol", xlab="Age", col="dark red", main="Driver Fatalities - Alcohol vs Age") lines(x, y2, col="royalblue1") # ---------------------------------------------- # Example 9.12 FARS fatalities in Pennsylvania # estimated logistic equation # ln(pHat / (1 - pHat)) == -0.123 -0.029 * x # pHat for 25 year old driver pHat.25 <- plogis(coef(fit)[1] + coef(fit)[2] * 25); pHat.25 # => 0.3000195 # odds of alcohol being involved for 25 year old driver odds.25 <- pHat.25 / (1 - pHat.25); odds.25 # => 0.4286112 # pHat for 35 year old driver pHat.35 <- plogis(coef(fit)[1] + coef(fit)[2] * 35); pHat.35 # => 0.2428646 # odds of alcohol being involved for 35 year old driver odds.35 <- pHat.35 / (1 - pHat.35); odds.35 # => 0.3207677 # odds ratio odds.25 / odds.35 # => 1.336205 # ---------------------------------------------- # Example 9.13 infection vs length of hospital stay # logistic regression # ln(pHat / (1 - pHat)) == -1.942 + 0.023 * x exp(0.023 * 7) # => 1.174685 # concl : staying an additional week increases the odds of infection by 17.5% # ---------------------------------------------- # 9.6.1 inference for logistic regression # Full bootstrap - slope coefficient, and prediction at age 20 N <- 10^3 n <- nrow(Fatalities) # number of observations alpha.boot <- numeric(N) beta.boot <- numeric(N) pPred.boot <- numeric(N) for (i in 1:N) { index <- sample(n, replace = TRUE) Fatal.boot <- Fatalities[index, ] # resampled data fit.boot <- glm(Alcohol ~ Age, data = Fatal.boot, family = binomial) alpha.boot[i] <- coef(fit.boot)[1] # new intercept beta.boot[i] <- coef(fit.boot)[2] # new slope pPred.boot[i] <- plogis(alpha.boot[i] + 20 * beta.boot[i]) } quantile(beta.boot, c(.025, .975)) # 95% percentile intervals quantile(pPred.boot, c(.025, .975)) par(mfrow=c(2,2)) # set layout hist(beta.boot, xlab = expression(beta), col="royalblue2", main = "") qqnorm(beta.boot, col="blue", main = "") hist(pPred.boot, xlab = expression(hat(p)), col="lightcyan4", main = "") qqnorm(pPred.boot, col="lightcyan2", main = "") #-------------------- # predict help(predict.glmm) # for more help on predict n <- nrow(Fatalities) # number of observations x <- seq(17, 91, length=500) # vector spanning the age range df.Age <- data.frame(Age = x) # data frame to hold # explanatory variables, will use this for making predictions plot(Fatalities\$Age, Fatalities\$Alcohol, ylab = "Probability of alcohol") for (i in 1:25) { index <- sample(n, replace = TRUE) Fatal.boot <- Fatalities[index, ] # resampled data fit.boot <- glm(Alcohol ~ Age, data = Fatal.boot, family = binomial) pPred <- predict(fit.boot, newdata = df.Age, type = "response") lines(x, pPred, col="olivedrab4") } # end fatalities #---------------------