# Efron and Tibshirani, 1994, An Introduction to the Bootstrap, chapter 8 # ================================= # two-sample problem # mice survival data, in days, with and without treatment treatment <- c(94, 197, 16, 38, 99, 141, 23) control <- c(52, 104, 146, 10, 51, 30, 40, 27, 46) # ================================= # beanplot library(beanplot) beanplot(treatment, control, side="b", col=list("yellow", "orange"), xlab="Treatment - Control", main="Mouse Data") legend("topright", bty="n", c("Treatment", "Control"), fill = c("yellow", "orange")) # ================================= # model the two-sample problem (z <- treatment) (y <- control) (x <- c(z, y)) (n.z <- length(z)) (n.y <- length(y)) (n.x <- length(x)) # theta.hat == mu.hat.z - mu.hat.y == z.bar - y.bar (mu.hat.z <- mean(x[1:n.z])) # => 86.85714 (mu.hat.y <- mean(x[(n.z+1):(n.z+n.y)])) # => 56.22222 (theta.hat <- mu.hat.z - mu.hat.y) # => 30.63492 # ================================= # bootstrap estimate of the standard error of the difference of means pop1 <- z pop2 <- y n1 <- n.z n2 <- n.y nboot <- 1400 bootstrap <- function(x, nboot, theta, ...){ data <- t(apply(cbind(rep(1, nboot)), 1, samp)) return(apply(data, 1, theta, pop1, pop2)) } samp <- function(x){ c(sample(1:n1, n1, replace=TRUE), sample(1:n2, n2, replace=TRUE)) } theta <- function(row, pop1, pop2){ mean(pop1[row[1:n1]]) - mean(pop2[row[(n1+1):(n1+n2)]]) } theta.hat.star <- bootstrap("good luck!", nboot, theta) hist(theta.hat.star, col="dark red", main="Histogram of theta.hat.star for mouse data") abline(v=theta.hat, lty=2, col="lightsalmon3") (se.hat.1400 <- sd(theta.hat.star)) # => 26.42963 # analysis : how many se's is theta.hat away from 0? theta.hat / se.hat.1400 # => 1.159113 # ================================= # 8.5 lutenizing hormone hormone <- read.table("lutenhorm.data") names(hormone) <- c("period", "level1", "level2", "level3", "level4") head(hormone) str(hormone) summary(hormone) hormone.mean <- mean(hormone\$level3) plot(hormone\$period, hormone\$level3, type="l", col="orchid", asp=10, xlab="time period", ylab="hormone level", main="Lutenizing Hormone") abline(h=hormone.mean,, lty=2, col="dark red") # centered measurements t <- hormone\$period y.t <- hormone\$level3 z.t <- y.t - hormone.mean plot(t, z.t, type="l", col="orchid", asp=10, xlab="time period", ylab="hormone level", main="Lutenizing Hormone - centered measurements") abline(h=0, lty=2, col="dark red") # estimate beta.hat v <- length(z.t) u <- 2 RSE <- function(x){ sum((z.t[u:v] - x * z.t[(u-1):(v-1)])^2) } xs <- seq(from=-1, to=1, by=.001) lst <- apply(cbind(xs), 1, RSE) plot(xs, lst, type="l", col="dark red", main="RSE") min(lst) # => 9.479154 subset(cbind(xs, lst), subset=lst==min(lst)) # => xs lst # [1,] 0.586 9.479154 # conclusion : beta.hat == min b == 0.586 (beta.hat <- subset(cbind(xs, lst), subset=lst==min(lst))[1]) # => 0.586 # estimate the epsilon's epsilon.hat.t <- z.t[u:v] - beta.hat * z.t[(u-1):(v-1)] hist(epsilon.hat.t, col="lightsalmon2") mean(epsilon.hat.t) # => 0.006234043 sd(epsilon.hat.t) # => 0.453904 # another calculation of min b # take the derivative of RSE(b) wrt b and set it equal to 0; solve for b # min b occurs when b == z.t dot z.(t-1) / z.(t-1) dot z.(t-1) c <- z.t[-1] # all but the first d <- z.t[-48] # all but the last b.min <- sum(c * d) / sum(d * d) b.min # => 0.5857651 # ================================= # bootstrap time series (es <- epsilon.hat.t) n <- length(es) samp <- sample(1:n, n, replace=TRUE) z.star <- numeric(48) z.star[1] <- z.t[1] for (i in 2:48){ z.star[i] <- beta.hat * z.star[i-1] + es[samp][i-1] } z.star plot(t, z.star, type="l", col="orchid", asp=10, xlab="time period", ylab="hormone level", main="Bootstrap time series") abline(h=0, lty=2, col="dark red") # calculate min b # min b occurs when b == z.t dot z.(t-1) / z.(t-1) dot z.(t-1) # note that this is the same formula as (8.29) on p.97 c <- z.star[-1] # all but the first d <- z.star[-48] # all but the last b.min <- sum(c * d) / sum(d * d) b.min # => something close to 0.5857651 # this produces bootstrap time series on demand # now need to calculate beta.hat.star for each of 200 series, # plot them, and calculate the estimated se # ======================================== # bootstrap - Efron-Tibshirani style bootstrap <- function(x, nboot, theta, ...){ data <- matrix(sample(x, size=length(x) * nboot, replace=TRUE), nrow=nboot) return(apply(data, 1, theta)) } # arguments # x a vector containing the data # nboot the number of bootstrap samples desired # theta function to be bootstrapped; takes x as an argument and may take additional arguments # ======================================== # Efron-Tibshirani style (es <- epsilon.hat.t) n <- length(es) nboot <- 200 theta <- function(x){ z.star <- numeric(48) z.star[1] <- z.t[1] for (i in 2:48){ z.star[i] <- beta.hat * z.star[i-1] + es[x][i-1] } c <- z.star[-1] # all but the first d <- z.star[-48] # all but the last b.min <- sum(c * d) / sum(d * d) b.min } theta.hat.star <- bootstrap(1:n, nboot, theta) hist(theta.hat.star, col="dark red", main="200 bootstrap estimates of beta.hat") abline(v=beta.hat, col="lemonchiffon4") mean(theta.hat.star) # => 0.5546252 sd(theta.hat.star) # => 0.125905