# Efron and Tibshirani, 1994, An Introduction to the Bootstrap, chapter 7 # ================================= # analysis of covariance # http://www.r-bloggers.com/analysis-of-covariance-%E2%80%93-extending-simple-linear-regression/ # cran task : multivariate statistics # http://cran.r-project.org/web/views/Multivariate.html # ================================= # read data from file # http://cran.r-project.org/doc/manuals/R-intro.html#Reading-data-from-files score <- read.table("scor.data") names(score) <- c("mechanics", "vectors", "algebra", "analysis", "statistics") head(score) str(score) summary(score) (means <- sapply(score, mean)) # covariance matrix # http://stat.ethz.ch/R-manual/R-patched/library/stats/html/cor.html round(cov(score), 1) # => mechanics vectors algebra analysis statistics # mechanics 305.8 127.2 101.6 106.3 117.4 # vectors 127.2 172.8 85.2 94.7 99.0 # algebra 101.6 85.2 112.9 112.1 121.9 # analysis 106.3 94.7 112.1 220.4 155.5 # statistics 117.4 99.0 121.9 155.5 297.8 # eigensystem of covariance matrix # http://rss.acs.unt.edu/Rdoc/library/panel/html/eddcmp.html library(panel) (eigensystem <- eddcmp(cov(score))) # => # \$evectors # [,1] [,2] [,3] [,4] [,5] # [1,] -0.7716061 0.77617504 0.2366079 0.194787294 0.09665605 # [2,] -0.5623157 0.21500057 -0.3280039 -0.514870935 0.22994275 # [3,] -0.5276815 -0.07868874 -0.1146922 -0.002128397 -1.12480305 # [4,] -0.6886773 -0.31191040 -0.4708866 0.340757586 0.34760111 # [5,] -0.8161893 -0.56784797 0.4737669 -0.115571179 0.18411402 # \$evalues # [1] 686.98981 202.11107 103.74731 84.63044 32.15329 # \$im.evalues # [1] 0 0 0 0 0 # ratio of largest eigenvalue to sum of eigenvalues # == % of variance explained by first principal component lambdas <- eigensystem\$evalues (theta.hat <- lambdas[1] / sum(lambdas)) # => 0.619115 # bootstrap replications of statistic theta bootstrap <- function(x, nboot, theta, ...){ data <- matrix(sample(x, size=length(x) * nboot, replace=TRUE), nrow=nboot) return(apply(data, 1, theta, xdata)) } xdata <- as.matrix(score) n <- nrow(xdata) theta <- function(x, xdata){ cov.star <- cov(xdata[x, ]) eigensys.star <- eddcmp(cov.star) lambdas.star <- eigensys.star\$evalues return(lambdas.star[1] / sum(lambdas.star)) } (results <- bootstrap(1:n, nboot=200, theta, xdata)) # ?plotmath hist(results, col="mediumorchid1", main=expression(paste(hat(theta)," for first principal component of G"))) abline(v=theta.hat, lty=2, col="firebrick") mean.star <- mean(results) # => 0.6188281 se.star <- sd(results) # => 0.0441262 quantile(results, c(.05, .10, .16, .50, .84, .90, .95)) # => 5% 10% 16% 50% 84% 90% 95% # 0.5441547 0.5635836 0.5749436 0.6220064 0.6599126 0.6742840 0.6899534 # standard CI # 68% CI (ci.68 <- mean.star + c(-se.star, se.star)) # => 0.5747019 0.6629543 # 90% CI alpha <- .05 z.95 <- qnorm(1 - alpha) (ci.90 <- mean.star + z.95 * c(-se.star, se.star)) # => 0.5462470 0.6914093 # bootstrap replications : first and second principal components theta <- function(x, xdata){ cov.star <- cov(xdata[x, ]) eigensys.star <- eddcmp(cov.star) v1.hat.star <- eigensys.star\$evectors[ ,1] v2.hat.star <- eigensys.star\$evectors[ ,2] return(c(v1.hat.star, v2.hat.star)) } (results <- bootstrap(1:n, nboot=200, theta, xdata)) # check the results plot(results[ ,1], type="n", xlim=c(1,5), ylim=c(-1,1), xlab="component", ylab="", main="Graphs of bootstrap replications of \nfirst principal component vector") for (i in 1:200){ lines(results[1:5,i], type="l", col="dark red") } plot(results[ ,1], type="n", xlim=c(1,5), xlab="component", ylab="", main="Graphs of bootstrap replications of \nsecond principal component vector") for (i in 1:200){ lines(results[6:10,i], type="l", col="dark red") } # notice the confusion in + and - signs for these eigenvectors # if the first component of a vector is positive, then # multiply the whole vector by -1 to turn it around (or vice versa), # then graph the vectors once again, and the result will be more coherent boxplot(t(results), col="orange", xlab="components of v1 and v2", main="Bootstrap replications of first two principal component vectors") # note that the means are similar to those in figure 7.3, p.70, # but the spreads are quite different # because of the arbitrary + an - signs # reorient the eigenvectors, and plot them again # v1.star vecs1 <- results[1:5, ] for (i in 1:200){ if (vecs1[1, i] < 0){ vecs1[ ,i] <- -vecs1[ ,i] } } plot(vecs1[ ,1], type="n", ylim=c(-1,1), xlab="component", ylab="", main="Graphs of bootstrap replications of \nfirst principal component vector") for (i in 1:200){ lines(vecs1[1:5,i], type="l", col="dark red") } boxplot(t(vecs1), ylim=c(-1,1), col="orange", xlab="components of v1", main="Bootstrap replications of first principal component vector") # v2.star vecs2 <- results[6:10, ] for (i in 1:200){ if (vecs2[1, i] > 0){ vecs2[ ,i] <- -vecs2[ ,i] } } plot(vecs2[ ,1], type="n", ylim=c(-1,1), xlab="component", ylab="", main="Graphs of bootstrap replications of \nsecond principal component vector") for (i in 1:200){ lines(vecs2[1:5,i], type="l", col="dark red") } boxplot(t(vecs2), ylim=c(-1,1), col="orange", xlab="components of v2", main="Bootstrap replications of second principal component vector") # ================================= # example 2 : cholostyramine data cholostyramine <- read.table("cholost.data") names(cholostyramine) <- c("compliance", "improvement") head(cholostyramine) # sort by compliance # http://developmentality.wordpress.com/2010/02/12/r-sorting-a-data-frame-by-the-contents-of-a-column/ chol.ordered <- cholostyramine[order(cholostyramine\$compliance), ] # scatterplot plot(cholostyramine\$compliance, cholostyramine\$improvement, pch=19, col="dark red", xlab="compliance", ylab="improvement", main="Cholostyramine") abline(h=0, lty=2, col="maroon2") # ================================= # quadratic approximation # http://astrostatistics.psu.edu/su07/R/reg.html # http://www.stat.umn.edu/geyer/5102/examp/reg.html x <- cholostyramine\$compliance y <- cholostyramine\$improvement quad <- lm(y ~ poly(x, degree=2)) plot(x, y, pch=19, col="dark red", xlab="compliance", ylab="improvement", main="Cholostyramine - loess smoother and quadratic approximation") curve(predict(quad, newdata=data.frame(x=x)), lwd=2, col="lightgreen", add=TRUE) # ================================= # loess # http://www.ime.unicamp.br/~dias/loess.pdf # http://learnr.wordpress.com/2009/03/10/loess-smoothing/ # http://stat.ethz.ch/R-manual/R-patched/library/stats/html/loess.html # http://research.stowers-institute.org/efg/R/Statistics/loess.htm # larger spans create smoother plots y.loess <- loess(y ~ x, span=0.25) y.predict <- predict(y.loess) data <- cbind(x, y.predict)[order(x), ] data2 <- subset(data, !duplicated(x)) lines(data2, type="l", lwd=2, col="orange") # another plot lines(lowess(cholostyramine), col="orange") # and another y.loess <- loess(y ~ x, span=0.75) y.predict <- predict(y.loess) plot(x, y.predict, pch=19, col="navyblue") # ================================= # loess library(ggplot2) (p <- qplot(compliance, improvement, data = cholostyramine)) (p1 <- p + geom_smooth(method = "loess") + opts(title = "Cholostyramine loess analysis"))