# assumptions of linear regression # ========================================= # assumptions of simple linear regression # from DeGroot and Schervish, chapter 11 # 1. predictor is known # 2. normality : Yi | x1, ..., xn ~ N # 3. linear mean : mean( Yi | x1, ..., xn ) = b0 + b1 * xi for 1 <= i <= n # 4. common variance : variance( Yi | x1, ..., xn ) = sigma^2 for 1 <= i <= n # 5. independence : Yi | x1, ..., xn are independent # thus, if x = (x1, ..., xn) and y = (y1, ..., yn), # then the pdf f_n(y | x, b0, b1, sigma) ~ N(b0 + b1 * x, sigma^2) # multivariate normal # ========================================= # Verzani, chapter 10 # Example 10.1 maximum heart rate (p.281) age <- rep(seq(20, 60, by=5), 3) mhr <- 209 - 0.7 * age + rnorm(length(age), sd=4) mhr.lm <- lm(mhr ~ age) mhr.lm # testing the model assumptions # does the linear model seem appropriate? # is the general trend linear? # scatterplot plot(mhr ~ age, col="dark red", main="Maximum heart rate vs. age - scatterplot") abline(lm(mhr ~ age), col="palegreen4") # residual plot # this can detect small deviations in the model # this is one of the four plots produced by plot(mhr.lm) # do the error terms have a common variance? # variance may increase for larger values of the predictor plot(fitted(mhr.lm), resid(mhr.lm), col="dark red", main="mhr residual plot") # testing mormality of the residuals # are the error terms normally distributed? # this is one of the four plots produced by plot(mhr.lm) qqnorm(resid(mhr.lm), col="dark red", main="mhr - qqplot") qqline(resid(mhr.lm), col="steelblue", lty=2) # scale-location plot # spread of the scattered points should not get smaller or larger # as one scans from left to right across the plot # this is one of the four plots produced by plot(mhr.lm) plot(fitted(mhr.lm), sqrt(abs(resid(mhr.lm))), col="dark red", main="mhr scale-location plot") # influential points # a regression line can be strongly influenced by outliers # Cook's distance # this is one of the four plots produced by plot(mhr.lm) plot(cooks.distance(mhr.lm), type="h", col="dark red", main="mhr Cook's distance") # plot(mhr.lm) produces four plots # which indicate the appropriateness of the regression plot(mhr.lm, col="dark red", main="Maximum heart rate vs. age") # =================================