# assumptions of linear regression
# =========================================
# assumptions of simple linear regression
# from DeGroot and Schervish, chapter 11
# 1. predictor is known
# 2. normality : Yi | x1, ..., xn ~ N
# 3. linear mean : mean( Yi | x1, ..., xn ) = b0 + b1 * xi for 1 <= i <= n
# 4. common variance : variance( Yi | x1, ..., xn ) = sigma^2 for 1 <= i <= n
# 5. independence : Yi | x1, ..., xn are independent
# thus, if x = (x1, ..., xn) and y = (y1, ..., yn),
# then the pdf f_n(y | x, b0, b1, sigma) ~ N(b0 + b1 * x, sigma^2)
# multivariate normal
# =========================================
# Verzani, chapter 10
# Example 10.1 maximum heart rate (p.281)
age <- rep(seq(20, 60, by=5), 3)
mhr <- 209 - 0.7 * age + rnorm(length(age), sd=4)
mhr.lm <- lm(mhr ~ age)
mhr.lm
# testing the model assumptions
# does the linear model seem appropriate?
# is the general trend linear?
# scatterplot
plot(mhr ~ age, col="dark red",
main="Maximum heart rate vs. age - scatterplot")
abline(lm(mhr ~ age),
col="palegreen4")
# residual plot
# this can detect small deviations in the model
# this is one of the four plots produced by plot(mhr.lm)
# do the error terms have a common variance?
# variance may increase for larger values of the predictor
plot(fitted(mhr.lm), resid(mhr.lm),
col="dark red", main="mhr residual plot")
# testing mormality of the residuals
# are the error terms normally distributed?
# this is one of the four plots produced by plot(mhr.lm)
qqnorm(resid(mhr.lm),
col="dark red", main="mhr - qqplot")
qqline(resid(mhr.lm),
col="steelblue", lty=2)
# scale-location plot
# spread of the scattered points should not get smaller or larger
# as one scans from left to right across the plot
# this is one of the four plots produced by plot(mhr.lm)
plot(fitted(mhr.lm), sqrt(abs(resid(mhr.lm))),
col="dark red", main="mhr scale-location plot")
# influential points
# a regression line can be strongly influenced by outliers
# Cook's distance
# this is one of the four plots produced by plot(mhr.lm)
plot(cooks.distance(mhr.lm), type="h",
col="dark red", main="mhr Cook's distance")
# plot(mhr.lm) produces four plots
# which indicate the appropriateness of the regression
plot(mhr.lm, col="dark red",
main="Maximum heart rate vs. age")
# =================================