# Chihara and Hesterberg, chap. 3 # 3.1 hypothesis testing tDrug <- c(30,25,20) tControl <- c(18,21,22) muDrug <- mean(tDrug) muControl <- mean(tControl) meanDifference <- muDrug - muControl # 3.3 permutation tests Beerwings <- read.csv("Beerwings.csv") boxplot(Beerwings\$Hotwings ~ Beerwings\$Gender, col="plum", ylab="Number of hot wings consumed", main="Number of hot wings consumed, by gender") tapply(Beerwings\$Hotwings, Beerwings\$Gender, mean) observed <- 14.5333- 9.3333 # store observed mean differences # Get hotwings variable hotwings <- subset(Beerwings, select=Hotwings, drop=T) #set.seed(0) B <- 10^5-1 #set number of times to repeat this process result <- numeric(B) # space to save the random differences for(i in 1:B) { index <- sample(30, size=15, replace = FALSE) # sample of numbers from 1:30 result[i] <- mean(hotwings[index]) - mean(hotwings[-index]) } # Plot hist(result, col="orange", xlab = "xbarM - xbarF", main="Permutation distribution for hot wings") abline(v = observed, col = "blue", lty=5) # Another visualization of distribution dev.new() plot.ecdf(result, col="saddlebrown") abline(v=observed,col="seagreen", lty=5) # Compute P-value (sum(result >= observed)+1)/(B + 1) # P-value #---------------------------------------- # Example 3.3 Verizon # Permutation test #Uncomment below if you haven't imported Verizon yet Verizon <- read.csv("Verizon.csv") tapply(Verizon\$Time, Verizon\$Group, mean) Time <- subset(Verizon, select=Time, drop=T) Time.ILEC <- subset(Verizon, select=Time, Group=="ILEC", drop=T) Time.CLEC <- subset(Verizon, select=Time, Group=="CLEC", drop=T) observed <- mean(Time.ILEC)-mean(Time.CLEC) observed B <- 10^4-1 #set number of times to repeat this process #set.seed(99) result <- numeric(B) # space to save the random differences for(i in 1:B) { index <- sample(1687, size=1664, replace = FALSE) #sample of numbers from 1:1687 result[i] <- mean(Time[index]) - mean(Time[-index]) } hist(result, xlab = "xbar1 - xbar2", col="sandybrown", main="Permutation Distribution for Verizon repair times") abline(v = observed, col = "rosybrown4", lty=5) (sum(result <= observed)+1)/(B + 1) #P-value #------------------------------------------------------- # Example 3.5, Verizon cont. # median, trimmed means tapply(Verizon\$Time, Verizon\$Group, median) # Difference in means observed <- median(Time.ILEC)-median(Time.CLEC) observed # Differnce in trimmed means observed2 <- mean(Time.ILEC, trim=.25)-mean(Time.CLEC,trim=.25) observed2 B <- 10^4-1 # set number of times to repeat this process #set.seed(99) result <- numeric(B) # space to save the random differences result2 <- numeric(B) for(i in 1:B) { index <- sample(1687, size=1664, replace = FALSE) # sample of numbers from 1:1687 result[i] <- median(Time[index]) - median(Time[-index]) result2[i] <- mean(Time[index],trim=.25)-mean(Time[-index],trim=.25) } hist(result, xlab = "median1 - median2", col="mistyrose2", main="Permutation Distribution for medians") abline(v = observed, col = "mediumorchid3", lty=5) dev.new() hist(result2, xlab = "trimMean1 - trimMean2", col="indianred3", main="Permutation Distribution for trimmed means") abline(v = observed, col = "lavenderblush3", lty=5) # P-value difference in means (sum(result <= observed) + 1)/(B + 1) # P-value difference in trimmed means (sum(result2 <= observed2) + 1)/(B + 1) #------------------------------------------------ # Example 3.5, Verzion continued # difference in proportion of time > 10 # and ratio of variances observed <- mean(Time.ILEC > 10) - mean(Time.CLEC > 10) observed #ratio of variances observed2 <- var(Time.ILEC)/var(Time.CLEC) B <- 10^4-1 #set number of times to repeat this process # set.seed(99) result <- numeric(B) result2 <- numeric(B) for(i in 1:B) { index <- sample(1687, size=1664, replace = FALSE) result[i] <- mean(Time[index] > 10) - mean(Time[-index] > 10) result2[i] <- var(Time[index])/var(Time[-index]) } hist(result, xlab = "Difference in proportions", col="red", main="Repair times > 10 hours") abline(v = observed, lty=5, col = "tomato4") dev.new() hist(result2, xlab = "variance1/variance2", col="seagreen4", main="Ratio of variances") abline(v = observed, lty=5, col = "turquoise4") # P-value difference in proportion (sum(result <= observed)+1)/(B + 1) #P-value # P-value ratio of variances (sum(result2 <= observed2)+1)/(B + 1) #P-value #------------------------------------------------ # Example 3.6, extreme case varA <- 10^6 nA <- 10^2 varB <- 1 nB <- 10^6 varT <- (nA * varA + nB * varB)/(nA + nB) #------------------------------------------------ # Chapter 3.4.1 # Here is a function that computes the chi-square # test statistic chisq<-function(Obs) { # Obs is the observed contingency table Expected <- outer(rowSums(Obs),colSums(Obs))/sum(Obs) sum((Obs-Expected)^2/Expected) } #------------------------------------------- # Uncomment below if you haven't imported GSS2002 yet. GSS2002 <- read.csv("GSS2002.csv") Education <- subset(GSS2002, select=Education, drop=T) DeathPenalty <- subset(GSS2002, select=DeathPenalty,drop=T) # Alternatively # Education <- GSS2002\$Education # DeathPenalty <- GSS2002\$DeathPenalty table(Education, DeathPenalty) # Use function created above to calcluate chi-square test statistic observed <- chisq(table(Education,DeathPenalty)) observed # Find those rows where there is at least one NA index<- which(is.na(Education) | is.na(DeathPenalty)) # Remove those rows from the two variables and define Educ2 and # DeathPenalty2 to be the new vectors with those rows removed Educ2 <- Education[-index] DeathPenalty2 <- DeathPenalty[-index] B <- 10^4-1 result<-numeric(B) for (i in 1:B) { DP.permutation <-sample(DeathPenalty2) GSS.table <- table(Educ2, DP.permutation) result[i]<-chisq(GSS.table) } # Create a histogram hist(result, col="lemonchiffon", xlab="chi-square statistic", main="Distribution of chi-square statistic") abline(v=observed,col="blue",lty=5) # optional: Create a histogram with the density curve # imposed onto the histogram # The prob=TRUE option below scales the histogram to have area 1 hist(result, xlab="chi-square statistic", col="salmon", main="Distribution of chi-square statistic", ylim=c(0,.2), prob=TRUE) abline(v=observed,col="blue",lty=5) curve(dchisq(x, df=4), from=0, to= 25, col="brown", add=T) # Compute P-value (sum(result >= observed)+1)/(B + 1) # compare with chi-square value 1-pchisq(23.45, 4) # chi-square test chisq.test(Education, DeathPenalty) chisq.test(Education, DeathPenalty, simulate.p.value=TRUE) # Chapter 3.4.2 chi-square reference distribution xs <- 0:60 plot(xs, dchisq(xs, df=8), type="l", col="springgreen4", xlab="x", ylab="", main="Densities for the chi-square distribution") curve(dchisq(x, df=15), type="l", col="paleturquoise3", add=TRUE) curve(dchisq(x, df=25), type="l", col="steelblue3", add=TRUE) curve(dchisq(x, df=40), type="l", col="slategray4", add=TRUE) legend(x=40, y=0.10, c("df=8", "df=15", "df=25", "df=40"), lwd=c(2,2,2,2), col=c("springgreen4", "paleturquoise3", "steelblue3", "slategray4")) # Chapter 3.5 chi-square test of independence bullied.mat <- rbind(c(42,50), c(30,87)) bullied.mat Xsq <- chisq(bullied.mat) 1 - pchisq(cs, df=1) chisq.test(bullied.mat) # Chapter 3.6 test of homogeneity candy.mat <- rbind(c(42,20,38), c(33,27,50)) candy.mat chisq(candy.mat) chisq.test(candy.mat) # Chapter 3.7 goodness-of-fit : all parameters known birth.mat <- c(150,138,140,100) birth.mat n <- sum(birth.mat) expected <- rep(1,4)*n/4 Xsq <- sum((birth.mat - expected)^2/expected) chisq.test(birth.mat) # Example 3.9 # X ~ Exp(1) ? distr.mat <- rbind(c(30,30,22,18), c(22,30.6,18.6,28.7)) distr.mat chisq(distr.mat) chisq.test(distr.mat) # Example 3.9 # X ~ Xsq(df=10) ? qchisq(c(.2,.4,.6,.8), 10) obs=c(7,7,10,11,15) expected <- rep(10,5) Xsq <- sum((obs - expected)^2/expected) chisq.test(obs) # Chapter 3.8 goodness-of-fit : some parameters estimated Phillies2009 <- read.csv("Phillies2009.csv") Homeruns <- subset(Phillies2009, select=HomeRuns, drop=T) #alternatively #Homeruns <- Phillies2009\$Homeruns nGames <- length(Homeruns) nHomeruns <- sum(Homeruns) nHomeruns/nGames lambda <- mean(Homeruns) lambda table(Homeruns)/nGames dpois(0:5, lambda) table(Homeruns) dpois(0:5, lambda)*nGames # the last count is too small to include in a chi-square analysis # so combine the last two categories obs <- c(43,52,40,17,10) lastExpectedValue <- sum(obs) - sum(expected[1:4]) expected <- c(40.6,56.2,38.9,17.9,lastExpectedValue) Homeruns.mat <- rbind(obs, expected) Homeruns.mat Xsq <- sum((obs-expected)^2/expected) Xsq 1-pchisq(Xsq, 3)