# Rosner, chapter 10 # ================================= # significance # from chapter 7 significance <- function(p){ if (.01 <= p & p < .05) { print('results.significant') } else if (.001 <= p & p < .01) { print('results.highly.significant') } else if (p < .001) { print('results.very.highly.significant') } else { print('results.not.statistically.significant') } } # ================================= # example 10.4 cancer # older n cancer # 683 3220 yes # 1498 10245 no # younger == age.at.first.birth <= 29 # older == age.at.first.birth >= 30 n.with.cancer <- 3220 n.without.cancer <- 10245 older.with.cancer <- 683 older.without.cancer <- 1498 younger.with.cancer <- n.with.cancer - older.with.cancer younger.without.cancer <- n.without.cancer - older.without.cancer # ================================= # two-sample test for binomial proportions (normal theory test) # H0 : p1 == p2 # H1 : p1 != p2 # test statistic two.sample.test.for.binomial.proportions <- function(x1, x2, n1, n2){ p1Hat <- x1 / n1 p2Hat <- x2 / n2 pHat <- (x1 + x2) / (n1 + n2) qHat <- 1 - pHat (abs(p1Hat - p2Hat) - (1/(2 * n1) + 1/(2 * n2))) / sqrt(pHat * qHat * (1/n1 + 1/n2)) } z <- two.sample.test.for.binomial.proportions(x1, x2, n1, n2) # ================================= # example 10.5 cancer p1Hat <- older.with.cancer / n.with.cancer p2Hat <- older.without.cancer / n.without.cancer x1 <- older.with.cancer x2 <- older.without.cancer n1 <- n.with.cancer n2 <- n.without.cancer # common proportion pHat <- (x1 + x2) / (n1 + n2); pHat qHat <- 1 - pHat; qHat # test appropriate? n1 * pHat * qHat >= 5 n2 * pHat * qHat >= 5 # test statistic z <- two.sample.test.for.binomial.proportions(x1, x2, n1, n2); z # => 8.825256 # p value p <- 2 * (1 - pnorm(z)); p significance(p) # ================================= # example 10.6 cardiovascular disease # mi n oc # 13 5000 yes # 7 10000 no # H0 : p1 == p2 # H1 : p1 != p2 n.oc <- 5000 n.oc.mi <- 13 n.non.oc <- 10000 n.non.oc.mi <- 7 n1 <- n.oc p1Hat <- n.oc.mi / n.oc; p1Hat n2 <- n.non.oc p2Hat <- n.non.oc.mi / n.non.oc; p2Hat x1 <- n.oc.mi x2 <- n.non.oc.mi # common proportion pHat <- (x1 + x2) / (n1 + n2); pHat qHat <- 1 - pHat; qHat # test appropriate? n1 * pHat * qHat >= 5 n2 * pHat * qHat >= 5 # test statistic -- not quite the same as the text z <- two.sample.test.for.binomial.proportions(x1, x2, n1, n2); z # => 2.768839 # p value p <- 2 * (1 - pnorm(z)); p # => 0.005625635 significance(p) # => "results.highly.significant" # ================================= # example 10.7 cancer # from example 10.4 cancer # older younger n cancer # 683 2537 3220 yes # 1498 8747 10245 no # 2181 11284 13465 total # younger == age.at.first.birth <= 29 # older == age.at.first.birth >= 30 a <- 683 b <- 2537 c <- 1498 d <- 8747 (cancer.table <- rbind(c(a, b), c(c, d))) # ================================= # example 10.8 cardiovascular disease # from example 10.6 cardiovascular disease # mi no.mi n oc # 13 4987 5000 yes # 7 9993 10000 no # 20 14980 15000 total a <- 13 b <- 4987 c <- 7 d <- 9993 (cardiovascular.table <- rbind(c(a, b), c(c, d))) # ================================= # example 10.9 nutrition # 1st\2nd hi normal total # hi 15 5 20 # normal 9 21 30 # total 24 26 50 a <- 15 b <- 5 c <- 9 d <- 21 (nutrition.table <- rbind(c(a, b), c(c, d))) # ================================= # expected table expected.table <- function(observed.table){ outer(rowSums(observed.table), colSums(observed.table)) / sum(observed.table); } # ================================= # example 10.10 cancer # expected table row.totals <- c(3220, 10245) col.totals <- c(2181, 11284) grand.total <- sum(row.totals); grand.total consistency.check <- sum(row.totals) == sum(col.totals); consistency.check expected.table.cancer <- outer(row.totals, col.totals) / grand.total; expected.table # check expected.table(cancer.table) # ================================= # example 10.11 cardiovascular disease # expected table row.totals <- c(5000, 10000) col.totals <- c(20, 14980) grand.total <- sum(row.totals); grand.total consistency.check <- sum(row.totals) == sum(col.totals); consistency.check expected.table.mi <- outer(row.totals, col.totals) / grand.total; expected.table.mi # check expected.table(cardiovascular.table) # ================================= # example 10.12 cardiovascular disease # from example 10.10 cancer rowSums(expected.table.cancer) colSums(expected.table.cancer) # ================================= # chi squared test, Yeats corrected chi.squared.yeats <- function(table){ o <- table; e <- outer(rowSums(table), colSums(table)) / sum(table); sum((abs(o - e) - 1/2)^2 / e) } # ================================= # example 10.13 cancer # from example 10.4 cancer # older younger cancer # 683 2537 yes # 1498 8747 no cancer.data <- rbind(c(683, 2537), c(1498, 8747)); data # test statistic cs <- chi.squared.yeats(cancer.data); cs # p value p <- 1 - pchisq(cs, df=1); p significance(p) # ================================= # example 10.14 cardiovascular disease # from example 10.8 cardiovascular disease # mi no.mi n oc # 13 4987 5000 yes # 7 9993 10000 no # 20 14980 15000 total row1 <- c(13, 4987) row2 <- c(7, 9993) data <- rbind(row1, row2); data cardiovascular.data <- data # test that procedure is valid e <- outer(rowSums(data), colSums(data)) / sum(data); e proceed <- min(e) >= 5; proceed # test statistic cs <- chi.squared.yeats(data); cs # p value p <- 1 - pchisq(cs, df=1); p significance(p) # ================================= # chi squared test, Yeats corrected, short form # mat : a b # c d chi.squared.yeats.short <- function(a, b, c, d){ n <- a + b + c + d; n * (abs(a * d - b * c) - n/2)^2 / ((a + b) * (c + d) * (a + c) * (b + d)) } # ================================= # example 10.15 nutrition a <- 15 b <- 5 c <- 9 d <- 21 cs <- chi.squared.yeats.short(a, b, c, d); cs # critical value alpha <- .05 critical.value <- qchisq(1 - alpha, df=1); critical.value accept.HO <- cs <= critical.value; accept.HO # p value p <- 1 - pchisq(cs, df=1); p # ================================= # R : chisq.test() # http://www.r-tutor.com/elementary-statistics/goodness-fit/chi-squared-test-independence data <- rbind(c(a, b), c(c, d)); data chisq.test(data) # ================================= # example 10.17 cardiovascular disease, nutrition a <- 2 b <- 23 c <- 5 d <- 30 data <- rbind(c(a, b), c(c, d)); data # test that normal methods procedure is valid e <- outer(rowSums(data), colSums(data)) / sum(data); e proceed <- min(e) >= 5; proceed # ================================= # example 10.18 exact probability mat <- rbind(c(2, 5), c(3, 1)); mat exact.probability <- function(mat){ prod(factorial(rowSums(mat))) * prod(factorial(colSums(mat))) / (factorial((sum(mat))) * prod(factorial(mat))) } p <- exact.probability(mat); p # ================================= # R : Fisher's Exact Test # R-bloggers : Fisher's Exact Test # http://www.r-bloggers.com/contingency-tables-%E2%80%93-fisher%E2%80%99s-exact-test/ # Chi-square, Fisher's exact, and McNemar's test # http://yatani.jp/HCIstats/ChiSquare mat <- matrix(c(2, 5, 3, 1), nrow=2); mat fisher.test(mat) # ================================= # example 10.19 enumeration # enumeration of all possible tables with same row and col sums # col sums : m1, m2 # row sums : n1, n2 # original matrix mat <- rbind(c(2, 23), c(5, 30)); mat rowSums(mat) colSums(mat) # 1st matrix in series : a == 0 a <- mat[1,1] b <- mat[1,2] c <- mat[2,1] d <- mat[2,2] mat0 <- rbind(c(0, a+b), c(a+c, d-a)); mat0 exact.probability(mat0) # sequence of matrices kth.mat <- function(mat, k){ mat + matrix(c(k, -k, -k, k), nrow=2) } for (i in 0:7) { m <- kth.mat(mat=mat0, k=i) print(m) cat("p == ", exact.probability(m), "\n\n") } # vector of probabilities n <- 7 ps <- numeric(n+1) for (i in 0:n) { ps[i+1] <- exact.probability(kth.mat(mat=mat0, k=i)) } ps # ================================= # example 10.20 cardiovascular disease, nutrition a <- 2 k <- 7 left <- ps[1:(a+1)]; left lefthand.tail <- sum(left); lefthand.tail right <- ps[(a+1):(k+1)]; right righthand.tail <- sum(right); righthand.tail p <- 2 * min(lefthand.tail, righthand.tail, .5); p significance(p) # alternate approach to computing p # compare with p returned by fisher.test(), below rbind(0:7, ps) # add all the p's which are <= P(original matrix) mat <- rbind(c(2, 23), c(5, 30)); mat exact.probability(mat) p <- sum(c(ps[1:3], ps[5:8])); p # using R : Fisher's Exact Test # original matrix mat <- rbind(c(2, 23), c(5, 30)); mat fisher.test(mat) # ================================= # example 10.21 cancer # matched pair data : McNemar's Test # patients as the sampling unit # units : patients # cols : survive for 5 yrs die within 5 yrs # rows : treatment A # treatment B mat <- rbind(c(526, 95), c(515, 106)); mat rowSums(mat) colSums(mat) sum(mat) # matched pairs as the sampling unit # units : treatment A patient \ treatment B patient # cols : survive for 5 yrs die within 5 yrs # rows : # survive for 5 yrs # die within 5 mat <- rbind(c(510, 16), c(5, 90)); mat rowSums(mat) colSums(mat) sum(mat) # independence? ... no, they are dependent # P(B survived | A survived) # == P(B and A survived) ? P(A survived) 510 / (510 + 16) # P(B survived | A died) # == P(B survived and A died) ? P(A died) 5 / (5 + 90) # ================================= # example 10.22 cancer concordant.pairs <- 510 + 90; concordant.pairs discordant.pairs <- 5 + 16; discordant.pairs # ================================= # example 10.23 cancer typeA.discordant.pairs <- 5; typeA.discordant.pairs typeB.discordant.pairs <- 16; typeB.discordant.pairs # ================================= # example 10.24 cancer # McNemar's Test n.A <- 5; n.D <- 5 + 16 # test appropriate? test.appropriate <- n.D >= 20; test.appropriate # test statistic cs <- (abs(n.A - n.D / 2) - .5)^2 / (n.D / 4); cs # critical value alpha <- .05 critical.value <- qchisq(1 - alpha, df=1); critical.value accept.H0 <- cs <= critical.value; accept.H0 # p value p <- 1 - pchisq(cs, df=1); p significance(p) # ================================= # R : McNemar's Test mat <- matrix(c(510, 5, 16, 90), nrow=2, dimnames=list("treatment A"=c("survived", "died"), "treatment B"=c("survived", "died"))); mat mcnemar.test(mat) # ================================= # pBinomial # X ~ Binomial(n,p) => P(X == k) == dbinom(x=k, size=n, prob=p) # ================================= # example 10.25 hypertension # Exact Test # McNemar's Test and Exact McNemar's Test # http://www.statsdirect.com/help/chi_square_tests/mat.htm # http://yatani.jp/HCIstats/ChiSquare # R : package exact2x2 # http://cran.r-project.org/web/packages/exact2x2/exact2x2.pdf mat <- matrix(c(3, 1, 7, 9), nrow=2, dimnames=list("computer"=c("hypertension +", "hypertension -"), "trained observer"=c("hypertension +", "hypertension -"))); mat n.A <- 7 n.D <- 8 # test appropriate? (for McNemar's Test) # not appropriate, so use Exact Test test.appropriate <- n.D >= 20; test.appropriate p <- 2 * sum(dbinom(x=n.A:n.D, size=n.D, prob=1/2)); p significance(p) # R : package exact2x2 with mcnemar.test() # first, install package exact2x2 (with its dependencies) library(exact2x2) mcnemar.exact(mat) # ================================= # example 10.26 and 10.27 cancer, nutrition p1 <- 150/10^5 q1 <- 1 - p1 p2 <- .8 * p1 q2 <- 1 - p2 alpha <- .05 beta <- .20 k <- 1 pBar <- (p1 + k * p2) / (1 + k); pBar qBar <- 1 - pBar; qBar z.alpha.975 <- qnorm(1 - alpha/2); z.alpha.975 z.beta.80 <- qnorm(1 - beta); z.beta.80 delta <- abs(p2 - p1) # estimate of sample sizes n1 <- (sqrt(pBar * qBar * (1 + 1/k)) * z.alpha.975 + sqrt(p1 * q1 + p2 * q2 / k) * z.beta.80)^2 / delta^2; n1 n2 <- k * n1; n2 # ================================= # example 10.28 otolaryngology p1 <- .5 q1 <- 1 - p1 p2 <- .7 q2 <- 1 - p2 n1 <- 100 n2 <- 100 alpha <- .05 delta <- .2 pBar <- (n1 * p1 + n2 * p2) / (n1 + n2); pBar qBar <- 1 - pBar; qBar # power z.alpha.975 <- qnorm(1 - alpha/2); z.alpha.975 denom <- sqrt(p1 * q1 / n1 + p2 * q2 / n2) q <- delta / denom - z.alpha.975 * sqrt(pBar * qBar * (1/n1 + 1/n2)) / denom power <- pnorm(q); power # ================================= # example 10.29 cancer p.D <- .15 p.A <- 2/3 q.A <- 1 - p.A alpha <- .05 beta <- .10 z.alpha.975 <- qnorm(1 - alpha/2) z.beta.90 <- qnorm(1 - beta) n.pairs <- (z.alpha.975 + 2 * z.beta.90 * sqrt(p.A * q.A))^2 / (4 * (p.A - .5)^2 * p.D); n.pairs n.individuals <- 2 * n.pairs; n.individuals # ================================= # example 10.30 cancer # data from example 10.29 cancer n <- 400 z.alpha.025 <- qnorm(alpha/2) z <- (z.alpha.025 + 2 * abs(p.A - .5) * sqrt(n * p.D)) / (2 * sqrt(p.A * q.A)); z power <- pnorm(z); power # ================================= # example 10.31 and 10.32 cardiovascular disease mi.incidence.per.year <- .005 relative.risk <- .8 duration.years <- 5 dropout.rate <- .10 dropin.rate <- .05 alpha <- .05 beta <- .20 incidence.5.years <- duration.years * mi.incidence.per.year p2 <- incidence.5.years q2 <- 1 - p2 p1 <- relative.risk * p2 q1 <- 1 - p1 lambda1 <- dropout.rate lambda2 <- dropin.rate p1.star <- (1 - lambda1) * p1 + lambda1 * p2; p1.star q1.star <- 1 - p1.star p2.star <- (1 - lambda2) * p2 + lambda2 * p1; p2.star q2.star <- 1 - p2.star delta.star <- abs(p1.star - p2.star); delta.star delta <- abs(p1 - p2); delta pBar.star <- (p1.star + p2.star) / 2; pBar.star qBar.star <- 1 - pBar.star; qBar.star z.alpha.975 <- qnorm(1 - alpha/2) z.beta.80 <- qnorm(1 - beta) numer <- (sqrt(2 * pBar.star * qBar.star) * z.alpha.975 + sqrt(p1.star * q1.star + p2.star * q2.star) * z.beta.80)^2 n1 <- numer / delta.star^2; n1 n2 <- n1; n2 # without conidering compliance numer <- (sqrt(2 * pBar.star * qBar.star) * z.alpha.975 + sqrt(p1 * q1 + p2 * q2) * z.beta.80)^2 n1 <- numer / delta^2; n1 n2 <- n1 # approximate formula n.perfect.compliance <- 13794 n1.approximate <- n.perfect.compliance / (1 - lambda1 - lambda2)^2; n1.approximate n2.approximate <- n1.approximate # ================================= # example 10.33 cancer mat <- matrix(c(320,1422,1206,4432,1011,2893,463,1092,220,406), nrow=2, dimnames=list("case-control"=c("case", "control"), "age at first birth"=c("<20", "20-24", "25-29", "30-34", ">=35"))); mat # ================================= # example 10.34 and 10.35 cancer o <- mat e <- outer(rowSums(mat), colSums(mat)) / sum(mat); e cs <- sum((o - e)^2 / e); cs # critical region alpha <- .05 r <- 2 c <- 5 q <- qchisq(p=(1 - alpha), df=(r-1) * (c-1)); q # p value p <- 1 - pchisq(cs, df=(r-1) * (c-1)); p significance(p) # ================================= # example 10.36 cancer scores <- 1:5 # ================================= # example 10.37 cancer ss <- 1:5 xs <- c(320,1206,1011,463,220) ns <- c(1742,5638,3904,1555,626) x <- sum(xs) n <- sum(ns) pBar <- x / n; pBar qBar <- 1 - pBar a <- sum(xs * ss) - x * sum(ns * ss) / n; a b <- pBar * qBar * (sum(ns * ss^2) - sum(ns * ss)^2 / n); b # test statistic cs <- a^2 / b; cs # critical value alpha <- .05 critical.value <- qchisq(1 - alpha, df=1); critical.value # p value p <- 1 - pchisq(cs, df=1); p significance(p) # trend trend.increasing <- a > 0; trend.increasing plot(x=ss, y= xs / ns, pch=19, col="dark red", xlab="scores", ylab="proportion of cases", main="Cancer incidence vs. age class at first birth") # ================================= # example 10.38 ophthalmology mat <- matrix(c(5,1,9,5,6,4,3,4,2,8,0,5,0,2,0,1), nrow=2, dimnames=list("case"=c("dominant", "sex-linked"), "visual acuity"=c("20-20", "20-25", "20-30", "20-40", "20-50", "20-60", "20-70", "20-80"))); mat dominant <- c(5,9,6,3,2,0,0,0) sex.linked <- c(1,5,4,4,8,5,2,1) scores <- c(3.5,13.5,25.5,34.0,42.5,50.0,53.5,55.0) xs <- dominant x <- sum(xs); x ss <- scores ns <- colSums(mat); ns n <- sum(ns); n o <- sum(dominant * scores); o e <- x * sum(ns * ss) / n; e v <- x * (n - x) * (sum(ns * ss^2) - sum(ns * ss)^2 / n) / (n * (n - 1)); v # test statistic cs <- (abs(o - e) - .5)^2 / v; cs # p value p <- 1 - pchisq(cs, df=1); p significance(p) # ================================= # example 10.39 and 10.40 hypertension o <- c(57, 330, 2132, 4584, 4604, 2119, 659, 251) e <- c(77.9, 547.1, 2126.7, 4283.3, 4478.5, 2431.1, 684.1, 107.2) total <- sum(o); total sum(e) xBar <- 80.68 s <- 12.00 # ================================= # example 10.41 hypertension k <- 2 g <- 8 cs <- sum((o - e)^2 / e); cs # p value p <- 1 - pchisq(cs, df=g-k-1); p significance(p) xs <- c(45,55,65,75,85,95,105,115) plot(x=xs, y=o, type="n", ylim=c(0,5000), xlab="group (mm Hg)", ylab="count", main="Frequency Distribution of Mean DBP") points(xs, o, pch=19, col="dark red") curve(10 * total * dnorm(x, mean=xBar, sd=s), from=45, to=115, col="orange", add=TRUE) legend(x=95, y=4200, c("observed", "expected"), lwd=c(2,2), col=c("dark red", "orange")) # ================================= # example 10.42 nutrition mat <- matrix(c(136,69,92,240), nrow=2, dimnames=list("survey 1"=c("<= 1", "> 1"), "survey 2"=c("<= 1", "> 1"))); mat p.o <- (136 + 240) / 537; p.o # ================================= # example 10.43 nutrition mat <- matrix(c(136,69,92,240), nrow=2, dimnames=list("survey 1"=c("<= 1", "> 1"), "survey 2"=c("<= 1", "> 1"))); n <- sum(mat) as <- rowSums(mat) / n bs <- colSums(mat) / n p.e <- sum(as * bs); p.e # ================================= # example 10.44 nutrition kappa <- (p.o - p.e) / (1 - p.e); kappa kappa.se <- sqrt((p.e + p.e^2 - sum(as * bs * (as + bs))) / (n * (1 - p.e)^2)); kappa.se # test statistic z <- kappa / kappa.se; z # p value p <- 1 - pnorm(z); p significance(p) # ================================= # interpreting kappa kappa.interpretation <- function(k){ if (k > 75) { print('excellent reproducibility') } else if (.4 <= k & k <= .75) { print('good reproducibility') } else if (0 <= k & k < .4) { print('marginal reproducibility') } } # ================================= # example 10.44 nutrition kappa.interpretation(kappa) # =================================