# Chihara and Hesterberg, chap 6 ## Chapter 6: Estimation # maximum likelihood estimation # X ~ Binomial(n,p) # estimate p L <- function (p) { p^5*(1-p)^3 } curve(L, from=0, to=1, col="red", xlab="p", ylab="L(p)", main="Maximum Likelihood Function, L(p)") # maximum likelihood estimation # X ~ Poisson(lambda) # estimate lambda e <- 2.71828183 L <- function (lambda) { lambda^17 * e^(-4*lambda) - log(factorial(3)*factorial(4)*factorial(3)*factorial(7))} curve(L, from=0, to=10, col="red", xlab="lambda", ylab="L(lambda)", main="Maximum Likelihood Function, L(lambda)") data <- c(3,4,3,7) lambda <- mean(data) #---------------------------------------------------------- ## Wind speed case study: fitting the Weibull distribution #------------------------------------------------------------ # This function takes input shape parameter k and # the data to compute # (1/k)+ (1/n)*sum (log(xi)) +(1/alpha)sum xi^klog(xi)=0 # where alpha= sum xi^k. weibull.shape <- function(k, data) { numer <- colSums(outer(data, k, "^") * log(data)) denom <- colSums(outer(data, k, "^")) numer/denom - 1/k - mean(log(data)) } #----- # This function takes input shape parameter k # and data to compute # k^{th} root of (1/n) sum xi^k # n=number of data values weibull.scale <- function(k, data) { mean(data^k)^(1/k) } ##----- # uniroot is a built-in R function which estimates the root # of a function. # Provide function, any arguments needed for function, # and a guess of values two values around root. # Function values must be opposite signs at lower # and upper guess. Turbine <- read.csv("Turbine.csv") head(Turbine) #Now, we do the data specific commands wind <- Turbine\$AveSpeed #alternatively, wind <- subset(Turbine, select=AveSpeed, drop=TRUE) length(wind) uniroot(weibull.shape, data = wind, lower = 1, upper = 5) # With estimate of shape parameter, now find estimate # of scale parameters weibull.scale(3.169, wind) # Plot histogram with density curve overlap # The prob=TRUE option scales histogram to area 1. hist(wind, main = "Distribution of average wind speeds", xlab = "meters/sec", prob = TRUE, col="skyblue1") curve(dweibull(x, 3.169, 7.661), add = TRUE, col = "blue", lwd = 2) dev.new() plot.ecdf(wind,main = "ECDF of wind data", col="seashell3") curve(pweibull(x,3.169,7.661), add=TRUE, col="blue",lwd=2) # Now for the chi-square goodness of fit test # Get the deciles q <- quantile(wind, seq(0,1,by=.1)) # Get the counts in each sub-interval. The plot=F command # suppresses plot and gives statistics hist(wind, breaks=q, plot=F)\$counts # repeat above but store output count <- hist(wind,breaks=q,plot=F)\$counts expected <- length(wind)*.1 # compute chi-square test statistic sum((count-expected)^2/expected) #End wind speed case study #----------------------------------------- # Example 6.13 # Simulation comparing two estimators for uniform my.mean <- numeric(1000) my.max <- numeric(1000) #set.seed(100) for (i in 1:1000) { x <- runif(25,0,12) #sample n=25 from Unif[0,12] my.mean[i] <- 2*mean(x) my.max[i] <- 25/24*max(x) } #mean and standard deviation of the method of moments estimate mean(my.mean) sd(my.mean) #mean and sd of estimate from MLE mean(my.max) sd(my.max) par(mfrow=c(1,2)) hist(my.mean, xlim=c(8,16),ylim=c(0,650),xlab="means", main="2*Sample mean", col="seagreen1") hist(my.max, xlim=c(8,16),ylim=c(0,650),xlab="maximums", main="25/24*maximum", col="mediumturquoise") par(mfrow=c(1,1)) #---------------------- #Example 6.14 #Relative efficiency for the Wind Speed Case Study # Theta = P(wind > 5), nonparametric and parametric estimates n <- length(wind) theta1 <- mean(wind > 5) theta2 <- pweibull(5, 3.169, 7.661, lower.tail = FALSE) theta1 # 0.75 theta2 # 0.7720827 eta1 <- quantile(wind, .1) eta2 <- qweibull(.1, 3.169, 7.661) eta1 # 3.77 eta2 # 3.766038 # nonparametric bootstrap to find standard errors set.seed(23) B <- 10^4 boot.theta1 <- numeric(B) boot.shape <- numeric(B) boot.scale <- numeric(B) boot.theta2 <- numeric(B) boot.eta1 <- numeric(B) boot.eta2 <- numeric(B) for (i in 1:B) { boot.wind <- wind[sample(1:n, replace=TRUE)] boot.theta1[i] <- mean(boot.wind > 5) boot.shape[i] <- uniroot(weibull.shape, data=boot.wind, lower=1,upper=5)\$root boot.scale[i] <- weibull.scale(boot.shape[i], boot.wind) boot.theta2[i] <- pweibull(5, boot.shape[i], boot.scale[i], lower.t = FALSE) boot.eta1[i] <- quantile(boot.wind, .1) boot.eta2[i] <- qweibull(.1, boot.shape[i], boot.scale[i]) } qqnorm(boot.theta1, col="royalblue4") # discrete, normal qqnorm(boot.theta2, col="rosybrown4") # continuous, normal qqnorm(boot.eta1, col="turquoise2") # discrete, irregular, roughly normal qqnorm(boot.eta2, col="violet") # continuous, very normal sd(boot.theta1) # 0.03331767 sd(boot.theta2) # 0.02342771 var(boot.theta1) / var(boot.theta2) # relative efficiency : 2.022504 # Formula standard error, for comparison sqrt(theta1 * (1-theta1) / n) # standard error = 0.03340766 sd(boot.eta1) # 0.2161054 sd(boot.eta2) # 0.1839509 var(boot.eta1) / var(boot.eta2) # relative efficiency : 1.380154 # parametric bootstrap to find standard errors set.seed(23) pboot.theta1 <- numeric(B) pboot.shape <- numeric(B) pboot.scale <- numeric(B) pboot.theta2 <- numeric(B) pboot.eta1 <- numeric(B) pboot.eta2 <- numeric(B) for (i in 1:B) { boot.wind <- rweibull(n, 3.169, 7.661) pboot.theta1[i] <- mean(boot.wind > 5) pboot.shape[i] <- uniroot(weibull.shape, data=boot.wind, lower=1,upper=5)\$root pboot.scale[i] <- weibull.scale(pboot.shape[i], boot.wind) pboot.theta2[i] <- pweibull(5, pboot.shape[i], pboot.scale[i], lower.t=FALSE) pboot.eta1[i] <- quantile(boot.wind, .1) pboot.eta2[i] <- qweibull(.1, pboot.shape[i], pboot.scale[i]) } etaRange <- range(boot.eta1, boot.eta2, pboot.eta1, pboot.eta2) qqnorm(pboot.theta1, col="sandybrown") # discrete, slight neg skewness qqnorm(pboot.theta2, col="red1") # continuous, normal qqnorm(pboot.eta1, col="papayawhip") # very normal, continuous qqnorm(pboot.eta2, col="peru") # very normal, continuous sd(pboot.theta1) # 0.03198990 sd(pboot.theta2) # 0.02604409 var(pboot.theta1) / var(pboot.theta2) # relative efficiency : 1.508716 # Similar to the nonparametric bootstrap, though relative efficiency differs sd(pboot.eta1) # 0.2823103 sd(pboot.eta2) # 0.2100147 var(pboot.eta1) / var(pboot.eta2) # relative efficiency : 1.806982 ##----------------------------------------------------- # Example 6.15 mean square error n <- 16 curve(x*(1-x)/n, from=0, to=1, xlab="p", ylab="MSE", col="plum1", main="Mean square error") curve(n*(1-x)*x/(n+2)^2 + (1-2*x)^2/(n-2)^2, add=TRUE, col="blue", lty=2) legend(x=.82, y=.014, c("p1", "p2"), lwd=c(2,2), col=c("plum1", "blue"))