#---------------------------------------------------------------- # Example of maximum likelihood estimation of simple mean model #---------------------------------------------------------------- # Load a few required packages #install.packages("numDeriv") #install.packages("animation") library(numDeriv) library(animation) #---------------------------------------------------------------- # Simulate a sample from a normal distributed population #---------------------------------------------------------------- # Specify the population parameters mu <- 150 sigma <- 20 # Number of observations n <- 100 # Randomly sample n observations the normal distribution y <- rnorm(n = n, mean = mu, sd = sigma) # Plot the distribution of the sample hist(y, prob=TRUE, xlim=c(0,300)) xseq <- seq(50,250,len=1000) # Plot the distribution of the population lines(xseq, dnorm(xseq, mean = mu, sd = sigma)) #-------- # - Change the parameters mu and sigma, see how the values in the sample change (note the fixed xlim on the histotram) # - Change the number of observations, when is the sample distribution closer to the population distribution? #-------- #---------------------------------------------------------------- # Maximum-likelihood function #---------------------------------------------------------------- # Calculate the maximum likelihood your self # The normal pdf for the first observation is the likelihood for the first observation! # We just use theta to make a bit more generic (now theta = c(mu,logsigma)) theta <- c(150, log(20)) # The likelihood (see description of function with ?dnorm) dnorm(y[1], mean=theta[1], sd=exp(theta[2])) # Plot the normal pdf and the first observation and its' likelihood value xseq <- seq(-200, 400, len=1000) val <- dnorm(y[1],mean=theta[1],sd=exp(theta[2])) plot(xseq, dnorm(xseq, mean=theta[1], sd=exp(theta[2])), type="l", xlab="y", ylab="Normal pdf") lines(c(y[1],y[1]), c(-1, val), col=2) lines(c(-250,y[1]), c(val,val), col=2) mtext("y[1]", 1, at=y[1], line=2, col=2) mtext("L(y[1]|theta)", 2, at=val, line=2.5, col=2, padj=1) # Finally, do this for all observations and sum: That's the likelihood of the sample given the parameter values in theta prod(dnorm(y, mean=theta[1], sd=exp(theta[2]))) # Take the log don't give such small numbers log(dnorm(y, mean=theta[1], sd=exp(theta[2]))) sum(log(dnorm(y, mean=theta[1], sd=exp(theta[2])))) #-------- # - Go up and change theta, what is the likelihood? try some different values # - What happens with the likelihood, does it always calculate nicely? where is it highest? #-------- #---------------------------------------------------------------- # Negative log-likelihood is better for optimization #---------------------------------------------------------------- # Take the negative to give as function to an optimizer -sum(dnorm(y, mean=theta[1], sd=exp(theta[2]), log=TRUE)) #-------- # - This is the negative log-likelihood which is minimized by an optimizer # - Its' minimum is the maximum of the likelihood #-------- #---------------------------------------------------------------- # Maximum-likelihood estimation #---------------------------------------------------------------- # Run an optimization of the negative log-likelihood function # Define the negative log-likelihood as a function of the parameters theta neg_loglike <- function(theta) { # Take the mean mu <- theta[1] # Take the variance as the exponential (optimize -Inf to Inf, while variance goes from 0 to Inf) # Remember var = sigma^2 sigma <- exp(theta[2]) # Calculate the log to the probability function (pdf) for each observation and sum and make negative nll <- -sum(dnorm(y, mean=mu, sd=sigma, log=TRUE)) # Another way: nll <- n*log(sigma) + 1/2 * sum((y - mu)^2/sigma^2) return(nll) } # Carry out the optimization nlminb(start=c(0,0), objective=neg_loglike, control=list(trace=1)) #-------- # - Did it find estimates close to the population mu and sigma? # - Should it have found the exact population mu and sigma!? # - Have a look at the neg_loglike function to see how it takes the parameters in a vector #-------- #---------------------------------------------------------------- # Generate an animation of the optimization #---------------------------------------------------------------- # Optimise the negative loglikelihood function starting at (mu=0,sigma=0). start <- c(0,0) # nlminb outputs the current point at every iteration. The output is captured and saved in a temporary file. ss <- capture.output(opt <- nlminb(start=start, objective=neg_loglike, control=list(trace=1)), file = out.file <- tempfile()) # Okay.. Load it here trace <- read.table(out.file, stringsAsFactors=FALSE)[,-1] colnames(trace) <- c("ll","mu","lvar") trace$ll <- as.double(substr(trace$ll, 1L, 9L)) ymax <- dnorm(trace$mu, mean=trace$mu, sd = exp(trace$lvar))[nrow(trace)] * 1.2 # Animation running the optimisation oopt <- ani.options(interval = .1) xseq <- seq(-200,400,length.out=400) opar <- par(mfrow=c(1,2), oma=c(3,3,4,0), mar=c(4,3,1,1), las=1, cex=0.8) for (i in seq_len(nrow(trace))) { # The population distribution pdf <- dnorm(x=xseq, mean=trace$mu[i], sd=exp(trace$lvar[i])) # plot(xseq, pdf, type="l", xlab = "Observations", ylab = "PDF", ylim = c(0, ymax)) legend("topright", lty=1, col=c(1,4,2), c("Estimated","Population","Observations")) lines(xseq, dnorm(x=xseq, mean=mu, sd=sigma), col="blue") rug(y, col="red", lwd=1) # plot(seq_len(i), trace$ll[1:i], xlim = c(1,nrow(trace)), ylim = range(trace$ll), type="p", xlab = "Iteration", log="y", pch=16) # title(main = bquote(atop(list(Iteration == .(i), -log(L) == .(formatC(trace$ll[i], digits=1, format="f")) ), list(mu == .(formatC( trace$mu[i], digits=1, format="f")), sigma == .(formatC( sqrt(exp(trace$lvar[i])), digits=1, format="f")) ) ) ), outer = TRUE) # ani.pause(0.8) } # Restore the plotting pars. par(opar) #-------- # - Try going up all the way up and increase sigma to make the observations more spread on the axis (red lines) #-------- #---------------------------------------------------------------- # Generate another animation of the optimization #---------------------------------------------------------------- # Contour plot of the neg_loglikelihood function of the parameter space mu_seq <- seq(-20,300, length.out=100) lsd <- seq(-3, 12, length.out=100) # grid <- as.matrix(expand.grid(mu = mu_seq, lsd = lsd)) # llc <- numeric(nrow(grid)) # for (i in seq_len(nrow(grid))) { llc[i] <- neg_loglike(grid[i,]) } # for (i in seq_len(nrow(trace))) { contour(x = mu_seq, y = lsd, z = matrix(log(llc), ncol=length(mu_seq)), levels = c(seq(log(min(llc)), 8, length.out=20), seq(8,20,length.out=30)), xlab = bquote(mu_seq), ylab = bquote(log(sigma^2))) # points(mu,log(sigma^2), col="blue", pch=16) # lines(trace$mu[seq_len(i)], trace$lvar[seq_len(i)], col="red",lwd=2) points(trace$mu[seq_len(i)], trace$lvar[seq_len(i)], col="red",pch=16) # title(main = bquote(atop(list(Iteration == .(i), -log(L) == .(formatC(trace$ll[i], digits=1, format="f")) ), list(mu == .(formatC( trace$mu[i], digits=1, format="f")), sigma == .(formatC( sqrt(exp(trace$lvar[i])), digits=1, format="f")) ) ) )) # ani.pause(0.4) } #-------- # - Dwell a bit over how the optimizer take steps towards the minimum #-------- #---------------------------------------------------------------- # The Maximum Likelihood Estimated (MLE) values and compare to regression #---------------------------------------------------------------- # The population mean mu # The MLE mean opt$par[1] # The population standard deviation sigma # The MLE standard deviation sqrt(exp(opt$par[2])) # Use linear regression and compare fit <- lm(y ~ 1) summary(fit) # The sigma is estimated from the residuals sd(fit$residuals) #-------- # - Are the estimates with the two methods exactly the same? # - Note the exponential for the standard deviation. Because of the optimizer, the exponential was used such that the optimizer can work in the range from -Inf to Inf, and the value used for sigma^2 goes from 0 to Inf (the variance cannot be negative). If not used the optimizer will often crash. #-------- #---------------------------------------------------------------- # What about the estimated uncertainty of the parameters with MLE? #---------------------------------------------------------------- # I.e. the standard error ("Std. Error" in regression summary) summary(fit) # Approximate the Hessian H <- hessian(neg_loglike, opt$par) # Fisher's information matrix I <- qr.solve(H) # Covariance matrix of the estimates cov2cor(I) # Standard error (Standard deviation estimate of the theta parameters, so mu and exp(sigma^2)) sqrt(diag(I))