# OLS likelihood function (my way) --
# Note: I am going to write the LF using sigma2=sigma^2 and not sigma.
ols.lf1 <- function(theta, y, X) {
  beta <- theta[-1]
  sigma2 <- theta[1]
  if (sigma2 <= 0) return(NA)
  n <- nrow(X)
  e <- y - X%*%beta                                  # t() = matrix transpose
  logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2))
  return(-logl) # since optim() does minimisation by default.
}

# A variant: Shift to logs for sigma2 so I can do unconstrained maximisation --
ols.lf1.inlogs <- function(truetheta, y, X) {
  ols.lf1(c(exp(truetheta[1]), truetheta[-1]), y, X)
}

# Write it in another (more readable but costly) way --
ols.lf2 <- function(theta, y, X) {
  beta <- theta[-1]
  if (theta[1] <= 0) return(NA)
  sigma <- sqrt(theta[1])
  e <- (y - X%*%beta)/sigma
  logl <- sum(log(dnorm(e)/sigma))
  return(-logl)
} # Most people find this more readable than ols.lf1().

# Pretty code, by Douglas Bates --
ols.lf3 <- function(theta, y, X) {
  if (theta[1] <= 0) return(NA)
  -sum(dnorm(y, mean = X %*% theta[-1], sd = sqrt(theta[1]), log = TRUE))
}         # dnorm(..., log=TRUE) is generally superior to log(dnorm())

# NOTE: The above 4 functions are all just different ways to write
# the same thing. ols.lf{1,2,3} produce the identical results.

# This is the gradient of ols.lf{1,2,3} but not of ols.lf1.inlogs
# It is written in the (mathematical) notation of ols.lf1()
ols.gradient <- function(theta, y, X) {
  beta <- theta[-1]
  sigma2 <- theta[1]
  e <- y - X%*%beta
  n <- nrow(X)

  g <- numeric(length(theta))
  g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma
  g[-1] <- (t(X) %*% e)/sigma2                           # d logl / d beta

  return(-g)
}