# This file contains various functions to be applied
# to a data matrix X with n rows and p columns:
# 
# MVTMLE0(X,nu) computes/approximates the unique matrix
# S such that
#    S  =  (nu + p) *
#          average_i( X_i X_i' / (nu + X_i' S^{-1} X_i) )
# with X_i being the i-th observation (as a column vector).
# In case of nu = 0, we impose the additional constraint
# that det(S) = 1.
# 
# The underlying algorithm is the new "partial Newton
# method" introduced by
# 
# L. Duembgen, K. Nordhausen and H. Schuhmacher:
#  New Algorithms for M-Estimation of Multivariate
#  Scatter and Location.
#  Preprint 2013/2014, http://arxiv.org/abs/1312.6489.
# 
# For the sake of comparison, two alternative
# algorithms are implemented as well:
# - MVTMLE0_FP():
#      Iterating the fixed point equation (with
#      "iterative whitening" of the data).
# - MVTMLE0_G():
#      A gradient method.
# 
# MVTMLE(X,nu) computes/approximates the maximum
# likelihood estimator for location and scatter of a
# multivariate t distribution with nu >= 1 degrees of
# freedom, utilizing MVTMLE0() and the augmentation trick
# of Kent and Tyler (1991).
# 
# For the sake of comparison we also implemented
# algorithm III of Arslan et al. (1995), but with
# the same stopping criterion as in MVTMLE, as
# MVTMLE_FP3().
# 
# Lutz Duembgen, Klaus Nordhausen, Heike Schuhmacher
# May 19, 2015


###########################################
# Algorithms for the scatter-only problem #
###########################################

# Our favourite algorithm PN:

MVTMLE0 <- function(X,nu=0,delta=10^(-7),
	prewhitened=FALSE,
	steps=FALSE)
{
	Xnames <- colnames(X)
	Xs <- as.matrix(X)
	n <- dim(Xs)[1]
	p <- dim(Xs)[2]
	if (!prewhitened)
	{
		S <- crossprod(Xs)/n
		tmp <- eigen(S,symmetric=TRUE)
		tmpv <- sqrt(tmp$values)
		B <- t(t(tmp$vectors) * tmpv)
		Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs))
	} else
	{
		B <- diag(1,p)
	}

	denom <- nu + rowSums(Xs^2)
	Ys <- Xs / sqrt(denom)
	Psi <- (nu + p) * crossprod(Ys) / n
	Tmp <- eigen(Psi,symmetric=TRUE)
	nG <- sqrt(sum((1 - Tmp$values)^2))
	iter <- 0
	while (nG > delta && iter < 1000)
	{
		iter <- iter + 1
		B <- B %*% Tmp$vectors
		Xs <- Xs %*% Tmp$vectors
		Zs <- Xs^2/denom
		Ht <- diag(Tmp$values) -
			(nu + p) * crossprod(Zs)/n + (nu == 0)/p
		a <- qr.solve(Ht,Tmp$values - 1)

		# Check whether the new matrix
		# parameter is really better:
        a.half <- a/2
		Xs.new <- t(t(Xs)*exp(-a.half))
		denom.new <- nu + rowSums(Xs.new^2)
		DL <- (nu + p)*mean(log(denom.new/denom)) +
			sum(a)
		DL0 <- sum(a*(1 - Tmp$values))/4
		if (steps)
		{
			print(cbind(Before.iteration=iter,
				Norm.of.gradient=nG,
				Diff.M2LL.with.PN=DL))
		}
		if (DL <= DL0)
		{
			B <- t(t(B) * exp(a.half))
			Xs <- Xs.new
			denom <- denom.new
		} else
		# perform a step of the classical
		# fixed-point algorithm:
		{
			sqrtTmpValues <- sqrt(Tmp$values)
			B <- t(t(B) * sqrtTmpValues)
			Xs <- t(t(Xs) / sqrtTmpValues)
			denom <- nu + rowSums(Xs^2)
		}
		Ys <- Xs / sqrt(denom)
		Psi <- (nu + p) * crossprod(Ys) / n
		Tmp <- eigen(Psi,symmetric=TRUE)
		nG <- sqrt(sum((1 - Tmp$values)^2))
	}
	if (steps)
	{
		print(cbind(After.iteration=iter,
			Norm.of.gradient=nG))
	}
	S <- tcrossprod(B)
	rownames(S) <- colnames(S) <- Xnames
	if (nu == 0)
	{
		S <- S / det(S)^(1/p)
	}
	return(list(B=B,S=S,iter=iter))
}


# The fixed-point algorithm with stepwise
# pre-whitening of data:

MVTMLE0_FP <- function(X,nu=0,delta=10^(-7),
	steps=FALSE)
{
	n <- dim(X)[1]
	p <- dim(X)[2]
	S <- crossprod(X)/n
	tmp <- eigen(S,symmetric=TRUE)
	tmpv <- sqrt(tmp$values)
	B <- t(t(tmp$vectors) * tmpv)
	Xs <- t((t(tmp$vectors)/tmpv) %*% t(X))

	denom <- nu + rowSums(Xs^2)
	Ys <- Xs / sqrt(denom)
	Psi <- (nu + p) * crossprod(Ys)/n
	tmp <- eigen(Psi,symmetric=TRUE)
	nG <- sqrt(sum((1 - tmp$values)^2))
	iter <- 0
	while (nG > delta && iter < 1000)
	{
		iter <- iter+1
		if (steps)
		{
			print(cbind(Before.iteration=iter,
				Norm.of.gradient=nG))
		}

		tmpv <- sqrt(tmp$values)
		B <- B %*% t(t(tmp$vectors)*tmpv)
		Xs <- Xs %*% tmp$vectors
		Xs <- t(t(Xs)/tmpv)

		denom <- nu + rowSums(Xs^2)
		Ys <- Xs/sqrt(denom)
		Psi <- (nu + p) * crossprod(Ys)/n
		tmp <- eigen(Psi,symmetric=TRUE)
		nG <- sqrt(sum((1 - tmp$values)^2))
	}
	if (steps)
	{
		print(cbind(After.iteration=iter,
			Norm.of.gradient=nG))
	}
	S <- tcrossprod(B)
	if (nu == 0)
	{
		S <- S / det(S)^(1/p)
	}
	return(list(S=S,iter=iter))
}

# A gradient descent algorithm:

MVTMLE0_G <- function(X,nu=0,delta=10^(-7),
	steps=FALSE)
{
	n <- dim(X)[1]
	p <- dim(X)[2]
	S <- crossprod(X)/n
	tmp <- eigen(S)
	tmpv <- sqrt(tmp$values)
	B <- t(t(tmp$vectors)*tmpv)
	Xs <- X %*% tmp$vectors
	Xs <- t(t(Xs)/tmpv)
	
	denom <- nu + rowSums(Xs^2)
	Ys <- Xs/sqrt(denom)
	Psi <- (nu + p) * crossprod(Ys)/n
	Tmp <- eigen(Psi)
	Gt <- 1 - Tmp$values
	nGt <- sqrt(sum(Gt^2))
	iter <- 0
	while (nGt > delta && iter < 1000)
	{
		iter <- iter + 1
		Xs <- Xs %*% Tmp$vectors
		B <- B %*% Tmp$vectors
		Zs <- Xs^2/denom

		HGG <- sum(Tmp$values * Gt^2) -
			(nu + p) * sum(colSums(t(Zs)*Gt)^2)/n
		At <- -(nGt^2/HGG)*Gt

		# Check whether the new matrix
		# parameter is really better:
		At.half <- At/2
		Xs.new <- t(t(Xs)*exp(-At.half))
		denom.new <- nu + rowSums(Xs.new^2)
		DL <- (nu + p)*mean(log(denom.new/denom)) +
			sum(At)
		DL0 <- - nGt^2/4
		if (steps)
		{
			print(cbind(Before.iteration=iter,
				Norm.of.gradient=nGt,
				Diff.M2LL.with.G=DL))
		}
		if (DL <= DL0)
		{
			B <- t(t(B)*exp(At.half))
			Xs <- Xs.new
			denom <- denom.new
		} else
		# perform a step of the classical
		# fixed-point algorithm:
		{
			sqrtTmpValues <- sqrt(Tmp$values)
			B <- t(t(B) * sqrtTmpValues)
			Xs <- t(t(Xs) / sqrtTmpValues)
			denom <- nu + rowSums(Xs^2)
		}
		
		Ys <- Xs/sqrt(denom)
		Psi <- (nu + p) * crossprod(Ys)/n
		Tmp <- eigen(Psi)
		Gt <- 1 - Tmp$values
		nGt <- sqrt(sum(Gt^2))
	}
	if (steps)
	{
		print(cbind(After.iteration=iter,
			Norm.of.gradient=nGt))
	}
	S <- tcrossprod(B)
	if (nu == 0)
	{
		S <- S / det(S)^(1/p)
	}
	return(list(S=S,iter=iter))
}


###############################################
# Algorithms for the location-scatter problem #
###############################################

# Our favourite one (PN):

MVTMLE <- function(X,nu=1,delta=10^(-7),
	steps=FALSE)
{
	if (nu < 1)
	{
		print("Need nu >= 1 degress of freedom!")
		print("Use nu == 1 now!")
		nu <- 1
	}
	Xnames <- colnames(X)
	Xs <- as.matrix(X)
	mu0 <- colMeans(Xs)
	Xs <- t(t(Xs) - mu0)
	p <- dim(Xs)[2]
	Y <- cbind(Xs,1)
	tmp <- MVTMLE0(Y,nu-1,delta,steps=steps)
	G <- tmp$S
	iter <- tmp$iter
	G <- G / G[p+1,p+1]
	mu.hat <- G[1:p,p+1]
	names(mu.hat) <- Xnames
	Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat)
	rownames(Sigma.hat) <-
		colnames(Sigma.hat) <- Xnames
	Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p)
	return(list(mu.hat=mu0+mu.hat,
		Sigma.hat=Sigma.hat,
		Shape.hat=Shape.hat,iter=iter))
}

# The one based on FP:

MVTMLE_FP <- function(X,nu=1,delta=10^(-7),
	steps=FALSE)
{
	if (nu < 1)
	{
		print("Need nu >= 1 degress of freedom!")
		print("Use nu == 1 now!")
		nu <- 1
	}
	Xnames <- colnames(X)
	Xs <- as.matrix(X)
	mu0 <- colMeans(Xs)
	Xs <- t(t(Xs) - mu0)
	p <- dim(Xs)[2]
	Y <- cbind(Xs,1)
	tmp <- MVTMLE0_FP(Y,nu-1,delta,steps=steps)
	G <- tmp$S
	iter <- tmp$iter
	G <- G / G[p+1,p+1]
	mu.hat <- G[1:p,p+1]
	names(mu.hat) <- Xnames
	Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat)
	rownames(Sigma.hat) <-
		colnames(Sigma.hat) <- Xnames
	Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p)
	return(list(mu.hat=mu0+mu.hat,
		Sigma.hat=Sigma.hat,
		Shape.hat=Shape.hat,iter=iter))
}

# Alternative fixed-point algorithm III
# of Arslan et al. (1995), but with our
# strict criterion for convergence:

MVTMLE_FP3 <- function(X,nu=1,delta=10^(-7),
	steps=FALSE)
{
	if (nu < 1)
	{
		print("Need nu >= 1 degress of freedom!")
		print("Use nu == 1 now!")
		nu <- 1
	}
	Xnames <- colnames(X)
	Xs <- as.matrix(X)
	n <- dim(Xs)[1]
	p <- dim(Xs)[2]
	Iq <- diag(1,p+1)

	mu.hat <- colMeans(Xs)
	Xs <- t(t(X) - mu.hat)
	S <- crossprod(Xs)/n
	tmp <- eigen(S,symmetric=TRUE)
	tmpv <- sqrt(tmp$values)
	B <- t(t(tmp$vectors) * tmpv)
	Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs))

	denom <- nu + rowSums(Xs^2)
	Ys <- cbind(Xs,1)/sqrt(denom)
	Psi <- (nu + p)*crossprod(Ys)/n
	nG <- norm(Psi - Iq,type="F")

	iter <- 0
	while (nG > delta && iter < 1000)
	{
		iter <- iter+1
		if (steps)
		{
			print(cbind(Before.iteration=iter,
				Norm.of.gradient=nG))
		}

		Psi <- Psi/Psi[p+1,p+1]
		dmu <- Psi[1:p,p+1]
		Xs <- t(t(Xs) - dmu)
		mu.hat <- mu.hat + B %*% dmu

		dB2 <- Psi[1:p,1:p] - dmu %*% t(dmu)
		tmp <- eigen(dB2,symmetric=TRUE)
		tmpv <- sqrt(tmp$values)
		B <- B %*% t(t(tmp$vectors)*tmpv)
		Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs))

		denom <- nu + rowSums(Xs^2)
		Ys <- cbind(Xs,1)/sqrt(denom)
		Psi <- (nu + p)*crossprod(Ys)/n
		nG <- norm(Psi - Iq,type="F")
	}
	if (steps)
	{
		print(cbind(After.iteration=iter,
			Norm.of.gradient=nG))
	}
	Sigma.hat <- tcrossprod(B)
	
	names(mu.hat) <- Xnames
	rownames(Sigma.hat) <-
		colnames(Sigma.hat) <- Xnames
	Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p)
	return(list(mu.hat=mu.hat,
		Sigma.hat=Sigma.hat,
		Shape.hat=Shape.hat,iter=iter))
}

# The one based on G:

MVTMLE_G <- function(X,nu=1,delta=10^(-7),
	steps=FALSE)
{
	if (nu < 1)
	{
		print("Need nu >= 1 degress of freedom!")
		print("Use nu == 1 now!")
		nu <- 1
	}
	Xnames <- colnames(X)
	Xs <- as.matrix(X)
	mu0 <- colMeans(Xs)
	Xs <- t(t(Xs) - mu0)
	p <- dim(Xs)[2]
	Y <- cbind(Xs,1)
	tmp <- MVTMLE0_G(Y,nu-1,delta,steps=steps)
	G <- tmp$S
	iter <- tmp$iter
	G <- G / G[p+1,p+1]
	mu.hat <- G[1:p,p+1]
	names(mu.hat) <- Xnames
	Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat)
	rownames(Sigma.hat) <-
		colnames(Sigma.hat) <- Xnames
	Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p)
	return(list(mu.hat=mu0+mu.hat,
		Sigma.hat=Sigma.hat,
		Shape.hat=Shape.hat,iter=iter))
}