# This file contains various functions to be applied # to a data matrix X with n rows and p columns: # # MVTMLE0(X,nu) computes/approximates the unique matrix # S such that # S = (nu + p) * # average_i( X_i X_i' / (nu + X_i' S^{-1} X_i) ) # with X_i being the i-th observation (as a column vector). # In case of nu = 0, we impose the additional constraint # that det(S) = 1. # # The underlying algorithm is the new "partial Newton # method" introduced by # # L. Duembgen, K. Nordhausen and H. Schuhmacher: # New Algorithms for M-Estimation of Multivariate # Scatter and Location. # Preprint 2013/2014, http://arxiv.org/abs/1312.6489. # # For the sake of comparison, two alternative # algorithms are implemented as well: # - MVTMLE0_FP(): # Iterating the fixed point equation (with # "iterative whitening" of the data). # - MVTMLE0_G(): # A gradient method. # # MVTMLE(X,nu) computes/approximates the maximum # likelihood estimator for location and scatter of a # multivariate t distribution with nu >= 1 degrees of # freedom, utilizing MVTMLE0() and the augmentation trick # of Kent and Tyler (1991). # # For the sake of comparison we also implemented # algorithm III of Arslan et al. (1995), but with # the same stopping criterion as in MVTMLE, as # MVTMLE_FP3(). # # Lutz Duembgen, Klaus Nordhausen, Heike Schuhmacher # May 19, 2015 ########################################### # Algorithms for the scatter-only problem # ########################################### # Our favourite algorithm PN: MVTMLE0 <- function(X,nu=0,delta=10^(-7), prewhitened=FALSE, steps=FALSE) { Xnames <- colnames(X) Xs <- as.matrix(X) n <- dim(Xs)[1] p <- dim(Xs)[2] if (!prewhitened) { S <- crossprod(Xs)/n tmp <- eigen(S,symmetric=TRUE) tmpv <- sqrt(tmp$values) B <- t(t(tmp$vectors) * tmpv) Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs)) } else { B <- diag(1,p) } denom <- nu + rowSums(Xs^2) Ys <- Xs / sqrt(denom) Psi <- (nu + p) * crossprod(Ys) / n Tmp <- eigen(Psi,symmetric=TRUE) nG <- sqrt(sum((1 - Tmp$values)^2)) iter <- 0 while (nG > delta && iter < 1000) { iter <- iter + 1 B <- B %*% Tmp$vectors Xs <- Xs %*% Tmp$vectors Zs <- Xs^2/denom Ht <- diag(Tmp$values) - (nu + p) * crossprod(Zs)/n + (nu == 0)/p a <- qr.solve(Ht,Tmp$values - 1) # Check whether the new matrix # parameter is really better: a.half <- a/2 Xs.new <- t(t(Xs)*exp(-a.half)) denom.new <- nu + rowSums(Xs.new^2) DL <- (nu + p)*mean(log(denom.new/denom)) + sum(a) DL0 <- sum(a*(1 - Tmp$values))/4 if (steps) { print(cbind(Before.iteration=iter, Norm.of.gradient=nG, Diff.M2LL.with.PN=DL)) } if (DL <= DL0) { B <- t(t(B) * exp(a.half)) Xs <- Xs.new denom <- denom.new } else # perform a step of the classical # fixed-point algorithm: { sqrtTmpValues <- sqrt(Tmp$values) B <- t(t(B) * sqrtTmpValues) Xs <- t(t(Xs) / sqrtTmpValues) denom <- nu + rowSums(Xs^2) } Ys <- Xs / sqrt(denom) Psi <- (nu + p) * crossprod(Ys) / n Tmp <- eigen(Psi,symmetric=TRUE) nG <- sqrt(sum((1 - Tmp$values)^2)) } if (steps) { print(cbind(After.iteration=iter, Norm.of.gradient=nG)) } S <- tcrossprod(B) rownames(S) <- colnames(S) <- Xnames if (nu == 0) { S <- S / det(S)^(1/p) } return(list(B=B,S=S,iter=iter)) } # The fixed-point algorithm with stepwise # pre-whitening of data: MVTMLE0_FP <- function(X,nu=0,delta=10^(-7), steps=FALSE) { n <- dim(X)[1] p <- dim(X)[2] S <- crossprod(X)/n tmp <- eigen(S,symmetric=TRUE) tmpv <- sqrt(tmp$values) B <- t(t(tmp$vectors) * tmpv) Xs <- t((t(tmp$vectors)/tmpv) %*% t(X)) denom <- nu + rowSums(Xs^2) Ys <- Xs / sqrt(denom) Psi <- (nu + p) * crossprod(Ys)/n tmp <- eigen(Psi,symmetric=TRUE) nG <- sqrt(sum((1 - tmp$values)^2)) iter <- 0 while (nG > delta && iter < 1000) { iter <- iter+1 if (steps) { print(cbind(Before.iteration=iter, Norm.of.gradient=nG)) } tmpv <- sqrt(tmp$values) B <- B %*% t(t(tmp$vectors)*tmpv) Xs <- Xs %*% tmp$vectors Xs <- t(t(Xs)/tmpv) denom <- nu + rowSums(Xs^2) Ys <- Xs/sqrt(denom) Psi <- (nu + p) * crossprod(Ys)/n tmp <- eigen(Psi,symmetric=TRUE) nG <- sqrt(sum((1 - tmp$values)^2)) } if (steps) { print(cbind(After.iteration=iter, Norm.of.gradient=nG)) } S <- tcrossprod(B) if (nu == 0) { S <- S / det(S)^(1/p) } return(list(S=S,iter=iter)) } # A gradient descent algorithm: MVTMLE0_G <- function(X,nu=0,delta=10^(-7), steps=FALSE) { n <- dim(X)[1] p <- dim(X)[2] S <- crossprod(X)/n tmp <- eigen(S) tmpv <- sqrt(tmp$values) B <- t(t(tmp$vectors)*tmpv) Xs <- X %*% tmp$vectors Xs <- t(t(Xs)/tmpv) denom <- nu + rowSums(Xs^2) Ys <- Xs/sqrt(denom) Psi <- (nu + p) * crossprod(Ys)/n Tmp <- eigen(Psi) Gt <- 1 - Tmp$values nGt <- sqrt(sum(Gt^2)) iter <- 0 while (nGt > delta && iter < 1000) { iter <- iter + 1 Xs <- Xs %*% Tmp$vectors B <- B %*% Tmp$vectors Zs <- Xs^2/denom HGG <- sum(Tmp$values * Gt^2) - (nu + p) * sum(colSums(t(Zs)*Gt)^2)/n At <- -(nGt^2/HGG)*Gt # Check whether the new matrix # parameter is really better: At.half <- At/2 Xs.new <- t(t(Xs)*exp(-At.half)) denom.new <- nu + rowSums(Xs.new^2) DL <- (nu + p)*mean(log(denom.new/denom)) + sum(At) DL0 <- - nGt^2/4 if (steps) { print(cbind(Before.iteration=iter, Norm.of.gradient=nGt, Diff.M2LL.with.G=DL)) } if (DL <= DL0) { B <- t(t(B)*exp(At.half)) Xs <- Xs.new denom <- denom.new } else # perform a step of the classical # fixed-point algorithm: { sqrtTmpValues <- sqrt(Tmp$values) B <- t(t(B) * sqrtTmpValues) Xs <- t(t(Xs) / sqrtTmpValues) denom <- nu + rowSums(Xs^2) } Ys <- Xs/sqrt(denom) Psi <- (nu + p) * crossprod(Ys)/n Tmp <- eigen(Psi) Gt <- 1 - Tmp$values nGt <- sqrt(sum(Gt^2)) } if (steps) { print(cbind(After.iteration=iter, Norm.of.gradient=nGt)) } S <- tcrossprod(B) if (nu == 0) { S <- S / det(S)^(1/p) } return(list(S=S,iter=iter)) } ############################################### # Algorithms for the location-scatter problem # ############################################### # Our favourite one (PN): MVTMLE <- function(X,nu=1,delta=10^(-7), steps=FALSE) { if (nu < 1) { print("Need nu >= 1 degress of freedom!") print("Use nu == 1 now!") nu <- 1 } Xnames <- colnames(X) Xs <- as.matrix(X) mu0 <- colMeans(Xs) Xs <- t(t(Xs) - mu0) p <- dim(Xs)[2] Y <- cbind(Xs,1) tmp <- MVTMLE0(Y,nu-1,delta,steps=steps) G <- tmp$S iter <- tmp$iter G <- G / G[p+1,p+1] mu.hat <- G[1:p,p+1] names(mu.hat) <- Xnames Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat) rownames(Sigma.hat) <- colnames(Sigma.hat) <- Xnames Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p) return(list(mu.hat=mu0+mu.hat, Sigma.hat=Sigma.hat, Shape.hat=Shape.hat,iter=iter)) } # The one based on FP: MVTMLE_FP <- function(X,nu=1,delta=10^(-7), steps=FALSE) { if (nu < 1) { print("Need nu >= 1 degress of freedom!") print("Use nu == 1 now!") nu <- 1 } Xnames <- colnames(X) Xs <- as.matrix(X) mu0 <- colMeans(Xs) Xs <- t(t(Xs) - mu0) p <- dim(Xs)[2] Y <- cbind(Xs,1) tmp <- MVTMLE0_FP(Y,nu-1,delta,steps=steps) G <- tmp$S iter <- tmp$iter G <- G / G[p+1,p+1] mu.hat <- G[1:p,p+1] names(mu.hat) <- Xnames Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat) rownames(Sigma.hat) <- colnames(Sigma.hat) <- Xnames Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p) return(list(mu.hat=mu0+mu.hat, Sigma.hat=Sigma.hat, Shape.hat=Shape.hat,iter=iter)) } # Alternative fixed-point algorithm III # of Arslan et al. (1995), but with our # strict criterion for convergence: MVTMLE_FP3 <- function(X,nu=1,delta=10^(-7), steps=FALSE) { if (nu < 1) { print("Need nu >= 1 degress of freedom!") print("Use nu == 1 now!") nu <- 1 } Xnames <- colnames(X) Xs <- as.matrix(X) n <- dim(Xs)[1] p <- dim(Xs)[2] Iq <- diag(1,p+1) mu.hat <- colMeans(Xs) Xs <- t(t(X) - mu.hat) S <- crossprod(Xs)/n tmp <- eigen(S,symmetric=TRUE) tmpv <- sqrt(tmp$values) B <- t(t(tmp$vectors) * tmpv) Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs)) denom <- nu + rowSums(Xs^2) Ys <- cbind(Xs,1)/sqrt(denom) Psi <- (nu + p)*crossprod(Ys)/n nG <- norm(Psi - Iq,type="F") iter <- 0 while (nG > delta && iter < 1000) { iter <- iter+1 if (steps) { print(cbind(Before.iteration=iter, Norm.of.gradient=nG)) } Psi <- Psi/Psi[p+1,p+1] dmu <- Psi[1:p,p+1] Xs <- t(t(Xs) - dmu) mu.hat <- mu.hat + B %*% dmu dB2 <- Psi[1:p,1:p] - dmu %*% t(dmu) tmp <- eigen(dB2,symmetric=TRUE) tmpv <- sqrt(tmp$values) B <- B %*% t(t(tmp$vectors)*tmpv) Xs <- t((t(tmp$vectors)/tmpv) %*% t(Xs)) denom <- nu + rowSums(Xs^2) Ys <- cbind(Xs,1)/sqrt(denom) Psi <- (nu + p)*crossprod(Ys)/n nG <- norm(Psi - Iq,type="F") } if (steps) { print(cbind(After.iteration=iter, Norm.of.gradient=nG)) } Sigma.hat <- tcrossprod(B) names(mu.hat) <- Xnames rownames(Sigma.hat) <- colnames(Sigma.hat) <- Xnames Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p) return(list(mu.hat=mu.hat, Sigma.hat=Sigma.hat, Shape.hat=Shape.hat,iter=iter)) } # The one based on G: MVTMLE_G <- function(X,nu=1,delta=10^(-7), steps=FALSE) { if (nu < 1) { print("Need nu >= 1 degress of freedom!") print("Use nu == 1 now!") nu <- 1 } Xnames <- colnames(X) Xs <- as.matrix(X) mu0 <- colMeans(Xs) Xs <- t(t(Xs) - mu0) p <- dim(Xs)[2] Y <- cbind(Xs,1) tmp <- MVTMLE0_G(Y,nu-1,delta,steps=steps) G <- tmp$S iter <- tmp$iter G <- G / G[p+1,p+1] mu.hat <- G[1:p,p+1] names(mu.hat) <- Xnames Sigma.hat <- G[1:p,1:p] - tcrossprod(mu.hat) rownames(Sigma.hat) <- colnames(Sigma.hat) <- Xnames Shape.hat <- Sigma.hat / det(Sigma.hat)^(1/p) return(list(mu.hat=mu0+mu.hat, Sigma.hat=Sigma.hat, Shape.hat=Shape.hat,iter=iter)) }