From b291f9aa71f78804dfc7a15035dccdf0d85af5d7 Mon Sep 17 00:00:00 2001 From: bobbraswell Date: Thu, 13 Aug 2015 02:32:17 +0200 Subject: [PATCH 1/2] Create centroid The hardest part of a centroid extraction in R is dealing with the sign vector used to maximize positive manifold before each factor is extracted. This function implements a new algorithm from the IFI-Zurich report in references list. This will not be a function called by the user, but will be wrapped in the centroid extraction function. --- R/centroid | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 R/centroid diff --git a/R/centroid b/R/centroid new file mode 100644 index 0000000..1ee5936 --- /dev/null +++ b/R/centroid @@ -0,0 +1,38 @@ +FindSignVector <- function(X) { + # input is the matrix to be factored, whether original cor or residuals + # first find how many rows it has (rows=cols in our context) + n <- dim(X)[1] + # initialize the posn variable before entering loop + posn <- 0 + repeat { + # change sign at posn unless just starting, then initialize sign vector Z + if (posn==0) { + Z <- c(rep(1,n)) # starts out all ones of length n + } else { + Z[pos] <- -1 #could mult by -1 but no need in this algorithm; always doing 1 -> -1 + } + # Determine S and V + S <- Z %*% t(X) + S <- t(S) + V <- NULL + for (i in 1:n) { + V[i] <- X[i,] %*% S - Z[i] * X[i,] %*% X[i,] + print(V[i]) + } + # search next element + val <- 0 + posn <- 0 + for (i in 1:n) { + if (Z[i]*V[i] < 0) { + if (abs(V[i]) > val) { + val <- V[i] + posn <- i + } + } + } # end of for loop + # 'until' condition for repeat loop in R must be implemented as break statement + if (posn == 0) {break} + } # end of repeat loop + return(Z) +} # end of function FindSignVector + From 94eb72485124c124bc78c7728a81b4d650b7e96f Mon Sep 17 00:00:00 2001 From: bobbraswell Date: Tue, 18 Aug 2015 15:27:25 +0200 Subject: [PATCH 2/2] complete working centroid FA The function committed last time is renamed and debugged, and supplemented by all other necessary functions. Given an input matrix X (correlation matrix), "results <- qcf(X)" will return centroid FA results. At this stage it has been tested on three datasets and seems to give the same results as other standards. More testing needed. --- R/centroid | 119 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 19 deletions(-) diff --git a/R/centroid b/R/centroid index 1ee5936..bfabaf9 100644 --- a/R/centroid +++ b/R/centroid @@ -1,32 +1,44 @@ -FindSignVector <- function(X) { +# a function to be called internally by other functions to zero the diagonal of input matrix +setdiag0 <- function(x) { + x[row(x)==col(x)] <- 0 + return(x) +} + +# a function to apply a signs vector to an input matrix to improve positive manifold +applyZ <- function (X,Z) { + m <- dim(X)[2] + for (i in 1:m) { + X[i,] <- X[i,] * Z[i] + X[,i] <- X[,i] * Z[i] + } + return(X) +} + +fsv <- function(X) { #finds sign vector that will maximize positive manifold of the input matrix # input is the matrix to be factored, whether original cor or residuals # first find how many rows it has (rows=cols in our context) n <- dim(X)[1] # initialize the posn variable before entering loop - posn <- 0 + posn <- 0 repeat { # change sign at posn unless just starting, then initialize sign vector Z if (posn==0) { Z <- c(rep(1,n)) # starts out all ones of length n } else { - Z[pos] <- -1 #could mult by -1 but no need in this algorithm; always doing 1 -> -1 - } - # Determine S and V - S <- Z %*% t(X) - S <- t(S) - V <- NULL - for (i in 1:n) { - V[i] <- X[i,] %*% S - Z[i] * X[i,] %*% X[i,] - print(V[i]) + Z[posn] <- Z[posn] * -1 } - # search next element - val <- 0 + # get column sums of X as operated on by the Z so far + X2 <- applyZ(X,Z) + S <- colSums(X2) + # now search for the biggest negative item in S + val <- 0 posn <- 0 - for (i in 1:n) { - if (Z[i]*V[i] < 0) { - if (abs(V[i]) > val) { - val <- V[i] - posn <- i + ii <- 0 # just in case + for (ii in 1:n) { + if (S[ii] < 0) { + if (S[ii] < val) { + val <- S[ii] + posn <- ii } } } # end of for loop @@ -34,5 +46,74 @@ FindSignVector <- function(X) { if (posn == 0) {break} } # end of repeat loop return(Z) -} # end of function FindSignVector +} # end of function fsv + +# Centroid extraction function -- takes correlation matrix as input and returns factor matrix and stats +qcf <- function(X) { # Q-style centroid factoring + # initialize how many factors to extract: lesser of size of cor matrix or maxfactors + maxfactors <- 7 + n <- dim(X)[1] + m <- dim(X)[2] + if (m > maxfactors) {m <- maxfactors} + # initialize loadings vector and output matrix so we can insert rather than append as we go + temploadings <- c(rep(0,n)) + EV <- c(rep(0,m)) + pctvar <- c(rep(0,m)) + F <- matrix(nrow=n,ncol=m) + # outer loop: executed once for each factor extracted + for (i in 1:m) { + # set diagonals to zero (will be fixed later by adding h^2 estimate back in to column totals) + X <- setdiag0(X) + # call a function that returns the reflection vector + Z <- fsv(X) + # apply it to the input matrix + X2 <- applyZ(X,Z) + # calculate sums of columns and store in sums vector + S <- colSums(X2) + # set initial communality estimate to 0.80: this will be improved by iteration + h2est <- rep(0.80,n) + # inner loop to iterate communalities and associated matrix totals + repeat { + # col sums plus communality estimates + hS <- S + h2est + # get matrix total, adding in communality estimates to make up for zeroed diagonals + T <- sum(hS) + # get reciprocal of sq rt of T as scaling factor + sf <- 1/sqrt(T) + # multiply column sums by scaling factor to get initial loadings estimate + for (j in 1:n) { temploadings[j] <- hS[j] * sf } + # square loadings to get vector of communality estimates + newh2est <- temploadings * temploadings + # compare new communalities to old and unless the max difference in any column is < .02, loop again + temp <- h2est - newh2est # had to break this up during debugging as putting it + temp2 <- abs(temp) # all in one if statement was not always + temp3 <- max(temp2) # breaking out of the loop when it should + if (temp3 < 0.02) {break} + h2est <- newh2est ## get ready to loop again with new communality estimate + } ## end of repeat loop -- go back unless break occurred + # finish up for this factor and prepare for next + # calculate the crossproducts matrix + Xprod <- temploadings %*% t(temploadings) + # subtract crossproducts from input matrix to create residuals + residuals <- X - Xprod + # (un)reflect and save the factor loadings + F[,i] <- temploadings * Z + # add up the diagonals and save as eigenvalue + EV[i] <- sum(newh2est) + # divide eigenvalue by n and save as percent of variance + pctvar[i] <- EV[i]/n + # (un)reflect the residuals + X <- applyZ(residuals,Z) + # now do next iteration with residuals as input (or fall through to finish up after last factor) + } # end of for loop + # calculate communalities across all factors for each sort? not implemented yet + # return results in a nice packet ready for further work + centroidresults <- NULL + centroidresults$EV <- EV + centroidresults$pctvar <- 100 * pctvar + centroidresults$loadings <- F + centroidresults$residuals <- residuals + return(centroidresults) +} # end of qcf +