-
Notifications
You must be signed in to change notification settings - Fork 6
/
glPcaFast.R
113 lines (112 loc) · 3.46 KB
/
glPcaFast.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
glPcaFast <- function(x,
center=TRUE,
scale=FALSE,
nf=NULL,
loadings=TRUE,
alleleAsUnit=FALSE,
returnDotProd=FALSE){
if(!inherits(x, "genlight")) stop("x is not a genlight object")
# keep the original mean / var code, as it's used further down
# and has some NA checks..
if(center) {
vecMeans <- glMean(x, alleleAsUnit=alleleAsUnit)
if(any(is.na(vecMeans))) stop("NAs detected in the vector of means")
}
if(scale){
vecVar <- glVar(x, alleleAsUnit=alleleAsUnit)
if(any(is.na(vecVar))) stop("NAs detected in the vector of variances")
}
# convert to full data, try to keep the NA handling as similar
# to the original as possible
# - dividing by ploidy keeps the NAs
mx <- t(sapply(x$gen, as.integer)) / ploidy(x)
# handle NAs
NAidx <- which(is.na(mx), arr.ind = T)
if (center) {
mx[NAidx] <- vecMeans[NAidx[,2]]
} else {
mx[NAidx] <- 0
}
# center and scale
mx <- scale(mx,
center = if (center) vecMeans else F,
scale = if (scale) vecVar else F)
# all dot products at once using underlying BLAS
# to support thousands of samples, this could be
# replaced by 'Truncated SVD', but it would require more changes
# in the code around
allProd <- tcrossprod(mx) / nInd(x) # assume uniform weights
## PERFORM THE ANALYSIS ##
## eigenanalysis
eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
rank <- sum(eigRes$values > 1e-12)
eigRes$values <- eigRes$values[1:rank]
eigRes$vectors <- eigRes$vectors[, 1:rank, drop=FALSE]
## scan nb of axes retained
if(is.null(nf)){
barplot(eigRes$values, main="Eigenvalues", col=heat.colors(rank))
cat("Select the number of axes: ")
nf <- as.integer(readLines(n = 1))
}
## rescale PCs
res <- list()
res$eig <- eigRes$values
nf <- min(nf, sum(res$eig>1e-10))
##res$matprod <- allProd # for debugging
## use: li = XQU = V\Lambda^(1/2)
eigRes$vectors <- eigRes$vectors * sqrt(nInd(x)) # D-normalize vectors
res$scores <- sweep(eigRes$vectors[, 1:nf, drop=FALSE],2,
sqrt(eigRes$values[1:nf]), FUN="*")
## GET LOADINGS ##
## need to decompose X^TDV into a sum of n matrices of dim p*r
## but only two such matrices are represented at a time
if(loadings){
if(scale) {
vecSd <- sqrt(vecVar)
}
res$loadings <- matrix(0, nrow=nLoc(x), ncol=nf) # create empty matrix
## use: c1 = X^TDV
## and X^TV = A_1 + ... + A_n
## with A_k = X_[k-]^T v[k-]
myPloidy <- ploidy(x)
for(k in 1:nInd(x)){
temp <- as.integer(x@gen[[k]]) / myPloidy[k]
if(center) {
temp[is.na(temp)] <- vecMeans[is.na(temp)]
temp <- temp - vecMeans
} else {
temp[is.na(temp)] <- 0
}
if(scale){
temp <- temp/vecSd
}
res$loadings <- res$loadings + matrix(temp) %*% eigRes$vectors[k,
1:nf, drop=FALSE]
}
res$loadings <- res$loadings / nInd(x) # don't forget the /n of X_tDV
res$loadings <- sweep(res$loadings, 2, sqrt(eigRes$values[1:nf]),
FUN="/")
}
## FORMAT OUTPUT ##
colnames(res$scores) <- paste("PC", 1:nf, sep="")
if(!is.null(indNames(x))){
rownames(res$scores) <- indNames(x)
} else {
rownames(res$scores) <- 1:nInd(x)
}
if(!is.null(res$loadings)){
colnames(res$loadings) <- paste("Axis", 1:nf, sep="")
if(!is.null(locNames(x)) & !is.null(alleles(x))){
rownames(res$loadings) <- paste(locNames(x),alleles(x), sep=".")
} else {
rownames(res$loadings) <- 1:nLoc(x)
}
}
if(returnDotProd){
res$dotProd <- allProd
rownames(res$dotProd) <- colnames(res$dotProd) <- indNames(x)
}
res$call <- match.call()
class(res) <- "glPca"
return(res)
}