-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCAdataVariance.R
executable file
·77 lines (59 loc) · 2.3 KB
/
PCAdataVariance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
####################
# exercise 2.2.1
####################
# read data into R
setwd("~/GoogleDrive/PhD/Courses/IntroductiontoMachineLearningandDataMining/ToolBox/02450Toolbox_R")
dat <- read.csv("./Data/nanonose.csv", sep=",", check.names=FALSE)
# extract class labels of observations
class_labels <- colnames(dat)
class_labels <- class_labels[-(1:2)]
# extract attributes, i.e. sensor names
attributes <- dat[3:10,1]
# remove first two rows and columns
datnew <- dat[-(1:2),-(1:2)]
# transpose data matrix
datfinal <- t(datnew)
# check that dimensions are as they should be (90 rows and 8 columns)
dim(datfinal)
# assign the class labels as row names and the attributes as column names
rownames(datfinal) <- class_labels
colnames(datfinal) <- attributes
# extract the class names present in data
class_names <- unique(class_labels)
# Extract numeric class assignments
y <- as.numeric(as.factor(class_labels))
y <- y-1
####################
# exercise 2.2.2
####################
# choose which sensors to plot
sensorx = datfinal[,1]
sensory = datfinal[,2]
# make simple plot
plot(sensorx, sensory)
# make more fancy plot
# first assign titles and labels to the plot, and determine its size by giving the minimum and maximum values of the sensors. Do not plot anything (the option type="n")
plot(c(min(sensorx), max(sensorx)), c(min(sensory), max(sensory)), xlab="Sensor A", ylab="Sensor B", main="NanoNose data", type="n")
# plot points for each sensor in separate colors
cols <- colors()
for(i in sort(unique(y))){
points(sensorx[y==i], sensory[y==i], col=cols[(i+1)*10])
}
# get the order that classes were gone through and plotted in for loop
sorted <- sort(class_names, index.return=TRUE)
# add legend
legend("topright", legend=class_names[sorted$ix], fill = cols[10*(1:5)])
####################
# exercise 2.2.3
####################
means <- colMeans(datfinal)
#substract the mean row-wise to all the data
datzeromean<- t(apply(datfinal,1,'-',means))
colMeans(datzeromean)
svdres <- svd(datzeromean)
singularvals <- svdres$d
#how much of the variance ech component is expressing
pcvariance <- singularvals^2/sum(singularvals^2)
sum(pcvariance[1:3])
#calculating the cumulative sum
plot(cumsum(pcvariance), main="Data variance explained by PCs", xlab="Number of PCs included in variance sum", ylab="Proportion of variance explained")