-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNNscript.R
105 lines (83 loc) · 3.12 KB
/
KNNscript.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
####################################################################################
# Fit KNN
####################################################################################
#Clear workspace ect:
rm(list=ls())
graphics.off() # close all open graphics windows
#use controle l to clear the consol window
#----
#Packages
#----
#install.packages("FNN")
library(FNN)
#install.packages("cvTools")
library(cvTools)
source('categoric2numeric.R')
#----------------------
#Load the data into R
#----------------------
#Set working directory
#setwd("~/Google Drive/PhD/Courses/Introducion to Machine Learning and Data Mining /dataset")
setwd('/Users/lenesommer/DTU/Kandidat/F2014/IntroTilMachineLearning /Project2')
#Read data in
dat <- read.table("phageDS.complete25FEB.txt", header = T, as.is = T)
dim(dat)
#Extract class labels of observations
classlabels <- dat[,5]
XC <- dat[,-c(1,4,5,6,7,8)]
#NOTE: removing so that there is no categorical variables.
#Remove the fith column because it is with the class labels
#Remove the first because it is the ID
# 4 is the phage family - categorical
# 8 is the predicted host
#Remove host GC content because it is related to the host and also host genome size (6 og 7)
#Make the 2 categorical attributes into something that we can use
#First column 4
col4 = categoric2numeric(dat[,4])
XC[11:19]<-col4[[1]]
familynames <- col4[[2]]
colnames(XC)[11:19] <- familynames
#Then column 8
col7 = categoric2numeric(dat[,8])
XC[20:28]<-col7[[1]]
familynames7 <- col7[[2]]
colnames(XC)[20:28] <- familynames7
#Do we need to divide by the number of columns when we are not standardizing?
yC <- as.numeric(as.factor(classlabels))
yC <- yC-1
attributeNamesC <- colnames(XC)
#define the size - NOTE: we start with a test size
Nk = 126;
Mk = 28;
Ck = 8;
# Leave-one-out crossvalidation
CV <- cvFolds(Nk, K=Nk);
K = Nk
# K-nearest neighbors parameters
L = 40; # Maximum number of neighbors
# Variable for classification error
Error = array(rep(NA, times=K*L), dim=c(K,L))
for(k in 1:K){ # For each crossvalidation fold
print(paste('Crossvalidation fold ', k, '/', CV$NumTestSets, sep=''))
# Extract training and test set
X_train <- XC[CV$which!=k, ];
y_train <- yC[CV$which!=k];
X_test <- XC[CV$which==k, ];
y_test <- yC[CV$which==k];
CV$TrainSize[k] <- length(y_train)
CV$TestSize[k] <- length(y_test)
X_testdf <- data.frame(X_test)
colnames(X_testdf) <- attributeNamesC
X_traindf <- data.frame(X_train)
colnames(X_traindf) <- attributeNamesC
for(l in 1:L){ # For each number of neighbors
# Use knnclassify to find the l nearest neighbors
y_test_est <- knn(X_traindf, X_testdf, cl=y_train, k = l, prob = FALSE, algorithm="kd_tree")
# Compute number of classification errors
Error[k,l] = sum(y_test!=y_test_est); # Count the number of errors
}
}
## Plot the classification error rate
plot(colSums(Error)/sum(CV$TestSize)*100, main='Error rate', xlab='Number of neighbors', ylab='Classification error rate (%)', pch=20, type='l');
#Buy using this we would choose around 3 as the optimal number og neighbors - I think
#NOTE: What more do we need to do with KNN, what plots ect should we show?