-
Notifications
You must be signed in to change notification settings - Fork 0
/
outlierDetection.R
executable file
·150 lines (111 loc) · 4.03 KB
/
outlierDetection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#---------------------------------------------------------------
#Outlier detection
#---------------------------------------------------------------
#use script 11_2_2 as inspiration
#Clear workspace ect:
rm(list=ls())
graphics.off() # close all open graphics windows
#----------------------
#Load the data into R
#----------------------
#Set working directory
#setwd("~/Google Drive/PhD/Courses/Introducion to Machine Learning and Data Mining /dataset")
setwd("/Users/lenesommer/DTU/Kandidat/F2014/IntroTilMachineLearning/Project3")
#Read data in
dat <- read.table("phageDS.complete25FEB.txt", header = T, as.is = T)
dim(dat)
#Library and sources
source("gausKernelDensity.R")
#install.packages("FNN")
library(FNN)
#remove phage ID, categorical attributes and attributes related to the host
X<- dat[,-c(1,4,5,6,7,8)]
X <- scale(X) #Standardize
#X <- X[-77,] The outlier, which is not removed. Try remobing it and see which one is then found
X <-data.frame(X)
attributeNames <- colnames(X)
#---------------------------------------------
#Gaussian kernel
#---------------------------------------------
# Estimate optimal kernel density width by leave-one-out cross-validation
widths=2^(-10:10)
logP <- rep(NA, times=length(widths))
for(w in 1:length(widths)){
res = gausKernelDensity(as.matrix(X), widths[w]);
f <- res$density
log_f <- res$log_density
logP[w]=sum(log_f);
}
val <- max(logP)
ind <- which.max(logP)
width=widths[ind]
print(paste('Optimal estimated width is', width))
#Estimate density for each observation not including the observation
# itself in the density estimate
res = gausKernelDensity(as.matrix(X), width)
f <- res$density
#Sort the densities
sortres <- sort(f, index.return=TRUE)
y <- sortres$x
i <- sortres$ix
#Display the index of the lowest density data object
print(i[1])
#Perfect, since it is the same that we found in the first report
# Plot density estimate outlier scores
barplot(y[1:20], main='Method: Gaussian kernel', ylab="Outlier score", xlab="20 observations with lowest score", col="darkblue")
#---------------------------------------------
#K-nearest neighbor density estimator
#---------------------------------------------
#Use script 11_3_1 as help
# Number of neighbors
K = 5;
# Find the k nearest neighbors
res <- get.knnx(data=X, query=X, k=K+1)
idx <- res$nn.index
D <- res$nn.dist
# Compute the density
density = 1/(rowSums(D[,2:dim(D)[2]])/K)
# Sort the densities
sortres <- sort(density, index.return=TRUE)
y <- sortres$x
i <- sortres$ix
#Display the index of the lowest density data object
print(i[1])
# Plot outlier scores
barplot(y[1:20], main="Method: KNN density", ylab="Outlier score", xlab="20 observations with lowest score", col="blue")
#---------------------------------------------
#K-nearest neigbor average relative density
#---------------------------------------------
#Use script 11_3_1 as help
# Compute the average relative density
avg_rel_density=density/(rowSums(matrix(density[idx[,2:dim(idx)[2]]], nrow=dim(idx)[1]))/K)
# Sort the densities
sortres <- sort(avg_rel_density, index.return=TRUE)
y <- sortres$x
i <- sortres$ix
#Display the index of the lowest density data object
print(i[1])
# Plot outlier scores
barplot(y[1:20], main='Method: KNN average relative density', ylab="Outlier score", xlab="20 observations with lowest score", col="red")
#Here it is more clear to see the outlier
#---------------------------------------------
#Distance to the K’th nearest neighbor
#---------------------------------------------
#Use script 11.2.4 as help
#Neighbors to use
K = 5
#Find the k nearest neighbors
res <- get.knnx(data=X, query=X, k=K+1)
i <- res$nn.index
D <- res$nn.dist
#Outlier score
f = D[,K+1]
#Sort the outlier scores
# Sort the densities
sortres <- sort(f, index.return=TRUE, decreasing=TRUE)
y <- sortres$x
i <- sortres$ix
#Display the index of the lowest density data object
print(i[1])
# Plot kernel density estimate outlier scores
barplot(y[1:20], main='Method: Distance to the 5’th nearest neighbor', xlab="20 observations with lowest score", ylab="Outlier score", col="green")