-
Notifications
You must be signed in to change notification settings - Fork 7
/
2.2_model_SVM.Rmd
151 lines (128 loc) · 4.48 KB
/
2.2_model_SVM.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
---
title: "model_SVM"
author: "Yingfan Duan"
date: "2020/11/25"
output: html_document
---
## Preparations
```{r}
# library packages
require(dplyr)
require(readr)
library(caTools)
library(ROSE)
require(pROC)
require(e1071)
require(fastDummies)
require(caTools)
source('funcs.R', encoding = 'UTF-8')
# read processed dataset
list(load("../data/working/bank.RData"))
```
## Train-test split
```{r}
# Encoding the target feature as factor
bank_rffixed$y <- factor(bank_rffixed$y, levels = c(0, 1))
# delete year and duration
bank_rffixed <- subset(bank_rffixed, select = -c(year,duration,month))
# categorical variables
bank_rffixed$default <- as.numeric(bank_rffixed$default)
bank_rffixed$contact <- as.numeric(bank_rffixed$contact)
bank_rffixed$poutcome <- as.numeric(bank_rffixed$poutcome)
# one hot encoding
dataset <- dummy_cols(bank_rffixed, select_columns = c('job', 'marital'), remove_selected_columns = TRUE)
dataset <- subset(dataset, select = -c(job_unemployed, marital_single))
# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset$y, SplitRatio = 0.7)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
```
```{r}
# Feature Scaling
training_set[-(16:28)] = scale(training_set[-(16:28)])
test_set[-(16:28)] = scale(test_set[-(16:28)])
```
## svm
```{r}
# # choose type and kernel
# svm_test <- function(x,y){
# type <- c('C-classification','nu-classification','one-classification')
# kernel <- c('linear','polynomial','radial','sigmoid')
# pred <- array(0, dim=c(nrow(x),3,4))
# errors <- matrix(0,3,4)
# dimnames(errors) <- list(type, kernel)
# for(i in 1:3){
# for(j in 1:4){
# pred[,i,j] <- predict(object = svm(y ~ .,data = x, type = type[i], kernel = kernel[j]), newdata = x)
# if(i > 2) errors[i,j] <- sum(pred[,i,j] != 1)
# else errors[i,j] <- sum(pred[,i,j] != as.integer(y))
# }
# }
# return(errors)
# }
#
# svm_test(x = training_set, y = training_set$y)
# use the optimal type and kernel to build svm
classifier = svm(formula = y ~ .,
data = training_set,
type = 'C-classification',
kernel = 'radial',
probability = TRUE)
# Predicting the Test set results
y_pred_svm = predict(classifier, test_set[-16], probability = TRUE, type= 'prob')
# get the probability
confusionMatrix(factor(y_pred_svm,levels = c(0, 1)),
factor(svm_pred$y_true,levels = c(0, 1)))$table
# evaluation
evaluation(test_set$y, y_pred_svm)
```
## use grid search to find optimal parameters and retrain the model
```{r}
# tune parameters
# output <- data.frame()
# for(gamma in 10^(-7:-2)){
# for(cost in 10^(1:3)){
# classifier = svm(formula = y ~ .,
# data = training_set,
# type = 'C-classification',
# kernel = 'radial',
# gamma = gamma,
# cost = cost)
# y_pred = predict(classifier, newdata = test_set[-16])
# result <- evaluation(test_set$y, y_pred)
# output <- rbind(output, result)
# }
# }
# best_svm <- tune.svm(y ~., data = training_set, gamma = 10^(-7:-2), cost = 10^(1:3))
# train the model
classifier = svm(formula = y ~ .,
data = training_set,
type = 'C-classification',
kernel = 'radial',
gamma = 0.001,
cost = 100)
# Predicting the Test set results
y_pred_svm = predict(classifier, newdata = test_set[-16])
# Making the Confusion Matrix
confusionMatrix(factor(y_pred,levels = c(0, 1)),
factor(test_set$y,levels = c(0, 1)))$table
# evaluation
evaluation(training_set$y, y_pred_svm)
```
## ROC curve
```{r}
modelroc <- roc(test_set$y, as.numeric(svm_pred_prob), plot = TRUE)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE,
auc.polygon.col="skyblue", print.thres=TRUE, )
```
```{r}
# compare between three models
roc1 <- roc(bic_pred$y_true, bic_pred$pred_prob_bic)
roc2 <- roc(svm_pred$y_true, as.numeric(svm_pred$y_pred))
roc3 <- roc(sjr_roc$y_true, sjr_roc$y_prob)
plot(roc1, col="blue", lty = 1, max.auc.polygon=TRUE)
plot.roc(roc2, add=TRUE, col="red", lty = 2)
plot.roc(roc3, add=TRUE, col="green", lty = 3, print.auc=TRUE)
legend('right',c('Logit','SVM','Decision Tree'), lty = c(1,2,3), col = c('blue','red','green'),ncol=1)
```