code/2.2_model_SVM.Rmd

---
title: "model_SVM"
author: "Yingfan Duan"
date: "2020/11/25"
output: html_document
---

## Preparations

```{r}
# library packages
require(dplyr)
require(readr)
library(caTools)
library(ROSE)
require(pROC)
require(e1071)
require(fastDummies)
require(caTools)
source('funcs.R', encoding = 'UTF-8')

# read processed dataset
list(load("../data/working/bank.RData"))
```


## Train-test split

```{r}
# Encoding the target feature as factor
bank_rffixed$y <- factor(bank_rffixed$y, levels = c(0, 1))

# delete year and duration
bank_rffixed <- subset(bank_rffixed, select = -c(year,duration,month))

# categorical variables
bank_rffixed$default <- as.numeric(bank_rffixed$default)
bank_rffixed$contact <- as.numeric(bank_rffixed$contact)
bank_rffixed$poutcome <- as.numeric(bank_rffixed$poutcome)
# one hot encoding
dataset <- dummy_cols(bank_rffixed, select_columns = c('job', 'marital'), remove_selected_columns = TRUE)
dataset <- subset(dataset, select = -c(job_unemployed, marital_single))

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset$y, SplitRatio = 0.7)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
```

```{r}
# Feature Scaling
training_set[-(16:28)] = scale(training_set[-(16:28)])
test_set[-(16:28)] = scale(test_set[-(16:28)])
```


##  svm
```{r}
# # choose type and kernel
# svm_test <- function(x,y){
#    type <- c('C-classification','nu-classification','one-classification')
#    kernel <- c('linear','polynomial','radial','sigmoid')
#    pred <- array(0, dim=c(nrow(x),3,4))
#    errors <- matrix(0,3,4)
#    dimnames(errors) <- list(type, kernel)
#    for(i in 1:3){
#      for(j in 1:4){
#        pred[,i,j] <- predict(object = svm(y ~ .,data = x, type = type[i], kernel = kernel[j]), newdata = x)
#        if(i > 2) errors[i,j] <- sum(pred[,i,j] != 1)
#        else errors[i,j] <- sum(pred[,i,j] != as.integer(y))
#        }
#      }
#    return(errors)
# }
# 
# svm_test(x = training_set, y = training_set$y)

# use the optimal type and kernel to build svm
classifier = svm(formula = y ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'radial',
                 probability = TRUE)

# Predicting the Test set results
y_pred_svm = predict(classifier, test_set[-16], probability = TRUE, type= 'prob')

# get the probability

confusionMatrix(factor(y_pred_svm,levels = c(0, 1)), 
                      factor(svm_pred$y_true,levels = c(0, 1)))$table
# evaluation
evaluation(test_set$y, y_pred_svm)
```

## use grid search to find optimal parameters and retrain the model
```{r}
# tune parameters
# output <- data.frame()
# for(gamma in 10^(-7:-2)){
#   for(cost in 10^(1:3)){
#     classifier = svm(formula = y ~ .,
#                  data = training_set,
#                  type = 'C-classification',
#                  kernel = 'radial',
#                  gamma = gamma,
#                  cost = cost)
#     y_pred = predict(classifier, newdata = test_set[-16])
#     result <- evaluation(test_set$y, y_pred)
#     output <- rbind(output, result)
#   }
# }
# best_svm <- tune.svm(y ~., data = training_set, gamma = 10^(-7:-2), cost = 10^(1:3))


# train the model
classifier = svm(formula = y ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'radial',
                 gamma = 0.001,
                 cost = 100)

# Predicting the Test set results
y_pred_svm = predict(classifier, newdata = test_set[-16])

# Making the Confusion Matrix
confusionMatrix(factor(y_pred,levels = c(0, 1)), 
                      factor(test_set$y,levels = c(0, 1)))$table
# evaluation 
evaluation(training_set$y, y_pred_svm)
```

## ROC curve
```{r}
modelroc <- roc(test_set$y, as.numeric(svm_pred_prob), plot = TRUE)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE,
     auc.polygon.col="skyblue", print.thres=TRUE, )
```

```{r}
# compare between three models
roc1 <- roc(bic_pred$y_true, bic_pred$pred_prob_bic)   
roc2 <- roc(svm_pred$y_true, as.numeric(svm_pred$y_pred))
roc3 <- roc(sjr_roc$y_true, sjr_roc$y_prob)
plot(roc1, col="blue", lty = 1, max.auc.polygon=TRUE)
plot.roc(roc2, add=TRUE, col="red", lty = 2)
plot.roc(roc3, add=TRUE, col="green", lty = 3, print.auc=TRUE)
legend('right',c('Logit','SVM','Decision Tree'), lty = c(1,2,3), col = c('blue','red','green'),ncol=1)
```