1. Data Understanding

rm(list=ls())
library("tidyverse")
## -- Attaching packages ------------------------------------------------------ tidyverse 1.3.0 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1.1 Collect Initial Data

# Import bank-full data and save it as bank
bank <- read.csv("bank-full.csv", sep = ';')
bank_svm <- read.csv("bank.csv", sep=";")

Notes: 1) bank-full.csv with all examples, ordered by date (from May 2008 to November 2010). 2) bank.csv with 10% of the examples (4521), randomly selected from bank-full.csv. The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g. SVM).

1.2 Describe Data

glimpse(bank)
## Observations: 45,211
## Variables: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 5...
## $ job       <fct> management, technician, entrepreneur, blue-collar, unknow...
## $ marital   <fct> married, single, married, married, single, married, singl...
## $ education <fct> tertiary, secondary, secondary, unknown, unknown, tertiar...
## $ default   <fct> no, no, no, no, no, no, no, yes, no, no, no, no, no, no, ...
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6,...
## $ housing   <fct> yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, yes...
## $ loan      <fct> no, no, yes, no, no, no, yes, no, no, no, no, no, no, no,...
## $ contact   <fct> unknown, unknown, unknown, unknown, unknown, unknown, unk...
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ month     <fct> may, may, may, may, may, may, may, may, may, may, may, ma...
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 5...
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ poutcome  <fct> unknown, unknown, unknown, unknown, unknown, unknown, unk...
## $ y         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, n...
glimpse(bank_svm)
## Observations: 4,521
## Variables: 17
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 3...
## $ job       <fct> unemployed, services, management, management, blue-collar...
## $ marital   <fct> married, married, single, married, married, single, marri...
## $ education <fct> primary, secondary, tertiary, tertiary, secondary, tertia...
## $ default   <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, n...
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374,...
## $ housing   <fct> no, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes,...
## $ loan      <fct> no, yes, no, yes, no, no, no, no, no, yes, no, no, no, no...
## $ contact   <fct> cellular, cellular, cellular, unknown, unknown, cellular,...
## $ day       <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, ...
## $ month     <fct> oct, may, apr, jun, may, feb, may, may, may, apr, may, ap...
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113,...
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, ...
## $ pdays     <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, ...
## $ previous  <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, ...
## $ poutcome  <fct> unknown, failure, failure, unknown, unknown, failure, oth...
## $ y         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, ...
summary(bank)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
summary(bank_svm)
##       age                 job          marital         education    default   
##  Min.   :19.00   management :969   divorced: 528   primary  : 678   no :4445  
##  1st Qu.:33.00   blue-collar:946   married :2797   secondary:2306   yes:  76  
##  Median :39.00   technician :768   single  :1196   tertiary :1350             
##  Mean   :41.17   admin.     :478                   unknown  : 187             
##  3rd Qu.:49.00   services   :417                                              
##  Max.   :87.00   retired    :230                                              
##                  (Other)    :713                                              
##     balance      housing     loan           contact          day       
##  Min.   :-3313   no :1962   no :3830   cellular :2896   Min.   : 1.00  
##  1st Qu.:   69   yes:2559   yes: 691   telephone: 301   1st Qu.: 9.00  
##  Median :  444                         unknown  :1324   Median :16.00  
##  Mean   : 1423                                          Mean   :15.92  
##  3rd Qu.: 1480                                          3rd Qu.:21.00  
##  Max.   :71188                                          Max.   :31.00  
##                                                                        
##      month         duration       campaign          pdays       
##  may    :1398   Min.   :   4   Min.   : 1.000   Min.   : -1.00  
##  jul    : 706   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
##  aug    : 633   Median : 185   Median : 2.000   Median : -1.00  
##  jun    : 531   Mean   : 264   Mean   : 2.794   Mean   : 39.77  
##  nov    : 389   3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
##  apr    : 293   Max.   :3025   Max.   :50.000   Max.   :871.00  
##  (Other): 571                                                   
##     previous          poutcome      y       
##  Min.   : 0.0000   failure: 490   no :4000  
##  1st Qu.: 0.0000   other  : 197   yes: 521  
##  Median : 0.0000   success: 129             
##  Mean   : 0.5426   unknown:3705             
##  3rd Qu.: 0.0000                            
##  Max.   :25.0000                            
## 

bank client data:

    1. age (numeric)
    1. job : type of job (categorical: “admin.”,“unknown”,“unemployed”,“management”,“housemaid”,“entrepreneur”,“student”,“blue-collar”,“self-employed”,“retired”,“technician”,“services”)
    1. marital : marital status (categorical: “married”,“divorced”,“single”; note: “divorced” means divorced or widowed)
    1. education (categorical:“unknown”,“secondary”,“primary”,“tertiary”)
    1. default: has credit in default? (binary: “yes”,“no”)
    1. balance: average yearly balance, in euros (numeric)
    1. housing: has housing loan? (binary: “yes”,“no”)
    1. loan: has personal loan? (binary: “yes”,“no”)

related with the last contact of the current campaign:

    1. contact: contact communication type (categorical:“unknown”,“telephone”,“cellular”)
    1. day: last contact day of the month (numeric)
    1. month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
    1. duration: last contact duration, in seconds (numeric)

other attributes:

    1. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
    1. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
    1. previous: number of contacts performed before this campaign and for this client (numeric)
    1. poutcome: outcome of the previous marketing campaign (categorical: “unknown”,“other”,“failure”,“success”)

Output variable (desired target):

    1. y - has the client subscribed a term deposit? (binary: “yes”,“no”)

-> WE ARE INTERESTED IN “YES”!

2. Data Preparation

2.1 Information Gain

library(FSelector)
attribute_weights <- information.gain(y~., bank)
attribute_weights_svm <- information.gain(y~., bank_svm)
(filtered_attribute <- cutoff.k(attribute_weights, 5))
## [1] "duration" "poutcome" "pdays"    "month"    "contact"
(filtered_attribute_svm <- cutoff.k(attribute_weights_svm, 5))
## [1] "duration" "poutcome" "pdays"    "month"    "contact"

Notes: The two datasets have same information gain results. We will use these top 5 attributes to build our models.

2.2 Select variables for Building Models

dt <- bank[filtered_attribute]
dt_svm <- bank_svm[filtered_attribute]

dt$class <- bank$y
dt_svm$class <- bank_svm$y

3. Building Models

library(caTools)
set.seed(123)

3.1 Split the data for training and testing

split <- sample.split(dt$class, SplitRatio = 0.7)
split_svm <- sample.split(dt_svm$class, SplitRatio = 0.7)

training = subset(dt, split==TRUE)
test = subset(dt, split==FALSE)

training_svm = subset(dt_svm, split_svm==TRUE)
test_svm = subset(dt_svm, split_svm==FALSE)

3.2 Data Balancing

  • The column class shows whether a client subscribed a term deposit (yes) or did not subscrib a term deposit (no).
  • Our aim is to predict subscription behavior. Therefore, our target variable is class.
  • Check the distribution of variable class by using functions table() and prop.table().
table(training$class)
## 
##    no   yes 
## 27945  3702
prop.table(table(training$class))
## 
##        no       yes 
## 0.8830221 0.1169779
barplot(table(training$class), 
        xlab = "Classes", ylab = "Frequency")

table(training_svm$class)
## 
##   no  yes 
## 2800  365
prop.table(table(training_svm$class))
## 
##        no       yes 
## 0.8846761 0.1153239
barplot(table(training_svm$class), 
        xlab = "Classes", ylab = "Frequency")

Notes:

  • We need to figure out how to balance the sample training data in the following codes.
  • And we can use F-score to choose the best ways of balancing the sample training data.

3.2 Building Models Without Balancing the Sample Training Data

library(ROSE)
## Loaded ROSE 0.0-3

3.2.1 SVM

library(e1071)
svm_radial_1 <- svm(class~., 
                    data = training_svm, 
                    kernel = "radial", 
                    scale = TRUE, 
                    probability = TRUE)
svm_predict_1 <- predict(svm_radial_1, 
                         test_svm, probability = TRUE)
results_svm_1 <- test_svm
results_svm_1$PredictionSVM <- svm_predict_1

3.2.2 Decision Tree

library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
decTree_1 <- ctree(class~., 
                   data = training)
decTree_predict_1 <- predict(decTree_1, 
                             test, type="response")
results_1 <- test
results_1$PredictionTree <- decTree_predict_1

3.3.3 Naive Bayes

#Naive Bayes
library(naivebayes)
## naivebayes 0.9.6 loaded
naivebayes_1 <- naive_bayes(class~., 
                            data = training)
naivebayes_predict_1 <- predict(naivebayes_1, 
                                test, type = "class")
## Warning: predict.naive_bayes(): More features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
results_1$PredictionNB <- naivebayes_predict_1

3.2.4 Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
model_RF_1 <- randomForest(class ~ ., 
                           data =  training)
RF_predict_1 <- predict(model_RF_1, 
                        test, type = "response")
results_1$PredictionRF <- RF_predict_1
correct_svm_1 <- which (results_svm_1$class == results_svm_1$PredictionSVM)
length(correct_svm_1)
## [1] 1215
correct_Tree_1 <- which(results_1$class == results_1$PredictionTree)
length(correct_Tree_1)
## [1] 12253
correct_NB_1 <- which(results_1$class == results_1$PredictionNB)
length(correct_NB_1)
## [1] 12059
correct_RF_1 <- which(results_1$class == results_1$PredictionRF)
length(correct_RF_1)
## [1] 12298

3.2.5 F-score of Each Model Without Balancing the Sample Training Data

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
CM_DT_1 <- confusionMatrix(decTree_predict_1, 
                           test$class, positive="yes")
CM_SVM_1 <- confusionMatrix(svm_predict_1, 
                            test_svm$class, positive="yes")
CM_NB_1 <- confusionMatrix(naivebayes_predict_1, 
                           test$class, positive="yes")
CM_RF_1 <- confusionMatrix(RF_predict_1, 
                           test$class, positive="yes")
beta <- 1
F_measure_DT_1 <- ((1 + beta^2)*CM_DT_1$byClass[5]*CM_DT_1$byClass[6])/(beta^2 * CM_DT_1$byClass[5]+CM_DT_1$byClass[6])
F_measure_SVM_1 <- ((1 + beta^2)*CM_SVM_1$byClass[5]*CM_SVM_1$byClass[6])/(beta^2 * CM_SVM_1$byClass[5]+CM_SVM_1$byClass[6])
F_measure_NB_1 <- ((1 + beta^2)*CM_NB_1$byClass[5]*CM_NB_1$byClass[6])/(beta^2 * CM_NB_1$byClass[5]+CM_NB_1$byClass[6])
F_measure_RF_1 <- ((1 + beta^2)*CM_RF_1$byClass[5]*CM_RF_1$byClass[6])/(beta^2 * CM_RF_1$byClass[5]+CM_RF_1$byClass[6])

F_measure_1 <- data.frame(DT = F_measure_DT_1, 
                          SVM = F_measure_SVM_1, 
                          NB = F_measure_NB_1, 
                          RF = F_measure_RF_1)
row.names(F_measure_1) <- "F-measure_1"

3.3 Building Models with Both Balancing the Sample Training Data

library(ROSE)
#Both under and over -> under sample 'no' and over sample 'yes'
training_bothsampled <- ovun.sample(class~., 
                                    data = training, 
                                    method = "both", 
                                    p = 0.45, 
                                    seed = 1)$data

table(training_bothsampled$class)
## 
##    no   yes 
## 17499 14148
prop.table(table(training_bothsampled$class))
## 
##        no       yes 
## 0.5529434 0.4470566
#Both under and over -> under sample 'no' and over sample 'yes'
training_svm_bothsampled <- ovun.sample(class~., 
                                        data = training_svm, 
                                        method = "both", 
                                        p = 0.45, 
                                        seed = 1)$data

table(training_svm_bothsampled$class)
## 
##   no  yes 
## 1793 1372
prop.table(table(training_svm_bothsampled$class))
## 
##        no       yes 
## 0.5665087 0.4334913

3.3.1 SVM

library(e1071)
svm_radial_2 <- svm(class~., data = training_svm_bothsampled, 
                    kernel = "radial", scale = TRUE, probability = TRUE)
svm_predict_2 <- predict(svm_radial_2, 
                         test_svm, probability = TRUE)

results_svm_2 <- test_svm
results_svm_2$PredictionSVM <- svm_predict_2

3.3.2 Decision Tree

library(partykit)
decTree_2 <- ctree(class~., 
                   data = training_bothsampled)
decTree_predict_2 <- predict(decTree_2, 
                             test, type="response")
results_2 <- test
results_2$PredictionTree <- decTree_predict_2

3.3.3 Naive Bayes

#Naive Bayes
library(naivebayes)
naivebayes_2 <- naive_bayes(class~., 
                            data = training_bothsampled)
naivebayes_predict_2 <- predict(naivebayes_2, 
                                test, type = "class")
## Warning: predict.naive_bayes(): More features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
results_2$PredictionNB <- naivebayes_predict_2

3.3.4 Random Forest

library(randomForest)
model_RF_2 <- randomForest(class ~ . , 
                           data =  training_bothsampled)
RF_predict_2 <- predict(model_RF_2, 
                        test, type = "response")
results_2$PredictionRF <- RF_predict_2
correct_svm_2 <- which (results_svm_2$class == results_svm_2$PredictionSVM)
length(correct_svm_2)
## [1] 1145
correct_Tree_2 <- which(results_2$class == results_2$PredictionTree)
length(correct_Tree_2)
## [1] 11107
correct_NB_2 <- which(results_2$class == results_2$PredictionNB)
length(correct_NB_2)
## [1] 11134
correct_RF_2 <- which(results_2$class == results_2$PredictionRF)
length(correct_RF_2)
## [1] 11310

3.3.5 F-score of Each Model with Both Balancing the Sample Training Data

library(caret)
CM_DT_2 <- confusionMatrix(decTree_predict_2, 
                           test$class, positive="yes")
CM_SVM_2 <- confusionMatrix(svm_predict_2, 
                            test_svm$class, positive="yes")
CM_NB_2 <- confusionMatrix(naivebayes_predict_2, 
                           test$class, positive="yes")
CM_RF_2 <- confusionMatrix(RF_predict_2, 
                           test$class, positive="yes")
beta <- 1

F_measure_DT_2 <- ((1 + beta^2)*CM_DT_2$byClass[5]*CM_DT_2$byClass[6])/(beta^2 * CM_DT_2$byClass[5]+CM_DT_2$byClass[6])

F_measure_SVM_2 <- ((1 + beta^2)*CM_SVM_2$byClass[5]*CM_SVM_2$byClass[6])/(beta^2 * CM_SVM_2$byClass[5]+CM_SVM_2$byClass[6])

F_measure_NB_2 <- ((1 + beta^2)*CM_NB_2$byClass[5]*CM_NB_2$byClass[6])/(beta^2 * CM_NB_2$byClass[5]+CM_NB_2$byClass[6])

F_measure_RF_2 <- ((1 + beta^2)*CM_RF_2$byClass[5]*CM_RF_2$byClass[6])/(beta^2 * CM_RF_2$byClass[5]+CM_RF_2$byClass[6])


F_measure_2 <- data.frame(DT = F_measure_DT_2, 
                          SVM = F_measure_SVM_2, 
                          NB = F_measure_NB_2, 
                          RF = F_measure_RF_2)
row.names(F_measure_2) <- "F-measure_2"

3.4 Compare Models Rusults Between Both Balancing the Sample Training Data and Without Balancing the Sample Training Data

F_compare <- data.frame(DT = c(F_measure_DT_1,F_measure_DT_2),
                        SVM = c(F_measure_SVM_1,F_measure_SVM_2),
                        NB = c(F_measure_NB_1,F_measure_NB_2),
                        RF = c(F_measure_RF_1,F_measure_RF_2))

row.names(F_compare) <- c("F_without", "F_both")

F_compare
##                  DT       SVM        NB        RF
## F_without 0.5062147 0.3380282 0.4568748 0.5244177
## F_both    0.5248501 0.5300668 0.4696639 0.5418699

Notes: - All models using both balancing the sample training data get higher F-score. - We continue to evalue the models based on the both balancing data in the following code.

4.Evaluation

4.1 Evaluation Metrics

summary_DT <- c(CM_DT_2$overall[1], CM_DT_2$byClass[c(1,3)])
summary_SVM <- c(CM_SVM_2$overall[1], CM_SVM_2$byClass[c(1,3)])
summary_NB <- c(CM_NB_2$overall[1], CM_NB_2$byClass[c(1,3)])
summary_RF <- c(CM_RF_2$overall[1], CM_RF_2$byClass[c(1,3)])


(summary <- data.frame(DT = summary_DT, 
                       SVM = summary_SVM,
                       NB = summary_NB, RF = summary_RF))
##                       DT       SVM        NB        RF
## Accuracy       0.8188587 0.8443953 0.8208493 0.8338248
## Sensitivity    0.8550725 0.7628205 0.6780088 0.8399496
## Pos Pred Value 0.3786272 0.4061433 0.3592654 0.3999400
row.names(F_measure_2) <- "F-measure"

summary <- rbind(summary, F_measure_2)
print(summary)
##                       DT       SVM        NB        RF
## Accuracy       0.8188587 0.8443953 0.8208493 0.8338248
## Sensitivity    0.8550725 0.7628205 0.6780088 0.8399496
## Pos Pred Value 0.3786272 0.4061433 0.3592654 0.3999400
## F-measure      0.5248501 0.5300668 0.4696639 0.5418699

This summary shows Accuracy, Sensitivity, Pos Pred Value (precision) and F-measure of each model.

  • Accuracy refers to the score of how well a model predicted the actual responses of customers correctly. SVM shows the highest Accuracy amongst the three models. However, Accuracy is not the sole metric to assess a model’s eligibility.

  • Sensitivity indicates what proportion of the actual “yes” responses are predicted correctly. This ratio is important in this direct marketing case, as better prediction for the actual “yes” responses can lead to higher marketing proficiency by contacting more actual “yes” responders. In the result of the summary, Decision Tree model has the highest Sensitivity amongst the three.

  • Pos Pred Value shows what proportion of the predicted “yes” responses is actual “yes”. This is also crucial ratio for this bank marketing case. This is because it is directly related to the marketing efficiency. That is, higher Pos Pred Value means less errors in predicting “yes” responders, which can then lower the unnecessary marketing cost of contacting actual “no” responders. According to the result, SVM has the highest Pos Pred Value.

  • F-measure is the balanced metric of Pos Pred Value and Sensitivity. The result shows that SVM model has slightly higher F-measure than that of Decision Tree, while Naive Bayes model has relatively low F-measure.

According to the entire evaluation, it seems ambiguous to determine which model is the most suitable approach for selecting customers for the deposit marketing. However, sensitivity method should be considered with more weights in the direct marketing case. Due to cost-efficiency of direct marketing campaign, the costs of contacting customers will have relatively low impacts on the total expected value of the entire marketing than the profits of contacting prospective customers who will be most likely to say “yes” to the deposit offer. In other words, it is better to implement greedy direct marketing in order to obtain optimum expected value, focusing more on the profits rather than the costs. Therefore, Decision Tree model is determined to be the most suitable model. Even though Decision Tree is slightly outperformed by SVM and RF in Precision and F-measure, it has the highest Sensitivity as shown with the black dashed line of the summary figure. Therefore, Decision Tree model will provide more marketing benefits than the other models, which can sufficiently offset the costs of wrong direct marketing and generate more profits.

summary_for_viz <- gather(summary, key="models", value = "values")
summary_for_viz <- summary_for_viz %>% 
  mutate(metric = c("Accuracy", "Sensitivity",
                    "Pos Pred Value", "F-measure",
                    "Accuracy", "Sensitivity", 
                    "Pos Pred Value", "F-measure",
                    "Accuracy", "Sensitivity", 
                    "Pos Pred Value", "F-measure", 
                    "Accuracy", "Sensitivity", 
                    "Pos Pred Value", "F-measure"))

summary_for_viz$models <- as.factor(summary_for_viz$models)
summary_for_viz$metric <- as.factor(summary_for_viz$metric)

ggplot(summary_for_viz, aes(x=metric, y=values, fill = metric)) + geom_bar(stat="identity") + facet_grid(cols = vars(models)) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = summary$DT[2], col = "black", linetype = "dashed")

4.2 Receiver Operator Characteristic (ROC) Graph

  • ROC curve conveys the relationship between costs of direct marketing and corresponding profits. The x-axis of the curve indicates the costs of the direct marketing and the y-axis means the profits of the direct marketing. Therefore, a better model should be the more left upper going curve, indicating higher profits with lower costs. AUC is the area under the curve.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
prob_SVM <- attr(svm_predict_2, "probabilities")[,2]
prob_DT <- predict(decTree_2, test, type = "prob")
prob_NB <- predict(naivebayes_2, test, type="prob")
## Warning: predict.naive_bayes(): More features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
prob_RF <- predict(model_RF_2, test, type = "prob")
ROC_SVM <- roc(class~prob_SVM, test_svm)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
df_SVM <-  data.frame((1-ROC_SVM$specificities), ROC_SVM$sensitivities)
ROC_DT <- roc(class~prob_DT[,2], test)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
df_DT <- data.frame((1-ROC_DT$specificities), ROC_DT$sensitivities)
ROC_NB <- roc(class~prob_NB[,2], test)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
df_NB <- data.frame((1-ROC_NB$specificities), ROC_NB$sensitivities)
ROC_RF <- roc(class~prob_RF[,2], test)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
df_RF <- data.frame((1-ROC_RF$specificities), ROC_RF$sensitivities)
plot(df_DT, col="red", type="l", xlab="False Positive Rate (1-Specificity)", ylab="True Positive Rate (Sensitivity)")
lines(df_SVM, col="green")
lines(df_NB, col="yellow")
lines(df_RF, col="black")

abline(a=0, b=1, col = "grey")

legend("bottomright",
c("Decision Tree", "SVM", "Naive Bayes", "Random Forest"),
fill=c("red", "green", "yellow", "black"))

(AUC_DT <- auc(ROC_DT))
## Area under the curve: 0.9042
(AUC_SVM <- auc(ROC_SVM))
## Area under the curve: 0.8925
(AUC_NB <- auc(ROC_NB))
## Area under the curve: 0.8598
(AUC_RF <- auc(ROC_RF))
## Area under the curve: 0.9046
(AUC_summary <- data.frame(AUC_SVM, AUC_DT, AUC_NB, AUC_RF))
##     AUC_SVM    AUC_DT    AUC_NB    AUC_RF
## 1 0.8924599 0.9041929 0.8597964 0.9046292

4.3 Cumulative Response(Gain) Chart

  • “INCREASE IN THE CORRECT PREDICTION % FOR EACH INCREASE OF INFORMATION GAIN %”
library(CustomerScoringMetrics)
GainTable_DT <- cumGainsTable(prob_DT[,2], test$class, resolution = 1/100)
GainTable_SVM <- cumGainsTable(prob_SVM, test_svm$class, resolution = 1/100)
GainTable_NB <- cumGainsTable(prob_NB[,2], test$class, resolution = 1/100)
GainTable_RF <- cumGainsTable(prob_RF[,2], test$class, resolution = 1/100)
plot(GainTable_DT[,4], col="red", type="l",     
xlab="Percentage of test instances", ylab="Percentage of correct predictions")
lines(GainTable_SVM[,4], col="green")
lines(GainTable_NB[,4], col="yellow")
lines(GainTable_RF[,4], col="black")

abline(a = 0, b = 1, col = "grey")

legend("bottomright",
c("Decision tree", "SVM", "Naive Bayes", "Random Forest"),
fill=c("red","green", "yellow", "black"))