In last post we covered the concepts and theory behind Boosting Algorithms.
In this post we will put the concepts into practice and build Boosting models using Scikit Learn in R.
LAB: Boosting
- Rightly categorizing the items based on their detailed feature specifications. More than 100 specifications have been collected.
- Data: Ecom_Products_Menu/train.csv
- Build a decision tree model and check the training and testing accuracy.
- Build a boosted decision tree.
- Is there any improvement from the earlier decision tree.
Solution
train <- read.csv("C:/Amrita/Datavedi/Ecom_Products_Menu/train.csv")
test <- read.csv("C:/Amrita/Datavedi/Ecom_Products_Menu/test.csv")
dim(train)
## [1] 50122 102
##Decison Tree
library(rpart)
ecom_products_ds<-rpart(Category ~ ., method="class", control=rpart.control(minsplit=30, cp=0.01), data=train[,-1])
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(ecom_products_ds)
#Training accuarcy
library(caret)
predicted_y<-predict(ecom_products_ds, type="class")
table(predicted_y)
## predicted_y
## Accessories Appliances Camara Ipod Laptops
## 0 10899 2733 2442 0
## Mobiles Personal_Care Tablets TV
## 0 10288 23760 0
confusionMatrix(predicted_y,train$Category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accessories Appliances Camara Ipod Laptops Mobiles
## Accessories 0 0 0 0 0 0
## Appliances 825 5536 1086 130 506 709
## Camara 88 387 1456 4 55 388
## Ipod 30 17 23 2032 144 5
## Laptops 0 0 0 0 0 0
## Mobiles 0 0 0 0 0 0
## Personal_Care 110 308 152 0 18 79
## Tablets 1288 615 1247 51 5743 377
## TV 0 0 0 0 0 0
## Reference
## Prediction Personal_Care Tablets TV
## Accessories 0 0 0
## Appliances 1035 932 140
## Camara 252 84 19
## Ipod 13 159 19
## Laptops 0 0 0
## Mobiles 0 0 0
## Personal_Care 9545 19 57
## Tablets 607 11885 1947
## TV 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6076
## 95% CI : (0.6033, 0.6119)
## No Information Rate : 0.2609
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5053
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Accessories Class: Appliances Class: Camara
## Sensitivity 0.00000 0.8066 0.36731
## Specificity 1.00000 0.8760 0.97233
## Pos Pred Value NaN 0.5079 0.53275
## Neg Pred Value 0.95329 0.9662 0.94708
## Prevalence 0.04671 0.1369 0.07909
## Detection Rate 0.00000 0.1105 0.02905
## Detection Prevalence 0.00000 0.2174 0.05453
## Balanced Accuracy 0.50000 0.8413 0.66982
## Class: Ipod Class: Laptops Class: Mobiles
## Sensitivity 0.91655 0.000 0.00000
## Specificity 0.99144 1.000 1.00000
## Pos Pred Value 0.83210 NaN NaN
## Neg Pred Value 0.99612 0.871 0.96892
## Prevalence 0.04423 0.129 0.03108
## Detection Rate 0.04054 0.000 0.00000
## Detection Prevalence 0.04872 0.000 0.00000
## Balanced Accuracy 0.95400 0.500 0.50000
## Class: Personal_Care Class: Tablets Class: TV
## Sensitivity 0.8335 0.9087 0.00000
## Specificity 0.9808 0.6794 1.00000
## Pos Pred Value 0.9278 0.5002 NaN
## Neg Pred Value 0.9521 0.9547 0.95647
## Prevalence 0.2285 0.2609 0.04353
## Detection Rate 0.1904 0.2371 0.00000
## Detection Prevalence 0.2053 0.4740 0.00000
## Balanced Accuracy 0.9071 0.7941 0.50000
#Accuarcy on Test data
predicted_test_ds<-predict(ecom_products_ds, test[,-1], type="class")
confusionMatrix(predicted_test_ds,test$Category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accessories Appliances Camara Ipod Laptops Mobiles
## Accessories 0 0 0 0 0 0
## Appliances 172 1308 269 40 92 170
## Camara 15 80 383 1 16 95
## Ipod 14 4 3 469 28 0
## Laptops 0 0 0 0 0 0
## Mobiles 0 0 0 0 0 0
## Personal_Care 23 75 42 0 1 23
## Tablets 274 134 294 12 1401 83
## TV 0 0 0 0 0 0
## Reference
## Prediction Personal_Care Tablets TV
## Accessories 0 0 0
## Appliances 234 210 42
## Camara 52 23 3
## Ipod 3 49 5
## Laptops 0 0 0
## Mobiles 0 0 0
## Personal_Care 2242 10 17
## Tablets 152 2751 442
## TV 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6085
## 95% CI : (0.5996, 0.6173)
## No Information Rate : 0.2588
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5071
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Accessories Class: Appliances Class: Camara
## Sensitivity 0.00000 0.8170 0.38648
## Specificity 1.00000 0.8790 0.97353
## Pos Pred Value NaN 0.5156 0.57335
## Neg Pred Value 0.95764 0.9682 0.94517
## Prevalence 0.04236 0.1362 0.08430
## Detection Rate 0.00000 0.1113 0.03258
## Detection Prevalence 0.00000 0.2158 0.05682
## Balanced Accuracy 0.50000 0.8480 0.68000
## Class: Ipod Class: Laptops Class: Mobiles
## Sensitivity 0.89847 0.0000 0.00000
## Specificity 0.99056 1.0000 1.00000
## Pos Pred Value 0.81565 NaN NaN
## Neg Pred Value 0.99526 0.8692 0.96844
## Prevalence 0.04440 0.1308 0.03156
## Detection Rate 0.03989 0.0000 0.00000
## Detection Prevalence 0.04891 0.0000 0.00000
## Balanced Accuracy 0.94452 0.5000 0.50000
## Class: Personal_Care Class: Tablets Class: TV
## Sensitivity 0.8356 0.9040 0.0000
## Specificity 0.9789 0.6796 1.0000
## Pos Pred Value 0.9215 0.4963 NaN
## Neg Pred Value 0.9527 0.9530 0.9567
## Prevalence 0.2282 0.2588 0.0433
## Detection Rate 0.1907 0.2340 0.0000
## Detection Prevalence 0.2070 0.4715 0.0000
## Balanced Accuracy 0.9073 0.7918 0.5000
###Boosting
library(xgboost)
library(methods)
library(data.table)
library(magrittr)
# converting datasets to Numeric format. xgboost needs at least one numeric column
train[,c(-1,-102)] <- lapply( train[,c(-1,-102)], as.numeric)
test[,c(-1,-102)] <- lapply( test[,c(-1,-102)], as.numeric)
# converting datasets to Matrix format. Data frame is not supported by xgboost
trainMatrix <- train[,c(-1,-102)] %>% as.matrix
testMatrix <- test[,c(-1,-102)] %>% as.matrix
#The label should be in numeric format and it should start from 0
y<-as.integer(train$Category)-1
table(y,train$Category)
##
## y Accessories Appliances Camara Ipod Laptops Mobiles Personal_Care
## 0 2341 0 0 0 0 0 0
## 1 0 6863 0 0 0 0 0
## 2 0 0 3964 0 0 0 0
## 3 0 0 0 2217 0 0 0
## 4 0 0 0 0 6466 0 0
## 5 0 0 0 0 0 1558 0
## 6 0 0 0 0 0 0 11452
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
##
## y Tablets TV
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 13079 0
## 8 0 2182
test_y<-as.integer(test$Category)-1
table(test_y,test$Category)
##
## test_y Accessories Appliances Camara Ipod Laptops Mobiles Personal_Care
## 0 498 0 0 0 0 0 0
## 1 0 1601 0 0 0 0 0
## 2 0 0 991 0 0 0 0
## 3 0 0 0 522 0 0 0
## 4 0 0 0 0 1538 0 0
## 5 0 0 0 0 0 371 0
## 6 0 0 0 0 0 0 2683
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
##
## test_y Tablets TV
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 3043 0
## 8 0 509
#Setting the parameters for multiclass classification
param <- list("objective" = "multi:softprob","eval.metric" = "merror", "num_class" =9)
#"multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
#"merror": Multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases).
XGBModel <- xgboost(param=param, data = trainMatrix, label = y, nrounds=40)
## [0] train-merror:0.269223
## [1] train-merror:0.241750
## [2] train-merror:0.229500
## [3] train-merror:0.222776
## [4] train-merror:0.218966
## [5] train-merror:0.211923
## [6] train-merror:0.208312
## [7] train-merror:0.203703
## [8] train-merror:0.199553
## [9] train-merror:0.196481
## [10] train-merror:0.192969
## [11] train-merror:0.190695
## [12] train-merror:0.188241
## [13] train-merror:0.185487
## [14] train-merror:0.183193
## [15] train-merror:0.180400
## [16] train-merror:0.177886
## [17] train-merror:0.175552
## [18] train-merror:0.173217
## [19] train-merror:0.171362
## [20] train-merror:0.168968
## [21] train-merror:0.166474
## [22] train-merror:0.164379
## [23] train-merror:0.162743
## [24] train-merror:0.161925
## [25] train-merror:0.160389
## [26] train-merror:0.158214
## [27] train-merror:0.156478
## [28] train-merror:0.155521
## [29] train-merror:0.154284
## [30] train-merror:0.152628
## [31] train-merror:0.151271
## [32] train-merror:0.149356
## [33] train-merror:0.147879
## [34] train-merror:0.146283
## [35] train-merror:0.144827
## [36] train-merror:0.143749
## [37] train-merror:0.142053
## [38] train-merror:0.140358
## [39] train-merror:0.139240
#Training accuarcy
predicted_y<-predict(XGBModel, trainMatrix)
probs <- data.frame(matrix(predicted_y, nrow=nrow(train), ncol=9, byrow = TRUE))
probs_final<-as.data.frame(cbind(row.names(probs),apply(probs,1, function(x) c(0:8)[which(x==max(x))])))
table(probs_final$V2)
##
## 0 1 2 3 4 5 6 7 8
## 2140 6969 3997 2227 5142 1242 11418 15605 1382
confusionMatrix(probs_final$V2,y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8
## 0 1820 32 13 1 74 26 94 58 22
## 1 73 6495 123 1 13 129 119 12 4
## 2 13 78 3584 2 4 204 103 9 0
## 3 8 5 3 2192 0 1 0 8 10
## 4 84 20 4 3 3830 5 12 970 214
## 5 28 55 60 2 2 1051 37 7 0
## 6 81 105 93 1 5 95 10987 20 31
## 7 216 73 82 15 2500 46 92 11932 649
## 8 18 0 2 0 38 1 8 63 1252
##
## Overall Statistics
##
## Accuracy : 0.8608
## 95% CI : (0.8577, 0.8638)
## No Information Rate : 0.2609
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8306
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.77745 0.9464 0.90414 0.98872 0.59233 0.67458
## Specificity 0.99330 0.9890 0.99105 0.99927 0.96995 0.99607
## Pos Pred Value 0.85047 0.9320 0.89667 0.98428 0.74485 0.84622
## Neg Pred Value 0.98914 0.9915 0.99176 0.99948 0.94140 0.98963
## Prevalence 0.04671 0.1369 0.07909 0.04423 0.12901 0.03108
## Detection Rate 0.03631 0.1296 0.07151 0.04373 0.07641 0.02097
## Detection Prevalence 0.04270 0.1390 0.07975 0.04443 0.10259 0.02478
## Balanced Accuracy 0.88537 0.9677 0.94759 0.99400 0.78114 0.83532
## Class: 6 Class: 7 Class: 8
## Sensitivity 0.9594 0.9123 0.57379
## Specificity 0.9889 0.9008 0.99729
## Pos Pred Value 0.9623 0.7646 0.90593
## Neg Pred Value 0.9880 0.9668 0.98092
## Prevalence 0.2285 0.2609 0.04353
## Detection Rate 0.2192 0.2381 0.02498
## Detection Prevalence 0.2278 0.3113 0.02757
## Balanced Accuracy 0.9741 0.9066 0.78554
#Accuarcy on Test data
predicted_test_boost<-predict(XGBModel, testMatrix)
probs_test <- data.frame(matrix(predicted_test_boost, nrow=nrow(test), ncol=9, byrow = TRUE))
probs_final_test<-as.data.frame(cbind(row.names(probs_test),apply(probs_test,1, function(x) c(0:8)[which(x==max(x))])))
table(probs_final_test$V2)
##
## 0 1 2 3 4 5 6 7 8
## 446 1654 1037 514 1202 231 2699 3707 266
confusionMatrix(probs_final_test$V2,test_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8
## 0 327 15 2 1 26 8 38 22 7
## 1 27 1476 34 0 4 66 37 9 1
## 2 1 29 881 0 4 78 34 10 0
## 3 1 1 1 502 0 1 2 4 2
## 4 29 6 2 1 743 4 2 344 71
## 5 11 21 22 0 0 163 13 0 1
## 6 38 35 32 1 2 35 2526 9 21
## 7 58 18 17 15 733 16 26 2620 204
## 8 6 0 0 2 26 0 5 25 202
##
## Overall Statistics
##
## Accuracy : 0.803
## 95% CI : (0.7957, 0.8102)
## No Information Rate : 0.2588
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.76
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.65663 0.9219 0.88900 0.96169 0.4831 0.43935
## Specificity 0.98943 0.9825 0.98551 0.99893 0.9551 0.99403
## Pos Pred Value 0.73318 0.8924 0.84957 0.97665 0.6181 0.70563
## Neg Pred Value 0.98488 0.9876 0.98974 0.99822 0.9247 0.98195
## Prevalence 0.04236 0.1362 0.08430 0.04440 0.1308 0.03156
## Detection Rate 0.02782 0.1256 0.07494 0.04270 0.0632 0.01387
## Detection Prevalence 0.03794 0.1407 0.08821 0.04372 0.1022 0.01965
## Balanced Accuracy 0.82303 0.9522 0.93725 0.98031 0.7191 0.71669
## Class: 6 Class: 7 Class: 8
## Sensitivity 0.9415 0.8610 0.39686
## Specificity 0.9809 0.8752 0.99431
## Pos Pred Value 0.9359 0.7068 0.75940
## Neg Pred Value 0.9827 0.9474 0.97328
## Prevalence 0.2282 0.2588 0.04330
## Detection Rate 0.2149 0.2229 0.01718
## Detection Prevalence 0.2296 0.3153 0.02263
## Balanced Accuracy 0.9612 0.8681 0.69558
When Ensemble doesn’t work?
- The models have to be independent, we can’t build the same model multiple times and expect the error to reduce.
- We may have to bring in the independence by choosing subsets of data, or subset of features while building the individual models.
- Ensemble may backfire if we use dependent models that are already less accurate. The final ensemble might turn out to be even worse model.
- Yes, there is a small disclaimer in “Wisdom of Crowd” theory. We need good independent individuals. If we collate any dependent individuals with poor knowledge, then we might end with an even worse ensemble.
- For example, we built three models, model-1 , model-2 are bad, model-3 is good. Most of the times ensemble will result the combined output of model-1 and model-2, based on voting.
LAB: When Ensemble doesn’t work?
- When the individual models/ sample are dependent
#Data Import
train<- read.csv("C:/Amrita/Datavedi/Car Accidents IOT/Train.csv")
test<- read.csv("C:/Amrita/Datavedi/Car Accidents IOT/Test.csv")
####Logistic Regression
crash_model_logistic <- glm(Fatal ~ . , data=train, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(crash_model_logistic)
##
## Call:
## glm(formula = Fatal ~ ., family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 -0.8571 0.3656 0.8242 3.1945
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.954e-01 5.412e-01 1.654 0.098067 .
## S1 -1.045e-02 2.860e-03 -3.653 0.000259 ***
## S2 -3.740e-03 5.454e-03 -0.686 0.492915
## S3 2.638e-01 6.112e-02 4.316 1.59e-05 ***
## S4 1.605e-03 2.197e-04 7.304 2.80e-13 ***
## S5 3.161e-02 2.718e-03 11.631 < 2e-16 ***
## S6 3.748e-03 2.414e-03 1.553 0.120537
## S7 -8.739e-04 2.476e-04 -3.530 0.000415 ***
## S8 1.684e-01 3.209e-02 5.247 1.54e-07 ***
## S9 -8.099e-04 7.008e-04 -1.156 0.247805
## S10 -9.886e+01 9.210e+00 -10.734 < 2e-16 ***
## S11 -1.538e-02 8.875e-04 -17.334 < 2e-16 ***
## S12 -2.447e-01 2.161e-02 -11.324 < 2e-16 ***
## S13 3.227e+00 1.092e-01 29.549 < 2e-16 ***
## S14 7.233e-03 1.663e-03 4.350 1.36e-05 ***
## S15 6.571e-03 4.373e-03 1.503 0.132889
## S16 -7.763e-02 5.666e-02 -1.370 0.170693
## S17 -3.497e-04 6.861e-05 -5.097 3.46e-07 ***
## S18 -2.865e-04 4.433e-04 -0.646 0.518052
## S19 -6.798e-02 6.262e-02 -1.086 0.277665
## S20 -1.001e-02 2.043e-03 -4.902 9.49e-07 ***
## S21 -4.146e-01 2.398e-02 -17.291 < 2e-16 ***
## S22 1.678e-01 6.718e-03 24.981 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20538 on 15108 degrees of freedom
## Residual deviance: 14794 on 15086 degrees of freedom
## AIC: 14840
##
## Number of Fisher Scoring iterations: 8
#Training accuarcy
predicted_y<-round(predict(crash_model_logistic,type="response"),0)
confusionMatrix(predicted_y,crash_model_logistic$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4394 1300
## 1 1922 7493
##
## Accuracy : 0.7867
## 95% CI : (0.7801, 0.7933)
## No Information Rate : 0.582
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5556
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6957
## Specificity : 0.8522
## Pos Pred Value : 0.7717
## Neg Pred Value : 0.7959
## Prevalence : 0.4180
## Detection Rate : 0.2908
## Detection Prevalence : 0.3769
## Balanced Accuracy : 0.7739
##
## 'Positive' Class : 0
##
#Accuarcy on Test data
predicted_test_logistic<-round(predict(crash_model_logistic,test, type="response"),0)
confusionMatrix(predicted_test_logistic,test$Fatal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2766 781
## 1 1126 4392
##
## Accuracy : 0.7896
## 95% CI : (0.7811, 0.798)
## No Information Rate : 0.5707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5659
## Mcnemar's Test P-Value : 3.343e-15
##
## Sensitivity : 0.7107
## Specificity : 0.8490
## Pos Pred Value : 0.7798
## Neg Pred Value : 0.7959
## Prevalence : 0.4293
## Detection Rate : 0.3051
## Detection Prevalence : 0.3913
## Balanced Accuracy : 0.7799
##
## 'Positive' Class : 0
##
###Decision Tree
library(rpart)
crash_model_ds<-rpart(Fatal ~ ., method="class", data=train)
#Training accuarcy
predicted_y<-predict(crash_model_ds, type="class")
table(predicted_y)
## predicted_y
## 0 1
## 5544 9565
confusionMatrix(predicted_y,train$Fatal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4705 839
## 1 1611 7954
##
## Accuracy : 0.8378
## 95% CI : (0.8319, 0.8437)
## No Information Rate : 0.582
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6609
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7449
## Specificity : 0.9046
## Pos Pred Value : 0.8487
## Neg Pred Value : 0.8316
## Prevalence : 0.4180
## Detection Rate : 0.3114
## Detection Prevalence : 0.3669
## Balanced Accuracy : 0.8248
##
## 'Positive' Class : 0
##
#Accuaracy on Test data
predicted_test_ds<-predict(crash_model_ds, test, type="class")
confusionMatrix(predicted_test_ds,test$Fatal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2884 454
## 1 1008 4719
##
## Accuracy : 0.8387
## 95% CI : (0.831, 0.8462)
## No Information Rate : 0.5707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.665
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7410
## Specificity : 0.9122
## Pos Pred Value : 0.8640
## Neg Pred Value : 0.8240
## Prevalence : 0.4293
## Detection Rate : 0.3181
## Detection Prevalence : 0.3682
## Balanced Accuracy : 0.8266
##
## 'Positive' Class : 0
##
####SVM Model
library(e1071)
pc <- proc.time()
crash_model_svm <- svm(Fatal ~ . , type="C", data = train)
proc.time() - pc
## user system elapsed
## 89.49 0.13 92.84
summary(crash_model_svm)
##
## Call:
## svm(formula = Fatal ~ ., data = train, type = "C")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.04545455
##
## Number of Support Vectors: 6992
##
## ( 3582 3410 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
#Confusion Matrix
library(caret)
label_predicted<-predict(crash_model_svm, type = "class")
confusionMatrix(label_predicted,train$Fatal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4811 538
## 1 1505 8255
##
## Accuracy : 0.8648
## 95% CI : (0.8592, 0.8702)
## No Information Rate : 0.582
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.716
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7617
## Specificity : 0.9388
## Pos Pred Value : 0.8994
## Neg Pred Value : 0.8458
## Prevalence : 0.4180
## Detection Rate : 0.3184
## Detection Prevalence : 0.3540
## Balanced Accuracy : 0.8503
##
## 'Positive' Class : 0
##
#Out of time validation with test data
predicted_test_svm<-predict(crash_model_svm, newdata =test[,-1] , type = "class")
confusionMatrix(predicted_test_svm,test[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2933 399
## 1 959 4774
##
## Accuracy : 0.8502
## 95% CI : (0.8427, 0.8575)
## No Information Rate : 0.5707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6887
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7536
## Specificity : 0.9229
## Pos Pred Value : 0.8803
## Neg Pred Value : 0.8327
## Prevalence : 0.4293
## Detection Rate : 0.3236
## Detection Prevalence : 0.3676
## Balanced Accuracy : 0.8382
##
## 'Positive' Class : 0
##
####Ensemble Model
#DS and SVM are predictng 1 & 2
predicted_test_logistic1<-predicted_test_logistic+1
Ens_predicted_data<-data.frame(lg=as.numeric(predicted_test_logistic1),ds=as.numeric(predicted_test_ds), svm=as.numeric(predicted_test_svm))
Ens_predicted_data$final<-ifelse(Ens_predicted_data$lg+Ens_predicted_data$ds+Ens_predicted_data$svm<4.5,0,1)
table(Ens_predicted_data$final)
##
## 0 1
## 3340 5725
##Ensemble Model accuracy test data
confusionMatrix(Ens_predicted_data$final,test[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2878 462
## 1 1014 4711
##
## Accuracy : 0.8372
## 95% CI : (0.8294, 0.8447)
## No Information Rate : 0.5707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6618
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7395
## Specificity : 0.9107
## Pos Pred Value : 0.8617
## Neg Pred Value : 0.8229
## Prevalence : 0.4293
## Detection Rate : 0.3175
## Detection Prevalence : 0.3685
## Balanced Accuracy : 0.8251
##
## 'Positive' Class : 0
##
Out[28]: