• No products in the cart.

203.5.5 Practice : Implementing Intermediate outputs in R

Putting intermediate outputs into practice.

In previous section, we studied about  Issue with Non Linear Decision Boundary

In this post we will learn how to implement the concept of intermediate outputs using R. We will cover many things in this session.

  • Dataset: Emp_Productivity/ Emp_Productivity_All_Sites.csv
  • Filter the data and take first 74 observations from above dataset . Filter condition is Sample_Set<3
  • Build a logistic regression model to predict Productivity using age and experience
  • Calculate the prediction probabilities for all the inputs. Store the probabilities in inter1 variable
  • Filter the data and take observations from row 34 onwards. Filter condition is Sample_Set<1
  • Build a logistic regression model to predict Productivity using age and experience
  • Calculate the prediction probabilities for all the inputs. Store the probabilities in inter2 variable
  • Build a consolidated model to predict productivity using inter-1 and inter-2 variables
  • Create the confusion matrix and find the accuracy and error rates for the consolidated model

Our sampled data Emp_Productivity1 has first 74 observations. Lets build the model on this sample data(sample-1)

####The clasification graph Sample-1
library(ggplot2)
ggplot(Emp_Productivity1)+geom_point(aes(x=Age,y=Experience,color=factor(Productivity),shape=factor(Productivity)),size=5)

###Logistic Regerssion model1
Emp_Productivity_logit<-glm(Productivity~Age+Experience,data=Emp_Productivity1, family=binomial())

coef(Emp_Productivity_logit)
## (Intercept)         Age  Experience 
##  -8.9361114   0.2762749   0.5923444
slope1 <- coef(Emp_Productivity_logit)[2]/(-coef(Emp_Productivity_logit)[3])
intercept1 <- coef(Emp_Productivity_logit)[1]/(-coef(Emp_Productivity_logit)[3]) 
####Decision boundary for model1 built on Sample-1
library(ggplot2)
base<-ggplot(Emp_Productivity1)+geom_point(aes(x=Age,y=Experience,color=factor(Productivity),shape=factor(Productivity)),size=5)
base+geom_abline(intercept = intercept1 , slope = slope1, color = "red", size = 2)

#Base is the scatter plot. Then we are adding the decision boundary
#Filter the data and take observations from row 34 onwards. 
Emp_Productivity2<-Emp_Productivity_raw[Emp_Productivity_raw$Sample_Set>1,]
####The clasification graph
library(ggplot2)
ggplot(Emp_Productivity2)+geom_point(aes(x=Age,y=Experience,color=factor(Productivity),shape=factor(Productivity)),size=5)

###Logistic Regerssion model2 built on Sample2
Emp_Productivity_logit2<-glm(Productivity~Age+Experience, data=Emp_Productivity2, family=binomial())
Emp_Productivity_logit2
## 
## Call:  glm(formula = Productivity ~ Age + Experience, family = binomial(), 
##     data = Emp_Productivity2)
## 
## Coefficients:
## (Intercept)          Age   Experience  
##     16.3184      -0.3994      -0.2440  
## 
## Degrees of Freedom: 85 Total (i.e. Null);  83 Residual
## Null Deviance:       119 
## Residual Deviance: 34.08     AIC: 40.08
coef(Emp_Productivity_logit2)
## (Intercept)         Age  Experience 
##  16.3183916  -0.3994172  -0.2439643
slope3 <- coef(Emp_Productivity_logit2)[2]/(-coef(Emp_Productivity_logit2)[3])
intercept3 <- coef(Emp_Productivity_logit2)[1]/(-coef(Emp_Productivity_logit2)[3]) 
####Drawing the Decison boundry
library(ggplot2)
base<-ggplot(Emp_Productivity2)+geom_point(aes(x=Age,y=Experience,color=factor(Productivity),shape=factor(Productivity)),size=5)
base+geom_abline(intercept = intercept3 , slope = slope3, color = "red", size = 2) 

####Accuracy of the model2
predicted_values<-round(predict(Emp_Productivity_logit2,type="response"),0)
conf_matrix<-table(predicted_values,Emp_Productivity_logit2$y)
conf_matrix
##                 
## predicted_values  0  1
##                0 43  2
##                1  2 39
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy
## [1] 0.9534884
#Calculate the prediction probabilities for all the inputs. Store the probabilities in inter1 variable
Emp_Productivity_raw$inter1<-predict(Emp_Productivity_logit,type="response", newdata=Emp_Productivity_raw)

#Calculate the prediction probabilities for all the inputs. Store the probabilities in inter2 variable
Emp_Productivity_raw$inter2<-predict(Emp_Productivity_logit2,type="response", newdata=Emp_Productivity_raw)

head(Emp_Productivity_raw)
##    Age Experience Productivity Sample_Set     inter1    inter2
## 1 20.0        2.3            0          1 0.11423230 0.9995775
## 2 16.2        2.2            0          1 0.04080461 0.9999096
## 3 20.2        1.8            0          1 0.09202657 0.9995949
## 4 18.8        1.4            0          1 0.05152147 0.9997899
## 5 18.9        3.2            0          1 0.13955234 0.9996608
## 6 16.7        3.9            0          1 0.11793035 0.9998329
####Clasification graph with the two new coloumns
library(ggplot2)
ggplot(Emp_Productivity_raw)+geom_point(aes(x=inter1,y=inter2,color=factor(Productivity),shape=factor(Productivity)),size=5)

###Logistic Regerssion model with Intermediate outputs as input
Emp_Productivity_logit_combined<-glm(Productivity~inter1+inter2,data=Emp_Productivity_raw, family=binomial())
Emp_Productivity_logit_combined
## 
## Call:  glm(formula = Productivity ~ inter1 + inter2, family = binomial(), 
##     data = Emp_Productivity_raw)
## 
## Coefficients:
## (Intercept)       inter1       inter2  
##     -12.213        8.019        8.598  
## 
## Degrees of Freedom: 118 Total (i.e. Null);  116 Residual
## Null Deviance:       155.7 
## Residual Deviance: 49.74     AIC: 55.74
slope4 <- coef(Emp_Productivity_logit_combined)[2]/(-coef(Emp_Productivity_logit_combined)[3])
intercept4<- coef(Emp_Productivity_logit_combined)[1]/(-coef(Emp_Productivity_logit_combined)[3]) 

####Drawing the Decison boundry
library(ggplot2)
base<-ggplot(Emp_Productivity_raw)+geom_point(aes(x=inter1,y=inter2,color=factor(Productivity),shape=factor(Productivity)),size=5)
base+geom_abline(intercept = intercept4 , slope = slope4, colour = "red", size = 2) 

####Accuracy of the combined
predicted_values<-round(predict(Emp_Productivity_logit_combined,type="response"),0)
conf_matrix<-table(predicted_values,Emp_Productivity_logit_combined$y)
conf_matrix
##                 
## predicted_values  0  1
##                0 74  4
##                1  2 39
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy
## [1] 0.9495798

We got an accuracy of 94.95% with an Intermediate model.

The next post is about Neural Network Intuition.

Statinfer

Statinfer derived from Statistical inference. We provide training in various Data Analytics and Data Science courses and assist candidates in securing placements.

Contact Us

info@statinfer.com

+91- 9676098897

+91- 9494762485

 

Our Social Links

top
© 2020. All Rights Reserved.