Skip to content

Identified the optimal order of preprocessing for Pima Diabetes Dataset, how feature selection and upsampling affects the evaluation metrics

Notifications You must be signed in to change notification settings

TejaswiniKatale/Prediction_of_Diabetes_using_R

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 

Repository files navigation

Prediction_of_Diabetes_using_R

###1)Without Feature selection and then Upsampling### library(caTools) f<-read.csv("D:\Excel\DiabetesDiabetes_1.csv _1.csv") f<-f[,c(-1,-2)]

#data cleaning---- #remove duplicate values
f<-unique(f)

#Fixing and standardizing typos #1. Gender for(i in 1:length(f$Gender)){ if (f$Gender[i]=='M'){ f$Gender[i]=0; } else{ f$Gender[i]=1; } } #2. CLASS for(i in 1:length(f$CLASS)){ if (f$CLASS[i]=='N'){ f$CLASS[i]=1; } else if(f$CLASS[i]=='P'){ f$CLASS[i]=2; } else{ f$CLASS[i]=3; } } f<-transform(f,Gender=as.integer(Gender),CLASS=as.integer(CLASS));

#dividing into training and testing data---- set.seed(123) split<-sample.split(f,SplitRatio = 0.7) tr_data<-subset(f,split==TRUE) ts_data<-subset(f,split==FALSE)

#DECISION TREE---- library(rpart) model2 <- rpart(CLASS ~ . , data=tr_data,method="class")

  #plot(model2, uniform=TRUE, main="Classification Tree for Diabetes")
  #text(model2, use.n=TRUE, all=TRUE, cex=.8)
  
  treePred <- predict(model2, ts_data, type = 'class')
  tb1 = table(treePred, ts_data$CLASS)
  cat("\n")
  cat("\nDecison tree :")
  #Accuracy
  accuracy_Test <- sum(diag(tb1)) / sum(tb1)
  cat("\n    Accuracy :",accuracy_Test)
  acc_vector = c(accuracy_Test)

#Precision pre91<-(tb1[1,1]/sum(tb1[,1])) cat("\n Precision of Class N :",pre91) pre92<-(tb1[2,2]/sum(tb1[,2])) cat("\n Precision of Class P :",pre92) pre93<-(tb1[3,3]/sum(tb1[,3])) cat("\n Precision of Class Y :",pre93)

  prec_n = c(pre91)
  prec_p = c(pre92)
  prec_y = c(pre93)

#Recall rec91<-(tb1[1,1]/sum(tb1[1,])) cat("\n Recall of Class N :",rec91) rec92<-(tb1[2,2]/sum(tb1[2,])) cat("\n Recall of Class P :",rec92) rec93<-(tb1[3,3]/sum(tb1[3,])) cat("\n Recall of Class Y :",rec93,"\n")

  rec_n = c(rec91)
  rec_p = c(rec92)
  rec_y = c(rec93)

#KNN----

  tr1_data = tr_data
  ts1_data = ts_data

#Normalize tr1_data[,2:11]<-scale(tr1_data[,2:11]) ts1_data[,2:11]<-scale(ts1_data[,2:11])

#Apply knn library(class) y_pred<-knn(train = tr1_data[-12], test = ts1_data[-12], cl = tr1_data[,12], k = 3)

#Confusion Matrix cf1<-table(actual=ts1_data$CLASS,predicted = y_pred)

  cat("\nKNN :")
  #accuracy
  acc<-(cf1[[1,1]]+cf1[[2,2]]+cf1[[3,3]])/sum(cf1)
  cat("\n    Accuracy :",acc)
  acc_vector = c(acc_vector,acc)
  
  #Precision
  pre1<-(cf1[1,1]/sum(cf1[,1]))
  cat("\n    Precision of Class N :",pre1)
  pre2<-(cf1[2,2]/sum(cf1[,2]))
  cat("\n    Precision of Class P :",pre2)
  pre3<-(cf1[3,3]/sum(cf1[,3]))
  cat("\n    Precision of Class Y :",pre3)

  prec_n = c(prec_n,pre1)
  prec_p = c(prec_p,pre2)
  prec_y = c(prec_y,pre3)
  
  #Recall
  rec1<-(cf1[1,1]/sum(cf1[1,]))
  cat("\n    Recall of Class N :",rec1)
  rec2<-(cf1[2,2]/sum(cf1[2,]))
  cat("\n    Recall of Class P :",rec2)
  rec3<-(cf1[3,3]/sum(cf1[3,]))
  cat("\n    Recall of Class Y :",rec3,"\n")

  rec_n = c(rec_n,rec1)
  rec_p = c(rec_p,rec2)
  rec_y = c(rec_y,rec3)

#RANDOM fOREST---- library("randomForest")

  ts2_data<-ts_data
  tr2_data<-tr_data
  set.seed(345)

#Applying random forest rfm= randomForest(CLASS~.,data = tr2_data,ntree=10) y_pred1= predict(rfm,ts2_data)

  y_pred1[y_pred1<1.5]=1
  y_pred1[1.5<=y_pred1 & y_pred1<2.5]=2
  y_pred1[2.5<=y_pred1]=3

  ts2_data$y_pred1 = y_pred1

#Confusion Matrix cf2<-table(actual=ts2_data$CLASS,predicted = y_pred1)

  cat("\nRANDOM FOREST:")

#accuracy acc2<-(cf2[[1,1]]+cf2[[2,2]]+cf2[[3,3]])/sum(cf2) cat("\n Accuracy:",acc2) acc_vector = c(acc_vector,acc2)

#Precision pre12<-(cf2[1,1]/sum(cf2[,1])) cat("\n Precision of Class N :",pre12) pre22<-(cf2[2,2]/sum(cf2[,2])) cat("\n Precision of Class P :",pre22) pre33<-(cf2[3,3]/sum(cf2[,3])) cat("\n Precision of Class Y :",pre33)

  prec_n = c(prec_n,pre12)
  prec_p = c(prec_p,pre22)
  prec_y = c(prec_y,pre33)

  #Recall
  rec12<-(cf2[1,1]/sum(cf2[1,]))
  cat("\n    Recall of Class N :",rec12)
  rec22<-(cf2[2,2]/sum(cf2[2,]))
  cat("\n    Recall of Class P : ",rec22)
  rec33<-(cf2[3,3]/sum(cf2[3,]))
  cat("\n    Recall of Class Y :",rec33)
  cat("\n")      
  cat("\n")
  cat("\n")
  
  rec_n = c(rec_n,rec12)
  rec_p = c(rec_p,rec22)
  rec_y = c(rec_y,rec33)

Visualization---

  plot(acc_vector,type="o",col="red",xlab="Algorithms",ylab="Accuracy",xaxt='n',lwd=5,cex.lab=1.5)
  axis(1,at=c(1,2,3),labels=c("DT","KNN","RF"),cex.axis=2)

  plot(prec_n,type="o",col="red",xlab="Algorithms",ylab="Precision",ylim=c(0,1),xaxt='n',lwd=5,cex.lab=1.5)
  lines(prec_p, type="o",col="blue",lwd=5)
  lines(prec_y, type="o",col="green",lwd=5)
  legend(1,0.55,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("DT","KNN","RF"),cex.axis=2)
  
  plot(rec_n,type="o",col="red",xlab="Algorithms",ylab="Recall",ylim=c(0,1),xaxt='n',lwd=5,cex.lab=1.5)
  lines(rec_p, type="o",col="blue",lwd=5)
  lines(rec_y, type="o",col="green",lwd=5)
  legend(1,0.55,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("DT","KNN","RF"),cex.axis=2)

###2)With Upsampling then Feature Selection ### library(caTools) f<-read.csv("D:\Excel\Diabetes_1.csv") f<-f[,c(-1,-2)]

#data cleaning---- #remove duplicate values
f<-unique(f)

#Fixing and standardizing typos #1. Gender for(i in 1:length(f$Gender)){ if (f$Gender[i]=='M'){ f$Gender[i]=0; } else{ f$Gender[i]=1; } } #2. CLASS for(i in 1:length(f$CLASS)){ if (f$CLASS[i]=='N'){ f$CLASS[i]=1; } else if(f$CLASS[i]=='P'){ f$CLASS[i]=2; } else{ f$CLASS[i]=3; } } f<-transform(f,Gender=as.integer(Gender),CLASS=as.integer(CLASS));

#upsampling---- N<-which(f$CLASS==1) P<-which(f$CLASS==2) Y<-which(f$CLASS==3) length(N) length(P) length(Y)

  set.seed(121)
  Up.sampling1<-sample(N,length(Y),replace = TRUE)
  Up.sampling2<-sample(P,length(Y),replace = TRUE)
  f<-f[c(Up.sampling1,Up.sampling2,Y),]

#feature selection---- library(Boruta) set.seed(1234) imp_features <- Boruta(CLASS~.,data=f,doTrace=2) print(imp_features)

#Removing unimportant features # No unimportant features found

#dividing into training and testing data---- set.seed(123) split<-sample.split(f,SplitRatio = 0.7) tr_data<-subset(f,split==TRUE) ts_data<-subset(f,split==FALSE)

#DECISION TREE---- library(rpart) model2 <- rpart(CLASS ~ . , data=tr_data,method="class")

  #plot(model2, uniform=TRUE, main="Classification Tree for Diabetes")
  #text(model2, use.n=TRUE, all=TRUE, cex=.8)
  
  treePred <- predict(model2, ts_data, type = 'class')
  tb1 = table(treePred, ts_data$CLASS)
  cat("\n")
  cat("\nDecison tree :")

#Accuracy accuracy_Test <- sum(diag(tb1)) / sum(tb1) cat("\n Accuracy :",accuracy_Test) acc_vector = c(accuracy_Test)

#Precision pre91<-(tb1[1,1]/sum(tb1[,1])) cat("\n Precision of Class N :",pre91) pre92<-(tb1[2,2]/sum(tb1[,2])) cat("\n Precision of Class P :",pre92) pre93<-(tb1[3,3]/sum(tb1[,3])) cat("\n Precision of Class Y :",pre93)

  prec_n = c(pre91)
  prec_p = c(pre92)
  prec_y = c(pre93)

#Recall rec91<-(tb1[1,1]/sum(tb1[1,])) cat("\n Recall of Class N :",rec91) rec92<-(tb1[2,2]/sum(tb1[2,])) cat("\n Recall of Class P :",rec92) rec93<-(tb1[3,3]/sum(tb1[3,])) cat("\n Recall of Class Y :",rec93,"\n")

  rec_n = c(rec91)
  rec_p = c(rec92)
  rec_y = c(rec93)

#KNN----

  tr1_data = tr_data
  ts1_data = ts_data

#Normalize tr1_data[,2:11]<-scale(tr1_data[,2:11]) ts1_data[,2:11]<-scale(ts1_data[,2:11])

#Apply knn library(class) y_pred<-knn(train = tr1_data[-12], test = ts1_data[-12], cl = tr1_data[,12], k = 3)

#Confusion Matrix cf1<-table(actual=ts1_data$CLASS,predicted = y_pred)

  cat("\nKNN :")
  #accuracy
  acc<-(cf1[[1,1]]+cf1[[2,2]]+cf1[[3,3]])/sum(cf1)
  cat("\n    Accuracy :",acc)
  acc_vector = c(acc_vector,acc)
  
  #Precision
  pre1<-(cf1[1,1]/sum(cf1[,1]))
  cat("\n    Precision of Class N :",pre1)
  pre2<-(cf1[2,2]/sum(cf1[,2]))
  cat("\n    Precision of Class P :",pre2)
  pre3<-(cf1[3,3]/sum(cf1[,3]))
  cat("\n    Precision of Class Y :",pre3)
  
  prec_n = c(prec_n,pre1)
  prec_p = c(prec_p,pre2)
  prec_y = c(prec_y,pre3)
  
  #Recall
  rec1<-(cf1[1,1]/sum(cf1[1,]))
  cat("\n    Recall of Class N :",rec1)
  rec2<-(cf1[2,2]/sum(cf1[2,]))
  cat("\n    Recall of Class P :",rec2)
  rec3<-(cf1[3,3]/sum(cf1[3,]))
  cat("\n    Recall of Class Y :",rec3,"\n")
  
  rec_n = c(rec_n,rec1)
  rec_p = c(rec_p,rec2)
  rec_y = c(rec_y,rec3)

#RANDOM fOREST---- library("randomForest")

  ts2_data<-ts_data
  tr2_data<-tr_data
  set.seed(345)

#Applying random forest rfm= randomForest(CLASS~.,data = tr2_data,ntree=10) y_pred1= predict(rfm,ts2_data)

  y_pred1[y_pred1<1.5]=1
  y_pred1[1.5<=y_pred1 & y_pred1<2.5]=2
  y_pred1[2.5<=y_pred1]=3

  ts2_data$y_pred1 = y_pred1

#Confusion Matrix cf2<-table(actual=ts2_data$CLASS,predicted = y_pred1)

  cat("\nRANDOM FOREST:")

#accuracy acc2<-(cf2[[1,1]]+cf2[[2,2]]+cf2[[3,3]])/sum(cf2) cat("\n Accuracy :",acc2) acc_vector = c(acc_vector,acc2)

  #Precision
  pre12<-(cf2[1,1]/sum(cf2[,1]))
  cat("\n    Precision of Class N :",pre12)
  pre22<-(cf2[2,2]/sum(cf2[,2]))
  cat("\n    Precision of Class P :",pre22)
  pre33<-(cf2[3,3]/sum(cf2[,3]))
  cat("\n    Precision of Class Y :",pre33)
  
  prec_n = c(prec_n,pre12)
  prec_p = c(prec_p,pre22)
  prec_y = c(prec_y,pre33)
  
  #Recall
  rec12<-(cf2[1,1]/sum(cf2[1,]))
  cat("\n    Recall of Class N :",rec12)
  rec22<-(cf2[2,2]/sum(cf2[2,]))
  cat("\n    Recall of Class P : ",rec22)
  rec33<-(cf2[3,3]/sum(cf2[3,]))
  cat("\n    Recall of Class Y :",rec33)
  cat("\n")      
  cat("\n")
  cat("\n")
  
  rec_n = c(rec_n,rec12)
  rec_p = c(rec_p,rec22)
  rec_y = c(rec_y,rec33)

Visualization---

  plot(acc_vector,type="o",col="red",xlab="Algorithms",ylab="Accuracy",xaxt='n',lwd=5,cex.lab=1.5)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)
  
  plot(prec_n,type="o",col="red",xlab="Algorithms",ylab="Precision",ylim=c(0.7,1.1),xaxt='n',lwd=2,cex.lab=1.5)
  lines(prec_p, type="o",col="blue",lwd=2)
  lines(prec_y, type="o",col="green",lwd=2)
  legend(1,0.92,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)
  
  plot(rec_n,type="o",col="red",xlab="Algorithms",ylab="Recall",ylim=c(0.7,1.1),xaxt='n',lwd=2,cex.lab=1.5)
  lines(rec_p, type="o",col="blue",lwd=2)
  lines(rec_y, type="o",col="green",lwd=3)
  legend(1,0.93,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)
  
  
  
  ###3)With feature selection then Upsampling###
  library(caTools)

f<-read.csv("D:\Excel\Diabetes_1.csv") f<-f[,c(-1,-2)]

#data cleaning---- #remove duplicate values
f<-unique(f)

#Fixing and standardizing typos #1. Gender for(i in 1:length(f$Gender)){ if (f$Gender[i]=='M'){ f$Gender[i]=0; } else{ f$Gender[i]=1; } } #2. CLASS for(i in 1:length(f$CLASS)){ if (f$CLASS[i]=='N'){ f$CLASS[i]=1; } else if(f$CLASS[i]=='P'){ f$CLASS[i]=2; } else{ f$CLASS[i]=3; } } f<-transform(f,Gender=as.integer(Gender),CLASS=as.integer(CLASS));

#feature selection---- library(Boruta) set.seed(1234) imp_features <- Boruta(CLASS~.,data=f,doTrace=2) print(imp_features)

#Removing unimportant features
  #Unimportant features found: Gender, HDL
  f$Gender=NULL
  f$HDL=NULL

#upsampling---- N<-which(f$CLASS==1) P<-which(f$CLASS==2) Y<-which(f$CLASS==3) length(N) length(P) length(Y)

  set.seed(121)
  Up.sampling1<-sample(N,length(Y),replace = TRUE)
  Up.sampling2<-sample(P,length(Y),replace = TRUE)
  f<-f[c(Up.sampling1,Up.sampling2,Y),]

#dividing into training and testing data---- set.seed(123) split<-sample.split(f,SplitRatio = 0.7) tr_data<-subset(f,split==TRUE) ts_data<-subset(f,split==FALSE)

#DECISION TREE---- library(rpart) model2 <- rpart(CLASS ~ . , data=tr_data,method="class")

  #plot(model2, uniform=TRUE, main="Classification Tree for Diabetes")
  #text(model2, use.n=TRUE, all=TRUE, cex=.8)
  
  treePred <- predict(model2, ts_data, type = 'class')
  tb1 = table(treePred, ts_data$CLASS)
  cat("\n")
  cat("\nDecison tree :")

#Accuracy accuracy_Test <- sum(diag(tb1)) / sum(tb1) cat("\n Accuracy :",accuracy_Test) acc_vector = c(accuracy_Test)

#Precision pre91<-(tb1[1,1]/sum(tb1[,1])) cat("\n Precision of Class N :",pre91) pre92<-(tb1[2,2]/sum(tb1[,2])) cat("\n Precision of Class P :",pre92) pre93<-(tb1[3,3]/sum(tb1[,3])) cat("\n Precision of Class Y :",pre93)

prec_n = c(pre91) prec_p = c(pre92) prec_y = c(pre93)

  #Recall
  rec91<-(tb1[1,1]/sum(tb1[1,]))
  cat("\n    Recall of Class N :",rec91)
  rec92<-(tb1[2,2]/sum(tb1[2,]))
  cat("\n    Recall of Class P :",rec92)
  rec93<-(tb1[3,3]/sum(tb1[3,]))
  cat("\n    Recall of Class Y :",rec93,"\n")
  
  rec_n = c(rec91)
  rec_p = c(rec92)
  rec_y = c(rec93)

#KNN----

  tr1_data = tr_data
  ts1_data = ts_data

#Normalize tr1_data[,1:9]<-scale(tr1_data[,1:9]) ts1_data[,1:9]<-scale(ts1_data[,1:9])

#Apply knn library(class) y_pred<-knn(train = tr1_data[-10], test = ts1_data[-10], cl = tr1_data[,10], k = 3)

#Confusion Matrix cf1<-table(actual=ts1_data$CLASS,predicted = y_pred)

  cat("\nKNN :")
  #accuracy
  acc<-(cf1[[1,1]]+cf1[[2,2]]+cf1[[3,3]])/sum(cf1)
  cat("\n    Accuracy :",acc)
  acc_vector = c(acc_vector,acc)
  
  #Precision
  pre1<-(cf1[1,1]/sum(cf1[,1]))
  cat("\n    Precision of Class N :",pre1)
  pre2<-(cf1[2,2]/sum(cf1[,2]))
  cat("\n    Precision of Class P :",pre2)
  pre3<-(cf1[3,3]/sum(cf1[,3]))
  cat("\n    Precision of Class Y :",pre3)
  
  prec_n = c(prec_n,pre1)
  prec_p = c(prec_p,pre2)
  prec_y = c(prec_y,pre3)
  
  #Recall
  rec1<-(cf1[1,1]/sum(cf1[1,]))
  cat("\n    Recall of Class N :",rec1)
  rec2<-(cf1[2,2]/sum(cf1[2,]))
  cat("\n    Recall of Class P :",rec2)
  rec3<-(cf1[3,3]/sum(cf1[3,]))
  cat("\n    Recall of Class Y :",rec3,"\n")
  
  rec_n = c(rec_n,rec1)
  rec_p = c(rec_p,rec2)
  rec_y = c(rec_y,rec3)

#RANDOM fOREST---- library("stats") library("dplyr") library("randomForest")

  ts2_data<-ts_data
  tr2_data<-tr_data
  set.seed(345)

#Applying random forest rfm= randomForest(CLASS~.,data = tr2_data,ntree=10) y_pred1= predict(rfm,ts2_data)

  y_pred1[y_pred1<1.5]=1
  y_pred1[1.5<=y_pred1 & y_pred1<2.5]=2
  y_pred1[2.5<=y_pred1]=3

  ts2_data$y_pred1 = y_pred1

#Confusion Matrix cf2<-table(actual=ts2_data$CLASS,predicted = y_pred1)

  cat("\nRANDOM FOREST:")

#accuracy acc2<-(cf2[[1,1]]+cf2[[2,2]]+cf2[[3,3]])/sum(cf2) cat("\n Accuracy :",acc2) acc_vector = c(acc_vector,acc2)

  #Precision
  pre12<-(cf2[1,1]/sum(cf2[,1]))
  cat("\n    Precision of Class N :",pre12)
  pre22<-(cf2[2,2]/sum(cf2[,2]))
  cat("\n    Precision of Class P :",pre22)
  pre33<-(cf2[3,3]/sum(cf2[,3]))
  cat("\n    Precision of Class Y :",pre33)
  
  prec_n = c(prec_n,pre12)
  prec_p = c(prec_p,pre22)
  prec_y = c(prec_y,pre33)
  
  #Recall
  rec12<-(cf2[1,1]/sum(cf2[1,]))
  cat("\n    Recall of Class N :",rec12)
  rec22<-(cf2[2,2]/sum(cf2[2,]))
  cat("\n    Recall of Class P : ",rec22)
  rec33<-(cf2[3,3]/sum(cf2[3,]))
  cat("\n    Recall of Class Y :",rec33)
  cat("\n")      
  cat("\n")
  cat("\n")
  
  rec_n = c(rec_n,rec12)
  rec_p = c(rec_p,rec22)
  rec_y = c(rec_y,rec33)

Visualization---

  plot(acc_vector,type="o",col="red",xlab="Algorithms",ylab="Accuracy",xaxt='n',lwd=5,cex.lab=1.5)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)
  
  plot(prec_n,type="o",col="red",xlab="Algorithms",ylab="Precision",ylim=c(0.7,1.1),xaxt='n',lwd=5,cex.lab=1.5)
  lines(prec_p, type="o",col="blue",lwd=5)
  lines(prec_y, type="o",col="green",lwd=5)
  legend(1,0.92,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)
  
  plot(rec_n,type="o",col="red",xlab="Algorithms",ylab="Recall",ylim=c(0.7,1.1),xaxt='n',lwd=5,cex.lab=1.5)
  lines(rec_p, type="o",col="blue",lwd=5)
  lines(rec_y, type="o",col="green",lwd=5)
  legend(1,0.93,legend = c("Class N","Class P","Class Y"),fill = c("red","blue","green"),cex=1)
  axis(1,at=c(1,2,3),labels=c("nn","knn","rf"),cex.axis=2)

About

Identified the optimal order of preprocessing for Pima Diabetes Dataset, how feature selection and upsampling affects the evaluation metrics

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published