-
Notifications
You must be signed in to change notification settings - Fork 1
/
King_county.R
179 lines (140 loc) · 6.42 KB
/
King_county.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
library(pacman)
library(readxl)
library(dplyr)
library(ggplot2)
library(ggiraph)
library(ggiraphExtra)
library(plyr)
library(caret)
#install.packages("move")
#library(move)
#Fetching the Data File
data_raw <- read.csv(file.choose())
#Data Exploration
class(data_raw)
head(data_raw)
str(data_raw)
glimpse(data_raw)
data_clean <- data_raw
#Summary of Data
summary(data_clean)
#Changing Date to yymmdd
data_clean$date <- substr(data_clean$date, 1, 8)
data_clean$date <- as.numeric(as.character(data_clean$date))
head(data_clean)
str(data_clean)
#Checking NA Values
length(which(is.na(data_clean)))
#Removing ID column
data_clean$id <- NULL
data_clean$date <- NULL
#data Visualization
ggplot(data = data_clean, aes(x = sqft_living, y = price)) + geom_point() +ggtitle("Prices According to Square feet")
ggplot(data = data_clean, aes(x = bathrooms, y = price)) + geom_point() +ggtitle("Prices According to Bathrooms")
ggplot(data = data_clean, aes(x=waterfront, y = price,fill=waterfront)) + geom_point()+ggtitle("Prices According to WaterFront")
#checking skewness in our variables and adjusting those which add value to the prediction
#install.packages("moments")
library(moments)
apply(data_clean[,1:19], 2, skewness, na.rm =TRUE)
data_clean$price <- log(data_clean$price)
data_clean$sqft_lot <- log(data_clean$sqft_lot)
data_clean$sqft_lot15 <- log(data_clean$sqft_lot15)
#finding correlation and checking which variables have positive and negative impact on Price
library(corrplot)
library(GGally)
library(ggcorrplot)
library(corrr)
correlationplot <- ggcorr(data_clean[, 1:19], geom = "blank", label = TRUE, hjust = 0.75) +
geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.5)) +
scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
guides(color = FALSE, alpha = FALSE)
correlationplot
CorrelationResults = cor(data_clean)
corrplot(CorrelationResults)
#Taking data in train and test sets
set.seed(1234)
samp <- sample(nrow(data_clean),0.75*nrow(data_clean))
train <- data_clean[samp,]
test <- data_clean[-samp,]
#Applying linear regression model on all variables to check significance of each variable
model <- lm(data = train, price ~ .)
summary(model)
#predicting prices for reduced model
pred_log_prob_full<-predict(model, newdata = test, type = 'response')
#finding RMSE(root mean square error) less the value more better the model and R2 to check how much variance the model explains
RMSE(pred_log_prob_full,test$price)
R2(pred_log_prob_full,test$price)
#forward selection method
frwd_model<-step(model,direction = 'forward')
Null_to_full<-lm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + sqft_above + sqft_basement +
yr_built + yr_renovated + zipcode + lat + long + sqft_living15 +
sqft_lot15, data=train)
summary(Null_to_full)
#backward selection method
bckd_model<-step(model,direction = 'backward')
reduced_model<-lm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + yr_built + yr_renovated +
zipcode + lat + long + sqft_living15 + sqft_lot15, data=train)
summary(reduced_model)
#plotting the reduced model to check normality and homoscidastisity
par(mfrow=c(2,2))
plot(reduced_model)
#predicting prices for reduced model
pred_log_prob<-predict(reduced_model, newdata = test, type = 'response')
#finding RMSE(root mean square error) less the value more better the model and R2 to check how much variance the model explains
RMSE(pred_log_prob,test$price)
R2(pred_log_prob,test$price)
#decision tree
library(rpart)
library(rpart.plot)
reg<-rpart(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + yr_built + yr_renovated +
zipcode + lat + long + sqft_living15 + sqft_lot15, data=train)
summary(reg)
#Predicting prices of decision tree
pred_tree<-predict(reg,newdata = test)
#finding RMSE(root mean square error) less the value more better the model and R2 to check how much variance the model explains
RMSE(pred_tree,test$price)
R2(pred_tree,test$price)
rpart.plot(reg, box.palette="RdBu", shadow.col="gray", nn=TRUE)
#using random forest
library(randomForest)
set.seed(123)
#var.predict<-paste(names(train)[-19],collapse="+")
#rf.form <- as.formula(paste(names(train)[19], var.predict, sep = " ~ "))
rndm_frst<-randomForest(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + yr_built + yr_renovated +
zipcode + lat + long + sqft_living15 + sqft_lot15, data=train)
print(rndm_frst)
#summary(rndm_frst)
#finding importance of each variable in the model
imp<-importance(rndm_frst)
varImpPlot(rndm_frst)
#Predicting values of applied Random Forest model
pred_rndm<-predict(rndm_frst,newdata = test)
#finding RMSE(root mean square error) less the value more better the model and R2 to check how much variance the model explains
RMSE(pred_rndm,test$price)
R2(pred_rndm,test$price)
#using gradient boosting
library(caret)
#install.packages("gbm")
require(gbm)
require(MASS)
set.seed(123)
fitControl <- trainControl(method = "cv", number = 50)
tune_Grid <- expand.grid(interaction.depth = 2, n.trees = 500, shrinkage = 0.1, n.minobsinnode = 10)
grdnt_bstng<-train(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + yr_built + yr_renovated +
zipcode + lat + long + sqft_living15 + sqft_lot15, data=train,method='gbm',trControl = fitControl, verbose = FALSE)
print(grdnt_bstng)
#grdnt_bstng<-gbm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + waterfront +
# view + condition + grade + sqft_above + yr_built + yr_renovated +
# zipcode + lat + long + sqft_living15 + sqft_lot15,data = train,distribution = "gaussian",n.trees = 10000,
# shrinkage = 0.01, interaction.depth = 4)
#summary(grdnt_bstng)
#Predicting values of applied Gradient Boosting model
pred_grd<-predict(grdnt_bstng,newdata = test)
#finding RMSE(root mean square error) less the value more better the model and R2 to check how much variance the model explains
RMSE(pred_grd,test$price)
R2(pred_grd,test$price)