Skip to content

Latest commit

 

History

History
261 lines (158 loc) · 3.5 KB

File metadata and controls

261 lines (158 loc) · 3.5 KB

PortlandOregonHousePricesPrediction

# Ignore warning message
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import plotnine as pl # for ploting
# Linear Regression Machine learning algorithm
from sklearn.linear_model import LinearRegression as LR
def predicted_line(X, y):
    _model = LR()
    _model.fit(X, y)
    return X.flatten() * _model.coef_[0] + _model.intercept_
# Read dataset
with open("./ex1data2.txt") as _f:
    data = [list(map(int, i.strip().split(','))) for i in _f.readlines()]
# Create DataFrame
df = pd.DataFrame(data, columns=["size", "bedrooms", "prices"], )
# Average area and price by number of bedrooms
(
    df
    .groupby("bedrooms")["prices"]
    .agg(["mean", "size"])
    .rename(columns={"size": "count", "mean": "avg prices"})
)
avg prices count
bedrooms
1 169900.000000 1
2 280866.666667 6
3 326403.920000 25
4 377449.785714 14
5 699900.000000 1
(
    pl.ggplot(df, pl.aes("prices"))
    + pl.geom_histogram()
    + pl.geom_density()
)

png

<ggplot: (8773500822245)>
(
    pl.ggplot(df)
    + pl.geom_point(pl.aes("size", "prices",
                           color="factor(bedrooms)"))
)

png

<ggplot: (8773501033671)>

LR - size vs prices

X = df["size"].values.reshape(-1,1)
y = df["prices"].values
# Predicted values
df["predicted_price_by_size"] = predicted_line(X, y)
(
    pl.ggplot(df)
    + pl.geom_point(pl.aes("size", "prices"), color="#958AC5")
    + pl.geom_line(pl.aes("size", "predicted_price_by_size"), color="red")
)

png

<ggplot: (8773505392625)>

LR - number of bedrooms vs prices

X = df["bedrooms"].values.reshape(-1,1)
y = df["prices"].values
# Predicted values
df["predicted_price_by_bedrooms"] = predicted_line(X, y)
(
    pl.ggplot(df)
    + pl.geom_point(pl.aes("bedrooms", "prices"), color="#958AC5")
    + pl.geom_line(pl.aes("bedrooms", "predicted_price_by_bedrooms"), color="red")
)

png

<ggplot: (8773503059535)>

Separate LR model for each number of bedrooms data

df["predicted_by_bedrooms"] = 0
for v in df.groupby("bedrooms").groups.values():
    _df = df.iloc[v]
    _X, _y = _df["size"].values.reshape(-1, 1), _df["prices"].values
    df["predicted_by_bedrooms"].loc[v] = predicted_line(_X, _y)

Regression Line for each number of bedrooms data

(
    pl.ggplot(df)
    + pl.geom_point(pl.aes("size", "prices", color='factor(bedrooms)'))
    + pl.geom_line(pl.aes("size", "predicted_by_bedrooms"), color="red")
    + pl.facet_wrap("bedrooms")
)

png

<ggplot: (8773503002853)>