-
Notifications
You must be signed in to change notification settings - Fork 0
/
clothing_recommender_project.py
93 lines (68 loc) · 3.23 KB
/
clothing_recommender_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""Clothing_Recommender Project .ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1nw0ewNdkx8o3WULAp2ynhHpbq1kVq7YZ
Clean the data and use input
"""
## Import and Organize Data ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#read clean file (downloaded from Task 1)
df=pd.read_csv('CleanedData.csv', sep=',')
#Pivot table (clothingID, age, rating) - Nan is replaced with 0
train = df.pivot_table(index='Age', columns='ClothingID', values='Rating')
#sort train data
train = train.sort_values('Age', ascending=True)
###Create a greeting
print("Welcome, let us recommend a product for you")
#Take user input
Name =input('Please enter your name: ')
Age = int(input('Please enter your age: '))
CID_user = int(input("Enter Clothing ID: ")) #90
while CID_user not in train.columns:
print('Invalid: No data for ID')
CID_user = int(input("Enter valid Clothing ID: "))
rating_user = float(input("Enter Rating for Clothing ID: ")) #4
##use this later (if user has more than one rating to enter)
#entries = int(input("How many ratings will you enter? "))
#for x in range(entries):
#create array with user data
userArray = pd.DataFrame().reindex_like(train)
userArray.dropna(thresh=1,inplace=True)
userArray.loc[Age,CID_user] = rating_user #enter user data
from sklearn.metrics.pairwise import nan_euclidean_distances
#find euclidean distance between all rows of train and first row of test *ignores nan
distance = np.zeros((0,2)) #create empty array
for index, row in train.iterrows(): #iterate through each row of train
result = float(nan_euclidean_distances([userArray.loc[Age]], [train.loc[index]])) #compute the euclidean distance between two rows, *confirmed it works thru excel
result_array = [index, result] #place age and distance into an array
distance = np.append(distance,[result_array],axis= 0)
#convert array to a dataframe
dfDistance = pd.DataFrame({'Age': distance[:, 0], 'E-Distance': distance[:, 1]})
dfDistance.head()
k= 5
#sort by distance, reset the index
dfDistance = dfDistance.sort_values('E-Distance', ascending=True).head(20)
dfDistance = dfDistance.reset_index(drop=True)
dfDistance.drop(dfDistance[dfDistance.index > k-1].index, inplace=True)
dfDistance.head()
#NOTE: for calculating the predicted rating, could use an IDW Interpolation function shown here https://stackoverflow.com/questions/3104781/inverse-distance-weighted-idw-interpolation-with-python
#just using mean of each to test a solution, will come back and try more complex/accurate functions later
#assume k of 5####
k_array = pd.DataFrame().reindex_like(train)
meanArray = pd.DataFrame()
for x in dfDistance['Age']:
k_array = k_array.append([train.loc[x]]) #make array of the k closest ages
meanArray = meanArray.append(k_array.mean(),ignore_index = True).transpose()
meanArray.dropna(axis=0,inplace=True)
meanArray.columns = ["Mean"]
meanArray = meanArray[meanArray.Mean == 5]
recommend = list(meanArray.index.values)
print("recommended ClothingID's are: ")
print(recommend)
#feedback, clothingID (choose top 5), department
#reverse lookup clothingID for department
# feedback (choose first 3)