-
Notifications
You must be signed in to change notification settings - Fork 2
/
load_db.py
216 lines (187 loc) · 8.72 KB
/
load_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
'''
Created on Oct 1, 2021
@author: Matthew Moulton
'''
import sqlite3
import string
from src.database import Table, motorcycles, jewelry, furniture, jobs, housing, cars
def commitAction(action):
con = sqlite3.connect('product_qa.db')
cur = con.cursor()
action(cur)
con.commit() # Commit changes
con.close() # close after we are done
##################################################################################### Create tables
def buildCreate(table:Table):
cmd = 'CREATE TABLE "' + table.name.value + '" ('
for attr in table.dat:
cmd += '"' + attr[0][0] + '" ' + attr[1] + ', '
if not table.primKey is None:
cmd += ('PRIMARY KEY("' + table.dat[table.primKey][0][0] + '")')
cmd += ') WITHOUT ROWID;'
else:
cmd = cmd[0:len(cmd)-2] #get rid of the last comma
cmd += ');'
#print(cmd)
return cmd
def buildTables(cursor):
dropCommand = 'DROP TABLE IF EXISTS '
allTables = [motorcycles, jewelry, jobs, furniture, housing, cars]
for table in allTables:
cursor.execute(dropCommand + table.name.value)
cursor.execute(buildCreate(table))
# build the indexes (if any)
if table.idxCol is not None:
for idxC in table.idxCol: # for each of the specified indexes
iOn = table.dat[idxC][0][0]
cmd = 'CREATE INDEX idx_' + table.name.value + '_' + iOn + ' ON ' + table.name.value + '(' + iOn + ');'
print(cmd)
cursor.execute(cmd)
##################################################################################### Populate tables
def splitWithStrings(line: string) -> [string]:
comps = []
start = 0
while start < len(line):
nextDelim = line.find(',', start)
nextStrng = line.find('"', start)
while nextStrng > 1 and line[nextStrng - 1] == '\\':
nextStrng = line.find('"', nextStrng+1) # this instance is escaped, so find the next
# Verify that neither are -1, which throws off the less than comparison
if nextDelim == -1:
nextDelim = len(line)
if nextStrng == -1:
nextStrng = len(line)
if nextDelim < nextStrng:
comps.append(line[start:nextDelim])
start = nextDelim + 1
elif nextStrng < nextDelim:
# find the next " that ends the string
start = nextStrng + 1
nextStrng = line.find('"', start)
while nextStrng > 1 and line[nextStrng - 1] == '\\':
nextStrng = line.find('"', nextStrng+1)
comps.append(line[start:nextStrng])
# now move 'start' to after the next comma (if there is one)
ncom = line.find(',', nextStrng)
if ncom == -1: # no more to process
break
start = ncom + 1
else: # both are -1
# In this case, there is one more non-string field (since the list cannot end on a comma)
comps.append(line[start:(len(line)-1)])
break
return comps
def unfinishedString(line:string) -> bool:
start = 0
unfinished = False
while True:
try:
start = line.index('"', start)
# if the index immediately before is a \, then this has been escaped
if start > 1 and line[start - 1] != '\\':
unfinished = not unfinished
start += 1 # since the start arg in index is inclusive
except ValueError:
# Occurs when no more ". Stop looking.
break
return unfinished
def loadTable(cursor, loc:string, table:Table):
file = open(loc, encoding='utf-8')
title = False
sumLine = '' # this is what we will use when an entry spans several lines
unfinished = False
entry = 0
lineNo = 1
for line in file:
if not title:
title = True
continue
lineNo += 1
# We are going to trim the file to only load the first 800 entries
if entry > 800:
break
sumLine += line
# If there are an even number of non-escaped ", then we know that the line is done
unfinished = unfinishedString(line) ^ unfinished
if unfinished:
continue # get the next line to continue what we have
# If we are here, the line is considered finished
comps = splitWithStrings(sumLine)
sumLine = '' # reset the running line for next time
sqlString = 'INSERT INTO ' + table.name.value + ' VALUES('
first = True
for i in range(len(table.dat)):
if len(comps) > i:
comp = comps[i]
else:
comps = ""
if first:
first = False
else:
sqlString += ', '
# We want to see if this field should be a string. If so, it should have string chars surrounding
if "TEXT" in table.dat[i][1]:
if len(comp) == 0:
comp = '""' # default string value is nothing
elif comp[0] != '"' or comp[-1] != '"':
comp = '" ' + comp + ' "' # I put an extra space on both sides so that our LIKE comparison can match whole words
elif len(comp) == 0 and ("INTEGER" in table.dat[i][1] or "NUMERIC" in table.dat[i][1]):
comp = '-1' # default number value is -1
sqlString += comp
sqlString += ')'
#print(sqlString)
try:
cursor.execute(sqlString)
entry += 1
except sqlite3.Error:
import traceback
traceback.print_exc()
print("... when trying to execute " + sqlString)
print("... calculated from line " + str(lineNo))
print("... which is: " + line)
file.close()
def loadTables(cursor):
dataRoot = 'Datasets/'
loadTable(cursor, dataRoot+'BIKE-DETAILS.csv', motorcycles)
loadTable(cursor, dataRoot+'cartier_catalog.csv', jewelry)
loadTable(cursor, dataRoot+'DataScientist.csv', jobs)
loadTable(cursor, dataRoot+'IKEA_Furniture.csv', furniture)
loadTable(cursor, dataRoot+'melb_data.csv', housing)
loadTable(cursor, dataRoot+'vehicles4.csv', cars)
def rectifyData(cursor):
# There are some things about the dataset that we are going to change to make it easier to use
# We need to:
# change from Swedish crowns to USD on the furniture file
sekToUsd = str(0.12)
cursor.execute("UPDATE " + furniture.name.value + " SET price = price * " + sekToUsd + ";")
# change the dimensions from cm to inches
cmToInch = str(0.3937008)
cursor.execute("UPDATE " + furniture.name.value + " SET depth = depth * " + cmToInch + ";")
cursor.execute("UPDATE " + furniture.name.value + " SET height = height * " + cmToInch + ";")
cursor.execute("UPDATE " + furniture.name.value + " SET width = width * " + cmToInch + ";")
# change from Australian dollars to USD on the housing file
audToUsd = str(0.74)
cursor.execute("UPDATE " + housing.name.value + " SET price = price * " + audToUsd + ";")
# change from the single character types to the full name on the housing file
cursor.execute('UPDATE ' + housing.name.value + ' SET type = "house" WHERE type = "h";')
cursor.execute('UPDATE ' + housing.name.value + ' SET type = "unit" WHERE type = "u";')
cursor.execute('UPDATE ' + housing.name.value + ' SET type = "townhouse" WHERE type = "t";')
# change from m^2 to ft^2 in the housing file
sqmToSqft = str(10.76391)
cursor.execute("UPDATE " + housing.name.value + " SET landsize = landsize * " + sqmToSqft + " WHERE landsize <> -1;")
cursor.execute("UPDATE " + housing.name.value + " SET area = area * " + sqmToSqft + " WHERE area <> -1;")
# change from km driven to miles driven in the motorcycles file
kmToMi = str(0.6213712)
cursor.execute("UPDATE " + motorcycles.name.value + " SET mileage = mileage * " + kmToMi + ";")
# change the price from Indian rupees to USD in the motorcycle file
indianRupeeToUsd = str(0.01)
cursor.execute("UPDATE " + motorcycles.name.value + " SET price = price * " + indianRupeeToUsd + ";")
# update -1 to infinity in ranges
cursor.execute("UPDATE " + jobs.name.value + " SET salary_max = \"INFINITY\" WHERE salary_max = -1 AND salary_min <> -1;")
cursor.execute("UPDATE " + jobs.name.value + " SET size_max = \"INFINITY\" WHERE size_max = -1 AND size_min <> -1;")
cursor.execute("UPDATE " + jobs.name.value + " SET revenue_max = \"INFINITY\" WHERE revenue_max = -1 AND revenue_min <> -1;")
##################################################################################### Main Entry
if __name__ == '__main__':
commitAction(buildTables)
commitAction(loadTables)
commitAction(rectifyData)