-
Notifications
You must be signed in to change notification settings - Fork 0
/
song_graph.py
657 lines (515 loc) · 23.1 KB
/
song_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
"""
Song graph and related methods
"""
from __future__ import annotations
import csv
import datetime
from typing import Union, Tuple, Optional
SONG_DATA = 'Data/data.csv'
ARTIST_DATA_W_GENRES = 'Data/data_w_genres.csv'
GENRE_DATA = 'Data/data_by_genres.csv'
PROPERTIES = {'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'mode',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence'}
INFORMATION = {'artists', 'duration', 'explicit', 'id', 'name', 'release_date', 'year',
'popularity'}
# 50 most popular genres
GENRES = ['adult standards', 'album rock', 'alternative rock', 'brill building pop',
'classic bollywood', 'classic rock', 'classical', 'contemporary country', 'cool jazz',
'country', 'country rock', 'dance pop', 'disco', 'early romantic era', 'easy listening',
'filmi', 'folk', 'folk rock', 'german romanticism', 'hard rock', 'impressionism',
'italian romanticism', 'lounge', 'mellow gold', 'modern rock', 'motown', 'norteno',
'nu metal', 'outlaw country', 'pop', 'pop rock', 'post-teen pop', 'progressive house',
'psychedelic rock', 'punk', 'quiet storm', 'ranchera', 'rebetiko', 'rock',
'rock-and-roll', 'singer-songwriter', 'soft rock', 'soul', 'stride', 'swing', 'tango',
'torch song', 'vintage tango', 'vocal jazz', 'yacht rock']
# Weights of how much each property effects rating in get rating
WEIGHTS = {'acousticness': 1, 'danceability': 1, 'energy': 1, 'instrumentalness': 1, 'key': 1 / 9,
'mode': 1, 'liveness': 1, 'loudness': 1 / 59, 'speechiness': 1, 'tempo': 1 / 145,
'valence': 1}
class Song:
"""
Contain information about songs that spotify collects
Instance Attributes:
- properties: maps data the spotify collects that are ways to describe how a song sounds.
See below for specifics
- information: maps data that spotify collects that are objective ways to identify /
classify the song. See below for specifics
- neighbours: add desc
- name: name of the song, (this is in properties but is convenient to have nonetheless)
Properties includes:
- acousticness: relative measure of how acoustic a track is
- danceability: relative measure of how danceable a track is
- energy: relative measure of how energetic a track is
- instrumentalness: how close a track is to being instrumental. Closer to 1 the more inst.
- key: primary key of the track encoded as an integer from 1-11
- mode: binary value. (1) if track starts in major chord progression, (0) if not
- liveness: relative duration of the track sounding like a live preformance
- loudness: relative loudness of the track in decibel range [-60, 0]
- speechiness: relative length of the track containing human voice
- tempo: BPM of the track
- valence: How positive a track sounds (like positivity / happiness)
Information includes:
- artists: List of artists credited
- duration: length of the track in ms
- explicit: binary value if the song is explicit. (1) if explicit
- id: primary ID for the song generated by spotify
- name: title of the track, as a string
- release_date: release date in y-m-d or y-m or just y
- year: release year
- popularity: popularity of the song lately (in the US)
Representation Invariants:
- all([key in {'acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
'mode', 'liveness', 'loudness', 'speechiness', 'tempo', valence}
for key in self.properties])
- all([key in {'artists', 'duration', 'explicit', 'id', 'name',
'release_date', 'year', 'popularity'} for key in self.information])
- self not in self.neighbours
- all(self in u.neighbours for u in self.neighbours)
"""
properties: dict[str, Union[int, float]]
information: dict[str, Union[str, int, float, datetime.datetime, list[str]]]
neighbours: dict[str, Union[int, float]]
name: str
genre: str
def __init__(self, properties: dict[str, Union[int, float]],
information: dict[str, Union[str, int, float, datetime.datetime, list[str]]],
name: str) -> None:
"""
Initialize a new song with no neighbour data
"""
self.properties = properties
self.information = information
self.neighbours = {}
self.name = name
def get_degree(self) -> int:
"""
name
"""
return len(self.neighbours)
class SongGraph:
"""
Graph of songs within a particular genre. Each vertex represents a song and a edge represents
similar songs
Instance attributes:
_songs: maps spotify ID to song
"""
songs: dict[str, Song]
def __init__(self) -> None:
"""
init for SongGraph
"""
self.songs = {}
def add_song(self, song: Song) -> None:
"""
Add a song to _songs, (just adding a vertex into the graph)
"""
spotify_id = song.information['id']
self.songs[spotify_id] = song
def add_edge(self, id_1: str, id_2: str, sim_score: float) -> None:
"""
Add an edge between two _songs, songs are the id
"""
if id_1 not in self.songs or id_2 not in self.songs:
raise ValueError
else:
self.songs[id_1].neighbours[id_2] = sim_score
self.songs[id_2].neighbours[id_1] = sim_score
def search_song(self, song_id: str) -> bool:
"""This method checks our graph to see if a song exists."""
if song_id in self.songs:
return True
else:
return False
def sg_get_song(self, song_id: str) -> Optional[Song]:
"""Retrieves a song vertex"""
if song_id in self.songs:
return self.songs[song_id]
return None
def sg_insert_song(self, song: Song, thresh: float = 0.1) -> None:
"""This method inserts a song into the graph. Assume its not already here"""
self.add_song(song)
rating = get_song_rating(song)
for other_song in self.songs:
if self.songs[other_song].information['id'] != song.information['id']:
other_rating = get_song_rating(self.songs[other_song])
if other_rating - rating <= thresh:
self.add_edge(song.information['id'], self.songs[other_song].information['id'],
abs(other_rating - rating))
class Genre:
"""
Class representing a genre.
Instance attributes:
- song_graph: a SongGraph of only songs within this genre. See above for SongGraph
- average_properties: average properties of all songs within this genre. See above
for what properties are
- median_properties: median properties of all songs within this genre
- neighbours: similar genres
- name: name of genre.
Representation invariants:
- all([prop in PROPERTIES for prop in average_properties])
- all([prop in PROPERTIES for prop in median_properties])
- name is in Data/data_by_genres.csv or the equivalent file.
- all neighbours are in Data/data_by_genres.csv or the equivalent file.
"""
song_graph: SongGraph
average_properties: dict[str, Union[int, float]]
median_properties: dict[str, Union[int, float]]
neighbours: dict[str, Union[int, float]]
name: str
def __init__(self, song_graph: SongGraph, avg_props: dict[str, Union[int, float]],
name: str) -> None:
"""
Init for Genre
"""
self.song_graph = song_graph
self.average_properties = avg_props
self.name = name
self.median_properties = {}
self.neighbours = {}
class GenreGraph:
"""
Class for genre graph, each vertex is a genre object and edges represent similar genres.
Instance attributes:
- _genres maps genre name to Genre object
"""
genres: dict[str, Genre]
def __init__(self) -> None:
"""
init for SongGraph
"""
self.genres = {}
def add_genre(self, genre: Genre) -> None:
"""
Add a song to _songs, (just adding a vertex into the graph)
"""
genre_name = genre.name
self.genres[genre_name] = genre
def add_edge(self, genre_1: str, genre_2: str, sim_score: float) -> None:
"""
Add an edge between two _songs, songs are the id
"""
if genre_1 not in self.genres or genre_2 not in self.genres:
raise ValueError
else:
self.genres[genre_1].neighbours[genre_2] = sim_score
self.genres[genre_2].neighbours[genre_1] = sim_score
def get_song(self, song: Song) -> Song:
"""Retrieves a song"""
return self.genres[song.genre].song_graph.songs[song.information['id']]
def insert_song(self, song: Song) -> None:
"""This method inserts a song into the graph
you can set 'ret' to True if you want the song vertex returned for use"""
self.genres[song.genre].song_graph.sg_insert_song(song)
def load_genres(genres_file: str) -> dict[str, dict[str, float]]:
"""
Loads and returns all genres from the given genres file into a dict that maps genre to its
average properties
each average property is the average value of the corresponding property values
of all the songs within that genre
the properties are listed above in '_Songs' docstring.
Preconditions:
- genres_file is formatted in the same way as data/data_by_genres.csv
- genres_file is the path to the type of file described above
"""
genres_to_avg_properties = {}
with open(genres_file) as genre_data:
genres = csv.reader(genre_data)
headers = next(genres)[1:]
for genre in genres:
curr_genre = genre[0]
curr_headers = genre[1:]
curr_props = {}
for i in range(0, len(headers)):
curr_props[headers[i]] = float(curr_headers[i])
genres_to_avg_properties[curr_genre] = curr_props
return genres_to_avg_properties
def load_artists_to_genres(art_genres_file: str) -> dict[str, list[str]]:
"""
Returns a mapping of every artist within the given artist with genres file to the
genres of music they make
Preconditions:
- art_genres_file is formatted in the same way as data/data_w_genres.csv
- art_genres_file is the path to the type of file described above
"""
artist_to_genres = {}
with open(art_genres_file, encoding="ISO-8859-1") as art_g_data:
art_w_g = csv.reader(art_g_data)
next(art_w_g)
for row in art_w_g:
artist = row[0]
artist = clean_lists(artist)[0]
genres = row[-1]
genres = clean_lists(genres)
if genres == ['']:
genres = []
artist_to_genres[artist] = genres
return artist_to_genres
def song_to_genre(song: Song, genres: list[str], g_to_props: dict[str, dict[str, float]]) -> str:
"""
Takes in a song and returns the genre it most likely is
genres is a list of genres that the artist who wrote the song is known to make
g_to_props is a mapping of genres to their average or median properties.
See above for what properties is
Preconditions:
- g_to_props contains all genres from the data file
- song is a song in the data file
"""
# Normalize these properties so they are ~ [0, 1] as the other properties
loudness_range = 59
key_range = 10
tempo_mod = 145
# vv Note that the maximum difference between songs cannot be greater than 15 so this is fine vv
min_difference = 999999
curr_difference = 0
closest_genre = ''
for genre in genres:
if genre in g_to_props:
average_props = g_to_props[genre]
for prop in average_props:
if prop == 'tempo':
curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
tempo_mod
elif prop == 'loudness':
curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
loudness_range
elif prop == 'key':
curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
key_range
elif prop not in {'popularity', 'duration_ms'}:
curr_difference += abs(song.properties[prop] - g_to_props[genre][prop])
if curr_difference < min_difference:
closest_genre = genre
min_difference = curr_difference
curr_difference = 0
song.genre = closest_genre
return closest_genre
def clean_lists(lst: str) -> list[str]:
"""
Cleans the artists from the csv file.
Artists column is inconsistent there are extra " and ' at the beginning and end and there are
also random slashes -> \\ too (slashes come before ' as well). This function also removes the
brackets at the beginning and end of the string.
Returns a list of artists.
"""
lst = lst.split(', ')
for i in range(0, len(lst)):
lst[i] = lst[i].lstrip('\"')
lst[i] = lst[i].rstrip('\"')
lst[i] = lst[i].removeprefix('[')
lst[i] = lst[i].removesuffix(']')
lst[i] = lst[i].removeprefix('\'').removesuffix('\'')
lst[i] = lst[i].lstrip('\"')
lst[i] = lst[i].rstrip('\"')
lst[i] = lst[i].removeprefix('\'').removesuffix('\'')
for sub_lst in lst:
for i in range(0, len(sub_lst) - 1):
if sub_lst[i] == '\\' and sub_lst[i + 1] == '\'':
sub_lst = sub_lst[:i] + sub_lst[i + 1: len(sub_lst)]
elif sub_lst[i] == '\\' and sub_lst[i + 1] == '\\':
sub_lst = sub_lst[:i] + sub_lst[i + 1: len(sub_lst)]
return lst
def clean_data(properties: dict[str, str], information: dict[str, str]) -> \
tuple[dict[str, Union[int, float]], dict[str, Union[str, int, float, datetime.datetime,
list[str]]]]:
"""
Take in information and properties as strings and then return them as their appropriate
data type.
Preconditions:
- Properties and information formatted as described above
"""
float_properties = {'acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence'}
int_properties = {'key', 'mode'}
float_information = {'duration', 'popularity'}
int_information = {'explicit'}
date_time_information = {'release_date'}
str_information = {'id', 'name', 'year'}
lst_information = {'artists'}
new_props = {}
new_info = {}
for prop in properties:
if prop in float_properties:
new_props[prop] = float(properties[prop])
elif prop in int_properties:
new_props[prop] = int(properties[prop])
for info in information:
if info in float_information:
new_info[info] = float(information[info])
elif info in int_information:
new_info[info] = int(information[info])
elif info in date_time_information:
dates = str(information[info]).split('-')
dates = [int(date) for date in dates]
while len(dates) < 3:
dates.append(1)
new_info[info] = datetime.datetime(dates[0], dates[1], dates[2])
elif info in str_information:
new_info[info] = information[info]
elif info in lst_information:
new_info[info] = clean_lists(information[info])
return new_props, new_info
def load_songs(songs_file: str) -> dict[str, Song]:
"""
load songs by genre. Return a mapping of genres to songs within that genre. The songs are
_Song objects.
Preconditions:
- reviews_file is the path to a CSV file corresponding to the song data
format as described in Songs class.
Return a mapping that maps genre to songs in that genre from the list of songs in the csv file.
"""
set_songs = {}
with open(songs_file, encoding="ISO-8859-1") as song_data:
songs = csv.reader(song_data)
header = next(songs)
curr_information = {}
curr_properties = {}
for song in songs:
for index in range(0, len(header)):
if header[index] in PROPERTIES:
curr_properties[header[index]] = song[index]
elif header[index] in INFORMATION:
curr_information[header[index]] = song[index]
new_props, new_info = clean_data(curr_properties, curr_information)
set_songs[curr_information['id']] = Song(new_props, new_info,
curr_information['name'])
curr_information.clear()
curr_properties.clear()
return set_songs
def genres_to_songs(songs_file: str, artists_file: str, genres_file: str) -> Tuple[dict, dict]:
"""
Return a mapping of genres from genres_file to songs in songs_file.
Preconditions:
- songs_file is a path to a csv file structured in the same way as 'Data/data.csv'
- artists_file is a path to a csv file structured as 'Data/data_w_genres.csv' is.
- genres_file is a path to a csv file structured as 'Data/data_by_genres.csv' is
"""
genres_to_prop = load_genres(genres_file)
print('Loading genres finished. Next, loading artists:')
artists_to_genre = load_artists_to_genres(artists_file)
print('Loading artists finished. Next, loading Songs:')
songs = load_songs(songs_file)
print('loading songs finished. Next, assigning genres:')
genre_to_songs = {}
genres = []
songs_to_genre = {}
for genre in genres_to_prop:
genre_to_songs[genre] = []
for song in songs:
artist = songs[song].information['artists'][0]
if artist == 'n/a':
artist = songs[song].information['artists'][1]
genres = artists_to_genre.get(artist, [])
# genres = artists_to_genre.get(artist)
if genres == []:
genre = song_to_genre(songs[song], GENRES, genres_to_prop)
else:
genre = song_to_genre(songs[song], genres, genres_to_prop)
genre_to_songs[genre].append(songs[song])
songs_to_genre[songs[song].information['id']] = genre
return genre_to_songs, songs_to_genre
def get_song_rating(song: Song) -> float:
"""
Return the rating of a song:
The rating is calculated by: FILLER RATING FOR NOW
Preconditions:
- song.properties != {}
"""
rating = 0
for prop in song.properties:
rating += WEIGHTS[prop] * song.properties[prop]
return rating
def create_song_graph(songs: list[Song], threshold: float) -> SongGraph:
"""
Takes a list of songs and return their songGraph. The song graph connects songs which have
ratings that are within 'threshold' of each other.
The method above describes how to find the rating of a song.
Preconditions:
- threshold > 0
- all([song.properties != {} for song in songs])
"""
graph = SongGraph()
song_ratings = []
for song in songs:
graph.add_song(song)
rating = get_song_rating(song)
song_ratings.append((rating, song.information['id']))
song_ratings.sort(key=lambda x: x[0])
for song_index in range(0, len(song_ratings)):
potential_songs = song_index + 1
if potential_songs < len(song_ratings):
weight = abs(song_ratings[song_index][0] - song_ratings[potential_songs][0])
else:
weight = -10
while weight < threshold and potential_songs < len(song_ratings):
graph.add_edge(song_ratings[song_index][1], song_ratings[potential_songs][1], weight)
weight = abs(song_ratings[song_index][0] - song_ratings[potential_songs][0])
potential_songs += 1
return graph
def create_genre_graph(songs_file: str, artists_file: str, genres_file: str,
threshold: float) -> Tuple[GenreGraph, dict]:
"""
Returns the main genre graph to be used to recommend songs.
The graph connects similar 'Genre' objects together. Each genre object contains a
song graph which connects similar songs of that genre together.
The graph connects genres that have a rating within threshold of each other
See below for the rating of a genre.
Preconditions:
- songs_file is a path to a csv file structured in the same way as 'Data/data.csv'
- artists_file is a path to a csv file structured as 'Data/data_w_genres.csv' is.
- genres_file is a path to a csv file structured as 'Data/data_by_genres.csv' is
- threshold > 0
"""
g_to_songs, songs_to_g = genres_to_songs(songs_file, artists_file, genres_file)
print('Assigning Genres finished. Last, making graph.')
genres_to_prop = load_genres(genres_file)
genre_graph = GenreGraph()
ratings = []
for genre in g_to_songs:
curr_song_graph = create_song_graph(g_to_songs[genre], threshold)
curr_genre = Genre(curr_song_graph, genres_to_prop[genre], genre)
genre_graph.add_genre(curr_genre)
ratings.append((get_genre_rating(curr_genre), genre))
ratings.sort(key=lambda x: x[0])
for genre_index in range(0, len(ratings)):
potential_genres = genre_index + 1
if potential_genres < len(ratings):
weight = abs(ratings[genre_index][0] - ratings[potential_genres][0])
else:
weight = -10
while weight < threshold and potential_genres < len(ratings):
genre_graph.add_edge(ratings[genre_index][1], ratings[potential_genres][1], weight)
weight = abs(ratings[genre_index][0] - ratings[potential_genres][0])
potential_genres += 1
return genre_graph, songs_to_g
def get_genre_rating(genre: Genre) -> float:
"""
Return the rating for a genre
The rating for a genre is found by: FILLER RATING FOR NOW
Preconditions:
- genre.average_properties != {}
"""
rating = 0
for prop in genre.average_properties:
if prop in WEIGHTS:
rating += WEIGHTS[prop] * genre.average_properties[prop]
return rating
# if __name__ == '__main__':
# import python_ta.contracts
# python_ta.contracts.check_all_contracts()
#
# import doctest
# doctest.testmod()
#
# import python_ta
# python_ta.check_all(config={
# 'max-line-length': 100,
# 'disable': ['E1136'],
# 'extra-imports': ['pygame', 'networkx', 'pygame_visualization', 'song_graph',
# 'computations', 'tkinter', 'spotify_methods', 'random', 'main',
# 'spotipy', 'spotipy.oauth2', 'main', 'graph_visualization', 'datetime',
# 'csv', 'plotly.graph_objects'],
# 'generated-members': ['pygame.*'],
# 'max-nested-blocks': 4,
# 'allowed-io': ['genres_to_songs', 'load_genres', 'load_artists_to_genres', 'load_songs',
# 'create_genre_graph']
# })