song_graph.py

"""
Song graph and related methods
"""
from __future__ import annotations
import csv
import datetime
from typing import Union, Tuple, Optional

SONG_DATA = 'Data/data.csv'
ARTIST_DATA_W_GENRES = 'Data/data_w_genres.csv'
GENRE_DATA = 'Data/data_by_genres.csv'

PROPERTIES = {'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'mode',
              'liveness', 'loudness', 'speechiness', 'tempo', 'valence'}
INFORMATION = {'artists', 'duration', 'explicit', 'id', 'name', 'release_date', 'year',
               'popularity'}

# 50 most popular genres
GENRES = ['adult standards', 'album rock', 'alternative rock', 'brill building pop',
          'classic bollywood', 'classic rock', 'classical', 'contemporary country', 'cool jazz',
          'country', 'country rock', 'dance pop', 'disco', 'early romantic era', 'easy listening',
          'filmi', 'folk', 'folk rock', 'german romanticism', 'hard rock', 'impressionism',
          'italian romanticism', 'lounge', 'mellow gold', 'modern rock', 'motown', 'norteno',
          'nu metal', 'outlaw country', 'pop', 'pop rock', 'post-teen pop', 'progressive house',
          'psychedelic rock', 'punk', 'quiet storm', 'ranchera', 'rebetiko', 'rock',
          'rock-and-roll', 'singer-songwriter', 'soft rock', 'soul', 'stride', 'swing', 'tango',
          'torch song', 'vintage tango', 'vocal jazz', 'yacht rock']

# Weights of how much each property effects rating in get rating
WEIGHTS = {'acousticness': 1, 'danceability': 1, 'energy': 1, 'instrumentalness': 1, 'key': 1 / 9,
           'mode': 1, 'liveness': 1, 'loudness': 1 / 59, 'speechiness': 1, 'tempo': 1 / 145,
           'valence': 1}


class Song:
    """
    Contain information about songs that spotify collects

    Instance Attributes:
        - properties: maps data the spotify collects that are ways to describe how a song sounds.
            See below for specifics
        - information: maps data that spotify collects that are objective ways to identify /
            classify the song. See below for specifics
        - neighbours: add desc
        - name: name of the song, (this is in properties but is convenient to have nonetheless)

    Properties includes:
        - acousticness: relative measure of how acoustic a track is
        - danceability: relative measure of how danceable a track is
        - energy: relative measure of how energetic a track is
        - instrumentalness: how close a track is to being instrumental. Closer to 1 the more inst.
        - key: primary key of the track encoded as an integer from 1-11
        - mode: binary value. (1) if track starts in major chord progression, (0) if not
        - liveness: relative duration of the track sounding like a live preformance
        - loudness: relative loudness of the track in decibel range [-60, 0]
        - speechiness: relative length of the track containing human voice
        - tempo: BPM of the track
        - valence: How positive a track sounds (like positivity / happiness)

    Information includes:
        - artists: List of artists credited
        - duration: length of the track in ms
        - explicit: binary value if the song is explicit. (1) if explicit
        - id: primary ID for the song generated by spotify
        - name: title of the track, as a string
        - release_date: release date in y-m-d or y-m or just y
        - year: release year
        - popularity: popularity of the song lately (in the US)

    Representation Invariants:
        - all([key in {'acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
               'mode', 'liveness', 'loudness', 'speechiness', 'tempo', valence}
               for key in self.properties])
        - all([key in {'artists', 'duration', 'explicit', 'id', 'name',
               'release_date', 'year', 'popularity'} for key in self.information])
        - self not in self.neighbours
        - all(self in u.neighbours for u in self.neighbours)

    """
    properties: dict[str, Union[int, float]]
    information: dict[str, Union[str, int, float, datetime.datetime, list[str]]]
    neighbours: dict[str, Union[int, float]]
    name: str
    genre: str

    def __init__(self, properties: dict[str, Union[int, float]],
                 information: dict[str, Union[str, int, float, datetime.datetime, list[str]]],
                 name: str) -> None:
        """
        Initialize a new song with no neighbour data
        """
        self.properties = properties
        self.information = information
        self.neighbours = {}
        self.name = name

    def get_degree(self) -> int:
        """
        name
        """
        return len(self.neighbours)


class SongGraph:
    """
    Graph of songs within a particular genre. Each vertex represents a song and a edge represents
    similar songs

    Instance attributes:
        _songs: maps spotify ID to song
    """
    songs: dict[str, Song]

    def __init__(self) -> None:
        """
        init for SongGraph
        """
        self.songs = {}

    def add_song(self, song: Song) -> None:
        """
        Add a song to _songs, (just adding a vertex into the graph)
        """
        spotify_id = song.information['id']
        self.songs[spotify_id] = song

    def add_edge(self, id_1: str, id_2: str, sim_score: float) -> None:
        """
        Add an edge between two _songs, songs are the id
        """

        if id_1 not in self.songs or id_2 not in self.songs:
            raise ValueError
        else:
            self.songs[id_1].neighbours[id_2] = sim_score
            self.songs[id_2].neighbours[id_1] = sim_score

    def search_song(self, song_id: str) -> bool:
        """This method checks our graph to see if a song exists."""
        if song_id in self.songs:
            return True
        else:
            return False

    def sg_get_song(self, song_id: str) -> Optional[Song]:
        """Retrieves a song vertex"""
        if song_id in self.songs:
            return self.songs[song_id]
        return None

    def sg_insert_song(self, song: Song, thresh: float = 0.1) -> None:
        """This method inserts a song into the graph. Assume its not already here"""
        self.add_song(song)
        rating = get_song_rating(song)
        for other_song in self.songs:
            if self.songs[other_song].information['id'] != song.information['id']:
                other_rating = get_song_rating(self.songs[other_song])
                if other_rating - rating <= thresh:
                    self.add_edge(song.information['id'], self.songs[other_song].information['id'],
                                  abs(other_rating - rating))


class Genre:
    """
    Class representing a genre.

    Instance attributes:
        - song_graph: a SongGraph of only songs within this genre. See above for SongGraph
        - average_properties: average properties of all songs within this genre. See above
          for what properties are
        - median_properties: median  properties of all songs within this genre
        - neighbours: similar genres
        - name: name of genre.

    Representation invariants:
        - all([prop in PROPERTIES for prop in average_properties])
        - all([prop in PROPERTIES for prop in median_properties])
        - name is in Data/data_by_genres.csv or the equivalent file.
        - all neighbours are in Data/data_by_genres.csv or the equivalent file.
    """
    song_graph: SongGraph
    average_properties: dict[str, Union[int, float]]
    median_properties: dict[str, Union[int, float]]
    neighbours: dict[str, Union[int, float]]
    name: str

    def __init__(self, song_graph: SongGraph, avg_props: dict[str, Union[int, float]],
                 name: str) -> None:
        """
        Init for Genre
        """
        self.song_graph = song_graph
        self.average_properties = avg_props
        self.name = name
        self.median_properties = {}
        self.neighbours = {}


class GenreGraph:
    """
    Class for genre graph, each vertex is a genre object and edges represent similar genres.

    Instance attributes:
        - _genres maps genre name to Genre object
    """
    genres: dict[str, Genre]

    def __init__(self) -> None:
        """
        init for SongGraph
        """
        self.genres = {}

    def add_genre(self, genre: Genre) -> None:
        """
        Add a song to _songs, (just adding a vertex into the graph)
        """
        genre_name = genre.name
        self.genres[genre_name] = genre

    def add_edge(self, genre_1: str, genre_2: str, sim_score: float) -> None:
        """
        Add an edge between two _songs, songs are the id
        """

        if genre_1 not in self.genres or genre_2 not in self.genres:
            raise ValueError
        else:
            self.genres[genre_1].neighbours[genre_2] = sim_score
            self.genres[genre_2].neighbours[genre_1] = sim_score

    def get_song(self, song: Song) -> Song:
        """Retrieves a song"""
        return self.genres[song.genre].song_graph.songs[song.information['id']]

    def insert_song(self, song: Song) -> None:
        """This method inserts a song into the graph
        you can set 'ret' to True if you want the song vertex returned for use"""
        self.genres[song.genre].song_graph.sg_insert_song(song)


def load_genres(genres_file: str) -> dict[str, dict[str, float]]:
    """
    Loads and returns all genres from the given genres file into a dict that maps genre to its
    average properties

    each average property is the average value of the corresponding property values
    of all the songs within that genre

    the properties are listed above in '_Songs' docstring.

    Preconditions:
        - genres_file is formatted in the same way as data/data_by_genres.csv
        - genres_file is the path to the type of file described above
    """
    genres_to_avg_properties = {}

    with open(genres_file) as genre_data:
        genres = csv.reader(genre_data)
        headers = next(genres)[1:]

        for genre in genres:
            curr_genre = genre[0]
            curr_headers = genre[1:]
            curr_props = {}

            for i in range(0, len(headers)):
                curr_props[headers[i]] = float(curr_headers[i])

            genres_to_avg_properties[curr_genre] = curr_props

    return genres_to_avg_properties


def load_artists_to_genres(art_genres_file: str) -> dict[str, list[str]]:
    """
    Returns a mapping of every artist within the given artist with genres file to the
    genres of music they make

    Preconditions:
        - art_genres_file is formatted in the same way as data/data_w_genres.csv
        - art_genres_file is the path to the type of file described above
    """
    artist_to_genres = {}

    with open(art_genres_file, encoding="ISO-8859-1") as art_g_data:
        art_w_g = csv.reader(art_g_data)
        next(art_w_g)

        for row in art_w_g:
            artist = row[0]
            artist = clean_lists(artist)[0]
            genres = row[-1]

            genres = clean_lists(genres)

            if genres == ['']:
                genres = []

            artist_to_genres[artist] = genres

    return artist_to_genres


def song_to_genre(song: Song, genres: list[str], g_to_props: dict[str, dict[str, float]]) -> str:
    """
    Takes in a song and returns the genre it most likely is

    genres is a list of genres that the artist who wrote the song is known to make
    g_to_props is a mapping of genres to their average or median properties.
    See above for what properties is

    Preconditions:
        - g_to_props contains all genres from the data file
        - song is a song in the data file
    """
    # Normalize these properties so they are ~ [0, 1] as the other properties
    loudness_range = 59
    key_range = 10
    tempo_mod = 145

    # vv Note that the maximum difference between songs cannot be greater than 15 so this is fine vv
    min_difference = 999999
    curr_difference = 0

    closest_genre = ''

    for genre in genres:
        if genre in g_to_props:
            average_props = g_to_props[genre]

            for prop in average_props:
                if prop == 'tempo':
                    curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
                        tempo_mod
                elif prop == 'loudness':
                    curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
                        loudness_range
                elif prop == 'key':
                    curr_difference += abs(song.properties[prop] - g_to_props[genre][prop]) / \
                        key_range
                elif prop not in {'popularity', 'duration_ms'}:
                    curr_difference += abs(song.properties[prop] - g_to_props[genre][prop])

            if curr_difference < min_difference:
                closest_genre = genre
                min_difference = curr_difference

            curr_difference = 0

    song.genre = closest_genre
    return closest_genre


def clean_lists(lst: str) -> list[str]:
    """
    Cleans the artists from the csv file.

    Artists column is inconsistent there are extra " and ' at the beginning and end and there are
    also random slashes -> \\ too (slashes come before ' as well). This function also removes the
    brackets at the beginning and end of the string.

    Returns a list of artists.
    """
    lst = lst.split(', ')

    for i in range(0, len(lst)):
        lst[i] = lst[i].lstrip('\"')
        lst[i] = lst[i].rstrip('\"')
        lst[i] = lst[i].removeprefix('[')
        lst[i] = lst[i].removesuffix(']')

        lst[i] = lst[i].removeprefix('\'').removesuffix('\'')

        lst[i] = lst[i].lstrip('\"')
        lst[i] = lst[i].rstrip('\"')

        lst[i] = lst[i].removeprefix('\'').removesuffix('\'')

    for sub_lst in lst:

        for i in range(0, len(sub_lst) - 1):
            if sub_lst[i] == '\\' and sub_lst[i + 1] == '\'':
                sub_lst = sub_lst[:i] + sub_lst[i + 1: len(sub_lst)]
            elif sub_lst[i] == '\\' and sub_lst[i + 1] == '\\':
                sub_lst = sub_lst[:i] + sub_lst[i + 1: len(sub_lst)]

    return lst


def clean_data(properties: dict[str, str], information: dict[str, str]) -> \
        tuple[dict[str, Union[int, float]], dict[str, Union[str, int, float, datetime.datetime,
                                                            list[str]]]]:
    """
    Take in information and properties as strings and then return them as their appropriate
    data type.

    Preconditions:
        - Properties and information formatted as described above
    """
    float_properties = {'acousticness', 'danceability', 'energy', 'instrumentalness',
                        'liveness', 'loudness', 'speechiness', 'tempo', 'valence'}
    int_properties = {'key', 'mode'}

    float_information = {'duration', 'popularity'}
    int_information = {'explicit'}
    date_time_information = {'release_date'}
    str_information = {'id', 'name', 'year'}
    lst_information = {'artists'}

    new_props = {}
    new_info = {}

    for prop in properties:
        if prop in float_properties:
            new_props[prop] = float(properties[prop])
        elif prop in int_properties:
            new_props[prop] = int(properties[prop])

    for info in information:
        if info in float_information:
            new_info[info] = float(information[info])
        elif info in int_information:
            new_info[info] = int(information[info])
        elif info in date_time_information:
            dates = str(information[info]).split('-')
            dates = [int(date) for date in dates]

            while len(dates) < 3:
                dates.append(1)

            new_info[info] = datetime.datetime(dates[0], dates[1], dates[2])
        elif info in str_information:
            new_info[info] = information[info]
        elif info in lst_information:
            new_info[info] = clean_lists(information[info])

    return new_props, new_info


def load_songs(songs_file: str) -> dict[str, Song]:
    """
    load songs by genre. Return a mapping of genres to songs within that genre. The songs are
    _Song objects.

    Preconditions:
        - reviews_file is the path to a CSV file corresponding to the song data
          format as described in Songs class.

    Return a mapping that maps genre to songs in that genre from the list of songs in the csv file.
    """
    set_songs = {}

    with open(songs_file, encoding="ISO-8859-1") as song_data:
        songs = csv.reader(song_data)
        header = next(songs)

        curr_information = {}
        curr_properties = {}

        for song in songs:

            for index in range(0, len(header)):
                if header[index] in PROPERTIES:
                    curr_properties[header[index]] = song[index]
                elif header[index] in INFORMATION:
                    curr_information[header[index]] = song[index]

            new_props, new_info = clean_data(curr_properties, curr_information)
            set_songs[curr_information['id']] = Song(new_props, new_info,
                                                     curr_information['name'])
            curr_information.clear()
            curr_properties.clear()

    return set_songs


def genres_to_songs(songs_file: str, artists_file: str, genres_file: str) -> Tuple[dict, dict]:
    """
    Return a mapping of genres from genres_file to songs in songs_file.

    Preconditions:
        - songs_file is a path to a csv file structured in the same way as 'Data/data.csv'
        - artists_file is a path to a csv file structured as 'Data/data_w_genres.csv' is.
        - genres_file is a path to a csv file structured as 'Data/data_by_genres.csv' is
    """
    genres_to_prop = load_genres(genres_file)
    print('Loading genres finished. Next, loading artists:')
    artists_to_genre = load_artists_to_genres(artists_file)
    print('Loading artists finished. Next, loading Songs:')
    songs = load_songs(songs_file)
    print('loading songs finished. Next, assigning genres:')

    genre_to_songs = {}
    genres = []
    songs_to_genre = {}

    for genre in genres_to_prop:
        genre_to_songs[genre] = []

    for song in songs:
        artist = songs[song].information['artists'][0]

        if artist == 'n/a':
            artist = songs[song].information['artists'][1]

        genres = artists_to_genre.get(artist, [])
        # genres = artists_to_genre.get(artist)

        if genres == []:
            genre = song_to_genre(songs[song], GENRES, genres_to_prop)
        else:
            genre = song_to_genre(songs[song], genres, genres_to_prop)

        genre_to_songs[genre].append(songs[song])
        songs_to_genre[songs[song].information['id']] = genre

    return genre_to_songs, songs_to_genre


def get_song_rating(song: Song) -> float:
    """
    Return the rating of a song:

    The rating is calculated by: FILLER RATING FOR NOW
    Preconditions:
        - song.properties != {}
    """
    rating = 0
    for prop in song.properties:
        rating += WEIGHTS[prop] * song.properties[prop]

    return rating


def create_song_graph(songs: list[Song], threshold: float) -> SongGraph:
    """
    Takes a list of songs and return their songGraph. The song graph connects songs which have
    ratings that are within 'threshold' of each other.

    The method above describes how to find the rating of a song.

    Preconditions:
        - threshold > 0
        - all([song.properties != {} for song in songs])
    """
    graph = SongGraph()
    song_ratings = []

    for song in songs:
        graph.add_song(song)
        rating = get_song_rating(song)
        song_ratings.append((rating, song.information['id']))

    song_ratings.sort(key=lambda x: x[0])

    for song_index in range(0, len(song_ratings)):
        potential_songs = song_index + 1

        if potential_songs < len(song_ratings):
            weight = abs(song_ratings[song_index][0] - song_ratings[potential_songs][0])
        else:
            weight = -10

        while weight < threshold and potential_songs < len(song_ratings):
            graph.add_edge(song_ratings[song_index][1], song_ratings[potential_songs][1], weight)
            weight = abs(song_ratings[song_index][0] - song_ratings[potential_songs][0])
            potential_songs += 1

    return graph


def create_genre_graph(songs_file: str, artists_file: str, genres_file: str,
                       threshold: float) -> Tuple[GenreGraph, dict]:
    """
    Returns the main genre graph to be used to recommend songs.

    The graph connects similar 'Genre' objects together. Each genre object contains a
    song graph which connects similar songs of that genre together.

    The graph connects genres that have a rating within threshold of each other
    See below for the rating of a genre.

    Preconditions:
        - songs_file is a path to a csv file structured in the same way as 'Data/data.csv'
        - artists_file is a path to a csv file structured as 'Data/data_w_genres.csv' is.
        - genres_file is a path to a csv file structured as 'Data/data_by_genres.csv' is
        - threshold > 0
    """
    g_to_songs, songs_to_g = genres_to_songs(songs_file, artists_file, genres_file)
    print('Assigning Genres finished. Last, making graph.')
    genres_to_prop = load_genres(genres_file)

    genre_graph = GenreGraph()
    ratings = []
    for genre in g_to_songs:
        curr_song_graph = create_song_graph(g_to_songs[genre], threshold)
        curr_genre = Genre(curr_song_graph, genres_to_prop[genre], genre)

        genre_graph.add_genre(curr_genre)
        ratings.append((get_genre_rating(curr_genre), genre))

    ratings.sort(key=lambda x: x[0])

    for genre_index in range(0, len(ratings)):
        potential_genres = genre_index + 1

        if potential_genres < len(ratings):
            weight = abs(ratings[genre_index][0] - ratings[potential_genres][0])
        else:
            weight = -10

        while weight < threshold and potential_genres < len(ratings):
            genre_graph.add_edge(ratings[genre_index][1], ratings[potential_genres][1], weight)
            weight = abs(ratings[genre_index][0] - ratings[potential_genres][0])
            potential_genres += 1

    return genre_graph, songs_to_g


def get_genre_rating(genre: Genre) -> float:
    """
    Return the rating for a genre

    The rating for a genre is found by: FILLER RATING FOR NOW

    Preconditions:
        - genre.average_properties != {}
    """
    rating = 0
    for prop in genre.average_properties:
        if prop in WEIGHTS:
            rating += WEIGHTS[prop] * genre.average_properties[prop]

    return rating


# if __name__ == '__main__':
#     import python_ta.contracts
#     python_ta.contracts.check_all_contracts()
#
#     import doctest
#     doctest.testmod()
#
#     import python_ta
#     python_ta.check_all(config={
#         'max-line-length': 100,
#         'disable': ['E1136'],
#         'extra-imports': ['pygame', 'networkx', 'pygame_visualization', 'song_graph',
#                           'computations', 'tkinter', 'spotify_methods', 'random', 'main',
#                           'spotipy', 'spotipy.oauth2', 'main', 'graph_visualization', 'datetime',
#                           'csv', 'plotly.graph_objects'],
#         'generated-members': ['pygame.*'],
#         'max-nested-blocks': 4,
#         'allowed-io': ['genres_to_songs', 'load_genres', 'load_artists_to_genres', 'load_songs',
#                        'create_genre_graph']
#     })