train_model.py

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os  

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

lyrics_file = input('Enter the lyrics file name: ')

# Load the lyrics file
data = open(f'./Lyrics/{lyrics_file}.txt', encoding="utf8").read()

# Lowercase and split the text
corpus = data.lower().split("\n")

# Preview the lyrics
print(corpus)

# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in corpus:

	# Tokenize the current line
	token_list = tokenizer.texts_to_sequences([line])[0]

	# Loop over the line several times to generate the subphrases
	for i in range(1, len(token_list)):
		
		# Generate the subphrase
		n_gram_sequence = token_list[:i+1]

		# Append the subphrase to the sequences list
		input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Get sample sentence
sentence = corpus[0].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence: 
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

# Number of training examples 
print(f'Number of training examples: {len(xs)}')

# Hyperparameters
embedding_dim = 100
lstm_units = 300
learning_rate = 0.01

# Build the model
model = Sequential([
          Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
          Bidirectional(LSTM(lstm_units)),
          Dense(total_words, activation='softmax')
])

# Use categorical crossentropy because this is a multi-class problem
model.compile(
    loss='categorical_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
    metrics=['accuracy']
    )

# Print the model summary
model.summary()

epochs = 100
batch_size = 128

# Create checkpoints that save every 10 epochs
# Include epoch in the filename 
checkpoint_path = "Checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
  filepath = checkpoint_path,
  verbose = 1,
  save_weights_only = True,
  save_freq = 10*round(len(xs)/batch_size))

# Create callback that stops training if loss does not improve after 5 epochs
loss_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5) 

# Save the weights using the 'checkpoint_path' format
model.save_weights(checkpoint_path.format(epoch = 0))

model_name = input('Specify a name for the model: ')

# Train the model
history = model.fit(xs, ys, epochs=epochs, batch_size = batch_size, callbacks=[cp_callback, loss_callback], verbose=1)

#Save model
model.save(f'Models/{model_name}')

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')