File size: 7,818 Bytes
f5586d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
"""
This file manages the loading of the data
"""
import csv
import os
import pickle
import string
import numpy as np
import pretty_midi
def get_midi_files(midi_pickle, midi_folder, artists, names):
"""
This function loads the midi files
:param midi_pickle: path for the pickle file
:param midi_folder: path for the midi folder
:param artists: list of artist
:param names: list of song names
:return: list of pretty midi objects
"""
# If the pickle file is already exists, read that file
pretty_midi_songs = _read_pickle_if_exists(pickle_path=midi_pickle)
if pretty_midi_songs is None: # If the pickle is exists, covert the list into variables
pretty_midi_songs = []
lower_upper_files = get_lower_upper_dict(midi_folder)
if len(artists) != len(names):
raise Exception('Artists and Names lengths are different.')
for artist, song_name in zip(artists, names):
if song_name[0] == " ":
song_name = song_name[1:]
song_file_name = f'{artist}_-_{song_name}.mid'.replace(" ", "_")
if song_file_name not in lower_upper_files:
print(f'Song {song_file_name} does not exist, even though'
f' the song is provided in the training or testing sets')
continue
original_file_name = lower_upper_files[song_file_name]
midi_file_path = os.path.join(midi_folder, original_file_name)
try:
pretty_midi_format = pretty_midi.PrettyMIDI(midi_file_path)
pretty_midi_songs.append(pretty_midi_format)
except Exception:
print(f'Exception raised from Mido using this file: {midi_file_path}')
_save_pickle(pickle_path=midi_pickle, content=pretty_midi_songs)
return pretty_midi_songs
def get_lower_upper_dict(midi_folder):
"""
This function maps between lower case name to upper case name
:param midi_folder: midi folder path
:return: A dictionary between lower case name to upper case name
"""
lower_upper_files = {}
for file_name in os.listdir(midi_folder):
if file_name.endswith(".mid"):
lower_upper_files[file_name.lower()] = file_name
return lower_upper_files
def get_input_sets(input_file, pickle_path, word2vec, midi_folder) -> (list, list, list):
"""
This function loads the training and testing set that provided by the course staff.
In addition some pre-processing methods are work here.
:param input_file: training or testing set path
:param pickle_path: training or testing pickle path
:param word2vec: dictionary maps between a word and a vector
:param midi_folder: the midi folder that we use to validate if song is exists
:return: Nothing
"""
# If the pickle file is already exists, read that file
pickle_value = _read_pickle_if_exists(pickle_path=pickle_path)
# We want only songs with midi file
lower_upper_files = get_lower_upper_dict(midi_folder)
if pickle_value is not None: # If the pickle is exists, covert the list into variables
artists, names, lyrics = pickle_value[0], pickle_value[1], pickle_value[2]
else: # The pickle file is exists.
artists, names, lyrics = [], [], []
with open(input_file, newline='') as f:
lines = csv.reader(f, delimiter=',', quotechar='|')
for row in lines:
artist_name = row[0]
song_name = row[1]
if song_name[0] == " ":
song_name = song_name[1:]
song_file_name = f'{artist_name}_-_{song_name}.mid'.replace(" ", "_")
if song_file_name not in lower_upper_files:
print(f'Song {song_file_name} does not exist, even though'
f' the song is provided in the training or testing sets')
continue
original_file_name = lower_upper_files[song_file_name]
midi_file_path = os.path.join(midi_folder, original_file_name)
try:
pretty_midi.PrettyMIDI(midi_file_path)
except Exception:
print(f'Exception raised from Mido using this file: {midi_file_path}')
continue
song_lyrics = row[2]
song_lyrics = song_lyrics.replace('&', '')
song_lyrics = song_lyrics.replace(' ', ' ')
song_lyrics = song_lyrics.replace('\'', '')
song_lyrics = song_lyrics.replace('--', ' ')
tokens = song_lyrics.split()
table = str.maketrans('', '', string.punctuation) # remove punctuation from each token
tokens = [w.translate(table) for w in tokens]
tokens = [word for word in tokens if
word.isalpha()] # remove remaining tokens that are not alphabetic
tokens = [word.lower() for word in tokens if word.lower() in word2vec] # make lower case
song_lyrics = ' '.join(tokens)
artists.append(artist_name)
names.append(song_name)
lyrics.append(song_lyrics)
_save_pickle(pickle_path=pickle_path, content=[artists, names, lyrics])
return {'artists': artists, 'names': names, 'lyrics': lyrics}
def get_word2vec(word2vec_path, pre_trained, vector_size, encoding='utf-8') -> dict:
"""
This function returns a dictionary that maps between word and a vector
:param word2vec_path: path for the pickle file
:param pre_trained: path for the pre-trained embedding file
:param vector_size: the vector size for each word
:param encoding: the encoding the the pre_trained file
:return: dictionary maps between a word and a vector
"""
# If the pickle file is already exists, read that file
word2vec = _read_pickle_if_exists(word2vec_path)
if word2vec is None: # The pickle file is not exists.
with open(pre_trained, 'r', encoding=encoding) as f: # Read a pre-trained word vectors.
list_of_lines = list(f)
word2vec = _iterate_over_glove_list(list_of_lines=list_of_lines, vector_size=vector_size)
_save_pickle(pickle_path=word2vec_path, content=word2vec) # Save pickle for the next running
return word2vec
def _iterate_over_glove_list(list_of_lines, vector_size):
"""
This function iterates over the glove list line by line and returns a word2vec dictionary
:param list_of_lines: List of glove lines
:param vector_size: the size of the embedding vector size
:return: dictionary maps between a word and a vector
"""
word2vec = {}
punctuation = string.punctuation
for line in list_of_lines:
values = line.split(' ')
word = values[0]
if word in punctuation:
continue
vec = np.asarray(values[1:], "float32")
if len(vec) != vector_size:
raise Warning(f"Vector size is different than {vector_size}")
else:
word2vec[word] = vec
return word2vec
def _save_pickle(pickle_path, content):
"""
This function saves a value to pickle file
:param pickle_path: path for the pickle file
:param content: the value you want to save
:return: Nothing
"""
with open(pickle_path, 'wb') as f:
pickle.dump(content, f)
def _read_pickle_if_exists(pickle_path):
"""
This function reads a pickle file
:param pickle_path:path for the pickle file
:return: the saved value in the pickle file
"""
pickle_file = None
if os.path.exists(pickle_path):
with open(pickle_path, 'rb') as f:
pickle_file = pickle.load(f)
return pickle_file
print('Loaded Successfully')
|