File size: 7,818 Bytes

f5586d9

"""
This file manages the loading of the data
"""
import csv
import os
import pickle
import string

import numpy as np
import pretty_midi


def get_midi_files(midi_pickle, midi_folder, artists, names):
    """
    This function loads the midi files
    :param midi_pickle: path for the pickle file
    :param midi_folder: path for the midi folder
    :param artists: list of artist
    :param names: list of song names
    :return: list of pretty midi objects
    """
    # If the pickle file is already exists, read that file
    pretty_midi_songs = _read_pickle_if_exists(pickle_path=midi_pickle)
    if pretty_midi_songs is None:  # If the pickle is exists, covert the list into variables
        pretty_midi_songs = []
        lower_upper_files = get_lower_upper_dict(midi_folder)
        if len(artists) != len(names):
            raise Exception('Artists and Names lengths are different.')
        for artist, song_name in zip(artists, names):
            if song_name[0] == " ":
                song_name = song_name[1:]
            song_file_name = f'{artist}_-_{song_name}.mid'.replace(" ", "_")
            if song_file_name not in lower_upper_files:
                print(f'Song {song_file_name} does not exist, even though'
                      f' the song is provided in the training or testing sets')
                continue
            original_file_name = lower_upper_files[song_file_name]
            midi_file_path = os.path.join(midi_folder, original_file_name)
            try:
                pretty_midi_format = pretty_midi.PrettyMIDI(midi_file_path)
                pretty_midi_songs.append(pretty_midi_format)
            except Exception:
                print(f'Exception raised from Mido using this file: {midi_file_path}')

        _save_pickle(pickle_path=midi_pickle, content=pretty_midi_songs)
    return pretty_midi_songs


def get_lower_upper_dict(midi_folder):
    """
    This function maps between lower case name to upper case name
    :param midi_folder: midi folder path
    :return: A dictionary between lower case name to upper case name
    """
    lower_upper_files = {}
    for file_name in os.listdir(midi_folder):
        if file_name.endswith(".mid"):
            lower_upper_files[file_name.lower()] = file_name
    return lower_upper_files


def get_input_sets(input_file, pickle_path, word2vec, midi_folder) -> (list, list, list):
    """
    This function loads the training and testing set that provided by the course staff.
    In addition some pre-processing methods are work here.
    :param input_file: training or testing set path
    :param pickle_path: training or testing pickle path
    :param word2vec: dictionary maps between a word and a vector
    :param midi_folder: the midi folder that we use to validate if song is exists
    :return: Nothing
    """
    # If the pickle file is already exists, read that file
    pickle_value = _read_pickle_if_exists(pickle_path=pickle_path)
    # We want only songs with midi file
    lower_upper_files = get_lower_upper_dict(midi_folder)
    if pickle_value is not None:  # If the pickle is exists, covert the list into variables
        artists, names, lyrics = pickle_value[0], pickle_value[1], pickle_value[2]
    else:  # The pickle file is exists.
        artists, names, lyrics = [], [], []
        with open(input_file, newline='') as f:
            lines = csv.reader(f, delimiter=',', quotechar='|')
            for row in lines:
                artist_name = row[0]
                song_name = row[1]
                if song_name[0] == " ":
                    song_name = song_name[1:]
                song_file_name = f'{artist_name}_-_{song_name}.mid'.replace(" ", "_")
                if song_file_name not in lower_upper_files:
                    print(f'Song {song_file_name} does not exist, even though'
                          f' the song is provided in the training or testing sets')
                    continue
                original_file_name = lower_upper_files[song_file_name]
                midi_file_path = os.path.join(midi_folder, original_file_name)
                try:
                    pretty_midi.PrettyMIDI(midi_file_path)
                except Exception:
                    print(f'Exception raised from Mido using this file: {midi_file_path}')
                    continue
                song_lyrics = row[2]
                song_lyrics = song_lyrics.replace('&', '')
                song_lyrics = song_lyrics.replace('  ', ' ')
                song_lyrics = song_lyrics.replace('\'', '')
                song_lyrics = song_lyrics.replace('--', ' ')

                tokens = song_lyrics.split()
                table = str.maketrans('', '', string.punctuation)  # remove punctuation from each token
                tokens = [w.translate(table) for w in tokens]
                tokens = [word for word in tokens if
                          word.isalpha()]  # remove remaining tokens that are not alphabetic
                tokens = [word.lower() for word in tokens if word.lower() in word2vec]  # make lower case
                song_lyrics = ' '.join(tokens)
                artists.append(artist_name)
                names.append(song_name)
                lyrics.append(song_lyrics)
        _save_pickle(pickle_path=pickle_path, content=[artists, names, lyrics])

    return {'artists': artists, 'names': names, 'lyrics': lyrics}


def get_word2vec(word2vec_path, pre_trained, vector_size, encoding='utf-8') -> dict:
    """
    This function returns a dictionary that maps between word and a vector
    :param word2vec_path: path for the pickle file
    :param pre_trained: path for the pre-trained embedding file
    :param vector_size: the vector size for each word
    :param encoding: the encoding the the pre_trained file
    :return: dictionary maps between a word and a vector
    """
    # If the pickle file is already exists, read that file
    word2vec = _read_pickle_if_exists(word2vec_path)
    if word2vec is None:  # The pickle file is not exists.
        with open(pre_trained, 'r', encoding=encoding) as f:  # Read a pre-trained word vectors.
            list_of_lines = list(f)
        word2vec = _iterate_over_glove_list(list_of_lines=list_of_lines, vector_size=vector_size)
        _save_pickle(pickle_path=word2vec_path, content=word2vec)  # Save pickle for the next running
    return word2vec


def _iterate_over_glove_list(list_of_lines, vector_size):
    """
    This function iterates over the glove list line by line and returns a word2vec dictionary
    :param list_of_lines: List of glove lines
    :param vector_size: the size of the embedding vector size
    :return: dictionary maps between a word and a vector
    """
    word2vec = {}
    punctuation = string.punctuation
    for line in list_of_lines:
        values = line.split(' ')
        word = values[0]
        if word in punctuation:
            continue
        vec = np.asarray(values[1:], "float32")
        if len(vec) != vector_size:
            raise Warning(f"Vector size is different than {vector_size}")
        else:
            word2vec[word] = vec
    return word2vec


def _save_pickle(pickle_path, content):
    """
    This function saves a value to pickle file
    :param pickle_path: path for the pickle file
    :param content: the value you want to save
    :return: Nothing
    """
    with open(pickle_path, 'wb') as f:
        pickle.dump(content, f)


def _read_pickle_if_exists(pickle_path):
    """
    This function reads a pickle file
    :param pickle_path:path for the pickle file
    :return: the saved value in the pickle file
    """
    pickle_file = None
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            pickle_file = pickle.load(f)
    return pickle_file


print('Loaded Successfully')