|
import logging |
|
import os |
|
import tempfile |
|
|
|
import pytest |
|
from dotenv import dotenv_values |
|
|
|
from s3prl.dataio.corpus.librispeech import LibriSpeech |
|
from s3prl.dataio.encoder.tokenizer import load_tokenizer |
|
from s3prl.dataio.encoder.vocabulary import generate_vocab |
|
|
|
SAMPLE = "GOOD MORNING MY FRIEND" |
|
|
|
|
|
def is_same_vocab(vocabs_1, vocabs_2): |
|
if len(vocabs_1) != len(vocabs_2): |
|
return False |
|
|
|
for v1, v2 in zip(vocabs_1, vocabs_2): |
|
if v1 != v2: |
|
return False |
|
|
|
return True |
|
|
|
|
|
@pytest.mark.corpus |
|
def test_vocabulary(): |
|
config = dotenv_values() |
|
corpus = LibriSpeech(config["LibriSpeech"]) |
|
text_list = corpus.data_dict["train-clean-100"]["text_list"] |
|
|
|
with tempfile.TemporaryDirectory() as directory: |
|
logging.info(directory) |
|
text_file = os.path.join(directory, "text.txt") |
|
|
|
with open(text_file, "w") as fp: |
|
for text in text_list: |
|
fp.write(text + "\n") |
|
|
|
|
|
char_vocabs_1 = generate_vocab("character", text_list=text_list) |
|
char_vocabs_2 = generate_vocab("character", text_file=text_file) |
|
|
|
assert isinstance(char_vocabs_1, list) |
|
assert isinstance(char_vocabs_2, list) |
|
assert is_same_vocab(char_vocabs_1, char_vocabs_2) |
|
|
|
char_tokenizer = load_tokenizer("character", vocab_list=char_vocabs_1) |
|
assert char_tokenizer.decode(char_tokenizer.encode(SAMPLE)) == SAMPLE |
|
|
|
|
|
word_vocabs_1 = generate_vocab("word", text_list=text_list, vocab_size=5000) |
|
word_vocabs_2 = generate_vocab("word", text_file=text_file, vocab_size=5000) |
|
|
|
assert isinstance(word_vocabs_1, list) |
|
assert isinstance(word_vocabs_2, list) |
|
assert is_same_vocab(word_vocabs_1, word_vocabs_2) |
|
|
|
word_tokenizer = load_tokenizer("word", vocab_list=word_vocabs_1) |
|
assert word_tokenizer.decode(word_tokenizer.encode(SAMPLE)) == SAMPLE |
|
|
|
|
|
vocab_file_1 = os.path.join(directory, "subword_1") |
|
vocab_file_2 = os.path.join(directory, "subword_2") |
|
|
|
subword_vocabs_1 = generate_vocab( |
|
"subword", text_list=text_list, vocab_size=500, output_file=vocab_file_1 |
|
) |
|
subword_vocabs_2 = generate_vocab( |
|
"subword", text_file=text_file, vocab_size=500, output_file=vocab_file_2 |
|
) |
|
|
|
subword_tokenizer_1 = load_tokenizer( |
|
"subword", vocab_file=vocab_file_1 + ".model" |
|
) |
|
subword_tokenizer_2 = load_tokenizer( |
|
"subword", vocab_file=vocab_file_2 + ".model" |
|
) |
|
assert subword_tokenizer_1.decode(subword_tokenizer_1.encode(SAMPLE)) == SAMPLE |
|
assert subword_tokenizer_2.decode(subword_tokenizer_2.encode(SAMPLE)) == SAMPLE |
|
assert subword_tokenizer_1.encode(SAMPLE) == subword_tokenizer_2.encode(SAMPLE) |
|
|