|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import json |
|
import multiprocessing as mp |
|
import os |
|
|
|
import numpy as np |
|
from matplotlib import pyplot as plt |
|
|
|
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer |
|
|
|
|
|
|
|
|
|
|
|
worker_data = { |
|
"tokenizer": None, |
|
} |
|
|
|
|
|
def init_tokenizer(library, tokenizer_model): |
|
tokenizer = get_nmt_tokenizer(library=library, tokenizer_model=tokenizer_model) |
|
worker_data["tokenizer"] = tokenizer |
|
|
|
|
|
def read_batch(fh, batch_size): |
|
""" |
|
Reads a batch (or smaller) chunk of lines. |
|
""" |
|
lines = [] |
|
for i in range(batch_size): |
|
l = fh.readline() |
|
if not l: |
|
break |
|
else: |
|
lines.append(l.strip()) |
|
|
|
return lines |
|
|
|
|
|
def tokenize_line(line, tokenizer): |
|
""" |
|
Returns a tokenized line |
|
""" |
|
line = line.rstrip("\n") |
|
tokens = tokenizer.text_to_ids(line) |
|
|
|
return tokens |
|
|
|
|
|
def line_len(line, tokenizer=None): |
|
""" |
|
Returns a tokenized length of a text line |
|
""" |
|
if tokenizer is None: |
|
tokenizer = worker_data["tokenizer"] |
|
|
|
tokens = tokenize_line(line, tokenizer) |
|
|
|
return len(tokens) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser(description='Collects statistics over tokenized dataset') |
|
parser.add_argument('input_files', metavar='N', type=str, nargs='+', help='Input files to parse') |
|
parser.add_argument( |
|
'--tokenizer_library', type=str, required=True, help='Path to pre-trained nemo-supported tokenizer model' |
|
) |
|
parser.add_argument( |
|
'--tokenizer_model', type=str, required=True, help='Path to pre-trained nemo-supported tokenizer model' |
|
) |
|
parser.add_argument( |
|
'--num_workers', type=int, default=mp.cpu_count(), help='Number of workers (default to number of CPUs)' |
|
) |
|
parser.add_argument('--max_lines', type=int, default=-1, help='Max number of lines to parse') |
|
parser.add_argument('--batch_size', type=int, default=10000000, help='Batch size to parse in parallel') |
|
parser.add_argument('--out_dir', type=str, default="", help='Path to store data and plots') |
|
|
|
args = parser.parse_args() |
|
|
|
tokenizer = get_nmt_tokenizer(library=args.tokenizer_library, tokenizer_model=args.tokenizer_model,) |
|
|
|
all_len = [] |
|
|
|
for fn in args.input_files: |
|
print(f"Parsing fn = {fn}") |
|
|
|
fh = open(fn) |
|
|
|
|
|
while True: |
|
lines = read_batch(fh, args.batch_size) |
|
|
|
|
|
if not lines: |
|
break |
|
|
|
|
|
with mp.Pool( |
|
args.num_workers, initializer=init_tokenizer, initargs=(args.tokenizer_library, args.tokenizer_model) |
|
) as p: |
|
all_len.extend(p.map(line_len, lines)) |
|
|
|
print(f"{fn}: Parsed {len(all_len)} lines") |
|
|
|
|
|
if (args.max_lines > 0) and (len(all_len) >= args.max_lines): |
|
lines = lines[: args.max_lines] |
|
break |
|
|
|
|
|
if (args.max_lines > 0) and (len(all_len) >= args.max_lines): |
|
lines = lines[: args.max_lines] |
|
break |
|
|
|
|
|
|
|
|
|
if args.out_dir: |
|
os.makedirs(args.out_dir, exist_ok=True) |
|
|
|
stats = { |
|
"samples": int(len(all_len)), |
|
"mean": float(np.mean(all_len)), |
|
"stdev": float(np.std(all_len)), |
|
"min": float(np.min(all_len)), |
|
"max": float(np.max(all_len)), |
|
"median": float(np.median(all_len)), |
|
} |
|
|
|
print(f"stats = \n{stats}") |
|
|
|
|
|
if args.out_dir: |
|
if not os.path.exists(args.out_dir): |
|
os.makedirs(args.out_dir, exist_ok=True) |
|
|
|
fh = open(os.path.join(args.out_dir, "lengths.txt"), "w") |
|
fh.writelines(["{l}\n".format(l=l) for l in all_len]) |
|
|
|
json.dump(stats, open(os.path.join(args.out_dir, "stats.json"), "w")) |
|
|
|
fig = plt.hist(all_len) |
|
plt.savefig(os.path.join(args.out_dir, "lengths_hist.pdf")) |
|
|