import json
from tqdm import tqdm

DATASETS = ["apigen", "glaive", "toolace"]

data = []

for dataset_name in DATASETS:
    with open(f"./datasets/{dataset_name}/output.json") as f:
        subdata = json.load(f)

    subdata = [{**item, "source": dataset_name} for item in subdata]

    data.extend(subdata)

# filter all samples with used functions not in tool list
new_data = []

for sample in data:
    tools = [tool["name"] for tool in sample["tools"]]

    if any([used_tool not in tools for used_tool in sample["used_tools"]]):
        continue

    new_data.append(sample)

data = new_data

print("Number of samples:", len(data))


def generate_dataset(data):
    tools = {}
    dataset = []

    for sample in tqdm(data, desc="Processing samples"):

        for tool in sample["tools"]:
            if tool["name"] in tools:
                continue

            tools[tool["name"]] = {
                "name": tool["name"],
                "description": tool["description"],
                "id": len(tools),
                "source": sample["source"],
            }

        used_tools = []

        for tool_name in sample["used_tools"]:
            used_tools.append(tools[tool_name]["id"])

        new_sample = {
            "instruction": sample["instruction"],
            "tools": used_tools,
            "source": sample["source"],
        }

        dataset.append(new_sample)

    return {"tools": list(tools.values()), "samples": dataset}


from collections import defaultdict


def count(data):
    used_tools_count = defaultdict(int)

    for item in data:
        used_tools_count[len(item["used_tools"])] += 1

    return used_tools_count


print("Used tools count in dataset:")
for k, v in count(data).items():
    print(f"{k} used tools: {v} samples")

from random import shuffle, seed

# split dataset based on used tools count
# for one and two used tools we will use 80-20 split
# for three used tools we will all samples in test set
one_two = [item for item in data if len(item["used_tools"]) in [1, 2]]
other = [item for item in data if len(item["used_tools"]) > 2]

seed(42)
shuffle(one_two)

seed(42)
shuffle(other)

train_samples = one_two[: int(len(one_two) * 0.8)]
test_samples = one_two[int(len(one_two) * 0.8) :] + other

print("Train samples count:", len(train_samples))
print("Test samples count:", len(test_samples))

# count(train_samples), count(test_samples)
print("Train samples count based on used tools count:")
for k, v in count(train_samples).items():
    print(f"{k} used tools: {v} samples")

print("Test samples count based on used tools count:")
for k, v in count(test_samples).items():
    print(f"{k} used tools: {v} samples")

train_dataset = generate_dataset(train_samples)
test_dataset = generate_dataset(test_samples)

seed(42)
shuffle(train_dataset["samples"])

seed(42)
shuffle(test_dataset["samples"])

print("Number of tools in train dataset:", len(train_dataset["tools"]))
print("Number of samples in train dataset:", len(train_dataset["samples"]))

print("Number of tools in test dataset:", len(test_dataset["tools"]))
print("Number of samples in test dataset:", len(test_dataset["samples"]))

import os

os.makedirs("./datasets/mixed", exist_ok=True)

with open("./datasets/mixed/train.json", "w") as f:
    json.dump(train_dataset, f, indent=2)

with open("./datasets/mixed/test.json", "w") as f:
    json.dump(test_dataset, f, indent=2)