import json import re with open("./datasets/toolace/data.json") as f: data = json.load(f) def process_sample(sample): system_message = sample["system"] functions_match = re.findall( r'\{"name": "(.*?)", "description": "(.*?)"', system_message ) functions = [ {"name": name.replace(" ", "_").replace("/", "_"), "description": desc} for name, desc in functions_match ] # Process the conversation to extract instructions and function calls conversations = sample["conversations"] results = [] for i in range(len(conversations)): entry = conversations[i] # Check if the entry is from the user and the next entry is an assistant function call if ( entry["from"] == "user" and i + 1 < len(conversations) and conversations[i + 1]["from"] == "assistant" ): function_call = conversations[i + 1]["value"] if re.match(r"\[.*\]", function_call): # Ensure it contains a function call # Extract full tool names (including spaces) from the function call used_tools = [ tool.replace(" ", "_").replace("/", "_") for tool in re.findall(r"\[([^\(]+)\(", function_call) ] if len(used_tools) == 0: continue results.append( { "instruction": entry["value"], "tools": functions, "used_tools": used_tools, } ) return results from tqdm import tqdm results = [] for sample in tqdm(data, desc="Processing ToolACE samples"): processed = process_sample(sample) if processed: results.extend(processed) with open("./datasets/toolace/output.json", "w") as f: json.dump(results, f, indent=2)