import re import json with open("./datasets/glaive/glaive-function-calling-v2.json", "r") as f: data = json.load(f) def process_sample(sample): system_text = sample.get("system", "") chat_text = sample.get("chat", "") # Extract tools tools_match = re.findall( r'"name": "(.*?)",.*?"description": "(.*?)"', system_text, re.S ) if not tools_match: return None # Skip samples without tools tools = [{"name": name, "description": desc} for name, desc in tools_match] # Extract function calls and corresponding instructions function_calls = re.findall(r' {"name": "(.*?)"', chat_text) if not function_calls: return None # Skip samples without function calls # Extract ASSISTANT responses and filter for valid instructions assistant_responses = re.findall( r"ASSISTANT: (.*?)<\|endoftext\|>", chat_text, re.S ) instructions = [] for call, response in zip(function_calls, assistant_responses): # Find the user prompt just before this ASSISTANT response user_prompt_match = re.search( rf'USER: (.*?)\n.*? {{.*?"{call}".*?}}', chat_text, re.S ) if user_prompt_match: instructions.append( { "instruction": user_prompt_match.group(1).strip(), "tools": tools, "used_tools": [call], # Single tool for this instruction } ) return instructions from tqdm import tqdm results = [] for sample in tqdm(data, desc="Processing GLAIVE samples"): processed = process_sample(sample) if processed: results.extend(processed) with open("./datasets/glaive/output.json", "w") as f: json.dump(results, f, indent=2)