{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token length: 726\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 118\u001b[0m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;66;03m#tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\u001b[39;00m\n\u001b[1;32m    117\u001b[0m new_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessages\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massistant\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(tmp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mverifier\u001b[39m\u001b[38;5;124m\"\u001b[39m])})\n\u001b[0;32m--> 118\u001b[0m token_len \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_token_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dict\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmessages\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m token_len_list\u001b[38;5;241m.\u001b[39mappend(token_len)\n\u001b[1;32m    120\u001b[0m data_output\u001b[38;5;241m.\u001b[39mappend(new_dict)\n",
      "Cell \u001b[0;32mIn[1], line 57\u001b[0m, in \u001b[0;36mcheck_token_length\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load the tokenizer\u001b[39;00m\n\u001b[1;32m     54\u001b[0m \n\u001b[1;32m     55\u001b[0m \u001b[38;5;66;03m# Apply chat template and tokenize\u001b[39;00m\n\u001b[1;32m     56\u001b[0m formatted_prompt \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mapply_chat_template(query, tokenize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 57\u001b[0m tokens \u001b[38;5;241m=\u001b[39m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mformatted_prompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;66;03m# Get token length\u001b[39;00m\n\u001b[1;32m     60\u001b[0m token_length \u001b[38;5;241m=\u001b[39m tokens\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2788\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, padding_side, return_tensors, **kwargs)\u001b[0m\n\u001b[1;32m   2750\u001b[0m \u001b[38;5;129m@add_end_docstrings\u001b[39m(\n\u001b[1;32m   2751\u001b[0m     ENCODE_KWARGS_DOCSTRING,\n\u001b[1;32m   2752\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2771\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2772\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m   2773\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   2774\u001b[0m \u001b[38;5;124;03m    Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[1;32m   2775\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2786\u001b[0m \u001b[38;5;124;03m            method).\u001b[39;00m\n\u001b[1;32m   2787\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2788\u001b[0m     encoded_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2789\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2790\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2791\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2792\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2793\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2794\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2795\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2796\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2797\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2798\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2799\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2801\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3207\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   3197\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m   3198\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m   3199\u001b[0m     padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m   3200\u001b[0m     truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3204\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   3205\u001b[0m )\n\u001b[0;32m-> 3207\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3208\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3209\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3210\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3211\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3212\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3213\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3214\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3215\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3216\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3217\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3218\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3219\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3220\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3221\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3222\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3223\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3224\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3225\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3226\u001b[0m \u001b[43m    \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msplit_special_tokens\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3227\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3228\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:603\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[0m\n\u001b[1;32m    579\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_encode_plus\u001b[39m(\n\u001b[1;32m    580\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    581\u001b[0m     text: Union[TextInput, PreTokenizedInput],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    600\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    601\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m BatchEncoding:\n\u001b[1;32m    602\u001b[0m     batched_input \u001b[38;5;241m=\u001b[39m [(text, text_pair)] \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;28;01melse\u001b[39;00m [text]\n\u001b[0;32m--> 603\u001b[0m     batched_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_batch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    604\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbatched_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    605\u001b[0m \u001b[43m        \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    606\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    607\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    608\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    609\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    610\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    611\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    612\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    613\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    614\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    615\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    616\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    617\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    618\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    619\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    620\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    621\u001b[0m \u001b[43m        \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    622\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    623\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    625\u001b[0m     \u001b[38;5;66;03m# Return tensor is None, then we can remove the leading batch axis\u001b[39;00m\n\u001b[1;32m    626\u001b[0m     \u001b[38;5;66;03m# Overflowing tokens are returned as a batch of output so we keep them in this case\u001b[39;00m\n\u001b[1;32m    627\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m return_tensors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_overflowing_tokens:\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:529\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._batch_encode_plus\u001b[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens)\u001b[0m\n\u001b[1;32m    526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m!=\u001b[39m split_special_tokens:\n\u001b[1;32m    527\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m=\u001b[39m split_special_tokens\n\u001b[0;32m--> 529\u001b[0m encodings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_batch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    530\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    531\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    532\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_pretokenized\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    533\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    535\u001b[0m \u001b[38;5;66;03m# Convert encoding to dict\u001b[39;00m\n\u001b[1;32m    536\u001b[0m \u001b[38;5;66;03m# `Tokens` has type: Tuple[\u001b[39;00m\n\u001b[1;32m    537\u001b[0m \u001b[38;5;66;03m#                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],\u001b[39;00m\n\u001b[1;32m    538\u001b[0m \u001b[38;5;66;03m#                       List[EncodingFast]\u001b[39;00m\n\u001b[1;32m    539\u001b[0m \u001b[38;5;66;03m#                    ]\u001b[39;00m\n\u001b[1;32m    540\u001b[0m \u001b[38;5;66;03m# with nested dimensions corresponding to batch, overflows, sequence length\u001b[39;00m\n\u001b[1;32m    541\u001b[0m tokens_and_encodings \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    542\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_convert_encoding(\n\u001b[1;32m    543\u001b[0m         encoding\u001b[38;5;241m=\u001b[39mencoding,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    552\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m encoding \u001b[38;5;129;01min\u001b[39;00m encodings\n\u001b[1;32m    553\u001b[0m ]\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import json\n",
    "from transformers import AutoTokenizer\n",
    "from transformers.utils import logging\n",
    "import torch\n",
    "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n",
    "*** Task description ***\n",
    "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n",
    "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n",
    "- The goal is to infer the transformation from the few training examples.\n",
    "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n",
    "*** DSL description ***\n",
    "- Types and Constants\n",
    "  - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n",
    "  - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n",
    "- Primitives\n",
    "  - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n",
    "  - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n",
    "  - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n",
    "- Grid and Object Manipulation\n",
    "  - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n",
    "  - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n",
    "  - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n",
    "  - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n",
    "- Analysis and Filtering\n",
    "  - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n",
    "  - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n",
    "  - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n",
    "- Connectivity and Bounding\n",
    "  - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n",
    "  - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n",
    "- Utils\n",
    "  - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n",
    "  - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n",
    "  - **Grid Formatting**: `format_grid` casts lists to the grid type.\n",
    "*** Format of the generated code ***\n",
    "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n",
    "- This also means that each line of code is enforced to be a single function call.\n",
    "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n",
    "'''\n",
    "\n",
    "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n",
    "print(f\"Token length: {token_length}\")\n",
    "\n",
    "\n",
    "\n",
    "def check_token_length(query):\n",
    "    # Suppress warnings\n",
    "    logging.set_verbosity_error()\n",
    "\n",
    "    # Load the tokenizer\n",
    "\n",
    "    # Apply chat template and tokenize\n",
    "    formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n",
    "    tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n",
    "\n",
    "    # Get token length\n",
    "    token_length = tokens.shape[1]\n",
    "\n",
    "    #print(f\"Token length: {token_length}\")\n",
    "\n",
    "    # Check if it exceeds the model's context length (assuming 4096 for Llama-2)\n",
    "\n",
    "    return token_length\n",
    "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n",
    "    return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n",
    "\n",
    "def transform_query(query):\n",
    "    result_str = \"\"\n",
    "    previous_result = \"\"\n",
    "    for i, example in enumerate(query):\n",
    "        try:\n",
    "            r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n",
    "            r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n",
    "        except:\n",
    "            print(example)\n",
    "            return None\n",
    "        # input_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"input\"]])\n",
    "        # output_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"output\"]])\n",
    "        input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n",
    "        output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n",
    "        result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n",
    "        previous_result = result_str\n",
    "        if len(result_str) > 14000: #6000\n",
    "            token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n",
    "            #print(f\"previous Token length: {token_length}\")\n",
    "            if token_length > 14000: #6000\n",
    "                return None\n",
    "            return previous_result\n",
    "    token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n",
    "    #print(i, len(result_str))\n",
    "    #print(f\"Token length: {token_length}\")\n",
    "    \n",
    "    return result_str\n",
    "\n",
    "import re\n",
    "f = open(\"multi_step_verifiers_training.txt\", \"r\")\n",
    "data_output = []\n",
    "token_len_list = []\n",
    "skip = 0\n",
    "total = 0\n",
    "for line in f:\n",
    "    total += 1\n",
    "    tmp = json.loads(line)\n",
    "    new_dict = {\"messages\":[]}\n",
    "    new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n",
    "    tran = transform_query(tmp[\"example\"])   \n",
    "    if tran == None:\n",
    "        skip += 1\n",
    "        continue\n",
    "\n",
    "    new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n",
    "    tmp[\"verifier\"][0] =re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n",
    "    #tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\n",
    "    new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n",
    "    token_len = check_token_length(new_dict[\"messages\"])\n",
    "    token_len_list.append(token_len)\n",
    "    data_output.append(new_dict)\n",
    "print(data_output[0][\"messages\"][1][\"content\"])\n",
    "print(\"skip:\", skip)\n",
    "print(\"total:\", total)\n",
    "print(\"Token length max:\", max(token_len_list))\n",
    "print(\"Token length min:\", min(token_len_list))\n",
    "f.close()\n",
    "with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n",
    "    json.dump(data_output, f, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token length: 726\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      " 72%|███████▏  | 96270/134580 [14:57<05:57, 107.26it/s]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:856\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    855\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 856\u001b[0m     item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_items\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpopleft\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    857\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m:\n",
      "\u001b[0;31mIndexError\u001b[0m: pop from an empty deque",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 136\u001b[0m\n\u001b[1;32m    133\u001b[0m         json\u001b[38;5;241m.\u001b[39mdump(data_output, f, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[1;32m    135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m--> 136\u001b[0m     \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[2], line 115\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;66;03m# Create a pool of workers\u001b[39;00m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mp\u001b[38;5;241m.\u001b[39mPool(processes\u001b[38;5;241m=\u001b[39mmp\u001b[38;5;241m.\u001b[39mcpu_count()) \u001b[38;5;28;01mas\u001b[39;00m pool:\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;66;03m# Process lines in parallel with progress bar\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m     results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpool\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlines\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    117\u001b[0m     \u001b[38;5;66;03m# Collect results\u001b[39;00m\n\u001b[1;32m    118\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/tqdm/std.py:1181\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1178\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[1;32m   1180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1181\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m   1182\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m obj\n\u001b[1;32m   1183\u001b[0m         \u001b[38;5;66;03m# Update and possibly print the progressbar.\u001b[39;00m\n\u001b[1;32m   1184\u001b[0m         \u001b[38;5;66;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;00m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:861\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 861\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    863\u001b[0m     item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_items\u001b[38;5;241m.\u001b[39mpopleft()\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    321\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    322\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import json\n",
    "from transformers import AutoTokenizer\n",
    "from transformers.utils import logging\n",
    "import torch\n",
    "import multiprocessing as mp\n",
    "from tqdm import tqdm\n",
    "\n",
    "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n",
    "*** Task description ***\n",
    "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n",
    "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n",
    "- The goal is to infer the transformation from the few training examples.\n",
    "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n",
    "*** DSL description ***\n",
    "- Types and Constants\n",
    "  - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n",
    "  - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n",
    "- Primitives\n",
    "  - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n",
    "  - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n",
    "  - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n",
    "- Grid and Object Manipulation\n",
    "  - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n",
    "  - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n",
    "  - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n",
    "  - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n",
    "- Analysis and Filtering\n",
    "  - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n",
    "  - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n",
    "  - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n",
    "- Connectivity and Bounding\n",
    "  - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n",
    "  - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n",
    "- Utils\n",
    "  - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n",
    "  - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n",
    "  - **Grid Formatting**: `format_grid` casts lists to the grid type.\n",
    "*** Format of the generated code ***\n",
    "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n",
    "- This also means that each line of code is enforced to be a single function call.\n",
    "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n",
    "'''\n",
    "\n",
    "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n",
    "print(f\"Token length: {token_length}\")\n",
    "\n",
    "def check_token_length(query):\n",
    "    # Suppress warnings\n",
    "    logging.set_verbosity_error()\n",
    "\n",
    "    # Apply chat template and tokenize\n",
    "    formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n",
    "    tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n",
    "\n",
    "    # Get token length\n",
    "    token_length = tokens.shape[1]\n",
    "\n",
    "    return token_length\n",
    "\n",
    "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n",
    "    return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n",
    "\n",
    "def transform_query(query):\n",
    "    result_str = \"\"\n",
    "    previous_result = \"\"\n",
    "    for i, example in enumerate(query):\n",
    "        try:\n",
    "            r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n",
    "            r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n",
    "        except:\n",
    "            print(example)\n",
    "            return None\n",
    "        input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n",
    "        output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n",
    "        result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n",
    "        previous_result = result_str\n",
    "        if len(result_str) > 14000:\n",
    "            token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n",
    "            if token_length > 14000:\n",
    "                return None\n",
    "            return previous_result\n",
    "    token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n",
    "    \n",
    "    return result_str\n",
    "\n",
    "def process_line(line):\n",
    "    tmp = json.loads(line)\n",
    "    new_dict = {\"messages\":[]}\n",
    "    new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n",
    "    tran = transform_query(tmp[\"example\"])   \n",
    "    if tran == None:\n",
    "        return None\n",
    "\n",
    "    new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n",
    "    tmp[\"verifier\"][0] = re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n",
    "    new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n",
    "    token_len = check_token_length(new_dict[\"messages\"])\n",
    "    return new_dict, token_len\n",
    "\n",
    "def main():\n",
    "    with open(\"multi_step_verifiers_training.txt\", \"r\") as f:\n",
    "        lines = f.readlines()\n",
    "    \n",
    "    total = len(lines)\n",
    "    data_output = []\n",
    "    token_len_list = []\n",
    "    skip = 0\n",
    "    \n",
    "    # Create a pool of workers\n",
    "    with mp.Pool(processes=mp.cpu_count()) as pool:\n",
    "        # Process lines in parallel with progress bar\n",
    "        results = list(tqdm(pool.imap(process_line, lines), total=total))\n",
    "        \n",
    "        # Collect results\n",
    "        for result in results:\n",
    "            if result is None:\n",
    "                skip += 1\n",
    "                continue\n",
    "            new_dict, token_len = result\n",
    "            data_output.append(new_dict)\n",
    "            token_len_list.append(token_len)\n",
    "    \n",
    "    print(data_output[0][\"messages\"][1][\"content\"])\n",
    "    print(\"skip:\", skip)\n",
    "    print(\"total:\", total)\n",
    "    print(\"Token length max:\", max(token_len_list))\n",
    "    print(\"Token length min:\", min(token_len_list))\n",
    "    \n",
    "    with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n",
    "        json.dump(data_output, f, indent=4)\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'def verifier_9f6b5f41(I: Grid) -> Grid:'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "def replace_verifier_with_solver(input_string):\n",
    "    return re.sub(r'veri.*?\\(', 'solver(', input_string)\n",
    "\n",
    "original_string = 'def verifier_9f6b5f41(I: Grid) -> Grid:'\n",
    "modified_string = replace_verifier_with_solver(original_string)\n",
    "print(modified_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "# merge two datasets\n",
    "data1 = json.load(open(\"multi_step_verifiers_training.json\", \"r\"))\n",
    "data2 = json.load(open(\"re_arc_v4.json\", \"r\"))\n",
    "data1.extend(data2)\n",
    "with open(\"multi_step_merged_arc_v4.json\", \"w\") as f:\n",
    "    json.dump(data1, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}