{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token length: 726\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 118\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m#tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\u001b[39;00m\n\u001b[1;32m 117\u001b[0m new_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessages\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massistant\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(tmp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mverifier\u001b[39m\u001b[38;5;124m\"\u001b[39m])})\n\u001b[0;32m--> 118\u001b[0m token_len \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_token_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dict\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmessages\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 119\u001b[0m token_len_list\u001b[38;5;241m.\u001b[39mappend(token_len)\n\u001b[1;32m 120\u001b[0m data_output\u001b[38;5;241m.\u001b[39mappend(new_dict)\n", "Cell \u001b[0;32mIn[1], line 57\u001b[0m, in \u001b[0;36mcheck_token_length\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load the tokenizer\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# Apply chat template and tokenize\u001b[39;00m\n\u001b[1;32m 56\u001b[0m formatted_prompt \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mapply_chat_template(query, tokenize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 57\u001b[0m tokens \u001b[38;5;241m=\u001b[39m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mformatted_prompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# Get token length\u001b[39;00m\n\u001b[1;32m 60\u001b[0m token_length \u001b[38;5;241m=\u001b[39m tokens\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n", "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2788\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, padding_side, return_tensors, **kwargs)\u001b[0m\n\u001b[1;32m 2750\u001b[0m \u001b[38;5;129m@add_end_docstrings\u001b[39m(\n\u001b[1;32m 2751\u001b[0m ENCODE_KWARGS_DOCSTRING,\n\u001b[1;32m 2752\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2771\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 2772\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m 2773\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2774\u001b[0m \u001b[38;5;124;03m Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[1;32m 2775\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2786\u001b[0m \u001b[38;5;124;03m method).\u001b[39;00m\n\u001b[1;32m 2787\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2788\u001b[0m encoded_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2789\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2790\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2791\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2792\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2793\u001b[0m \u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2794\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2795\u001b[0m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2796\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2797\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2798\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2799\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2801\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3207\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 3197\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m 3198\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m 3199\u001b[0m padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m 3200\u001b[0m truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3204\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 3205\u001b[0m )\n\u001b[0;32m-> 3207\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3208\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3209\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3210\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3211\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3212\u001b[0m \u001b[43m \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3213\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3214\u001b[0m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3215\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3216\u001b[0m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3217\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3218\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3219\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3220\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3221\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3222\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3223\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3224\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3225\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3226\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msplit_special_tokens\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3227\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3228\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:603\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_encode_plus\u001b[39m(\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 581\u001b[0m text: Union[TextInput, PreTokenizedInput],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 601\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m BatchEncoding:\n\u001b[1;32m 602\u001b[0m batched_input \u001b[38;5;241m=\u001b[39m [(text, text_pair)] \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;28;01melse\u001b[39;00m [text]\n\u001b[0;32m--> 603\u001b[0m batched_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_batch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 604\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatched_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 605\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 606\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 612\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 614\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 615\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 616\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 617\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 618\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 619\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 620\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 621\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 623\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;66;03m# Return tensor is None, then we can remove the leading batch axis\u001b[39;00m\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# Overflowing tokens are returned as a batch of output so we keep them in this case\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_tensors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_overflowing_tokens:\n", "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:529\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._batch_encode_plus\u001b[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens)\u001b[0m\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m!=\u001b[39m split_special_tokens:\n\u001b[1;32m 527\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m=\u001b[39m split_special_tokens\n\u001b[0;32m--> 529\u001b[0m encodings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_batch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_pretokenized\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;66;03m# Convert encoding to dict\u001b[39;00m\n\u001b[1;32m 536\u001b[0m \u001b[38;5;66;03m# `Tokens` has type: Tuple[\u001b[39;00m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;66;03m# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],\u001b[39;00m\n\u001b[1;32m 538\u001b[0m \u001b[38;5;66;03m# List[EncodingFast]\u001b[39;00m\n\u001b[1;32m 539\u001b[0m \u001b[38;5;66;03m# ]\u001b[39;00m\n\u001b[1;32m 540\u001b[0m \u001b[38;5;66;03m# with nested dimensions corresponding to batch, overflows, sequence length\u001b[39;00m\n\u001b[1;32m 541\u001b[0m tokens_and_encodings \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_convert_encoding(\n\u001b[1;32m 543\u001b[0m encoding\u001b[38;5;241m=\u001b[39mencoding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m encoding \u001b[38;5;129;01min\u001b[39;00m encodings\n\u001b[1;32m 553\u001b[0m ]\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import json\n", "from transformers import AutoTokenizer\n", "from transformers.utils import logging\n", "import torch\n", "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n", "*** Task description ***\n", "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n", "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n", "- The goal is to infer the transformation from the few training examples.\n", "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n", "*** DSL description ***\n", "- Types and Constants\n", " - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n", " - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n", "- Primitives\n", " - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n", " - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n", " - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n", "- Grid and Object Manipulation\n", " - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n", " - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n", " - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n", " - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n", "- Analysis and Filtering\n", " - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n", " - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n", " - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n", "- Connectivity and Bounding\n", " - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n", " - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n", "- Utils\n", " - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n", " - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n", " - **Grid Formatting**: `format_grid` casts lists to the grid type.\n", "*** Format of the generated code ***\n", "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n", "- This also means that each line of code is enforced to be a single function call.\n", "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n", "'''\n", "\n", "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n", "print(f\"Token length: {token_length}\")\n", "\n", "\n", "\n", "def check_token_length(query):\n", " # Suppress warnings\n", " logging.set_verbosity_error()\n", "\n", " # Load the tokenizer\n", "\n", " # Apply chat template and tokenize\n", " formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n", " tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n", "\n", " # Get token length\n", " token_length = tokens.shape[1]\n", "\n", " #print(f\"Token length: {token_length}\")\n", "\n", " # Check if it exceeds the model's context length (assuming 4096 for Llama-2)\n", "\n", " return token_length\n", "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n", " return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n", "\n", "def transform_query(query):\n", " result_str = \"\"\n", " previous_result = \"\"\n", " for i, example in enumerate(query):\n", " try:\n", " r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n", " r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n", " except:\n", " print(example)\n", " return None\n", " # input_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"input\"]])\n", " # output_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"output\"]])\n", " input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n", " output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n", " result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n", " previous_result = result_str\n", " if len(result_str) > 14000: #6000\n", " token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n", " #print(f\"previous Token length: {token_length}\")\n", " if token_length > 14000: #6000\n", " return None\n", " return previous_result\n", " token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n", " #print(i, len(result_str))\n", " #print(f\"Token length: {token_length}\")\n", " \n", " return result_str\n", "\n", "import re\n", "f = open(\"multi_step_verifiers_training.txt\", \"r\")\n", "data_output = []\n", "token_len_list = []\n", "skip = 0\n", "total = 0\n", "for line in f:\n", " total += 1\n", " tmp = json.loads(line)\n", " new_dict = {\"messages\":[]}\n", " new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n", " tran = transform_query(tmp[\"example\"]) \n", " if tran == None:\n", " skip += 1\n", " continue\n", "\n", " new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n", " tmp[\"verifier\"][0] =re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n", " #tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\n", " new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n", " token_len = check_token_length(new_dict[\"messages\"])\n", " token_len_list.append(token_len)\n", " data_output.append(new_dict)\n", "print(data_output[0][\"messages\"][1][\"content\"])\n", "print(\"skip:\", skip)\n", "print(\"total:\", total)\n", "print(\"Token length max:\", max(token_len_list))\n", "print(\"Token length min:\", min(token_len_list))\n", "f.close()\n", "with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n", " json.dump(data_output, f, indent=4)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token length: 726\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", " 72%|███████▏ | 96270/134580 [14:57<05:57, 107.26it/s]\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:856\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 855\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 856\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_items\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpopleft\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m:\n", "\u001b[0;31mIndexError\u001b[0m: pop from an empty deque", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[2], line 136\u001b[0m\n\u001b[1;32m 133\u001b[0m json\u001b[38;5;241m.\u001b[39mdump(data_output, f, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m--> 136\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[2], line 115\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;66;03m# Create a pool of workers\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mp\u001b[38;5;241m.\u001b[39mPool(processes\u001b[38;5;241m=\u001b[39mmp\u001b[38;5;241m.\u001b[39mcpu_count()) \u001b[38;5;28;01mas\u001b[39;00m pool:\n\u001b[1;32m 114\u001b[0m \u001b[38;5;66;03m# Process lines in parallel with progress bar\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpool\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlines\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# Collect results\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n", "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/tqdm/std.py:1181\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1178\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1181\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m 1182\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m obj\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;66;03m# Update and possibly print the progressbar.\u001b[39;00m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;66;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;00m\n", "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:861\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 859\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 861\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 863\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_items\u001b[38;5;241m.\u001b[39mpopleft()\n", "File \u001b[0;32m/usr/local/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import json\n", "from transformers import AutoTokenizer\n", "from transformers.utils import logging\n", "import torch\n", "import multiprocessing as mp\n", "from tqdm import tqdm\n", "\n", "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n", "*** Task description ***\n", "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n", "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n", "- The goal is to infer the transformation from the few training examples.\n", "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n", "*** DSL description ***\n", "- Types and Constants\n", " - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n", " - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n", "- Primitives\n", " - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n", " - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n", " - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n", "- Grid and Object Manipulation\n", " - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n", " - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n", " - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n", " - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n", "- Analysis and Filtering\n", " - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n", " - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n", " - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n", "- Connectivity and Bounding\n", " - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n", " - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n", "- Utils\n", " - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n", " - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n", " - **Grid Formatting**: `format_grid` casts lists to the grid type.\n", "*** Format of the generated code ***\n", "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n", "- This also means that each line of code is enforced to be a single function call.\n", "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n", "'''\n", "\n", "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n", "print(f\"Token length: {token_length}\")\n", "\n", "def check_token_length(query):\n", " # Suppress warnings\n", " logging.set_verbosity_error()\n", "\n", " # Apply chat template and tokenize\n", " formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n", " tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n", "\n", " # Get token length\n", " token_length = tokens.shape[1]\n", "\n", " return token_length\n", "\n", "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n", " return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n", "\n", "def transform_query(query):\n", " result_str = \"\"\n", " previous_result = \"\"\n", " for i, example in enumerate(query):\n", " try:\n", " r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n", " r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n", " except:\n", " print(example)\n", " return None\n", " input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n", " output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n", " result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n", " previous_result = result_str\n", " if len(result_str) > 14000:\n", " token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n", " if token_length > 14000:\n", " return None\n", " return previous_result\n", " token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n", " \n", " return result_str\n", "\n", "def process_line(line):\n", " tmp = json.loads(line)\n", " new_dict = {\"messages\":[]}\n", " new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n", " tran = transform_query(tmp[\"example\"]) \n", " if tran == None:\n", " return None\n", "\n", " new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n", " tmp[\"verifier\"][0] = re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n", " new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n", " token_len = check_token_length(new_dict[\"messages\"])\n", " return new_dict, token_len\n", "\n", "def main():\n", " with open(\"multi_step_verifiers_training.txt\", \"r\") as f:\n", " lines = f.readlines()\n", " \n", " total = len(lines)\n", " data_output = []\n", " token_len_list = []\n", " skip = 0\n", " \n", " # Create a pool of workers\n", " with mp.Pool(processes=mp.cpu_count()) as pool:\n", " # Process lines in parallel with progress bar\n", " results = list(tqdm(pool.imap(process_line, lines), total=total))\n", " \n", " # Collect results\n", " for result in results:\n", " if result is None:\n", " skip += 1\n", " continue\n", " new_dict, token_len = result\n", " data_output.append(new_dict)\n", " token_len_list.append(token_len)\n", " \n", " print(data_output[0][\"messages\"][1][\"content\"])\n", " print(\"skip:\", skip)\n", " print(\"total:\", total)\n", " print(\"Token length max:\", max(token_len_list))\n", " print(\"Token length min:\", min(token_len_list))\n", " \n", " with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n", " json.dump(data_output, f, indent=4)\n", "\n", "if __name__ == '__main__':\n", " main()\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'def verifier_9f6b5f41(I: Grid) -> Grid:'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "\n", "def replace_verifier_with_solver(input_string):\n", " return re.sub(r'veri.*?\\(', 'solver(', input_string)\n", "\n", "original_string = 'def verifier_9f6b5f41(I: Grid) -> Grid:'\n", "modified_string = replace_verifier_with_solver(original_string)\n", "print(modified_string)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "# merge two datasets\n", "data1 = json.load(open(\"multi_step_verifiers_training.json\", \"r\"))\n", "data2 = json.load(open(\"re_arc_v4.json\", \"r\"))\n", "data1.extend(data2)\n", "with open(\"multi_step_merged_arc_v4.json\", \"w\") as f:\n", " json.dump(data1, f, indent=4)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }