In [8]:
import re
from typing import Dict, List, Optional
from mathruler.grader import extract_boxed_content, grade_answer


In [9]:
import json
from pathlib import Path
from typing import List, Dict, Union
from typing import Dict, List, Any
import re
from typing import List

def read_json(path: Union[str, Path]) -> List[Dict]:
 """
 Read a JSON file and return its contents as a list of dicts.

 Parameters
 ----------
 path : str or Path
 Path to a JSON file whose root is a JSON array.

 Returns
 -------
 List[Dict]
 Each element of the top-level JSON array, parsed into a Python dict.

 Raises
 ------
 ValueError
 If the JSON root is not a list.
 json.JSONDecodeError
 If the file is not valid JSON.
 """
 path = Path(path).expanduser()

 with path.open("r", encoding="utf-8") as f:
 data = json.load(f)

 if not isinstance(data, list):
 raise ValueError(f"{path} does not contain a JSON array at the top level.")

 # (Optional) sanity-check that every item is a dict
 if not all(isinstance(item, dict) for item in data):
 raise ValueError("Not every element in the JSON array is an object.")

 return data


def extract_description(predict: str) -> Optional[str]:
 """
 Extracts the content of the … block from `predict`.
 Returns the inner text (with leading/trailing whitespace stripped),
 or None if no tag is found.
 """
 match = re.search(r"([\s\S]*?)", predict, re.DOTALL)
 if not match:
 return None
 return match.group(1).strip()


def accuracy_reward(predict: str, ground_truth: str) -> float:
 answer = extract_boxed_content(predict)
 # answer = extract_answer(predict)
 return 1.0 if grade_answer(answer, ground_truth) else 0.0

In [10]:
def load_json_dir(root: str | Path, *, verbose: bool = True) -> Dict[str, List[Any]]:
 """
 Traverse *root* recursively and return {file_stem: parsed_json_data}.

 • Files that are empty or contain invalid JSON are skipped with a warning.
 Set verbose=False to silence the warnings.
 """
 root = Path(root).expanduser().resolve()
 out: Dict[str, List[Any]] = {}

 for path in root.rglob("*.json"):
 try:
 with path.open("r", encoding="utf-8") as f:
 data = json.load(f)
 out[path.stem] = data
 except json.JSONDecodeError as err:
 if verbose:
 print(f"[skip] {path} – invalid JSON ({err})")
 except Exception as err:
 if verbose:
 print(f"[skip] {path} – {err}")

 return out

In [4]:
# folder_dir = './gemini-flash'
folder_dir = './gemini-pro'
# folder_dir = './gemini-pro-pro'

In [5]:
datas = load_json_dir(folder_dir)

datas.keys()

dict_keys(['realWorldQA', 'clevr_count_70k', 'mmmu-pro', 'mathvision', 'mmstar', 'mmmu-pro-vision', 'mm-vet', 'mmmu_pro_10options', 'mathvista', 'visnumbench'])

In [6]:
indices = {}

for file, answers in datas.items():
 indices[file]=[]
 acc = 0
 for index, ele in enumerate(answers):
 solution = ele['solution']
 prediction = ele['predictions'][0]
 accuracy = accuracy_reward(prediction, solution)
 acc += accuracy
 
 if accuracy == 1:
 indices[file].append(index)
 
 print(f'{file}: {acc/len(answers)}')

realWorldQA: 0.6862745098039216
clevr_count_70k: 0.7108571428571429
mmmu-pro: 0.6105527638190955
mathvision: 0.36875
mmstar: 0.6633333333333333
mmmu-pro-vision: 0.5256410256410257
mm-vet: 0.3302752293577982
mmmu_pro_10options: 0.49243379571248425
mathvista: 0.554
visnumbench: 0.28835978835978837


In [7]:
len(datas['MLLM_rlvr_train'])

KeyError: 'MLLM_rlvr_train'

In [8]:
indices.keys()

dict_keys(['realWorldQA', 'MLLM_hotpot_train', 'mmmu-pro', 'mmstar', 'mm-vet', 'mathvista'])

In [None]:
realWorldQA: 0.6972477064220184
mmmu-pro: 0.5646606914212549
mmstar: 0.6061433447098976
mm-vet: 0.6018518518518519
mathvista: 0.5822401614530777

# Construct indices to merge datasets

In [8]:
description_folder_dir = './gpt_o1_outputs'
description_outputs = load_json_dir(description_folder_dir)

In [9]:
file = 'mathvision'
print(len(description_outputs[file]))
print(len(datas[file]))

891
2694


In [11]:
# idx = 1200
# print(description_outputs[file][idx])
# print('-'*10)
# print(datas[file][idx])

In [12]:
datas['MLLM_hotpot_train'][0]

{'index': 0,
 'problem': 'Are there any states with a number of reporters between 376-385? Options:\nA. No\nB. Yes',
 'solution': 'B',
 'predictions': [' The task is to determine if there are any states with a number of reporters between 376-385. The map shows states in two different colors corresponding to two ranges of reporters: 373-375 and 376-385. I need to identify states in the color representing the range 376-385.\n\nThe legend indicates two colors:\n- A lighter color for the range 373-375.\n- A darker color for the range 376-385.\n\nI will look at the map to identify states shaded with the darker color. The states shaded in the darker color are:\n- California\n- North Dakota\n- South Dakota\n- Iowa\n- Missouri\n- Illinois\n- Kentucky\n- West Virginia\n- New Jersey\n- Massachusetts\n\nThese states are not in the lighter color range, hence they must have a number of reporters between 376-385. Therefore, there are indeed states with a number of reporters in the range 376-385.\n\n

In [11]:
indices = {}

for file, answers in datas.items():
 # try:
 indices[file]=[]
 # description_data = description_outputs[file]
 acc = 0
 for i, ele in enumerate(answers):
 solution = ele['solution']
 prediction = ele['predictions'][0]
 datas_index = ele['index']
 
 # print(description)
 # break
 accuracy = accuracy_reward(prediction, solution)
 # acc += accuracy
 
 if accuracy == 1:
 # if description is not None:
 indices[file].append(datas_index)
 acc += accuracy
 
 print(f'{file}: {acc/len(answers)}; dataset len: {len(answers)}')
 # except Exception as e:
 # print(f"Exception caught: {e} for file: {file}")

MLLM_hotpot_train: 0.2949054259284827; dataset len: 14486
mathverse: 0.18071065989847715; dataset len: 3940


In [12]:
indices = {}
texts = {}
for file, answers in datas.items():
 try:
 indices[file]=[]
 texts[file] = []
 description_data = description_outputs[file]
 # ---------- 1) make a hash‑map: index -> description item ----------
 desc_by_idx = {item["index"]: item for item in description_data}
 
 acc = 0
 for i, ele in enumerate(answers):
 solution = ele['solution']
 prediction = ele['predictions'][0]
 data_idx = ele["index"] # the index in the answers item
 
 try:
 desc_item = desc_by_idx.get(data_idx)
 extracted_description = extract_description(desc_item['predictions'][0])
 except:
 extracted_description = None

 # print(description)
 # break
 accuracy = accuracy_reward(prediction, solution)
 # acc += accuracy 
 
 # print('data: ', description_data)
 # print('-'*10)
 # print('data1: ', ele)
 # break
 
 
 if accuracy == 1:
 if extracted_description is not None:
 indices[file].append(data_idx)
 curr_text = '\n' + extracted_description + '/n' + prediction
 texts[file].append(curr_text) 
 
 acc += accuracy
 
 print(f'{file}: {acc/len(answers)}; dataset len: {len(answers)}')
 except Exception as e:
 print(f"Exception caught: {e} for file: {file}")

Exception caught: name 'description_outputs' is not defined for file: MLLM_hotpot_train
Exception caught: name 'description_outputs' is not defined for file: mathverse


In [15]:
indices_by_dataset = indices

In [16]:
total = 0
for k, v in indices_by_dataset.items():
 print(f'K: {k}; V len: {len(v)}')
 total += len(v)
 
total

K: realWorldQA; V len: 514
K: MLLM_hotpot_train; V len: 0
K: mmmu-pro; V len: 389
K: mathvision; V len: 328
K: mmstar; V len: 512
K: mm-vet; V len: 65
K: mathvista; V len: 457


2265

### Add it for MLLM hotpot train

In [13]:
indices = {}

hotpot_description_folder_dir = './gemini-flash'
hotpot_description_outs = load_json_dir(hotpot_description_folder_dir)

for file, answers in hotpot_description_outs.items():
 try:
 print(len(answers))
 indices[file]=[]
 texts[file] = []
 acc = 0
 for i, ele in enumerate(answers):
 solution = ele['solution']
 prediction = ele['predictions'][0]
 datas_index = ele['index']
 
 # print(description)
 # break
 accuracy = accuracy_reward(prediction, solution)
 # acc += accuracy
 
 if accuracy == 1:
 indices[file].append(datas_index)
 texts[file].append(prediction)
 acc += accuracy
 
 print(f'{file}: {acc/len(answers)}; dataset len: {len(answers)}')
 except Exception as e:
 print(f"Exception caught: {e} for file: {file}")

len(indices['MLLM_hotpot_train'])

[skip] /apdcephfs_cq11/share_1603164/user/zongxia/workspace/C-gemini-answers/gemini-flash/clevr_count_70k.json – invalid JSON (Expecting value: line 1 column 1 (char 0))
14486
MLLM_hotpot_train: 0.2949054259284827; dataset len: 14486
3940
mathverse: 0.18071065989847715; dataset len: 3940


4272

In [14]:
idxs = [ele['index'] for ele in hotpot_description_outs['MLLM_hotpot_train']]


print("len(idxs) =", len(idxs), " min =", min(idxs), " max =", max(idxs))
# → len(idxs) == 6105, min == 0 (maybe), max == 6463

# 2) find every number that *should* be there but isn’t
expected = set(range(min(idxs), max(idxs) + 1)) # full consecutive range
missing = sorted(expected - set(idxs))

print("missing count :", len(missing))
print("first 20 gaps :", missing[:20])

len(idxs) = 14486 min = 0 max = 14485
missing count : 0
first 20 gaps : []


In [15]:
indices_by_dataset = indices

In [16]:
print(indices_by_dataset.keys())
print(texts.keys())

dict_keys(['MLLM_hotpot_train', 'mathverse'])
dict_keys(['MLLM_hotpot_train', 'mathverse'])


In [17]:
len(indices_by_dataset['MLLM_hotpot_train'])

4272

In [18]:
indices_by_dataset['MLLM_hotpot_train'][-10:]

[14471, 14473, 14474, 14476, 14477, 14478, 14480, 14481, 14484, 14485]

In [19]:
from datasets import load_dataset, concatenate_datasets

BASE_REPO = "zli12321/" # prefix for every dataset id
kept_splits = []

for short_name, keep in indices_by_dataset.items():
 try:
 if not keep: # nothing to keep → skip
 continue

 # -----------------------------------------------------------------
 # 1) ensure `keep` and its matching texts are sorted *together*
 # -----------------------------------------------------------------
 idxs = keep
 outs = texts[short_name]

 # idxs and outs were built in parallel, so they are aligned.
 # If you want the rows in ascending order, sort both lists together:
 order = sorted(range(len(idxs)), key=idxs.__getitem__)
 idxs = [idxs[i] for i in order] # sorted indices
 outs = [outs[i] for i in order] # matching outputs

 # -----------------------------------------------------------------
 # 2) load, slice, and keep only the three original columns
 # -----------------------------------------------------------------
 full_name = f"{BASE_REPO}{short_name}"
 
 print(f'filename: {full_name}')
 split = "train" if "MLLM_hotpot_train" in short_name else "test"

 ds = load_dataset(full_name, split=split, trust_remote_code=True)
 ds = ds.select(idxs) # keep only those rows
 
 # print(f'filename: {full_name}; len: {len(ds)}')

 cols_to_keep = {"problem", "images", "answer"}
 ds = ds.remove_columns([c for c in ds.column_names if c not in cols_to_keep])

 # -----------------------------------------------------------------
 # 3) add the new column
 # -----------------------------------------------------------------
 ds = ds.add_column("outputs", outs) # len(outs) == len(ds)

 kept_splits.append(ds)
 except Exception as e:
 print(f"dataset len: {len(ds)}")
 print(f'{short_name} Failed: {e}')

# ---------------------------------------------------------------------
# 4) concatenate everything into one big dataset
# ---------------------------------------------------------------------
combined = concatenate_datasets(kept_splits)

print(combined) # verify
# combined.save_to_disk("combined.arrow") # or .to_parquet(...)



 from .autonotebook import tqdm as notebook_tqdm


filename: zli12321/MLLM_hotpot_train


Flattening the indices: 100%|██████████| 4272/4272 [00:03<00:00, 1282.44 examples/s]


filename: zli12321/mathverse


Generating test split: 3940 examples [00:00, 13229.68 examples/s]
Flattening the indices: 100%|██████████| 712/712 [00:00<00:00, 48814.82 examples/s]

Dataset({
 features: ['problem', 'answer', 'images', 'outputs'],
 num_rows: 4984
})





In [29]:
combined.to_parquet("./hf_upload_train/train.parquet")

Creating parquet from Arrow format: 100%|██████████| 39/39 [00:17<00:00, 2.18ba/s]


909006342

In [20]:
def save_any_image(img_obj, out_base: Path) -> Path:
 """
 Save *img_obj* (str | dict | PIL.Image) to disk.
 Returns the *Path* actually written (possibly .png if alpha).
 """
 import io, shutil
 from PIL import Image

 # 1) resolve a PIL.Image ---------------------------------------------------
 if isinstance(img_obj, str): # already a path
 pil = Image.open(img_obj)

 elif isinstance(img_obj, dict): # HF Image feature
 if img_obj.get("path"):
 pil = Image.open(img_obj["path"])
 else:
 pil = Image.open(io.BytesIO(img_obj["bytes"]))

 else: # PIL.Image.Image
 pil = img_obj

 # 2) choose format & filename ---------------------------------------------
 suffix = ".jpg"
 img_mode = pil.mode

 if img_mode in ("RGBA", "LA", "P"):
 # keep alpha by switching to PNG (or call .convert("RGB") to stay JPEG)
 suffix = ".png"

 out_path = out_base.with_suffix(suffix)

 # 3) convert if you insist on JPG without alpha
 if suffix == ".jpg" and img_mode != "RGB":
 pil = pil.convert("RGB")

 # 4) write -----------------------------------------------------------------
 pil.save(out_path)
 return out_path


In [21]:
import os, io, json, shutil
from pathlib import Path
from PIL import Image
from tqdm import tqdm # optional progress bar

# ------------------------------------------------------------------ #
# directory setup
# ------------------------------------------------------------------ #
OUT_DIR = Path("sft_description")
OUT_DIR.mkdir(exist_ok=True) # creates folder if missing

json_records = []

# ------------------------------------------------------------------ #
# main loop
# ------------------------------------------------------------------ #
for idx, row in enumerate(tqdm(combined, desc="writing images")):
 img_path = save_any_image(row["images"], OUT_DIR / str(idx))
 json_records.append({
 "messages": [
 {"content": row["problem"], "role": "user"},
 {"content": row["outputs"], "role": "assistant"}
 ],
 "images": [str(img_path)]
 })

# ------------------------------------------------------------------ #
# write the JSONL / JSON
# ------------------------------------------------------------------ #
with open("sft_description.json", "w", encoding="utf-8") as f:
 json.dump(json_records, f, ensure_ascii=False, indent=2)

print(f"✅ Done: {len(json_records)} items saved.")

writing images: 100%|██████████| 4984/4984 [14:38<00:00, 5.67it/s]


✅ Done: 4984 items saved.


### Now process the data for Hotpot Train

In [1]:
import pandas as pd
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
from typing import List, Dict, Any, Optional
from openai import OpenAI
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Union

In [4]:
data[0]

{'index': 0,
 'problem': 'Are there any states with a number of reporters between 376-385? Options:\nA. No\nB. Yes',
 'solution': 'B',
 'predictions': ['The image is a map of the United States, with each state colored according to the number of reporters in that state. The title of the map is "The Number of reporters in the USA". There is a legend in the bottom right corner. States colored in a light beige color have between 373-375 reporters. States colored in a dark purple color have between 376-385 reporters. Several states are colored dark purple, including Washington, Montana, North Dakota, South Dakota, Iowa, Missouri, Louisiana, Utah, Nevada, California, Virginia, Maryland, and New Hampshire. Alaska and Hawaii are also shown. \nThe question asks if there are any states with a number of reporters between 376-385. The legend indicates that states with 376-385 reporters are colored dark purple. The map shows several states colored dark purple. Therefore, the answer is yes. \n\\boxe