import pandas as pd from sklearn.model_selection import train_test_split import importlib.resources as pkg_resources import polyatomic_complexes import numpy as np from typing import Tuple from pathlib import Path def load_dataset(name) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: if name.lower() == "esol": data_path = ( pkg_resources.files("polyatomic_complexes.dataset.esol") / "ESOL.csv" ) df = pd.read_csv(str(data_path)) target_col = "measured log solubility in mols per litre" elif name.lower() == "freesolv": data_path = ( pkg_resources.files("polyatomic_complexes.dataset.free_solv") / "FreeSolv.csv" ) df = pd.read_csv(str(data_path)) target_col = "expt" elif name.lower() == "lipophil": data_path = ( pkg_resources.files("polyatomic_complexes.dataset.lipophilicity") / "Lipophilicity.csv" ) df = pd.read_csv(str(data_path)) target_col = "exp" elif name.lower() == "boilingpoint": data_path = ( Path(__file__).parent.parent / "benchmark_csv/boiling_point.csv".__str__() ) df = pd.read_csv(data_path) target_col = "boiling_point_K" elif name.lower() == "qm9": data_path = ( Path(__file__).parent.parent / "benchmark_csv/qm9_subset.csv".__str__() ) df = pd.read_csv(data_path) target_col = "cv" elif name.lower() == "ic50": data_path = ( Path(__file__).parent.parent / "benchmark_csv/ic_50_subset.csv".__str__() ) df = pd.read_csv(data_path) target_col = "pIC50" elif name.lower() == "bindingdb": data_path = ( Path(__file__).parent.parent / "benchmark_csv/bindingdb.csv".__str__() ) df = pd.read_csv(data_path) target_col = "pIC50" else: raise ValueError(f"Unknown dataset: {name}") df.dropna(subset=["smiles", target_col], inplace=True) smiles = df["smiles"] targets = df[target_col] X_train, X_test, y_train, y_test = train_test_split( smiles, targets, test_size=0.2, random_state=42 ) return X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()