import numpy as np import torch from rdkit import Chem from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator import selfies as sf from torch_geometric.data import Data from torch_geometric.loader import DataLoader mfpgen = GetMorganGenerator( radius=2, countSimulation=False, includeChirality=False, useBondTypes=True, onlyNonzeroInvariants=False, includeRingMembership=True, countBounds=None, fpSize=2048, atomInvariantsGenerator=None, bondInvariantsGenerator=None, includeRedundantEnvironments=False, ) def smiles_to_graph(smiles): mol = Chem.MolFromSmiles(smiles) # type: ignore if mol is None: return None return mol_to_graph(mol) def selfies_to_graph(smiles_string): try: selfies_string = sf.encoder(smiles_string) smiles = sf.decoder(selfies_string) mol = Chem.MolFromSmiles(smiles) # type: ignore if mol is None: raise ValueError("Decoded SELFIES is invalid") return mol_to_graph(mol) except Exception: fallback = smiles_to_graph(smiles_string) if fallback is None: return None return fallback def ecfp_to_graph(smiles_str: str, max_bits: int = 2048, k: int = 2) -> Data | None: mol = Chem.MolFromSmiles(smiles_str) # type: ignore if mol is None: return None fp = mfpgen.GetFingerprintAsNumPy(mol) active_bits = np.nonzero(fp)[0] n = len(active_bits) if n == 0: return None edge_index = [] for i in range(n): for j in range(i + 1, min(i + 1 + k, n)): edge_index.append([i, j]) edge_index.append([j, i]) edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() x = torch.zeros((n, max_bits), dtype=torch.float) for i, bit_idx in enumerate(active_bits): x[i, bit_idx] = 1.0 return Data(x=x, edge_index=edge_index) def mol_to_graph(mol): atom_feats = [] for atom in mol.GetAtoms(): atom_feats.append( [ atom.GetAtomicNum(), atom.GetDegree(), atom.GetFormalCharge(), atom.GetIdx(), ] ) x = torch.tensor(atom_feats, dtype=torch.float) edge_index = [] edge_attr = [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() edge_index.append((i, j)) edge_index.append((j, i)) btype = bond.GetBondTypeAsDouble() edge_attr.append([btype]) edge_attr.append([btype]) edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() edge_attr = torch.tensor(edge_attr, dtype=torch.float) return Data(x=x, edge_index=edge_index, edge_attr=edge_attr) def smiles_for_gp(smiles: str) -> np.ndarray: mol = Chem.MolFromSmiles(smiles) # type: ignore if mol is None: return np.zeros(mfpgen.GetNumBits(), dtype=np.float32) arr = mfpgen.GetFingerprintAsNumPy(mol) return arr.astype(np.float32) def selfies_for_gp(selfies_str, radius=2, n_bits=2048): try: smiles = sf.decoder(selfies_str) assert isinstance(smiles, str) return smiles_for_gp(smiles) except: return np.zeros(n_bits) def ecfp_for_gp(smiles_str: str) -> np.ndarray: mol = Chem.MolFromSmiles(smiles_str) # type: ignore if mol is None: return np.zeros(mfpgen.GetNumBits(), dtype=np.float32) return mfpgen.GetFingerprintAsNumPy(mol).astype(np.float32) def graph_native_loader(graph_list, batch_size=32, shuffle=True): return DataLoader(graph_list, batch_size=batch_size, shuffle=shuffle)