fix: apply tokenizer fixes (taken from my ModernBERT repo: https://huggingface.co/stefan-it/ModernBERT-large-tokenizer-fix/blob/main/tokenizer_config.json)

Files changed (1) hide show

tokenizer_config.json CHANGED Viewed

@@ -929,16 +929,17 @@
       "special": false
     }
   },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "model_max_length": 8192,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "tokenizer_class": "PreTrainedTokenizerFast",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "unk_token": "[UNK]"
-}

       "special": false
     }
   },
+  "add_prefix_space": true,
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "model_max_length": 8192,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "tokenizer_class": "RobertaTokenizerFast",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "unk_token": "[UNK]"
+}