stefan-it commited on
Commit
82d97b3
·
verified ·
1 Parent(s): 7a55cb0

fix: apply tokenizer fixes (taken from my ModernBERT repo: https://huggingface.co/stefan-it/ModernBERT-large-tokenizer-fix/blob/main/tokenizer_config.json)

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +3 -2
tokenizer_config.json CHANGED
@@ -929,16 +929,17 @@
929
  "special": false
930
  }
931
  },
 
932
  "clean_up_tokenization_spaces": true,
933
  "cls_token": "[CLS]",
934
  "mask_token": "[MASK]",
935
  "model_max_length": 8192,
936
  "pad_token": "[PAD]",
937
  "sep_token": "[SEP]",
938
- "tokenizer_class": "PreTrainedTokenizerFast",
939
  "model_input_names": [
940
  "input_ids",
941
  "attention_mask"
942
  ],
943
  "unk_token": "[UNK]"
944
- }
 
929
  "special": false
930
  }
931
  },
932
+ "add_prefix_space": true,
933
  "clean_up_tokenization_spaces": true,
934
  "cls_token": "[CLS]",
935
  "mask_token": "[MASK]",
936
  "model_max_length": 8192,
937
  "pad_token": "[PAD]",
938
  "sep_token": "[SEP]",
939
+ "tokenizer_class": "RobertaTokenizerFast",
940
  "model_input_names": [
941
  "input_ids",
942
  "attention_mask"
943
  ],
944
  "unk_token": "[UNK]"
945
+ }