yzimmermann commited on
Commit
6bf5c82
·
verified ·
1 Parent(s): 78043fd

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +147 -0
  3. tokenizer_config.json +54 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "^",
3
+ "eos_token": "&",
4
+ "mask_token": "[MASK]",
5
+ "pad_token": " ",
6
+ "unk_token": "?"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Left",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": null,
10
+ "added_tokens": [
11
+ {
12
+ "id": 0,
13
+ "content": "^",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ {
21
+ "id": 1,
22
+ "content": "&",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 2,
31
+ "content": " ",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 3,
40
+ "content": "?",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 4,
49
+ "content": "[MASK]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": null,
58
+ "pre_tokenizer": {
59
+ "type": "Split",
60
+ "pattern": {
61
+ "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])"
62
+ },
63
+ "behavior": "Isolated",
64
+ "invert": false
65
+ },
66
+ "post_processor": {
67
+ "type": "TemplateProcessing",
68
+ "single": [
69
+ {
70
+ "SpecialToken": {
71
+ "id": "^",
72
+ "type_id": 0
73
+ }
74
+ },
75
+ {
76
+ "Sequence": {
77
+ "id": "A",
78
+ "type_id": 0
79
+ }
80
+ },
81
+ {
82
+ "SpecialToken": {
83
+ "id": "&",
84
+ "type_id": 0
85
+ }
86
+ }
87
+ ],
88
+ "pair": [
89
+ {
90
+ "Sequence": {
91
+ "id": "A",
92
+ "type_id": 0
93
+ }
94
+ },
95
+ {
96
+ "Sequence": {
97
+ "id": "B",
98
+ "type_id": 1
99
+ }
100
+ }
101
+ ],
102
+ "special_tokens": {
103
+ "&": {
104
+ "id": "&",
105
+ "ids": [
106
+ 1
107
+ ],
108
+ "tokens": [
109
+ "&"
110
+ ]
111
+ },
112
+ "^": {
113
+ "id": "^",
114
+ "ids": [
115
+ 0
116
+ ],
117
+ "tokens": [
118
+ "^"
119
+ ]
120
+ }
121
+ }
122
+ },
123
+ "decoder": null,
124
+ "model": {
125
+ "type": "WordLevel",
126
+ "vocab": {
127
+ "^": 0,
128
+ "&": 1,
129
+ " ": 2,
130
+ "?": 3,
131
+ "[MASK]": 4,
132
+ "c": 5,
133
+ "C": 6,
134
+ "(": 7,
135
+ ")": 8,
136
+ "1": 9,
137
+ "2": 10,
138
+ "3": 11,
139
+ "=": 12,
140
+ "F": 13,
141
+ "N": 14,
142
+ "O": 15,
143
+ "o": 16
144
+ },
145
+ "unk_token": "?"
146
+ }
147
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "^",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "&",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": " ",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "?",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "^",
45
+ "clean_up_tokenization_spaces": true,
46
+ "eos_token": "&",
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": " ",
50
+ "padding_side": "right",
51
+ "tokenizer_class": "PreTrainedTokenizerFast",
52
+ "truncation_side": "left",
53
+ "unk_token": "?"
54
+ }