yujiepan commited on
Commit
a3cb486
·
verified ·
1 Parent(s): 345fe9d

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - tencent/Hunyuan-7B-Instruct
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
14
+
15
+ ### Example usage:
16
+
17
+ ```python
18
+ import torch
19
+ from transformers.pipelines import pipeline
20
+
21
+ model_id = "yujiepan/hunyuan-dense-v1-tiny-random"
22
+ messages = [
23
+ {
24
+ "role": "user",
25
+ "content": "hi",
26
+ }
27
+ ]
28
+ pipe = pipeline('text-generation', model_id, device='cuda', torch_dtype=torch.bfloat16, trust_remote_code=True,)
29
+ print(pipe(messages, max_new_tokens=32))
30
+ ```
31
+
32
+ ### Codes to create this repo:
33
+
34
+ ```python
35
+ import json
36
+ from pathlib import Path
37
+
38
+ import accelerate
39
+ import torch
40
+ from huggingface_hub import file_exists, hf_hub_download
41
+ from transformers import (
42
+ AutoConfig,
43
+ AutoModelForCausalLM,
44
+ AutoProcessor,
45
+ GenerationConfig,
46
+ set_seed,
47
+ )
48
+
49
+ source_model_id = "tencent/Hunyuan-7B-Instruct"
50
+ save_folder = "/tmp/yujiepan/hunyuan-dense-v1-tiny-random"
51
+
52
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
53
+ processor.save_pretrained(save_folder)
54
+
55
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
56
+ config_json = json.load(f)
57
+ config_json['hidden_size'] = 16
58
+ config_json['head_dim'] = 32
59
+ config_json['intermediate_size'] = 64
60
+ config_json['num_attention_heads'] = 2
61
+ config_json['num_hidden_layers'] = 2
62
+ config_json['num_key_value_heads'] = 1
63
+ config_json['tie_word_embeddings'] = True
64
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
65
+ json.dump(config_json, f, indent=2)
66
+
67
+ config = AutoConfig.from_pretrained(
68
+ save_folder,
69
+ trust_remote_code=True,
70
+ )
71
+ print(config)
72
+ torch.set_default_dtype(torch.bfloat16)
73
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
74
+ torch.set_default_dtype(torch.float32)
75
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
76
+ model.generation_config = GenerationConfig.from_pretrained(
77
+ source_model_id, trust_remote_code=True,
78
+ )
79
+ set_seed(42)
80
+ model = model.cpu() # cpu is more stable for random initialization across machines
81
+ with torch.no_grad():
82
+ for name, p in sorted(model.named_parameters()):
83
+ torch.nn.init.normal_(p, 0, 0.1)
84
+ print(name, p.shape)
85
+ model.save_pretrained(save_folder)
86
+ print(model)
87
+ ```
88
+
89
+ ### Printing the model:
90
+
91
+ ```text
92
+ HunYuanDenseV1ForCausalLM(
93
+ (model): HunYuanDenseV1Model(
94
+ (embed_tokens): Embedding(128167, 16, padding_idx=127961)
95
+ (layers): ModuleList(
96
+ (0-1): 2 x HunYuanDenseV1DecoderLayer(
97
+ (self_attn): HunYuanDenseV1Attention(
98
+ (q_proj): Linear(in_features=16, out_features=64, bias=False)
99
+ (k_proj): Linear(in_features=16, out_features=32, bias=False)
100
+ (v_proj): Linear(in_features=16, out_features=32, bias=False)
101
+ (o_proj): Linear(in_features=64, out_features=16, bias=False)
102
+ (query_layernorm): HunYuanDenseV1RMSNorm((32,), eps=1e-05)
103
+ (key_layernorm): HunYuanDenseV1RMSNorm((32,), eps=1e-05)
104
+ )
105
+ (mlp): HunYuanDenseV1MLP(
106
+ (gate_proj): Linear(in_features=16, out_features=64, bias=False)
107
+ (up_proj): Linear(in_features=16, out_features=64, bias=False)
108
+ (down_proj): Linear(in_features=64, out_features=16, bias=False)
109
+ (act_fn): SiLU()
110
+ )
111
+ (input_layernorm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
112
+ (post_attention_layernorm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
113
+ )
114
+ )
115
+ (norm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
116
+ (rotary_emb): HunYuanDenseV1RotaryEmbedding()
117
+ )
118
+ (lm_head): Linear(in_features=16, out_features=128167, bias=False)
119
+ )
120
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if not add_generation_prompt is defined %}
2
+ {%- set add_generation_prompt = false %}
3
+ {%- endif %}
4
+ {%- set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_first_user=true, is_last_user=false) %}
5
+ {%- for message in messages %}
6
+ {%- if message['role'] == 'system' %}
7
+ {%- if ns.is_first_sp %}
8
+ {%- set ns.system_prompt = ns.system_prompt + message['content'] %}
9
+ {%- set ns.is_first_sp = false %}
10
+ {%- else %}
11
+ {%- set ns.system_prompt = ns.system_prompt + '
12
+
13
+ ' + message['content'] %}
14
+ {%- endif %}
15
+ {%- endif %}
16
+ {%- endfor %}
17
+ {{- bos_token }}
18
+ {{- ns.system_prompt }}
19
+ {%- if tools %}
20
+ {%- if ns.system_prompt != '' %}
21
+ {{- '
22
+
23
+ # Tools
24
+
25
+ You may call one or more functions to assist with the user query.' }}
26
+ {%- else %}
27
+ {{- '# Tools
28
+
29
+ You may call one or more functions to assist with the user query.' }}
30
+ {%- endif %}
31
+ {{- '
32
+
33
+ You are provided with function signatures within <tools></tools> XML tags:' }}
34
+ {{- '
35
+ <tools>
36
+ ' }}
37
+ {%- for tool in tools %}
38
+ {%- if loop.index0 > 0 %}
39
+ {{- '
40
+ ' }}
41
+ {%- endif %}
42
+ {{- tool | tojson }}
43
+ {%- endfor %}
44
+ {{- '
45
+ </tools>
46
+
47
+ ' }}
48
+ {{- 'For function call returns, you should first print <tool_calls>' }}
49
+ {{- 'For each function call, you should return object like:
50
+ ' }}
51
+ {{- '<tool_call>function_name
52
+ ```json
53
+ function_arguments_in_json_format
54
+ ```</tool_call>' }}
55
+ {{- 'At the end of function call returns, you should print </tool_calls>' }}
56
+ {%- endif %}
57
+ {%- if ns.system_prompt != '' or tools %}
58
+ {{- '<|extra_4|>' }}
59
+ {%- endif %}
60
+ {%- for message in messages %}
61
+ {%- if message['role'] == 'user' %}
62
+ {%- set ns.is_tool = false %}
63
+ {%- set ns.is_first = false %}
64
+ {%- set ns.is_last_user = true %}
65
+ {%- if ns.is_first_user %}
66
+ {{- message['content'] + '<|extra_0|>' }}
67
+ {%- set ns.is_first_user = false %}
68
+ {%- else %}
69
+ {{- bos_token + message['content'] + '<|extra_0|>' }}
70
+ {%- endif %}
71
+ {%- endif %}
72
+ {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
73
+ {%- set ns.is_last_user = false %}
74
+ {%- if ns.is_tool %}
75
+ {{- '</tool_responses>' + '<|extra_0|>' }}
76
+ {%- endif %}
77
+ {%- set ns.is_first = false %}
78
+ {%- set ns.is_tool = false %}
79
+ {%- set ns.is_output_first = true %}
80
+ {%- for tool in message['tool_calls'] %}
81
+ {%- set arguments = tool['function']['arguments'] %}
82
+ {%- if arguments is not string %}
83
+ {%- set arguments = arguments | tojson %}
84
+ {%- endif %}
85
+ {%- if not ns.is_first %}
86
+ {%- if message['content'] is none %}
87
+ {{- '<tool_calls><tool_call>' + tool['function']['name'] + '
88
+ ' + '```json' + '
89
+ ' + arguments + '
90
+ ' + '```' + '</tool_call>' }}
91
+ {%- else %}
92
+ {{- message['content'] + '<tool_calls><tool_call>' + tool['function']['name'] + '
93
+ ' + '```json' + '
94
+ ' + arguments + '
95
+ ' + '```' + '</tool_call>' }}
96
+ {%- endif %}
97
+ {%- set ns.is_first = true %}
98
+ {%- else %}
99
+ {{- '
100
+ ' + '<tool_call>' + tool['function']['name'] + '
101
+ ' + '```json' + '
102
+ ' + arguments + '
103
+ ' + '```' + '</tool_call>' }}
104
+ {%- endif %}
105
+ {%- endfor %}
106
+ {{- '</tool_calls>' + eos_token }}
107
+ {%- endif %}
108
+ {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
109
+ {%- set content = message['content'] %}
110
+ {%- if '<answer>' in content and not loop.last %}
111
+ {%- set content = content.split('<answer>')[-1].strip('</answer>').strip() %}
112
+ {%- endif %}
113
+ {%- set ns.is_last_user = false %}
114
+ {%- if ns.is_tool %}
115
+ {{- '</tool_responses>' + '<|extra_0|>' + content + eos_token }}
116
+ {%- set ns.is_tool = false %}
117
+ {%- else %}
118
+ {{- content + eos_token }}
119
+ {%- endif %}
120
+ {%- endif %}
121
+ {%- if message['role'] == 'tool' %}
122
+ {%- set ns.is_last_user = false %}
123
+ {%- set ns.is_tool = true %}
124
+ {%- if ns.is_output_first %}
125
+ {{- bos_token + '<tool_responses><tool_response>' + message['content'] + '</tool_response>' }}
126
+ {%- set ns.is_output_first = false %}
127
+ {%- else %}
128
+ {{- '
129
+ <tool_response>' + message['content'] + '</tool_response>' }}
130
+ {%- endif %}
131
+ {%- endif %}
132
+ {%- endfor %}
133
+ {%- if ns.is_tool %}
134
+ {{- '</tool_responses>' + '<|extra_0|>' }}
135
+ {%- endif %}
136
+ {%- if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
137
+ {{- '<|extra_0|>' }}
138
+ {%- endif %}
139
+ {%- if enable_thinking is defined and not enable_thinking %}
140
+ {{- '<think>
141
+
142
+ </think>
143
+ ' }}
144
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_classification_head": false,
3
+ "architectures": [
4
+ "HunYuanDenseV1ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.1,
8
+ "attention_head_dim": 128,
9
+ "bos_token_id": 1,
10
+ "cla_share_factor": 2,
11
+ "class_num": 0,
12
+ "dense_list": [
13
+ 4096,
14
+ 0
15
+ ],
16
+ "eos_token_id": 127960,
17
+ "head_dim": 32,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 16,
20
+ "im_end_id": 5,
21
+ "im_newline_id": 11,
22
+ "im_start_id": 4,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 64,
25
+ "mask_init_id": 12,
26
+ "max_position_embeddings": 32768,
27
+ "mlp_bias": false,
28
+ "model_type": "hunyuan_v1_dense",
29
+ "norm_type": "rms",
30
+ "num_attention_heads": 2,
31
+ "num_hidden_layers": 2,
32
+ "num_key_value_heads": 1,
33
+ "org_vocab_size": 128167,
34
+ "pad_id": 127961,
35
+ "pad_token_id": 127961,
36
+ "pool_type": "last",
37
+ "pretraining_tp": 1,
38
+ "rms_norm_eps": 1e-05,
39
+ "rope_scaling": {
40
+ "alpha": 1000.0,
41
+ "beta_fast": 32,
42
+ "beta_slow": 1,
43
+ "factor": 1.0,
44
+ "mscale": 1.0,
45
+ "mscale_all_dim": 1.0,
46
+ "type": "dynamic"
47
+ },
48
+ "rope_theta": 10000.0,
49
+ "sep_token_id": 127962,
50
+ "sliding_window": null,
51
+ "text_end_id": 7,
52
+ "text_start_id": 6,
53
+ "tie_word_embeddings": true,
54
+ "torch_dtype": "bfloat16",
55
+ "transformers_version": "4.56.0.dev0",
56
+ "use_cache": true,
57
+ "use_cla": false,
58
+ "use_qk_norm": true,
59
+ "use_rotary_pos_emb": true,
60
+ "vocab_size": 128167
61
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 127960,
5
+ 127967
6
+ ],
7
+ "pad_token_id": 127961,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.56.0.dev0",
13
+ "trust_remote_code": true
14
+ }
hy.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ce24ee92bb9eaab342190bd257bf36e9b06bf5c156f1c5c14c1c6526346e41
3
+ size 4128912
special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|startoftext|>",
4
+ "<|extra_0|>",
5
+ "<|extra_4|>",
6
+ "<|extra_5|>",
7
+ "<|eos|>"
8
+ ],
9
+ "bos_token": "<|startoftext|>",
10
+ "eos_token": "<|eos|>",
11
+ "pad_token": "<|pad|>"
12
+ }
tokenization_hy.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ import os
4
+ import unicodedata
5
+ from typing import Collection, Dict, List, Set, Tuple, Union
6
+
7
+ import tiktoken
8
+ from transformers import PreTrainedTokenizer, AddedToken
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ VOCAB_FILES_NAMES = {"vocab_file": "hy.tiktoken"}
14
+
15
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
16
+ # PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
17
+ ENDOFTEXT = "<|endoftext|>"
18
+ STARTOFTEXT = "<|startoftext|>"
19
+ BOSTOKEN = "<|bos|>"
20
+ EOSTOKEN = "<|eos|>"
21
+ PADTOKEN = "<|pad|>"
22
+
23
+ # as the default behavior is changed to allow special tokens in
24
+ # regular texts, the surface forms of special tokens need to be
25
+ # as different as possible to minimize the impact
26
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
27
+ # changed to use actual index to avoid misconfiguration with vocabulary expansion
28
+
29
+
30
+ SPECIAL_START_ID = 127957
31
+
32
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
33
+ # with open(tiktoken_bpe_file, "rb", encoding="utf-8") as f:
34
+ # contents = f.read()
35
+ dic = {}
36
+ rank = 0
37
+ for line in open(tiktoken_bpe_file, "rb"):
38
+ if line:
39
+ token, _ = line.split()
40
+ if base64.b64decode(token) in dic:
41
+ continue
42
+ dic[base64.b64decode(token)] = int(rank)
43
+ rank += 1
44
+ global SPECIAL_START_ID
45
+ SPECIAL_START_ID=rank
46
+ return dic
47
+
48
+ # NOTE: Please use the code line to check `SPECIAL_START_ID` right, this will affect the SPECIAL_START_ID
49
+ # _load_tiktoken_bpe('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy/' + VOCAB_FILES_NAMES['vocab_file'])
50
+ # print(SPECIAL_START_ID)
51
+
52
+ SPECIAL_TOKENS = tuple(
53
+ enumerate(
54
+ (
55
+ (
56
+ ENDOFTEXT,
57
+ STARTOFTEXT,
58
+ BOSTOKEN,
59
+ EOSTOKEN,
60
+ PADTOKEN,
61
+ )
62
+ + EXTRAS
63
+ ),
64
+ start=SPECIAL_START_ID,
65
+ )
66
+ )
67
+ # NOTE: Unused Token ID starts from 127962
68
+ SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
69
+
70
+ class HYTokenizer(PreTrainedTokenizer):
71
+ """hunyuan tokenizer."""
72
+
73
+ vocab_files_names = VOCAB_FILES_NAMES
74
+
75
+ def __init__(
76
+ self,
77
+ vocab_file,
78
+ errors="replace",
79
+ extra_vocab_file=None,
80
+ **kwargs,
81
+ ):
82
+ super().__init__(**kwargs)
83
+
84
+ # how to handle errors in decoding UTF-8 byte sequences
85
+ # use ignore if you are in streaming inference
86
+ self.errors = errors
87
+
88
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
89
+ self.special_tokens = {
90
+ token: index
91
+ for index, token in SPECIAL_TOKENS
92
+ }
93
+
94
+ # try load extra vocab from file
95
+ if extra_vocab_file is not None:
96
+ used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
97
+ extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
98
+ for token, index in extra_mergeable_ranks.items():
99
+ if token in self.mergeable_ranks:
100
+ logger.info(f"extra token {token} exists, skipping")
101
+ continue
102
+ if index in used_ids:
103
+ logger.info(f'the index {index} for extra token {token} exists, skipping')
104
+ continue
105
+ self.mergeable_ranks[token] = index
106
+ # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
107
+
108
+ enc = tiktoken.Encoding(
109
+ "HunYuan",
110
+ pat_str=PAT_STR,
111
+ mergeable_ranks=self.mergeable_ranks,
112
+ special_tokens=self.special_tokens,
113
+ )
114
+ assert (
115
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
116
+ ), f"{len(self.mergeable_ranks)} + {len(self.special_tokens)} != {enc.n_vocab} in encoding"
117
+
118
+ self.decoder = {
119
+ v: k for k, v in self.mergeable_ranks.items()
120
+ } # type: dict[int, bytes|str]
121
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
122
+
123
+ self.tokenizer = enc # type: tiktoken.Encoding
124
+
125
+ self.eod_id = self.tokenizer.eot_token
126
+ self.bod_id = self.special_tokens[STARTOFTEXT]
127
+ self.bos_id = self.special_tokens[BOSTOKEN]
128
+ self.eos_id = self.special_tokens[EOSTOKEN]
129
+ self.pad_id = self.special_tokens[PADTOKEN]
130
+
131
+ def __getstate__(self):
132
+ # for pickle lovers
133
+ state = self.__dict__.copy()
134
+ del state["tokenizer"]
135
+ return state
136
+
137
+ def __setstate__(self, state):
138
+ # tokenizer is not python native; don't pass it; rebuild it
139
+ self.__dict__.update(state)
140
+ enc = tiktoken.Encoding(
141
+ "HunYuan",
142
+ pat_str=PAT_STR,
143
+ mergeable_ranks=self.mergeable_ranks,
144
+ special_tokens=self.special_tokens,
145
+ )
146
+ self.tokenizer = enc
147
+
148
+ def __len__(self) -> int:
149
+ return self.tokenizer.n_vocab
150
+
151
+ def get_vocab(self) -> Dict[bytes, int]:
152
+ return self.mergeable_ranks
153
+
154
+ def convert_tokens_to_ids(
155
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
156
+ ) -> List[int]:
157
+ ids = []
158
+ if isinstance(tokens, (str, bytes)):
159
+ if tokens in self.special_tokens:
160
+ return self.special_tokens[tokens]
161
+ else:
162
+ return self.mergeable_ranks.get(tokens)
163
+ for token in tokens:
164
+ if token in self.special_tokens:
165
+ ids.append(self.special_tokens[token])
166
+ else:
167
+ ids.append(self.mergeable_ranks.get(token))
168
+ return ids
169
+
170
+ def _add_tokens(
171
+ self,
172
+ new_tokens: Union[List[str], List[AddedToken]],
173
+ special_tokens: bool = False,
174
+ ) -> int:
175
+ if not special_tokens and new_tokens:
176
+ raise ValueError("Adding regular tokens is not supported")
177
+ for token in new_tokens:
178
+ surface_form = token.content if isinstance(token, AddedToken) else token
179
+ if surface_form not in SPECIAL_TOKENS_SET:
180
+ raise ValueError("Adding unknown special tokens is not supported")
181
+ return 0
182
+
183
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
184
+ """
185
+ Save only the vocabulary of the tokenizer (vocabulary).
186
+ Returns:
187
+ `Tuple(str)`: Paths to the files saved.
188
+ """
189
+ file_path = os.path.join(save_directory, "hy.tiktoken")
190
+ with open(file_path, "w", encoding="utf-8") as w:
191
+ for k, v in self.mergeable_ranks.items():
192
+ line = base64.b64encode(k).decode("utf-8") + " " + str(v) + "\n"
193
+ w.write(line)
194
+ return (file_path,)
195
+
196
+ def tokenize(
197
+ self,
198
+ text: str,
199
+ allowed_special: Union[Set, str] = "all",
200
+ disallowed_special: Union[Collection, str] = (),
201
+ **kwargs,
202
+ ) -> List[Union[bytes, str]]:
203
+ """
204
+ Converts a string in a sequence of tokens.
205
+ Args:
206
+ text (`str`):
207
+ The sequence to be encoded.
208
+ allowed_special (`Literal["all"]` or `set`):
209
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
210
+ Default to "all".
211
+ disallowed_special (`Literal["all"]` or `Collection`):
212
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
213
+ Default to an empty tuple.
214
+ kwargs (additional keyword arguments, *optional*):
215
+ Will be passed to the underlying model specific encode method.
216
+ Returns:
217
+ `List[bytes|str]`: The list of tokens.
218
+ """
219
+ tokens = []
220
+ text = unicodedata.normalize("NFC", text)
221
+
222
+ # this implementation takes a detour: text -> token id -> token surface forms
223
+ for t in self.tokenizer.encode(
224
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
225
+ ):
226
+ tokens.append(self.decoder[t])
227
+ return tokens
228
+
229
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
230
+ """
231
+ Converts a sequence of tokens in a single string.
232
+ """
233
+ text = ""
234
+ temp = b""
235
+ for t in tokens:
236
+ if isinstance(t, str):
237
+ if temp:
238
+ text += temp.decode("utf-8", errors=self.errors)
239
+ temp = b""
240
+ text += t
241
+ elif isinstance(t, bytes):
242
+ temp += t
243
+ else:
244
+ raise TypeError("token should only be of type types or str")
245
+ if temp:
246
+ text += temp.decode("utf-8", errors=self.errors)
247
+ return text
248
+
249
+ @property
250
+ def vocab_size(self):
251
+ return self.tokenizer.n_vocab
252
+
253
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
254
+ """Converts an id to a token, special tokens included"""
255
+ if index in self.decoder:
256
+ return self.decoder[index]
257
+ raise ValueError("unknown ids")
258
+
259
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
260
+ """Converts a token to an id using the vocab, special tokens included"""
261
+ if token in self.special_tokens:
262
+ return self.special_tokens[token]
263
+ if token in self.mergeable_ranks:
264
+ return self.mergeable_ranks[token]
265
+ raise ValueError("unknown token")
266
+
267
+ def _tokenize(self, text: str, **kwargs):
268
+ """
269
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
270
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
271
+ Do NOT take care of added tokens.
272
+ """
273
+ raise NotImplementedError
274
+
275
+ def _decode(
276
+ self,
277
+ token_ids: Union[int, List[int]],
278
+ skip_special_tokens: bool = False,
279
+ errors: str = None,
280
+ **kwargs,
281
+ ) -> str:
282
+ if isinstance(token_ids, int):
283
+ token_ids = [token_ids]
284
+ if skip_special_tokens:
285
+ token_ids = [i for i in token_ids if i < self.eod_id]
286
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
287
+
288
+ # tests
289
+ if __name__ == "__main__":
290
+ tokenizer = HYTokenizer.from_pretrained('./other_tokenizer_vocab/hy')
291
+ text = '你好,世界'
292
+ tokens = tokenizer.tokenize(text)
293
+ print(tokens)
294
+ ids = tokenizer.convert_tokens_to_ids(tokens)
295
+ print(ids)
296
+ text2 = tokenizer.convert_tokens_to_string(tokens)
297
+ print(text2)
298
+ ids2 = tokenizer.convert_tokens_to_ids(tokens)
tokenizer_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "additional_special_tokens": [
4
+ "<|startoftext|>",
5
+ "<|extra_0|>",
6
+ "<|extra_4|>",
7
+ "<|extra_5|>",
8
+ "<|eos|>"
9
+ ],
10
+ "architectures": [
11
+ "GPT2LMHeadModel"
12
+ ],
13
+ "auto_map": {
14
+ "AutoTokenizer": [
15
+ "tokenization_hy.HYTokenizer",
16
+ null
17
+ ]
18
+ },
19
+ "bos_token": "<|startoftext|>",
20
+ "clean_up_tokenization_spaces": false,
21
+ "eos_token": "<|eos|>",
22
+ "extra_special_tokens": {},
23
+ "model_max_length": 262144,
24
+ "model_type": "gpt2",
25
+ "pad_token": "<|pad|>",
26
+ "tokenizer_class": "HYTokenizer"
27
+ }