Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +120 -0
chat_template.jinja +144 -0
config.json +61 -0
generation_config.json +14 -0
hy.tiktoken +0 -0
model.safetensors +3 -0
special_tokens_map.json +12 -0
tokenization_hy.py +298 -0
tokenizer_config.json +27 -0

README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+---
+library_name: transformers
+pipeline_tag: text-generation
+inference: true
+widget:
+  - text: Hello!
+    example_title: Hello world
+    group: Python
+base_model:
+- tencent/Hunyuan-7B-Instruct
+---
+This tiny model is for debugging. It is randomly initialized with the config adapted from [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+### Example usage:
+```python
+import torch
+from transformers.pipelines import pipeline
+model_id = "yujiepan/hunyuan-dense-v1-tiny-random"
+messages = [
+    {
+        "role": "user",
+        "content": "hi",
+    }
+]
+pipe = pipeline('text-generation', model_id, device='cuda', torch_dtype=torch.bfloat16, trust_remote_code=True,)
+print(pipe(messages, max_new_tokens=32))
+```
+### Codes to create this repo:
+```python
+import json
+from pathlib import Path
+import accelerate
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    GenerationConfig,
+    set_seed,
+)
+source_model_id = "tencent/Hunyuan-7B-Instruct"
+save_folder = "/tmp/yujiepan/hunyuan-dense-v1-tiny-random"
+processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
+processor.save_pretrained(save_folder)
+with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    config_json = json.load(f)
+config_json['hidden_size'] = 16
+config_json['head_dim'] = 32
+config_json['intermediate_size'] = 64
+config_json['num_attention_heads'] = 2
+config_json['num_hidden_layers'] = 2
+config_json['num_key_value_heads'] = 1
+config_json['tie_word_embeddings'] = True
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(
+    save_folder,
+    trust_remote_code=True,
+)
+print(config)
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+torch.set_default_dtype(torch.float32)
+if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
+    model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id, trust_remote_code=True,
+    )
+set_seed(42)
+model = model.cpu()  # cpu is more stable for random initialization across machines
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, 0, 0.1)
+        print(name, p.shape)
+model.save_pretrained(save_folder)
+print(model)
+```
+### Printing the model:
+```text
+HunYuanDenseV1ForCausalLM(
+  (model): HunYuanDenseV1Model(
+    (embed_tokens): Embedding(128167, 16, padding_idx=127961)
+    (layers): ModuleList(
+      (0-1): 2 x HunYuanDenseV1DecoderLayer(
+        (self_attn): HunYuanDenseV1Attention(
+          (q_proj): Linear(in_features=16, out_features=64, bias=False)
+          (k_proj): Linear(in_features=16, out_features=32, bias=False)
+          (v_proj): Linear(in_features=16, out_features=32, bias=False)
+          (o_proj): Linear(in_features=64, out_features=16, bias=False)
+          (query_layernorm): HunYuanDenseV1RMSNorm((32,), eps=1e-05)
+          (key_layernorm): HunYuanDenseV1RMSNorm((32,), eps=1e-05)
+        )
+        (mlp): HunYuanDenseV1MLP(
+          (gate_proj): Linear(in_features=16, out_features=64, bias=False)
+          (up_proj): Linear(in_features=16, out_features=64, bias=False)
+          (down_proj): Linear(in_features=64, out_features=16, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
+        (post_attention_layernorm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
+      )
+    )
+    (norm): HunYuanDenseV1RMSNorm((16,), eps=1e-05)
+    (rotary_emb): HunYuanDenseV1RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=16, out_features=128167, bias=False)
+)
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,144 @@

+{%- if not add_generation_prompt is defined %}
+    {%- set add_generation_prompt = false %}
+{%- endif %}
+{%- set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_first_user=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {%- set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {%- set ns.is_first_sp = false %}
+        {%- else %}
+            {%- set ns.system_prompt = ns.system_prompt + '
+' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{{- bos_token }}
+{{- ns.system_prompt }}
+{%- if tools %}
+    {%- if ns.system_prompt != '' %}
+        {{- '
+# Tools
+You may call one or more functions to assist with the user query.' }}
+    {%- else %}
+        {{- '# Tools
+You may call one or more functions to assist with the user query.' }}
+    {%- endif %}
+    {{- '
+You are provided with function signatures within <tools></tools> XML tags:' }}
+    {{- '
+<tools>
+' }}
+    {%- for tool in tools %}
+        {%- if loop.index0 > 0 %}
+            {{- '
+' }}
+        {%- endif %}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- '
+</tools>
+' }}
+    {{- 'For function call returns, you should first print <tool_calls>' }}
+    {{- 'For each function call, you should return object like:
+' }}
+    {{- '<tool_call>function_name
+```json
+function_arguments_in_json_format
+```</tool_call>' }}
+    {{- 'At the end of function call returns, you should print </tool_calls>' }}
+{%- endif %}
+{%- if ns.system_prompt != '' or tools %}
+    {{- '<|extra_4|>' }}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_last_user = true %}
+        {%- if ns.is_first_user %}
+            {{- message['content'] + '<|extra_0|>' }}
+            {%- set ns.is_first_user = false %}
+        {%- else %}
+            {{- bos_token + message['content'] + '<|extra_0|>' }}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false %}
+        {%- if ns.is_tool %}
+            {{- '</tool_responses>' + '<|extra_0|>' }}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false %}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- set arguments = tool['function']['arguments'] %}
+            {%- if arguments is not string %}
+                {%- set arguments = arguments | tojson %}
+            {%- endif %}
+            {%- if not ns.is_first %}
+                {%- if message['content'] is none %}
+                    {{- '<tool_calls><tool_call>' + tool['function']['name'] + '
+' + '```json' + '
+' + arguments + '
+' + '```' + '</tool_call>' }}
+                {%- else %}
+                    {{- message['content'] + '<tool_calls><tool_call>' + tool['function']['name'] + '
+' + '```json' + '
+' + arguments + '
+' + '```' + '</tool_call>' }}
+                {%- endif %}
+            {%- set ns.is_first = true %}
+            {%- else %}
+                {{- '
+' + '<tool_call>' + tool['function']['name'] + '
+' + '```json' + '
+' + arguments + '
+' + '```' + '</tool_call>' }}
+            {%- endif %}
+        {%- endfor %}
+        {{- '</tool_calls>' + eos_token }}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
+        {%- set content = message['content'] %}
+        {%- if '<answer>' in content and not loop.last %}
+            {%- set content = content.split('<answer>')[-1].strip('</answer>').strip() %}
+        {%- endif %}
+        {%- set ns.is_last_user = false %}
+        {%- if ns.is_tool %}
+            {{- '</tool_responses>' + '<|extra_0|>' + content + eos_token }}
+            {%- set ns.is_tool = false %}
+        {%- else %}
+            {{- content + eos_token }}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false %}
+        {%- set ns.is_tool = true %}
+        {%- if ns.is_output_first %}
+            {{- bos_token + '<tool_responses><tool_response>' + message['content'] + '</tool_response>' }}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{- '
+<tool_response>' + message['content'] + '</tool_response>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.is_tool %}
+    {{- '</tool_responses>' + '<|extra_0|>' }}
+{%- endif %}
+{%- if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{- '<|extra_0|>' }}
+{%- endif %}
+{%- if enable_thinking is defined and not enable_thinking %}
+    {{- '<think>
+</think>
+' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "add_classification_head": false,
+  "architectures": [
+    "HunYuanDenseV1ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_head_dim": 128,
+  "bos_token_id": 1,
+  "cla_share_factor": 2,
+  "class_num": 0,
+  "dense_list": [
+    4096,
+    0
+  ],
+  "eos_token_id": 127960,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 16,
+  "im_end_id": 5,
+  "im_newline_id": 11,
+  "im_start_id": 4,
+  "initializer_range": 0.02,
+  "intermediate_size": 64,
+  "mask_init_id": 12,
+  "max_position_embeddings": 32768,
+  "mlp_bias": false,
+  "model_type": "hunyuan_v1_dense",
+  "norm_type": "rms",
+  "num_attention_heads": 2,
+  "num_hidden_layers": 2,
+  "num_key_value_heads": 1,
+  "org_vocab_size": 128167,
+  "pad_id": 127961,
+  "pad_token_id": 127961,
+  "pool_type": "last",
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "alpha": 1000.0,
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 1.0,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "sep_token_id": 127962,
+  "sliding_window": null,
+  "text_end_id": 7,
+  "text_start_id": 6,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.56.0.dev0",
+  "use_cache": true,
+  "use_cla": false,
+  "use_qk_norm": true,
+  "use_rotary_pos_emb": true,
+  "vocab_size": 128167
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    127960,
+    127967
+  ],
+  "pad_token_id": 127961,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.56.0.dev0",
+  "trust_remote_code": true
+}

hy.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30ce24ee92bb9eaab342190bd257bf36e9b06bf5c156f1c5c14c1c6526346e41
+size 4128912

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "additional_special_tokens": [
+    "<|startoftext|>",
+    "<|extra_0|>",
+    "<|extra_4|>",
+    "<|extra_5|>",
+    "<|eos|>"
+  ],
+  "bos_token": "<|startoftext|>",
+  "eos_token": "<|eos|>",
+  "pad_token": "<|pad|>"
+}

tokenization_hy.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "hy.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+# PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+STARTOFTEXT = "<|startoftext|>"
+BOSTOKEN = "<|bos|>"
+EOSTOKEN = "<|eos|>"
+PADTOKEN = "<|pad|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 127957
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    # with open(tiktoken_bpe_file, "rb", encoding="utf-8") as f:
+    #     contents = f.read()
+    dic = {}
+    rank = 0
+    for line in open(tiktoken_bpe_file, "rb"):
+        if line:
+            token, _ = line.split()
+            if base64.b64decode(token) in dic:
+                continue
+            dic[base64.b64decode(token)] = int(rank)
+            rank += 1
+    global SPECIAL_START_ID
+    SPECIAL_START_ID=rank
+    return dic
+# NOTE: Please use the code line to check `SPECIAL_START_ID` right, this will affect the SPECIAL_START_ID
+# _load_tiktoken_bpe('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy/' + VOCAB_FILES_NAMES['vocab_file'])
+# print(SPECIAL_START_ID)
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                STARTOFTEXT,
+                BOSTOKEN,
+                EOSTOKEN,
+                PADTOKEN,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+# NOTE: Unused Token ID starts from 127962
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+class HYTokenizer(PreTrainedTokenizer):
+    """hunyuan tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+        enc = tiktoken.Encoding(
+            "HunYuan",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks)} + {len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.bod_id = self.special_tokens[STARTOFTEXT]
+        self.bos_id = self.special_tokens[BOSTOKEN]
+        self.eos_id = self.special_tokens[EOSTOKEN]
+        self.pad_id = self.special_tokens[PADTOKEN]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "HunYuan",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "hy.tiktoken")
+        with open(file_path, "w", encoding="utf-8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf-8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+# tests
+if __name__ == "__main__":
+    tokenizer = HYTokenizer.from_pretrained('./other_tokenizer_vocab/hy')
+    text = '你好，世界'
+    tokens = tokenizer.tokenize(text)
+    print(tokens)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    print(ids)
+    text2 = tokenizer.convert_tokens_to_string(tokens)
+    print(text2)
+    ids2 = tokenizer.convert_tokens_to_ids(tokens)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [
+    "<|startoftext|>",
+    "<|extra_0|>",
+    "<|extra_4|>",
+    "<|extra_5|>",
+    "<|eos|>"
+  ],
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hy.HYTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|eos|>",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "model_type": "gpt2",
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "HYTokenizer"
+}