tencent/Hunyuan-MT-7B

import os
import io
import uuid
import time
import asyncio
import requests
import subprocess
from time import perf_counter
from threading import Thread


import modal

APP_NAME = os.getenv("APP_NAME", "")

app = modal.App("hunyuan_mt")
img = (
    # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
    modal.Image.from_registry(
        "nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04",
        add_python="3.10",
    )
    .apt_install("git", "git-lfs", "ffmpeg", "clang", "cmake", "ninja-build")
    .pip_install("wheel")
    .pip_install("accelerate", "torch==2.6.0", "transformers==4.56.0")
    # https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1
    .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation")
    .pip_install("compressed-tensors==0.11.0")
    .env(
        {
            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
            "TQDM_DISABLE": "1",
            "LLM_MODEL": os.getenv("LLM_MODEL", "tencent/Hunyuan-MT-7B"),
        }
    )
)

if APP_NAME == "achatbot":
    img = img.pip_install(
        f"achatbot==0.0.24.post2",
        extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"),
    )

HF_MODEL_DIR = "/root/.achatbot/models"
hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)


with img.imports():
    import random

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
    from transformers.generation.streamers import TextIteratorStreamer

    MODEL_ID = os.getenv("LLM_MODEL")
    MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID)

    config = None
    if MODEL_ID in ["tencent/Hunyuan-MT-7B-fp8"]:
        # https://huggingface.co/docs/transformers/v4.56.0/en/model_doc/hunyuan_v1_dense#transformers.HunYuanDenseV1Config
        config = AutoConfig.from_pretrained(MODEL_PATH)
        config.quantization_config["ignore"] = config.quantization_config["ignored_layers"]
        del config.quantization_config["ignored_layers"]




@app
	.function(
    gpu=os.getenv("IMAGE_GPU", None),
    cpu=2.0,
    retries=1,
    image=img,
    volumes={
        HF_MODEL_DIR: hf_model_vol,
    },
    timeout=1200,  # default 300s
    scaledown_window=1200,
    max_containers=1,
)
async def run(func):
    subprocess.run("nvidia-smi --version", shell=True)
    subprocess.run("nvcc --version", shell=True)
    gpu_prop = None
    if torch.cuda.is_available():
        gpu_prop = torch.cuda.get_device_properties("cuda")
        print(gpu_prop)

    if asyncio.iscoroutinefunction(func):
        await func(gpu_prop)
    else:
        func(gpu_prop)


def print_model_params(model: torch.nn.Module, extra_info=""):
    # print the number of parameters in the model
    model_million_params = sum(p.numel() for p in model.parameters()) / 1e6
    print(model)
    print(f"{extra_info} {model_million_params} M parameters")


# https://huggingface.co/collections/tencent/hunyuan-mt-68b42f76d473f82798882597
def dump_model(gpu_prop):
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        device_map="auto",
        dtype="auto",
        attn_implementation="flash_attention_2" if gpu_prop.major >= 8 else None,
        config=config,
    )

    model = model.eval()
    # tencent/Hunyuan-MT-7B 7504.932864 M parameters # BF16
    # tencent/Hunyuan-MT-7B-fp8 7504.933312 M parameters ??? # F32 · BF16 · F8_E4M3
    print_model_params(model, f"{MODEL_ID}")
    print(f"{config=}")
    print(f"{model.config=}")
    assert config == model.config

    del model
    torch.cuda.empty_cache()


def tokenize(gpu_prop):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
    # extra_203, now support 33 languages.
    print(f"{tokenizer=}")
    chats = [
        [
            {
                "role": "user",
                "content": "Translate the following segment into Chinese, without additional explanation.\n\nIt’s on the house.",
            },
        ],
        [
            {
                "role": "user",
                "content": "把下面的文本翻译成English，不要额外解释。\n\n奥利给!",
            },
        ],
    ]
    print(chats)

    # use chat template
    tokens_list = []
    for chat in chats:
        tokens = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        tokens_list.append(tokens)

    print(f"{tokens_list=}")

    print("---" * 20)
    # don't use chat template
    inputs = tokenizer(
        tokens_list,
        return_tensors="pt",
        padding=True,  # for batch
    )
    for key, value in inputs.items():
        print(f"{key}: {value.shape=}")
    input_ids = inputs["input_ids"]
    decoded_prompts = tokenizer.batch_decode(
        input_ids,
        skip_special_tokens=True,
    )
    print(f"{decoded_prompts=}")


def generate(gpu_prop):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        dtype="auto",
        device_map="auto",
        attn_implementation="flash_attention_2" if gpu_prop.major >= 8 else None,
        config=config,
    )
    model = model.eval()
    # print(f"{model.config=}")

    messages = [
        "<|startoftext|>Translate the following segment into Chinese, without additional explanation.\n\nIt’s on the house.<|extra_0|>",
        "<|startoftext|>把下面的文本翻译成English，不要额外解释。\n\n奥利给!<|extra_0|>",
    ]
    print(messages)

    inputs = tokenizer(
        messages,
        return_tensors="pt",
        padding=True,  # for batch
        padding_side="left",
    ).to(model.device)
    for key, value in inputs.items():
        print(f"{key}: {value.shape=}")
    input_ids = inputs["input_ids"]
    prompt = tokenizer.batch_decode(input_ids)
    print(f"{prompt=}")
    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generated_ids = model.generate(
            input_ids,
            attention_mask=inputs["attention_mask"],
            do_sample=True,
            temperature=0.7,
            top_k=20,
            top_p=0.6,
            repetition_penalty=1.05,
            max_new_tokens=128,
        )
        generated_ids = [generated_id[input_len:] for generated_id in generated_ids]

    for generated_id in generated_ids:
        print(f"{generated_id.shape=}")
    generated_text = tokenizer.batch_decode(
        generated_ids,
        skip_special_tokens=True,
    )
    print(generated_text)


def generate_stream(gpu_prop):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        dtype="auto",
        device_map="auto",
        attn_implementation="flash_attention_2" if gpu_prop.major >= 8 else None,
        config=config,
    )
    model = model.eval()
    # print(f"{model.config=}")

    messages = [
        {
            "role": "user",
            "content": "Translate the following segment into Chinese, without additional explanation.\n\nIt’s on the house.",
        },
    ]
    print(messages)

    # inputs = tokenizer.apply_chat_template(
    #    messages,
    #    add_generation_prompt=True,
    #    tokenize=True,
    #    return_dict=True,
    #    return_tensors="pt",
    # ).to(model.device)

    inputs = tokenizer(
        "<|startoftext|>Translate the following segment into Chinese, without additional explanation.\n\nIt’s on the house.<|extra_0|>",
        return_tensors="pt",
    ).to(model.device)
    for key, value in inputs.items():
        print(f"{key}: {value.shape=}")
    input_ids = inputs["input_ids"]
    prompt = tokenizer.decode(input_ids[0])
    print(f"{prompt=}")

    streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        input_ids=input_ids,
        attention_mask=inputs["attention_mask"],
        do_sample=True,
        temperature=0.7,
        top_k=20,
        top_p=0.6,
        repetition_penalty=1.05,
        max_new_tokens=256,
        streamer=streamer,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    generated_text = ""
    start = perf_counter()
    times = []
    with torch.inference_mode():
        for new_text in streamer:
            times.append(perf_counter() - start)
            print(new_text, end="", flush=True)
            generated_text += new_text
            start = perf_counter()
    print(f"\n{generated_text=} TTFT: {times[0]:.2f}s total time: {sum(times):.2f}s")


async def achatbot_gen_stream(gpu_prop):
    from achatbot.core.llm.transformers.generator import TransformersGenerator, GenerationConfig
    from achatbot.types.llm.transformers import TransformersLMArgs
    from achatbot.common.types import MODELS_DIR, SessionCtx
    from achatbot.common.session import Session
    from achatbot.common.logger import Logger

    Logger.init(os.getenv("LOG_LEVEL", "info").upper(), is_file=False, is_console=True)

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    tokens = tokenizer(
        "<|startoftext|>Translate the following segment into Chinese, without additional explanation.\n\nIt’s on the house.<|extra_0|>"
    )
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # init generator
    args = TransformersLMArgs(lm_model_name_or_path=MODEL_PATH).__dict__
    args["config"] = config
    generator = TransformersGenerator(**args)

    # generation_config
    # https://huggingface.co/docs/transformers/v4.56.0/en/main_classes/text_generation#transformers.GenerationConfig
    generation_config = GenerationConfig.from_pretrained(MODEL_PATH, "generation_config.json")
    generation_config.max_new_tokens = 30
    print(generation_config.to_dict())

    session = Session(**SessionCtx(str(uuid.uuid4().hex)).__dict__)
    session.ctx.state["token_ids"] = input_ids
    first = True
    start_time = time.perf_counter()
    async for token_id in generator.generate(
        session,
        attention_mask=attention_mask,
        stop_ids=[tokenizer.eos_token_id],
        **generation_config.to_dict(),
    ):
        if first:
            ttft = time.perf_counter() - start_time
            print(f"generate TTFT time: {ttft} s")
            first = False
        gen_text = tokenizer.decode(token_id)
        print(token_id, gen_text)


# !!! If you want to load fp8 model with transformers, you need to change the name"ignored_layers" in config.json to "ignore" and upgrade the compressed-tensors to compressed-tensors-0.11.0.

"""
modal run src/llm/transformers/translation/hunyuan_mt.py --task tokenize

IMAGE_GPU=T4 modal run src/llm/transformers/translation/hunyuan_mt.py --task dump_model
LLM_MODEL=tencent/Hunyuan-MT-7B-fp8 IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task dump_model

IMAGE_GPU=T4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate
IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate
LLM_MODEL=tencent/Hunyuan-MT-7B-fp8 IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate

IMAGE_GPU=T4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate_stream
IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate_stream
LLM_MODEL=tencent/Hunyuan-MT-7B-fp8 IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task generate_stream

APP_NAME=achatbot IMAGE_GPU=T4 modal run src/llm/transformers/translation/hunyuan_mt.py --task achatbot_gen_stream
APP_NAME=achatbot IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task achatbot_gen_stream
LLM_MODEL=tencent/Hunyuan-MT-7B-fp8 APP_NAME=achatbot IMAGE_GPU=L4 modal run src/llm/transformers/translation/hunyuan_mt.py --task achatbot_gen_stream
"""




@app
	.local_entrypoint()
def main(task: str = "dump_model"):
    print(task)
    tasks = {
        "tokenize": tokenize,
        "dump_model": dump_model,
        "generate": generate,
        "generate_stream": generate_stream,
        "achatbot_gen_stream": achatbot_gen_stream,
    }
    if task not in tasks:
        raise ValueError(f"task {task} not found")
    print(f"running task {task}")
    run.remote(tasks[task])
more: https://github.com/ai-bot-pro/achatbot/pull/188
tencent
/

Hunyuan-MT-7B

demo