tencent/Hunyuan-MT-7B · tensorrt-llm demo

import os
import sys
import asyncio
import subprocess
from time import perf_counter


import modal


app = modal.App("hunyuan7b_trtllm")
IMAGE_GPU = os.getenv("IMAGE_GPU", None)
img = (
    # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
    modal.Image.from_registry(
        "docker.cnb.cool/tencent/hunyuan/hunyuan-7b:hunyuan-7b-trtllm",
        add_python="3.12",  # modal install /usr/local/bin/python3.12.1 or 3.10.13
    )
    .entrypoint([])  # remove verbose logging by base image on entry
    .run_commands(
        "/usr/local/bin/python --version",
        "/usr/bin/python --version",
        "echo $PATH",
        "/usr/bin/pip list",
        "update-alternatives --install /usr/local/bin/python python3 /usr/local/bin/python3.12 1",
        "update-alternatives --install /usr/local/bin/python python3 /usr/bin/python3.12 2",
        "python --version",
        "pip list",
    )
    # https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html
    # .run_commands(
    #    "git lfs install",
    #    "git clone https://github.com/NVIDIA/TensorRT-LLM.git",
    #    "cd TensorRT-LLM && git checkout 064eb7a70f29f45a74b5b080aafd0f6a872ed4b5",
    #    "cd TensorRT-LLM && pip install -r requirements.txt",
    # )
    .env(
        {
            # "TQDM_DISABLE": "1",
            "LLM_MODEL": os.getenv("LLM_MODEL", "tencent/Hunyuan-MT-7B"),
            "LD_LIBRARY_PATH": "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:$LD_LIBRARY_PATH",
        }
    )
    .run_commands(
        "cat /modal_requirements.txt",
        "pip list",
        "pip install -r /modal_requirements.txt",
    )
    .pip_install(
        "fastapi==0.115.4",
        "pydantic==2.11.4",
        "cloudpickle>=3.0.0",
    )
)


HF_MODEL_DIR = "/root/.achatbot/models"
hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)


with img.imports():
    import torch

    # sys.path.insert(0, "/TensorRT-LLM")
    from tensorrt_llm import LLM, SamplingParams
    from tensorrt_llm.bindings.executor import KvCacheConfig

    MODEL_ID = os.getenv("LLM_MODEL", "tencent/Hunyuan-MT-7B")
    MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID)




@app
	.function(
    gpu=IMAGE_GPU,
    cpu=2.0,
    retries=0,
    image=img,
    volumes={
        HF_MODEL_DIR: hf_model_vol,
    },
    timeout=1200,  # default 300s
    scaledown_window=1200,
    max_containers=1,
)
async def run(func, **kwargs):
    subprocess.run("nvidia-smi --version", shell=True)
    subprocess.run("nvcc --version", shell=True)
    if torch.cuda.is_available():
        gpu_prop = torch.cuda.get_device_properties("cuda")
        print(gpu_prop)

    if asyncio.iscoroutinefunction(func):
        await func(**kwargs)
    else:
        func(**kwargs)


def generate(**kwargs):
    """
    https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate
    """

    prompts = [
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n你好<|extra_0|>",
    ]

    # load hf model, flashinfer.jit compile
    # https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
    llm = LLM(
        model=MODEL_PATH,
        max_batch_size=2,
        max_seq_len=512,
        kv_cache_config={"free_gpu_memory_fraction": 0.5},
    )
    # https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
    sampling_params = SamplingParams(
        temperature=0.7, top_k=20, top_p=0.6, max_tokens=64, repetition_penalty=1.05
    )

    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)

    # Print the outputs.
    for output in outputs:
        print(output)

    llm.shutdown()


async def async_gen_stream():
    """
    https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async
    """
    from tensorrt_llm import LLM, SamplingParams

    prompts = [
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n你好<|extra_0|>",
    ]

    # load hf model, flashinfer.jit compile
    # https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
    llm = LLM(
        model=MODEL_PATH,
        max_batch_size=2,
        max_seq_len=512,
        kv_cache_config={"free_gpu_memory_fraction": 0.5},
    )
    # https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
    sampling_params = SamplingParams(
        temperature=0.7,
        top_k=20,
        top_p=0.6,
        repetition_penalty=1.05,
        max_tokens=64,
        detokenize=True,
    )

    for i, prompt in enumerate(prompts):
        generator = llm.generate_async(prompt, sampling_params, streaming=True)
        async for output in generator:
            print(output)

    llm.shutdown()


async def async_batch_stream():
    """
    https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async
    """
    import asyncio
    import uuid

    from tensorrt_llm import LLM, SamplingParams

    # Prompts to generate
    prompts = [
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n你好<|extra_0|>",
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n奥利给<|extra_0|>",
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n我爱中国<|extra_0|>",
        f"<|startoftext|>把下面的文本翻译成English，不要额外解释。 \n\n9月3日，看了阅兵仪式，东风快递好牛叉！<|extra_0|>",
    ]

    # load hf model, flashinfer.jit compile
    # https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
    llm = LLM(
        model=MODEL_PATH,
        max_batch_size=4,
        # max_batch_size=2,
        max_seq_len=512,
        kv_cache_config={"free_gpu_memory_fraction": 0.5},
    )

    # https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
    sampling_params = SamplingParams(
        temperature=0.7,
        top_k=20,
        top_p=0.6,
        repetition_penalty=1.05,
        max_tokens=64,
        detokenize=True,
    )

    lock = asyncio.Lock()

    async def run_async_stream(llm, prompt, sampling_params, request_id=str(uuid.uuid4().hex)):
        generator = llm.generate_async(prompt, sampling_params, streaming=True)
        async for item in generator:
            async with lock:
                print(f"[{request_id}] tokenId: {item.outputs[0].token_ids[-1]} {item} ")
                # u can send this response to a request queue/channle

    tasks = [
        run_async_stream(llm, prompt, sampling_params, request_id=str(uuid.uuid4().hex))
        for prompt in prompts
    ]
    await asyncio.gather(*tasks)

    llm.shutdown()


"""
# https://github.com/Tencent-Hunyuan/Hunyuan-7B#tensorrt-llm

IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task generate
IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task async_gen_stream
IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task async_batch_stream
"""




@app
	.local_entrypoint()
def main(
    task: str = "generate",
):
    print(task)
    tasks = {
        "generate": generate,
        "async_gen_stream": async_gen_stream,
        "async_batch_stream": async_batch_stream,
    }
    if task not in tasks:
        raise ValueError(f"task {task} not found")
    print(f"running task {task}")
    run.remote(tasks[task])