tensorrt-llm demo
#9
by
weege007
- opened
import os
import sys
import asyncio
import subprocess
from time import perf_counter
import modal
app = modal.App("hunyuan7b_trtllm")
IMAGE_GPU = os.getenv("IMAGE_GPU", None)
img = (
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
modal.Image.from_registry(
"docker.cnb.cool/tencent/hunyuan/hunyuan-7b:hunyuan-7b-trtllm",
add_python="3.12", # modal install /usr/local/bin/python3.12.1 or 3.10.13
)
.entrypoint([]) # remove verbose logging by base image on entry
.run_commands(
"/usr/local/bin/python --version",
"/usr/bin/python --version",
"echo $PATH",
"/usr/bin/pip list",
"update-alternatives --install /usr/local/bin/python python3 /usr/local/bin/python3.12 1",
"update-alternatives --install /usr/local/bin/python python3 /usr/bin/python3.12 2",
"python --version",
"pip list",
)
# https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html
# .run_commands(
# "git lfs install",
# "git clone https://github.com/NVIDIA/TensorRT-LLM.git",
# "cd TensorRT-LLM && git checkout 064eb7a70f29f45a74b5b080aafd0f6a872ed4b5",
# "cd TensorRT-LLM && pip install -r requirements.txt",
# )
.env(
{
# "TQDM_DISABLE": "1",
"LLM_MODEL": os.getenv("LLM_MODEL", "tencent/Hunyuan-MT-7B"),
"LD_LIBRARY_PATH": "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:$LD_LIBRARY_PATH",
}
)
.run_commands(
"cat /modal_requirements.txt",
"pip list",
"pip install -r /modal_requirements.txt",
)
.pip_install(
"fastapi==0.115.4",
"pydantic==2.11.4",
"cloudpickle>=3.0.0",
)
)
HF_MODEL_DIR = "/root/.achatbot/models"
hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)
with img.imports():
import torch
# sys.path.insert(0, "/TensorRT-LLM")
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.bindings.executor import KvCacheConfig
MODEL_ID = os.getenv("LLM_MODEL", "tencent/Hunyuan-MT-7B")
MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID)
@app
.function(
gpu=IMAGE_GPU,
cpu=2.0,
retries=0,
image=img,
volumes={
HF_MODEL_DIR: hf_model_vol,
},
timeout=1200, # default 300s
scaledown_window=1200,
max_containers=1,
)
async def run(func, **kwargs):
subprocess.run("nvidia-smi --version", shell=True)
subprocess.run("nvcc --version", shell=True)
if torch.cuda.is_available():
gpu_prop = torch.cuda.get_device_properties("cuda")
print(gpu_prop)
if asyncio.iscoroutinefunction(func):
await func(**kwargs)
else:
func(**kwargs)
def generate(**kwargs):
"""
https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate
"""
prompts = [
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n你好<|extra_0|>",
]
# load hf model, flashinfer.jit compile
# https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
llm = LLM(
model=MODEL_PATH,
max_batch_size=2,
max_seq_len=512,
kv_cache_config={"free_gpu_memory_fraction": 0.5},
)
# https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
sampling_params = SamplingParams(
temperature=0.7, top_k=20, top_p=0.6, max_tokens=64, repetition_penalty=1.05
)
outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
# Print the outputs.
for output in outputs:
print(output)
llm.shutdown()
async def async_gen_stream():
"""
https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async
"""
from tensorrt_llm import LLM, SamplingParams
prompts = [
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n你好<|extra_0|>",
]
# load hf model, flashinfer.jit compile
# https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
llm = LLM(
model=MODEL_PATH,
max_batch_size=2,
max_seq_len=512,
kv_cache_config={"free_gpu_memory_fraction": 0.5},
)
# https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
sampling_params = SamplingParams(
temperature=0.7,
top_k=20,
top_p=0.6,
repetition_penalty=1.05,
max_tokens=64,
detokenize=True,
)
for i, prompt in enumerate(prompts):
generator = llm.generate_async(prompt, sampling_params, streaming=True)
async for output in generator:
print(output)
llm.shutdown()
async def async_batch_stream():
"""
https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async
"""
import asyncio
import uuid
from tensorrt_llm import LLM, SamplingParams
# Prompts to generate
prompts = [
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n你好<|extra_0|>",
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n奥利给<|extra_0|>",
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n我爱中国<|extra_0|>",
f"<|startoftext|>把下面的文本翻译成English,不要额外解释。 \n\n9月3日,看了阅兵仪式,东风快递好牛叉!<|extra_0|>",
]
# load hf model, flashinfer.jit compile
# https://nvidia.github.io/TensorRT-LLM/1.0.0rc1/
llm = LLM(
model=MODEL_PATH,
max_batch_size=4,
# max_batch_size=2,
max_seq_len=512,
kv_cache_config={"free_gpu_memory_fraction": 0.5},
)
# https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams
sampling_params = SamplingParams(
temperature=0.7,
top_k=20,
top_p=0.6,
repetition_penalty=1.05,
max_tokens=64,
detokenize=True,
)
lock = asyncio.Lock()
async def run_async_stream(llm, prompt, sampling_params, request_id=str(uuid.uuid4().hex)):
generator = llm.generate_async(prompt, sampling_params, streaming=True)
async for item in generator:
async with lock:
print(f"[{request_id}] tokenId: {item.outputs[0].token_ids[-1]} {item} ")
# u can send this response to a request queue/channle
tasks = [
run_async_stream(llm, prompt, sampling_params, request_id=str(uuid.uuid4().hex))
for prompt in prompts
]
await asyncio.gather(*tasks)
llm.shutdown()
"""
# https://github.com/Tencent-Hunyuan/Hunyuan-7B#tensorrt-llm
IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task generate
IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task async_gen_stream
IMAGE_GPU=L4 modal run src/llm/trtllm/hunyuan_7b.py --task async_batch_stream
"""
@app
.local_entrypoint()
def main(
task: str = "generate",
):
print(task)
tasks = {
"generate": generate,
"async_gen_stream": async_gen_stream,
"async_batch_stream": async_batch_stream,
}
if task not in tasks:
raise ValueError(f"task {task} not found")
print(f"running task {task}")
run.remote(tasks[task])