Upload folder using huggingface_hub
Browse files- README.md +11 -19
- modeling_intern_vit.py +6 -12
README.md
CHANGED
|
@@ -239,7 +239,7 @@ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast
|
|
| 239 |
|
| 240 |
# set the max number of tiles in `max_num`
|
| 241 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 242 |
-
generation_config = dict(max_new_tokens=1024, do_sample=
|
| 243 |
|
| 244 |
# pure-text conversation (纯文本对话)
|
| 245 |
question = 'Hello, who are you?'
|
|
@@ -391,7 +391,7 @@ for new_text in streamer:
|
|
| 391 |
|
| 392 |
## Finetune
|
| 393 |
|
| 394 |
-
|
| 395 |
|
| 396 |
## Deployment
|
| 397 |
|
|
@@ -400,7 +400,7 @@ SWIFT from ModelScope community has supported the fine-tuning (Image/Video) of I
|
|
| 400 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
| 401 |
|
| 402 |
```sh
|
| 403 |
-
pip install lmdeploy
|
| 404 |
```
|
| 405 |
|
| 406 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
|
@@ -408,14 +408,12 @@ LMDeploy abstracts the complex inference process of multi-modal Vision-Language
|
|
| 408 |
#### A 'Hello, world' example
|
| 409 |
|
| 410 |
```python
|
| 411 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 412 |
from lmdeploy.vl import load_image
|
| 413 |
|
| 414 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 415 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 416 |
-
|
| 417 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 418 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 419 |
response = pipe(('describe this image', image))
|
| 420 |
print(response.text)
|
| 421 |
```
|
|
@@ -429,14 +427,12 @@ When dealing with multiple images, you can put them all in one list. Keep in min
|
|
| 429 |
> Warning: Due to the scarcity of multi-image conversation data, the performance on multi-image tasks may be unstable, and it may require multiple attempts to achieve satisfactory results.
|
| 430 |
|
| 431 |
```python
|
| 432 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 433 |
from lmdeploy.vl import load_image
|
| 434 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 435 |
|
| 436 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 437 |
-
|
| 438 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 439 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 440 |
|
| 441 |
image_urls=[
|
| 442 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
@@ -454,13 +450,11 @@ print(response.text)
|
|
| 454 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
| 455 |
|
| 456 |
```python
|
| 457 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 458 |
from lmdeploy.vl import load_image
|
| 459 |
|
| 460 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 461 |
-
|
| 462 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 463 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 464 |
|
| 465 |
image_urls=[
|
| 466 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
@@ -476,13 +470,11 @@ print(response)
|
|
| 476 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
| 477 |
|
| 478 |
```python
|
| 479 |
-
from lmdeploy import pipeline, TurbomindEngineConfig,
|
| 480 |
from lmdeploy.vl import load_image
|
| 481 |
|
| 482 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 483 |
-
|
| 484 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 485 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 486 |
|
| 487 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 488 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
|
|
|
| 239 |
|
| 240 |
# set the max number of tiles in `max_num`
|
| 241 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 242 |
+
generation_config = dict(max_new_tokens=1024, do_sample=True)
|
| 243 |
|
| 244 |
# pure-text conversation (纯文本对话)
|
| 245 |
question = 'Hello, who are you?'
|
|
|
|
| 391 |
|
| 392 |
## Finetune
|
| 393 |
|
| 394 |
+
Many repositories now support fine-tuning of the InternVL series models, including [InternVL](https://github.com/OpenGVLab/InternVL), [SWIFT](https://github.com/modelscope/ms-swift), [XTurner](https://github.com/InternLM/xtuner), and others. Please refer to their documentation for more details on fine-tuning.
|
| 395 |
|
| 396 |
## Deployment
|
| 397 |
|
|
|
|
| 400 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
| 401 |
|
| 402 |
```sh
|
| 403 |
+
pip install lmdeploy==0.5.3
|
| 404 |
```
|
| 405 |
|
| 406 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
|
|
|
| 408 |
#### A 'Hello, world' example
|
| 409 |
|
| 410 |
```python
|
| 411 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 412 |
from lmdeploy.vl import load_image
|
| 413 |
|
| 414 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 415 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 416 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
| 417 |
response = pipe(('describe this image', image))
|
| 418 |
print(response.text)
|
| 419 |
```
|
|
|
|
| 427 |
> Warning: Due to the scarcity of multi-image conversation data, the performance on multi-image tasks may be unstable, and it may require multiple attempts to achieve satisfactory results.
|
| 428 |
|
| 429 |
```python
|
| 430 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 431 |
from lmdeploy.vl import load_image
|
| 432 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 433 |
|
| 434 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 435 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
| 436 |
|
| 437 |
image_urls=[
|
| 438 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
|
|
| 450 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
| 451 |
|
| 452 |
```python
|
| 453 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 454 |
from lmdeploy.vl import load_image
|
| 455 |
|
| 456 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 457 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
| 458 |
|
| 459 |
image_urls=[
|
| 460 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
|
|
| 470 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
| 471 |
|
| 472 |
```python
|
| 473 |
+
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
| 474 |
from lmdeploy.vl import load_image
|
| 475 |
|
| 476 |
model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
|
| 477 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
| 478 |
|
| 479 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 480 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
modeling_intern_vit.py
CHANGED
|
@@ -20,18 +20,12 @@ from transformers.utils import logging
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
try: # v1
|
| 24 |
-
from flash_attn.flash_attn_interface import \
|
| 25 |
-
flash_attn_unpadded_qkvpacked_func
|
| 26 |
-
except: # v2
|
| 27 |
-
from flash_attn.flash_attn_interface import \
|
| 28 |
-
flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
|
| 29 |
-
|
| 30 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 31 |
-
|
|
|
|
| 32 |
has_flash_attn = True
|
| 33 |
except:
|
| 34 |
-
print('
|
| 35 |
has_flash_attn = False
|
| 36 |
|
| 37 |
logger = logging.get_logger(__name__)
|
|
@@ -74,7 +68,7 @@ class FlashAttention(nn.Module):
|
|
| 74 |
max_s = seqlen
|
| 75 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 76 |
device=qkv.device)
|
| 77 |
-
output =
|
| 78 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 79 |
softmax_scale=self.softmax_scale, causal=causal
|
| 80 |
)
|
|
@@ -84,7 +78,7 @@ class FlashAttention(nn.Module):
|
|
| 84 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 85 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 86 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 87 |
-
output_unpad =
|
| 88 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 89 |
softmax_scale=self.softmax_scale, causal=causal
|
| 90 |
)
|
|
@@ -93,7 +87,7 @@ class FlashAttention(nn.Module):
|
|
| 93 |
'b s (h d) -> b s h d', h=nheads)
|
| 94 |
else:
|
| 95 |
assert max_s is not None
|
| 96 |
-
output =
|
| 97 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 98 |
softmax_scale=self.softmax_scale, causal=causal
|
| 99 |
)
|
|
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 24 |
+
from flash_attn.flash_attn_interface import \
|
| 25 |
+
flash_attn_varlen_qkvpacked_func
|
| 26 |
has_flash_attn = True
|
| 27 |
except:
|
| 28 |
+
print('FlashAttention2 is not installed.')
|
| 29 |
has_flash_attn = False
|
| 30 |
|
| 31 |
logger = logging.get_logger(__name__)
|
|
|
|
| 68 |
max_s = seqlen
|
| 69 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 70 |
device=qkv.device)
|
| 71 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 72 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 73 |
softmax_scale=self.softmax_scale, causal=causal
|
| 74 |
)
|
|
|
|
| 78 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 79 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 80 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 81 |
+
output_unpad = flash_attn_varlen_qkvpacked_func(
|
| 82 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 83 |
softmax_scale=self.softmax_scale, causal=causal
|
| 84 |
)
|
|
|
|
| 87 |
'b s (h d) -> b s h d', h=nheads)
|
| 88 |
else:
|
| 89 |
assert max_s is not None
|
| 90 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 91 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 92 |
softmax_scale=self.softmax_scale, causal=causal
|
| 93 |
)
|