nm-research commited on
Commit
51838ec
·
verified ·
1 Parent(s): f5ef8cd

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -23
README.md CHANGED
@@ -35,32 +35,33 @@ This model was obtained by quantizing the weights of [google/gemma-3-27b-it](htt
35
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
36
 
37
  ```python
38
- from vllm.assets.image import ImageAsset
39
  from vllm import LLM, SamplingParams
 
 
40
 
41
- # prepare model
42
- llm = LLM(
43
- model="nm-testing/gemma-3-27b-it-quantized.w4a16",
44
- trust_remote_code=True,
45
- max_model_len=4096,
46
- max_num_seqs=2,
47
- )
48
 
49
- # prepare inputs
50
- question = "What is the content of this image?"
51
- inputs = {
52
- "prompt": f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n",
53
- "multi_modal_data": {
54
- "image": ImageAsset("cherry_blossom").pil_image.convert("RGB")
55
- },
56
- }
57
-
58
- # generate response
59
- print("========== SAMPLE GENERATION ==============")
 
60
  outputs = llm.generate(inputs, SamplingParams(temperature=0.2, max_tokens=64))
61
- print(f"PROMPT : {outputs[0].prompt}")
62
- print(f"RESPONSE: {outputs[0].outputs[0].text}")
63
- print("==========================================")
64
  ```
65
 
66
  vLLM also supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
@@ -184,7 +185,7 @@ lm_eval \
184
  <th>Category</th>
185
  <th>Metric</th>
186
  <th>google/gemma-3-27b-it</th>
187
- <th>nm-testing/gemma-3-27b-it-quantized.w8a8</th>
188
  <th>Recovery (%)</th>
189
  </tr>
190
  </thead>
 
35
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
36
 
37
  ```python
 
38
  from vllm import LLM, SamplingParams
39
+ from vllm.assets.image import ImageAsset
40
+ from transformers import AutoProcessor
41
 
42
+ # Define model name once
43
+ model_name = "RedHatAI/gemma-3-27b-it-quantized.w4a16"
44
+
45
+ # Load image and processor
46
+ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
47
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
48
 
49
+ # Build multimodal prompt
50
+ chat = [
51
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What is the content of this image?"}]},
52
+ {"role": "assistant", "content": []}
53
+ ]
54
+ prompt = processor.apply_chat_template(chat, add_generation_prompt=True)
55
+
56
+ # Initialize model
57
+ llm = LLM(model=model_name, trust_remote_code=True)
58
+
59
+ # Run inference
60
+ inputs = {"prompt": prompt, "multi_modal_data": {"image": [image]}}
61
  outputs = llm.generate(inputs, SamplingParams(temperature=0.2, max_tokens=64))
62
+
63
+ # Display result
64
+ print("RESPONSE:", outputs[0].outputs[0].text)
65
  ```
66
 
67
  vLLM also supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
 
185
  <th>Category</th>
186
  <th>Metric</th>
187
  <th>google/gemma-3-27b-it</th>
188
+ <th>RedHatAI/gemma-3-27b-it-quantized.w8a8</th>
189
  <th>Recovery (%)</th>
190
  </tr>
191
  </thead>