Improve model card: Update library, add paper link, abstract summary, and refine tags
Browse filesThis PR significantly enhances the model card for Voxtral Small by:
- **Updating the `library_name`** from `vllm` to `transformers` in the metadata, as the model is natively supported and has extensive usage examples for the Hugging Face `transformers` library. This ensures proper functionality of the "how to use" widget on the Hub.
- **Adding `vllm` to the `tags`** in the metadata to maintain discoverability and highlight its recommended use for serving.
- **Removing the `inference: false`** metadata tag, as the model provides comprehensive usage instructions, indicating it is ready for inference.
- **Adding a direct link to the paper** "[Voxtral](https://huggingface.co/papers/2507.13264)" prominently in the introductory section.
- **Including a concise summary of the paper's abstract** at the top of the model card content for immediate understanding of the model's capabilities and contributions.
@@ -1,4 +1,6 @@
|
|
1 |
---
|
|
|
|
|
2 |
language:
|
3 |
- en
|
4 |
- fr
|
@@ -8,24 +10,23 @@ language:
|
|
8 |
- pt
|
9 |
- nl
|
10 |
- hi
|
|
|
11 |
license: apache-2.0
|
12 |
-
library_name: vllm
|
13 |
-
inference: false
|
14 |
-
base_model:
|
15 |
-
- mistralai/Mistral-Small-24B-Base-2501
|
16 |
-
extra_gated_description: >-
|
17 |
-
If you want to learn more about how we process your personal data, please read
|
18 |
-
our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
|
19 |
pipeline_tag: audio-text-to-text
|
20 |
tags:
|
21 |
- transformers
|
|
|
|
|
|
|
22 |
---
|
23 |
|
24 |
# Voxtral Small 1.0 (24B) - 2507
|
25 |
|
|
|
|
|
26 |
Voxtral Small is an enhancement of [Mistral Small 3](https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
|
27 |
|
28 |
-
Learn more about Voxtral in our blog post [here](https://mistral.ai/news/voxtral).
|
29 |
|
30 |
## Key Features
|
31 |
|
@@ -157,7 +158,9 @@ user_msg = UserMessage(content=[file_to_chunk(obama_file), file_to_chunk(bcn_fil
|
|
157 |
|
158 |
print(30 * "=" + "USER 1" + 30 * "=")
|
159 |
print(text_chunk.text)
|
160 |
-
print("
|
|
|
|
|
161 |
|
162 |
response = client.chat.completions.create(
|
163 |
model=model,
|
@@ -169,13 +172,15 @@ content = response.choices[0].message.content
|
|
169 |
|
170 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
171 |
print(content)
|
172 |
-
print("
|
|
|
|
|
173 |
# The model could give the following answer:
|
174 |
# ```L'orateur le plus inspirant est le président.
|
175 |
# Il est plus inspirant parce qu'il parle de ses expériences personnelles
|
176 |
# et de son optimisme pour l'avenir du pays.
|
177 |
# Il est différent de l'autre orateur car il ne parle pas de la météo,
|
178 |
-
# mais
|
179 |
|
180 |
messages = [
|
181 |
user_msg,
|
@@ -184,7 +189,9 @@ messages = [
|
|
184 |
]
|
185 |
print(30 * "=" + "USER 2" + 30 * "=")
|
186 |
print(messages[-1]["content"])
|
187 |
-
print("
|
|
|
|
|
188 |
|
189 |
response = client.chat.completions.create(
|
190 |
model=model,
|
@@ -313,7 +320,8 @@ print(30 * "=" + "Transcription" + 30 * "=")
|
|
313 |
req = TranscriptionRequest(model=model, audio=audio_chunk.input_audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
|
314 |
response = client.audio.transcriptions.create(**req)
|
315 |
print(response.text) # How is the weather in Madrid at the moment?
|
316 |
-
print("
|
|
|
317 |
|
318 |
|
319 |
print(30 * "=" + "Function calling" + 30 * "=")
|
@@ -328,7 +336,9 @@ response = client.chat.completions.create(
|
|
328 |
)
|
329 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
330 |
print(response.choices[0].message.tool_calls)
|
331 |
-
print("
|
|
|
|
|
332 |
```
|
333 |
</details>
|
334 |
|
@@ -384,7 +394,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
384 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
385 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
386 |
|
387 |
-
print("
|
|
|
388 |
print("=" * 80)
|
389 |
print(decoded_outputs[0])
|
390 |
print("=" * 80)
|
@@ -442,7 +453,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
442 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
443 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
444 |
|
445 |
-
print("
|
|
|
446 |
print("=" * 80)
|
447 |
print(decoded_outputs[0])
|
448 |
print("=" * 80)
|
@@ -481,7 +493,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
481 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
482 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
483 |
|
484 |
-
print("
|
|
|
485 |
print("=" * 80)
|
486 |
print(decoded_outputs[0])
|
487 |
print("=" * 80)
|
@@ -520,7 +533,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
520 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
521 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
522 |
|
523 |
-
print("
|
|
|
524 |
print("=" * 80)
|
525 |
print(decoded_outputs[0])
|
526 |
print("=" * 80)
|
@@ -581,7 +595,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
581 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
582 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
583 |
|
584 |
-
print("
|
|
|
585 |
print("=" * 80)
|
586 |
for decoded_output in decoded_outputs:
|
587 |
print(decoded_output)
|
@@ -610,7 +625,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
610 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
611 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
612 |
|
613 |
-
print("
|
|
|
614 |
print("=" * 80)
|
615 |
for decoded_output in decoded_outputs:
|
616 |
print(decoded_output)
|
|
|
1 |
---
|
2 |
+
base_model:
|
3 |
+
- mistralai/Mistral-Small-24B-Base-2501
|
4 |
language:
|
5 |
- en
|
6 |
- fr
|
|
|
10 |
- pt
|
11 |
- nl
|
12 |
- hi
|
13 |
+
library_name: transformers
|
14 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
pipeline_tag: audio-text-to-text
|
16 |
tags:
|
17 |
- transformers
|
18 |
+
- vllm
|
19 |
+
extra_gated_description: If you want to learn more about how we process your personal
|
20 |
+
data, please read our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
|
21 |
---
|
22 |
|
23 |
# Voxtral Small 1.0 (24B) - 2507
|
24 |
|
25 |
+
We present Voxtral Mini and Voxtral Small, two multimodal audio chat models. Voxtral is trained to comprehend both spoken audio and text documents, achieving state-of-the-art performance across a diverse range of audio benchmarks, while preserving strong text capabilities. Voxtral Small outperforms a number of closed-source models, while being small enough to run locally. A 32K context window enables the model to handle audio files up to 40 minutes in duration and long multi-turn conversations.
|
26 |
+
|
27 |
Voxtral Small is an enhancement of [Mistral Small 3](https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
|
28 |
|
29 |
+
Learn more about Voxtral in our paper [Voxtral](https://huggingface.co/papers/2507.13264) and our blog post [here](https://mistral.ai/news/voxtral).
|
30 |
|
31 |
## Key Features
|
32 |
|
|
|
158 |
|
159 |
print(30 * "=" + "USER 1" + 30 * "=")
|
160 |
print(text_chunk.text)
|
161 |
+
print("
|
162 |
+
|
163 |
+
")
|
164 |
|
165 |
response = client.chat.completions.create(
|
166 |
model=model,
|
|
|
172 |
|
173 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
174 |
print(content)
|
175 |
+
print("
|
176 |
+
|
177 |
+
")
|
178 |
# The model could give the following answer:
|
179 |
# ```L'orateur le plus inspirant est le président.
|
180 |
# Il est plus inspirant parce qu'il parle de ses expériences personnelles
|
181 |
# et de son optimisme pour l'avenir du pays.
|
182 |
# Il est différent de l'autre orateur car il ne parle pas de la météo,
|
183 |
+
# mais plutôto de ses interactions avec les gens et de son rôle en tant que président.```
|
184 |
|
185 |
messages = [
|
186 |
user_msg,
|
|
|
189 |
]
|
190 |
print(30 * "=" + "USER 2" + 30 * "=")
|
191 |
print(messages[-1]["content"])
|
192 |
+
print("
|
193 |
+
|
194 |
+
")
|
195 |
|
196 |
response = client.chat.completions.create(
|
197 |
model=model,
|
|
|
320 |
req = TranscriptionRequest(model=model, audio=audio_chunk.input_audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
|
321 |
response = client.audio.transcriptions.create(**req)
|
322 |
print(response.text) # How is the weather in Madrid at the moment?
|
323 |
+
print("
|
324 |
+
")
|
325 |
|
326 |
|
327 |
print(30 * "=" + "Function calling" + 30 * "=")
|
|
|
336 |
)
|
337 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
338 |
print(response.choices[0].message.tool_calls)
|
339 |
+
print("
|
340 |
+
|
341 |
+
")
|
342 |
```
|
343 |
</details>
|
344 |
|
|
|
394 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
395 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
396 |
|
397 |
+
print("
|
398 |
+
Generated response:")
|
399 |
print("=" * 80)
|
400 |
print(decoded_outputs[0])
|
401 |
print("=" * 80)
|
|
|
453 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
454 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
455 |
|
456 |
+
print("
|
457 |
+
Generated response:")
|
458 |
print("=" * 80)
|
459 |
print(decoded_outputs[0])
|
460 |
print("=" * 80)
|
|
|
493 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
494 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
495 |
|
496 |
+
print("
|
497 |
+
Generated response:")
|
498 |
print("=" * 80)
|
499 |
print(decoded_outputs[0])
|
500 |
print("=" * 80)
|
|
|
533 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
534 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
535 |
|
536 |
+
print("
|
537 |
+
Generated response:")
|
538 |
print("=" * 80)
|
539 |
print(decoded_outputs[0])
|
540 |
print("=" * 80)
|
|
|
595 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
596 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
597 |
|
598 |
+
print("
|
599 |
+
Generated responses:")
|
600 |
print("=" * 80)
|
601 |
for decoded_output in decoded_outputs:
|
602 |
print(decoded_output)
|
|
|
625 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
626 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
627 |
|
628 |
+
print("
|
629 |
+
Generated responses:")
|
630 |
print("=" * 80)
|
631 |
for decoded_output in decoded_outputs:
|
632 |
print(decoded_output)
|