nielsr HF Staff commited on
Commit
6c4f629
·
verified ·
1 Parent(s): b74a328

Improve model card: Update library, add paper link, abstract summary, and refine tags

Browse files

This PR significantly enhances the model card for Voxtral Small by:
- **Updating the `library_name`** from `vllm` to `transformers` in the metadata, as the model is natively supported and has extensive usage examples for the Hugging Face `transformers` library. This ensures proper functionality of the "how to use" widget on the Hub.
- **Adding `vllm` to the `tags`** in the metadata to maintain discoverability and highlight its recommended use for serving.
- **Removing the `inference: false`** metadata tag, as the model provides comprehensive usage instructions, indicating it is ready for inference.
- **Adding a direct link to the paper** "[Voxtral](https://huggingface.co/papers/2507.13264)" prominently in the introductory section.
- **Including a concise summary of the paper's abstract** at the top of the model card content for immediate understanding of the model's capabilities and contributions.

Files changed (1) hide show
  1. README.md +36 -20
README.md CHANGED
@@ -1,4 +1,6 @@
1
  ---
 
 
2
  language:
3
  - en
4
  - fr
@@ -8,24 +10,23 @@ language:
8
  - pt
9
  - nl
10
  - hi
 
11
  license: apache-2.0
12
- library_name: vllm
13
- inference: false
14
- base_model:
15
- - mistralai/Mistral-Small-24B-Base-2501
16
- extra_gated_description: >-
17
- If you want to learn more about how we process your personal data, please read
18
- our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
19
  pipeline_tag: audio-text-to-text
20
  tags:
21
  - transformers
 
 
 
22
  ---
23
 
24
  # Voxtral Small 1.0 (24B) - 2507
25
 
 
 
26
  Voxtral Small is an enhancement of [Mistral Small 3](https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
27
 
28
- Learn more about Voxtral in our blog post [here](https://mistral.ai/news/voxtral).
29
 
30
  ## Key Features
31
 
@@ -157,7 +158,9 @@ user_msg = UserMessage(content=[file_to_chunk(obama_file), file_to_chunk(bcn_fil
157
 
158
  print(30 * "=" + "USER 1" + 30 * "=")
159
  print(text_chunk.text)
160
- print("\n\n")
 
 
161
 
162
  response = client.chat.completions.create(
163
  model=model,
@@ -169,13 +172,15 @@ content = response.choices[0].message.content
169
 
170
  print(30 * "=" + "BOT 1" + 30 * "=")
171
  print(content)
172
- print("\n\n")
 
 
173
  # The model could give the following answer:
174
  # ```L'orateur le plus inspirant est le président.
175
  # Il est plus inspirant parce qu'il parle de ses expériences personnelles
176
  # et de son optimisme pour l'avenir du pays.
177
  # Il est différent de l'autre orateur car il ne parle pas de la météo,
178
- # mais plutôt de ses interactions avec les gens et de son rôle en tant que président.```
179
 
180
  messages = [
181
  user_msg,
@@ -184,7 +189,9 @@ messages = [
184
  ]
185
  print(30 * "=" + "USER 2" + 30 * "=")
186
  print(messages[-1]["content"])
187
- print("\n\n")
 
 
188
 
189
  response = client.chat.completions.create(
190
  model=model,
@@ -313,7 +320,8 @@ print(30 * "=" + "Transcription" + 30 * "=")
313
  req = TranscriptionRequest(model=model, audio=audio_chunk.input_audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
314
  response = client.audio.transcriptions.create(**req)
315
  print(response.text) # How is the weather in Madrid at the moment?
316
- print("\n")
 
317
 
318
 
319
  print(30 * "=" + "Function calling" + 30 * "=")
@@ -328,7 +336,9 @@ response = client.chat.completions.create(
328
  )
329
  print(30 * "=" + "BOT 1" + 30 * "=")
330
  print(response.choices[0].message.tool_calls)
331
- print("\n\n")
 
 
332
  ```
333
  </details>
334
 
@@ -384,7 +394,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
384
  outputs = model.generate(**inputs, max_new_tokens=500)
385
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
386
 
387
- print("\nGenerated response:")
 
388
  print("=" * 80)
389
  print(decoded_outputs[0])
390
  print("=" * 80)
@@ -442,7 +453,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
442
  outputs = model.generate(**inputs, max_new_tokens=500)
443
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
444
 
445
- print("\nGenerated response:")
 
446
  print("=" * 80)
447
  print(decoded_outputs[0])
448
  print("=" * 80)
@@ -481,7 +493,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
481
  outputs = model.generate(**inputs, max_new_tokens=500)
482
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
483
 
484
- print("\nGenerated response:")
 
485
  print("=" * 80)
486
  print(decoded_outputs[0])
487
  print("=" * 80)
@@ -520,7 +533,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
520
  outputs = model.generate(**inputs, max_new_tokens=500)
521
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
522
 
523
- print("\nGenerated response:")
 
524
  print("=" * 80)
525
  print(decoded_outputs[0])
526
  print("=" * 80)
@@ -581,7 +595,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
581
  outputs = model.generate(**inputs, max_new_tokens=500)
582
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
583
 
584
- print("\nGenerated responses:")
 
585
  print("=" * 80)
586
  for decoded_output in decoded_outputs:
587
  print(decoded_output)
@@ -610,7 +625,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
610
  outputs = model.generate(**inputs, max_new_tokens=500)
611
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
612
 
613
- print("\nGenerated responses:")
 
614
  print("=" * 80)
615
  for decoded_output in decoded_outputs:
616
  print(decoded_output)
 
1
  ---
2
+ base_model:
3
+ - mistralai/Mistral-Small-24B-Base-2501
4
  language:
5
  - en
6
  - fr
 
10
  - pt
11
  - nl
12
  - hi
13
+ library_name: transformers
14
  license: apache-2.0
 
 
 
 
 
 
 
15
  pipeline_tag: audio-text-to-text
16
  tags:
17
  - transformers
18
+ - vllm
19
+ extra_gated_description: If you want to learn more about how we process your personal
20
+ data, please read our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
21
  ---
22
 
23
  # Voxtral Small 1.0 (24B) - 2507
24
 
25
+ We present Voxtral Mini and Voxtral Small, two multimodal audio chat models. Voxtral is trained to comprehend both spoken audio and text documents, achieving state-of-the-art performance across a diverse range of audio benchmarks, while preserving strong text capabilities. Voxtral Small outperforms a number of closed-source models, while being small enough to run locally. A 32K context window enables the model to handle audio files up to 40 minutes in duration and long multi-turn conversations.
26
+
27
  Voxtral Small is an enhancement of [Mistral Small 3](https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
28
 
29
+ Learn more about Voxtral in our paper [Voxtral](https://huggingface.co/papers/2507.13264) and our blog post [here](https://mistral.ai/news/voxtral).
30
 
31
  ## Key Features
32
 
 
158
 
159
  print(30 * "=" + "USER 1" + 30 * "=")
160
  print(text_chunk.text)
161
+ print("
162
+
163
+ ")
164
 
165
  response = client.chat.completions.create(
166
  model=model,
 
172
 
173
  print(30 * "=" + "BOT 1" + 30 * "=")
174
  print(content)
175
+ print("
176
+
177
+ ")
178
  # The model could give the following answer:
179
  # ```L'orateur le plus inspirant est le président.
180
  # Il est plus inspirant parce qu'il parle de ses expériences personnelles
181
  # et de son optimisme pour l'avenir du pays.
182
  # Il est différent de l'autre orateur car il ne parle pas de la météo,
183
+ # mais plutôto de ses interactions avec les gens et de son rôle en tant que président.```
184
 
185
  messages = [
186
  user_msg,
 
189
  ]
190
  print(30 * "=" + "USER 2" + 30 * "=")
191
  print(messages[-1]["content"])
192
+ print("
193
+
194
+ ")
195
 
196
  response = client.chat.completions.create(
197
  model=model,
 
320
  req = TranscriptionRequest(model=model, audio=audio_chunk.input_audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
321
  response = client.audio.transcriptions.create(**req)
322
  print(response.text) # How is the weather in Madrid at the moment?
323
+ print("
324
+ ")
325
 
326
 
327
  print(30 * "=" + "Function calling" + 30 * "=")
 
336
  )
337
  print(30 * "=" + "BOT 1" + 30 * "=")
338
  print(response.choices[0].message.tool_calls)
339
+ print("
340
+
341
+ ")
342
  ```
343
  </details>
344
 
 
394
  outputs = model.generate(**inputs, max_new_tokens=500)
395
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
396
 
397
+ print("
398
+ Generated response:")
399
  print("=" * 80)
400
  print(decoded_outputs[0])
401
  print("=" * 80)
 
453
  outputs = model.generate(**inputs, max_new_tokens=500)
454
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
455
 
456
+ print("
457
+ Generated response:")
458
  print("=" * 80)
459
  print(decoded_outputs[0])
460
  print("=" * 80)
 
493
  outputs = model.generate(**inputs, max_new_tokens=500)
494
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
495
 
496
+ print("
497
+ Generated response:")
498
  print("=" * 80)
499
  print(decoded_outputs[0])
500
  print("=" * 80)
 
533
  outputs = model.generate(**inputs, max_new_tokens=500)
534
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
535
 
536
+ print("
537
+ Generated response:")
538
  print("=" * 80)
539
  print(decoded_outputs[0])
540
  print("=" * 80)
 
595
  outputs = model.generate(**inputs, max_new_tokens=500)
596
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
597
 
598
+ print("
599
+ Generated responses:")
600
  print("=" * 80)
601
  for decoded_output in decoded_outputs:
602
  print(decoded_output)
 
625
  outputs = model.generate(**inputs, max_new_tokens=500)
626
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
627
 
628
+ print("
629
+ Generated responses:")
630
  print("=" * 80)
631
  for decoded_output in decoded_outputs:
632
  print(decoded_output)