Bug: Timestamps reset every 30 seconds with return_timestamps=True

#59
by fortytwoprod - opened

When using whisper-large-v3-turbo to transcribe audio files longer than 30 seconds, the timestamp resets every 30 seconds. Can this bug be solved? When return_timestamps is set to "word", the timestamp doesn't reset at all

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

result = pipe("audios/testlong-mp3.mp3", return_timestamps=True, generate_kwargs={"language": "english"})
result

Output:

{
    'text': 'blah blah',
    'chunks': [
        {'timestamp': (0.0, 4.42), 'text': " In this chapter, I'm goinsadfsdaffdsfrus."},
        {'timestamp': (4.6, 10.96), 'text': " Okay, it's time to dig into tasdfasfsdafwant it to be."},
        ...
        # later the timestamps reset
        {'timestamp': (26.1, 0.0), 'text': ''},
        {'timestamp': (6.16, 8.2), 'text': ' I like that I just thiasdfsadfasdflly good'},
    ]
}

Sign up or log in to comment