23 lines
725 B
Python
23 lines
725 B
Python
import torch
|
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
|
def process(audio_path):
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
|
"openai/whisper-large-v3",
|
|
torch_dtype=torch.float32,
|
|
low_cpu_mem_usage=True,
|
|
use_safetensors=True
|
|
)
|
|
model.to("mps")
|
|
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
|
|
pipe = pipeline(
|
|
"automatic-speech-recognition",
|
|
model=model,
|
|
tokenizer=processor.tokenizer,
|
|
feature_extractor=processor.feature_extractor,
|
|
torch_dtype=torch.float32,
|
|
return_timestamps=True,
|
|
device="mps"
|
|
)
|
|
output = pipe(audio_path)["text"]
|
|
return output
|