2024-10-10 14:51:48 +00:00
|
|
|
import torch
|
|
|
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
2024-10-16 13:52:53 +00:00
|
|
|
def process(audio_path, model_name):
|
2024-10-10 14:51:48 +00:00
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
2024-10-16 13:52:53 +00:00
|
|
|
model_name,
|
2024-10-10 14:51:48 +00:00
|
|
|
torch_dtype=torch.float32,
|
|
|
|
low_cpu_mem_usage=True,
|
|
|
|
use_safetensors=True
|
|
|
|
)
|
|
|
|
model.to("mps")
|
2024-10-16 13:52:53 +00:00
|
|
|
processor = AutoProcessor.from_pretrained(model_name)
|
2024-10-10 14:51:48 +00:00
|
|
|
pipe = pipeline(
|
|
|
|
"automatic-speech-recognition",
|
|
|
|
model=model,
|
|
|
|
tokenizer=processor.tokenizer,
|
|
|
|
feature_extractor=processor.feature_extractor,
|
|
|
|
torch_dtype=torch.float32,
|
2024-10-10 15:03:15 +00:00
|
|
|
return_timestamps=True,
|
2024-10-10 14:51:48 +00:00
|
|
|
device="mps"
|
|
|
|
)
|
|
|
|
output = pipe(audio_path)["text"]
|
|
|
|
return output
|