import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline def process(audio_path, model_name): model = AutoModelForSpeechSeq2Seq.from_pretrained( model_name, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True ) model.to("mps") processor = AutoProcessor.from_pretrained(model_name) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch.float32, return_timestamps=True, device="mps" ) output = pipe(audio_path)["text"] return output