import asyncio import os import tempfile from typing import Any, Dict, Optional from fastapi import HTTPException, UploadFile from sqlmodel import Session from core.settings import DATA_ROOT from core.speech_service import ( SpeechDisabledError, SpeechDurationError, SpeechServiceError, WhisperSpeechService, ) from models.bot import BotInstance from services.platform_settings_service import get_speech_runtime_settings async def transcribe_bot_speech_upload( session: Session, bot_id: str, upload: UploadFile, language: Optional[str], speech_service: WhisperSpeechService, logger: Any, ) -> Dict[str, Any]: bot = session.get(BotInstance, bot_id) if not bot: raise HTTPException(status_code=404, detail="Bot not found") speech_settings = get_speech_runtime_settings() if not speech_settings["enabled"]: raise HTTPException(status_code=400, detail="Speech recognition is disabled") if not upload: raise HTTPException(status_code=400, detail="no audio file uploaded") original_name = str(upload.filename or "audio.webm").strip() or "audio.webm" safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_") ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm" if len(ext) > 12: ext = ".webm" tmp_path = "" try: with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp: tmp_path = tmp.name while True: chunk = await upload.read(1024 * 1024) if not chunk: break tmp.write(chunk) if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0: raise HTTPException(status_code=400, detail="audio payload is empty") resolved_language = str(language or "").strip() or speech_settings["default_language"] result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language) text = str(result.get("text") or "").strip() if not text: raise HTTPException(status_code=400, detail="No speech detected") return { "bot_id": bot_id, "text": text, "duration_seconds": result.get("duration_seconds"), "max_audio_seconds": speech_settings["max_audio_seconds"], "model": speech_settings["model"], "device": speech_settings["device"], "language": result.get("language") or resolved_language, } except SpeechDisabledError as exc: logger.warning("speech transcribe disabled bot_id=%s file=%s language=%s detail=%s", bot_id, safe_name, language, exc) raise HTTPException(status_code=400, detail=str(exc)) from exc except SpeechDurationError as exc: logger.warning( "speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s", bot_id, safe_name, language, speech_settings["max_audio_seconds"], ) raise HTTPException(status_code=413, detail=f"Audio duration exceeds {speech_settings['max_audio_seconds']} seconds") from exc except SpeechServiceError as exc: logger.exception("speech transcribe failed bot_id=%s file=%s language=%s", bot_id, safe_name, language) raise HTTPException(status_code=400, detail=str(exc)) from exc except HTTPException: raise except Exception as exc: logger.exception("speech transcribe unexpected error bot_id=%s file=%s language=%s", bot_id, safe_name, language) raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}") from exc finally: try: await upload.close() except Exception: pass if tmp_path and os.path.exists(tmp_path): try: os.remove(tmp_path) except Exception: pass