dashboard-nanobot/backend/services/speech_transcribe_service.py

100 lines
3.9 KiB
Python

import asyncio
import os
import tempfile
from typing import Any, Dict, Optional
from fastapi import HTTPException, UploadFile
from sqlmodel import Session
from core.settings import DATA_ROOT
from core.speech_service import (
SpeechDisabledError,
SpeechDurationError,
SpeechServiceError,
WhisperSpeechService,
)
from models.bot import BotInstance
from services.platform_service import get_speech_runtime_settings
async def transcribe_bot_speech_upload(
session: Session,
bot_id: str,
upload: UploadFile,
language: Optional[str],
speech_service: WhisperSpeechService,
logger: Any,
) -> Dict[str, Any]:
bot = session.get(BotInstance, bot_id)
if not bot:
raise HTTPException(status_code=404, detail="Bot not found")
speech_settings = get_speech_runtime_settings()
if not speech_settings["enabled"]:
raise HTTPException(status_code=400, detail="Speech recognition is disabled")
if not upload:
raise HTTPException(status_code=400, detail="no audio file uploaded")
original_name = str(upload.filename or "audio.webm").strip() or "audio.webm"
safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
if len(ext) > 12:
ext = ".webm"
tmp_path = ""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
tmp_path = tmp.name
while True:
chunk = await upload.read(1024 * 1024)
if not chunk:
break
tmp.write(chunk)
if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
raise HTTPException(status_code=400, detail="audio payload is empty")
resolved_language = str(language or "").strip() or speech_settings["default_language"]
result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
text = str(result.get("text") or "").strip()
if not text:
raise HTTPException(status_code=400, detail="No speech detected")
return {
"bot_id": bot_id,
"text": text,
"duration_seconds": result.get("duration_seconds"),
"max_audio_seconds": speech_settings["max_audio_seconds"],
"model": speech_settings["model"],
"device": speech_settings["device"],
"language": result.get("language") or resolved_language,
}
except SpeechDisabledError as exc:
logger.warning("speech transcribe disabled bot_id=%s file=%s language=%s detail=%s", bot_id, safe_name, language, exc)
raise HTTPException(status_code=400, detail=str(exc)) from exc
except SpeechDurationError as exc:
logger.warning(
"speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
bot_id,
safe_name,
language,
speech_settings["max_audio_seconds"],
)
raise HTTPException(status_code=413, detail=f"Audio duration exceeds {speech_settings['max_audio_seconds']} seconds") from exc
except SpeechServiceError as exc:
logger.exception("speech transcribe failed bot_id=%s file=%s language=%s", bot_id, safe_name, language)
raise HTTPException(status_code=400, detail=str(exc)) from exc
except HTTPException:
raise
except Exception as exc:
logger.exception("speech transcribe unexpected error bot_id=%s file=%s language=%s", bot_id, safe_name, language)
raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}") from exc
finally:
try:
await upload.close()
except Exception:
pass
if tmp_path and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except Exception:
pass