100 lines
3.9 KiB
Python
100 lines
3.9 KiB
Python
import asyncio
|
|
import os
|
|
import tempfile
|
|
from typing import Any, Dict, Optional
|
|
|
|
from fastapi import HTTPException, UploadFile
|
|
from sqlmodel import Session
|
|
|
|
from core.settings import DATA_ROOT
|
|
from core.speech_service import (
|
|
SpeechDisabledError,
|
|
SpeechDurationError,
|
|
SpeechServiceError,
|
|
WhisperSpeechService,
|
|
)
|
|
from models.bot import BotInstance
|
|
from services.platform_service import get_speech_runtime_settings
|
|
|
|
|
|
async def transcribe_bot_speech_upload(
|
|
session: Session,
|
|
bot_id: str,
|
|
upload: UploadFile,
|
|
language: Optional[str],
|
|
speech_service: WhisperSpeechService,
|
|
logger: Any,
|
|
) -> Dict[str, Any]:
|
|
bot = session.get(BotInstance, bot_id)
|
|
if not bot:
|
|
raise HTTPException(status_code=404, detail="Bot not found")
|
|
speech_settings = get_speech_runtime_settings()
|
|
if not speech_settings["enabled"]:
|
|
raise HTTPException(status_code=400, detail="Speech recognition is disabled")
|
|
if not upload:
|
|
raise HTTPException(status_code=400, detail="no audio file uploaded")
|
|
|
|
original_name = str(upload.filename or "audio.webm").strip() or "audio.webm"
|
|
safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
|
|
ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
|
|
if len(ext) > 12:
|
|
ext = ".webm"
|
|
|
|
tmp_path = ""
|
|
try:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
|
|
tmp_path = tmp.name
|
|
while True:
|
|
chunk = await upload.read(1024 * 1024)
|
|
if not chunk:
|
|
break
|
|
tmp.write(chunk)
|
|
|
|
if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
|
|
raise HTTPException(status_code=400, detail="audio payload is empty")
|
|
|
|
resolved_language = str(language or "").strip() or speech_settings["default_language"]
|
|
result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
|
|
text = str(result.get("text") or "").strip()
|
|
if not text:
|
|
raise HTTPException(status_code=400, detail="No speech detected")
|
|
return {
|
|
"bot_id": bot_id,
|
|
"text": text,
|
|
"duration_seconds": result.get("duration_seconds"),
|
|
"max_audio_seconds": speech_settings["max_audio_seconds"],
|
|
"model": speech_settings["model"],
|
|
"device": speech_settings["device"],
|
|
"language": result.get("language") or resolved_language,
|
|
}
|
|
except SpeechDisabledError as exc:
|
|
logger.warning("speech transcribe disabled bot_id=%s file=%s language=%s detail=%s", bot_id, safe_name, language, exc)
|
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
except SpeechDurationError as exc:
|
|
logger.warning(
|
|
"speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
|
|
bot_id,
|
|
safe_name,
|
|
language,
|
|
speech_settings["max_audio_seconds"],
|
|
)
|
|
raise HTTPException(status_code=413, detail=f"Audio duration exceeds {speech_settings['max_audio_seconds']} seconds") from exc
|
|
except SpeechServiceError as exc:
|
|
logger.exception("speech transcribe failed bot_id=%s file=%s language=%s", bot_id, safe_name, language)
|
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
except HTTPException:
|
|
raise
|
|
except Exception as exc:
|
|
logger.exception("speech transcribe unexpected error bot_id=%s file=%s language=%s", bot_id, safe_name, language)
|
|
raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}") from exc
|
|
finally:
|
|
try:
|
|
await upload.close()
|
|
except Exception:
|
|
pass
|
|
if tmp_path and os.path.exists(tmp_path):
|
|
try:
|
|
os.remove(tmp_path)
|
|
except Exception:
|
|
pass
|