dashboard-nanobot/backend/services/bot_runtime_snapshot_servic...

289 lines
13 KiB
Python

import asyncio
import os
import time
from typing import Any, Callable, Dict
from sqlmodel import Session
from clients.edge.errors import log_edge_failure
from models.bot import BotInstance
from providers.target import provider_target_to_dict
class BotRuntimeSnapshotService:
_AGENT_LOOP_READY_MARKER = "Agent loop started"
def __init__(
self,
*,
engine: Any,
logger: Any,
docker_manager: Any,
default_soul_md: str,
default_agents_md: str,
default_user_md: str,
default_tools_md: str,
default_identity_md: str,
workspace_root: Callable[[str], str],
resolve_edge_state_context: Callable[[str], Any],
read_bot_config: Callable[[str], Dict[str, Any]],
resolve_bot_env_params: Callable[[str], Dict[str, str]],
resolve_bot_provider_target_for_instance: Callable[[BotInstance], Any],
read_global_delivery_flags: Callable[[Any], tuple[bool, bool]],
safe_float: Callable[[Any, float], float],
safe_int: Callable[[Any, int], int],
get_default_system_timezone: Callable[[], str],
read_bot_resources: Callable[[str, Any], Dict[str, Any]],
node_display_name: Callable[[str], str],
get_runtime_provider: Callable[[Any, BotInstance], Any],
invalidate_bot_detail_cache: Callable[[str], None],
record_activity_event: Callable[..., None],
) -> None:
self._engine = engine
self._logger = logger
self._docker_manager = docker_manager
self._default_soul_md = default_soul_md
self._default_agents_md = default_agents_md
self._default_user_md = default_user_md
self._default_tools_md = default_tools_md
self._default_identity_md = default_identity_md
self._workspace_root = workspace_root
self._resolve_edge_state_context = resolve_edge_state_context
self._read_bot_config = read_bot_config
self._resolve_bot_env_params = resolve_bot_env_params
self._resolve_bot_provider_target_for_instance = resolve_bot_provider_target_for_instance
self._read_global_delivery_flags = read_global_delivery_flags
self._safe_float = safe_float
self._safe_int = safe_int
self._get_default_system_timezone = get_default_system_timezone
self._read_bot_resources = read_bot_resources
self._node_display_name = node_display_name
self._get_runtime_provider = get_runtime_provider
self._invalidate_bot_detail_cache = invalidate_bot_detail_cache
self._record_activity_event = record_activity_event
def read_workspace_md(self, bot_id: str, filename: str, default_value: str) -> str:
edge_context = self._resolve_edge_state_context(bot_id)
if edge_context is not None:
client, workspace_root, node_id = edge_context
try:
payload = client.read_file(
bot_id=bot_id,
path=filename,
max_bytes=1_000_000,
workspace_root=workspace_root,
)
if bool(payload.get("is_markdown")):
content = payload.get("content")
if isinstance(content, str):
return content.strip()
except Exception as exc:
log_edge_failure(
self._logger,
key=f"workspace-md-read:{node_id}:{bot_id}:{filename}",
exc=exc,
message=f"Failed to read edge workspace markdown for bot_id={bot_id}, file={filename}",
)
return default_value
path = os.path.join(self._workspace_root(bot_id), filename)
if not os.path.isfile(path):
return default_value
try:
with open(path, "r", encoding="utf-8") as file:
return file.read().strip()
except Exception:
return default_value
def read_bot_runtime_snapshot(self, bot: BotInstance) -> Dict[str, Any]:
config_data = self._read_bot_config(bot.id)
env_params = self._resolve_bot_env_params(bot.id)
target = self._resolve_bot_provider_target_for_instance(bot)
provider_name = ""
provider_cfg: Dict[str, Any] = {}
providers_cfg = config_data.get("providers")
if isinstance(providers_cfg, dict):
for p_name, p_cfg in providers_cfg.items():
provider_name = str(p_name or "").strip()
if isinstance(p_cfg, dict):
provider_cfg = p_cfg
break
agents_defaults: Dict[str, Any] = {}
agents_cfg = config_data.get("agents")
if isinstance(agents_cfg, dict):
defaults = agents_cfg.get("defaults")
if isinstance(defaults, dict):
agents_defaults = defaults
channels_cfg = config_data.get("channels")
send_progress, send_tool_hints = self._read_global_delivery_flags(channels_cfg)
llm_provider = provider_name or "dashscope"
llm_model = str(agents_defaults.get("model") or "")
api_key = str(provider_cfg.get("apiKey") or "").strip()
api_base = str(provider_cfg.get("apiBase") or "").strip()
api_base_lower = api_base.lower()
if llm_provider == "openai" and ("spark-api-open.xf-yun.com" in api_base_lower or "xf-yun.com" in api_base_lower):
llm_provider = "xunfei"
soul_md = self.read_workspace_md(bot.id, "SOUL.md", self._default_soul_md)
resources = self._read_bot_resources(bot.id, config_data=config_data)
return {
**provider_target_to_dict(target),
"llm_provider": llm_provider,
"llm_model": llm_model,
"api_key": api_key,
"api_base": api_base,
"temperature": self._safe_float(agents_defaults.get("temperature"), 0.2),
"top_p": self._safe_float(agents_defaults.get("topP"), 1.0),
"max_tokens": self._safe_int(agents_defaults.get("maxTokens"), 8192),
"cpu_cores": resources["cpu_cores"],
"memory_mb": resources["memory_mb"],
"storage_gb": resources["storage_gb"],
"system_timezone": env_params.get("TZ") or self._get_default_system_timezone(),
"send_progress": send_progress,
"send_tool_hints": send_tool_hints,
"soul_md": soul_md,
"agents_md": self.read_workspace_md(bot.id, "AGENTS.md", self._default_agents_md),
"user_md": self.read_workspace_md(bot.id, "USER.md", self._default_user_md),
"tools_md": self.read_workspace_md(bot.id, "TOOLS.md", self._default_tools_md),
"identity_md": self.read_workspace_md(bot.id, "IDENTITY.md", self._default_identity_md),
"system_prompt": soul_md,
}
def serialize_bot(self, bot: BotInstance) -> Dict[str, Any]:
runtime = self.read_bot_runtime_snapshot(bot)
target = self._resolve_bot_provider_target_for_instance(bot)
return {
"id": bot.id,
"name": bot.name,
"enabled": bool(getattr(bot, "enabled", True)),
"avatar_model": "base",
"avatar_skin": "blue_suit",
"image_tag": bot.image_tag,
"llm_provider": runtime.get("llm_provider") or "",
"llm_model": runtime.get("llm_model") or "",
"system_prompt": runtime.get("system_prompt") or "",
"api_base": runtime.get("api_base") or "",
"temperature": self._safe_float(runtime.get("temperature"), 0.2),
"top_p": self._safe_float(runtime.get("top_p"), 1.0),
"max_tokens": self._safe_int(runtime.get("max_tokens"), 8192),
"cpu_cores": self._safe_float(runtime.get("cpu_cores"), 1.0),
"memory_mb": self._safe_int(runtime.get("memory_mb"), 1024),
"storage_gb": self._safe_int(runtime.get("storage_gb"), 10),
"system_timezone": str(runtime.get("system_timezone") or self._get_default_system_timezone()),
"send_progress": bool(runtime.get("send_progress")),
"send_tool_hints": bool(runtime.get("send_tool_hints")),
"node_id": target.node_id,
"node_display_name": self._node_display_name(target.node_id),
"transport_kind": target.transport_kind,
"runtime_kind": target.runtime_kind,
"core_adapter": target.core_adapter,
"soul_md": runtime.get("soul_md") or "",
"agents_md": runtime.get("agents_md") or "",
"user_md": runtime.get("user_md") or "",
"tools_md": runtime.get("tools_md") or "",
"identity_md": runtime.get("identity_md") or "",
"workspace_dir": bot.workspace_dir,
"docker_status": bot.docker_status,
"current_state": bot.current_state,
"last_action": bot.last_action,
"created_at": bot.created_at,
"updated_at": bot.updated_at,
}
def serialize_bot_list_item(self, bot: BotInstance) -> Dict[str, Any]:
runtime = self.read_bot_runtime_snapshot(bot)
target = self._resolve_bot_provider_target_for_instance(bot)
return {
"id": bot.id,
"name": bot.name,
"enabled": bool(getattr(bot, "enabled", True)),
"image_tag": bot.image_tag,
"llm_provider": runtime.get("llm_provider") or "",
"llm_model": runtime.get("llm_model") or "",
"node_id": target.node_id,
"node_display_name": self._node_display_name(target.node_id),
"transport_kind": target.transport_kind,
"runtime_kind": target.runtime_kind,
"core_adapter": target.core_adapter,
"docker_status": bot.docker_status,
"current_state": bot.current_state,
"last_action": bot.last_action,
"updated_at": bot.updated_at,
}
def refresh_bot_runtime_status(self, app_state: Any, bot: BotInstance) -> str:
current_status = str(bot.docker_status or "STOPPED").upper()
try:
status = str(self._get_runtime_provider(app_state, bot).get_runtime_status(bot_id=str(bot.id or "")) or "STOPPED").upper()
except Exception as exc:
log_edge_failure(
self._logger,
key=f"bot-runtime-status:{bot.id}",
exc=exc,
message=f"Failed to refresh runtime status for bot_id={bot.id}",
)
return current_status
bot.docker_status = status
if status != "RUNNING" and str(bot.current_state or "").upper() not in {"ERROR"}:
bot.current_state = "IDLE"
return status
async def wait_for_agent_loop_ready(
self,
bot_id: str,
timeout_seconds: float = 12.0,
poll_interval_seconds: float = 0.5,
) -> bool:
deadline = time.monotonic() + max(1.0, timeout_seconds)
marker = self._AGENT_LOOP_READY_MARKER.lower()
while time.monotonic() < deadline:
logs = self._docker_manager.get_recent_logs(bot_id, tail=200)
if any(marker in str(line or "").lower() for line in logs):
return True
await asyncio.sleep(max(0.1, poll_interval_seconds))
return False
async def record_agent_loop_ready_warning(
self,
bot_id: str,
timeout_seconds: float = 12.0,
poll_interval_seconds: float = 0.5,
) -> None:
try:
agent_loop_ready = await self.wait_for_agent_loop_ready(
bot_id,
timeout_seconds=timeout_seconds,
poll_interval_seconds=poll_interval_seconds,
)
if agent_loop_ready:
return
if self._docker_manager.get_bot_status(bot_id) != "RUNNING":
return
detail = (
"Bot container started, but ready marker was not found in logs within "
f"{int(timeout_seconds)}s. Check bot logs or MCP config if the bot stays unavailable."
)
self._logger.warning("bot_id=%s agent loop ready marker not found within %ss", bot_id, timeout_seconds)
with Session(self._engine) as background_session:
if not background_session.get(BotInstance, bot_id):
return
self._record_activity_event(
background_session,
bot_id,
"bot_warning",
channel="system",
detail=detail,
metadata={
"kind": "agent_loop_ready_timeout",
"marker": self._AGENT_LOOP_READY_MARKER,
"timeout_seconds": timeout_seconds,
},
)
background_session.commit()
self._invalidate_bot_detail_cache(bot_id)
except Exception:
self._logger.exception("Failed to record agent loop readiness warning for bot_id=%s", bot_id)