dashboard-nanobot/dashboard-edge/app/services/runtime_service.py

512 lines
21 KiB
Python
Raw Normal View History

import json
import os
import shlex
import shutil
import csv
from datetime import datetime, timezone
import psutil
from fastapi import HTTPException
from app.core.settings import EDGE_BOTS_WORKSPACE_ROOT, EDGE_NODE_ID, EDGE_NODE_NAME
from app.runtime.base import EdgeRuntimeBackend
from app.runtime.factory import build_edge_runtime_backends, preferred_edge_runtime_kind
from app.schemas.edge import (
EdgeCommandRequest,
EdgeLogsResponse,
EdgeMonitorEnsureResponse,
EdgeMonitorPacket,
EdgeMonitorPacketsResponse,
EdgeNodeHeartbeatResponse,
EdgeNodeResourcesResponse,
EdgeNodeSelfResponse,
EdgeStatusResponse,
NODE_PROTOCOL_VERSION,
)
from app.schemas.runtime import EdgeStartBotRequest
class EdgeRuntimeService:
def __init__(self) -> None:
self._runtime_backends: dict[str, EdgeRuntimeBackend] = {
str(kind).strip().lower(): backend
for kind, backend in build_edge_runtime_backends().items()
if isinstance(kind, str)
}
self._recent_packets: dict[str, list[dict]] = {}
self._packet_counters: dict[str, int] = {}
self._backfilled_bots: set[str] = set()
def _runtime_kind(self) -> str:
return preferred_edge_runtime_kind(self._runtime_backends)
def capabilities(self) -> dict:
caps: dict = {"protocol": {"version": NODE_PROTOCOL_VERSION}}
runtime_caps: dict[str, bool] = {}
workspace_caps: dict[str, bool] = {}
monitor_caps: dict[str, bool] = {}
process_caps: dict[str, object] = {}
for backend in self._runtime_backends.values():
current = dict(backend.capabilities() if hasattr(backend, "capabilities") else {})
for key, value in dict(current.get("runtime") or {}).items():
normalized = str(key or "").strip().lower()
if not normalized:
continue
runtime_caps[normalized] = bool(runtime_caps.get(normalized) or value is True)
for key, value in dict(current.get("workspace") or {}).items():
normalized = str(key or "").strip()
if not normalized:
continue
workspace_caps[normalized] = bool(workspace_caps.get(normalized) or value is True)
for key, value in dict(current.get("monitor") or {}).items():
normalized = str(key or "").strip()
if not normalized:
continue
monitor_caps[normalized] = bool(monitor_caps.get(normalized) or value is True)
for key, value in dict(current.get("process") or {}).items():
normalized = str(key or "").strip()
if normalized:
process_caps[normalized] = value
if runtime_caps:
caps["runtime"] = runtime_caps
if workspace_caps:
caps["workspace"] = workspace_caps
if monitor_caps:
caps["monitor"] = monitor_caps
if process_caps:
caps["process"] = process_caps
return caps
async def start_bot(self, *, bot_id: str, payload: EdgeStartBotRequest) -> EdgeStatusResponse:
runtime_kind = self._resolve_runtime_kind(bot_id, preferred=payload.runtime_kind)
backend = self._backend_for_bot(bot_id, preferred=runtime_kind)
self._write_runtime_target(
bot_id=bot_id,
runtime_kind=runtime_kind,
workspace_root=str(payload.workspace_root or "").strip() or None,
)
success = backend.start_bot(
bot_id=bot_id,
image_tag=str(payload.image_tag or "").strip(),
env_vars=dict(payload.env_vars or {}),
workspace_root=str(payload.workspace_root or "").strip() or None,
native_command=str(payload.native_command or "").strip() or None,
native_workdir=str(payload.native_workdir or "").strip() or None,
cpu_cores=float(payload.cpu_cores),
memory_mb=int(payload.memory_mb),
storage_gb=int(payload.storage_gb),
on_state_change=self._record_monitor_packet,
)
if not success:
detail = backend.get_last_delivery_error(bot_id) or f"Failed to start bot {bot_id} on dashboard-edge"
raise HTTPException(status_code=500, detail=detail)
return EdgeStatusResponse(status="started")
def stop_bot(self, *, bot_id: str) -> EdgeStatusResponse:
resolved_kind = self._resolve_runtime_kind(bot_id)
ordered_kinds: list[str] = []
if resolved_kind:
ordered_kinds.append(resolved_kind)
for kind in self._runtime_backends.keys():
if kind not in ordered_kinds:
ordered_kinds.append(kind)
for kind in ordered_kinds:
backend = self._runtime_backends.get(kind)
if backend is None:
continue
try:
backend.stop_bot(bot_id)
except Exception:
continue
return EdgeStatusResponse(status="stopped")
def send_command(self, *, bot_id: str, payload: EdgeCommandRequest) -> EdgeStatusResponse:
backend = self._backend_for_bot(bot_id)
ok = backend.send_command(bot_id, payload.command, media=list(payload.media or []))
if not ok:
detail = backend.get_last_delivery_error(bot_id) or "command delivery failed"
raise HTTPException(status_code=502, detail=detail)
return EdgeStatusResponse(status="ok")
def ensure_monitor(self, *, bot_id: str) -> EdgeMonitorEnsureResponse:
backend = self._backend_for_bot(bot_id)
ensured = backend.ensure_monitor(bot_id, self._record_monitor_packet)
return EdgeMonitorEnsureResponse(ensured=bool(ensured))
def get_recent_logs(self, *, bot_id: str, tail: int) -> EdgeLogsResponse:
backend = self._backend_for_bot(bot_id)
return EdgeLogsResponse(bot_id=bot_id, logs=backend.get_recent_logs(bot_id, tail=tail))
def get_monitor_packets(self, *, bot_id: str, after_seq: int = 0, limit: int = 200) -> EdgeMonitorPacketsResponse:
self._backfill_monitor_packets(bot_id=bot_id)
rows = [
dict(row)
for row in self._recent_packets.get(bot_id, [])
if int(row.get("seq") or 0) > max(0, int(after_seq or 0))
]
rows.sort(key=lambda row: int(row.get("seq") or 0))
if limit > 0:
rows = rows[: int(limit)]
latest_seq = int(self._packet_counters.get(bot_id, 0) or 0)
return EdgeMonitorPacketsResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
bot_id=bot_id,
latest_seq=latest_seq,
packets=[
EdgeMonitorPacket.model_validate(
{
"protocol_version": NODE_PROTOCOL_VERSION,
"node_id": EDGE_NODE_ID,
"bot_id": bot_id,
**row,
}
)
for row in rows
],
)
def get_runtime_status(self, *, bot_id: str) -> EdgeStatusResponse:
backend = self._backend_for_bot(bot_id)
return EdgeStatusResponse(status=backend.get_bot_status(bot_id))
def get_resource_snapshot(self, *, bot_id: str) -> dict:
backend = self._backend_for_bot(bot_id)
snapshot = dict(backend.get_bot_resource_snapshot(bot_id) or {})
snapshot.setdefault("runtime_kind", self._resolve_runtime_kind(bot_id))
return snapshot
def get_node_identity(self) -> EdgeNodeSelfResponse:
resources = self.get_node_resource_summary()
return EdgeNodeSelfResponse(
protocol_version=resources.protocol_version,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
service="dashboard-edge",
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
capabilities=self.capabilities(),
resources=dict(resources.resources or {}),
reported_at=resources.reported_at,
)
def get_node_resource_summary(self) -> EdgeNodeResourcesResponse:
cpu_percent = 0.0
try:
cpu_percent = float(psutil.cpu_percent(interval=None) or 0.0)
except Exception:
cpu_percent = 0.0
memory_total = 0
memory_used = 0
try:
memory = psutil.virtual_memory()
memory_total = int(getattr(memory, "total", 0) or 0)
memory_used = int(getattr(memory, "used", 0) or 0)
except Exception:
memory_total = 0
memory_used = 0
workspace_limit = 0
workspace_used = 0
try:
disk = psutil.disk_usage(EDGE_BOTS_WORKSPACE_ROOT)
workspace_limit = int(getattr(disk, "total", 0) or 0)
workspace_used = int(getattr(disk, "used", 0) or 0)
except Exception:
workspace_limit = 0
workspace_used = self._calc_workspace_used_bytes()
cpu_cores = 0.0
try:
cpu_cores = float(psutil.cpu_count(logical=True) or 0)
except Exception:
cpu_cores = 0.0
return EdgeNodeResourcesResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
resources={
"configured_cpu_cores": round(cpu_cores, 2),
"configured_memory_bytes": memory_total,
"configured_storage_bytes": workspace_limit,
"live_cpu_percent": round(cpu_percent, 2),
"live_memory_used_bytes": memory_used,
"live_memory_limit_bytes": memory_total,
"workspace_used_bytes": workspace_used,
"workspace_limit_bytes": workspace_limit,
},
reported_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
)
def heartbeat(self) -> EdgeNodeHeartbeatResponse:
node_resources = self.get_node_resource_summary()
return EdgeNodeHeartbeatResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
service="dashboard-edge",
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
capabilities=self.capabilities(),
resources=dict(node_resources.resources or {}),
reported_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
)
def native_preflight(self, *, native_command: str | None = None, native_workdir: str | None = None) -> dict:
raw_command = str(native_command or "").strip()
command_parts: list[str] = []
parse_error = ""
if raw_command:
command_parts, parse_error = self._parse_native_command(raw_command)
if not raw_command and not command_parts:
backend = self._runtime_backends.get("native")
process_caps = {}
if backend is not None:
process_caps = dict((backend.capabilities() or {}).get("process") or {})
command_parts = [str(item or "").strip() for item in list(process_caps.get("command") or []) if str(item or "").strip()]
command_available = bool(command_parts and shutil.which(command_parts[0]))
configured_workdir = str(native_workdir or "").strip()
if configured_workdir:
workdir = os.path.abspath(configured_workdir)
workdir_exists = os.path.isdir(workdir)
else:
workdir = ""
workdir_exists = True
ok = bool(command_available and workdir_exists)
detail_parts: list[str] = []
if not command_available:
detail_parts.append("native command not available")
if not workdir_exists:
detail_parts.append("native workdir does not exist")
if parse_error:
detail_parts.append(parse_error)
if not detail_parts:
detail_parts.append("native launcher ready")
return {
"ok": ok,
"command": command_parts,
"workdir": workdir,
"command_available": command_available,
"workdir_exists": workdir_exists,
"detail": "; ".join(detail_parts),
}
@staticmethod
def _parse_native_command(raw_command: str) -> tuple[list[str], str]:
text = str(raw_command or "").strip()
if not text:
return [], ""
if text.startswith("[") and text.endswith("]"):
try:
payload = json.loads(text)
if isinstance(payload, list):
rows = [str(item or "").strip() for item in payload if str(item or "").strip()]
if rows:
return rows, ""
return [], "native command JSON list is empty"
except Exception:
return [], "native command JSON is invalid"
if "," in text and any(mark in text for mark in ['"', "'"]):
try:
rows = [str(item or "").strip() for item in next(csv.reader([text], skipinitialspace=True)) if str(item or "").strip()]
if rows:
return rows, ""
except Exception:
pass
try:
rows = [str(item or "").strip() for item in shlex.split(text) if str(item or "").strip()]
if rows:
return rows, ""
return [], "native command is empty"
except Exception:
return [], "native command format is invalid"
def _record_monitor_packet(self, bot_id: str, packet: dict) -> None:
rows = self._recent_packets.setdefault(bot_id, [])
next_seq = int(self._packet_counters.get(bot_id, 0) or 0) + 1
self._packet_counters[bot_id] = next_seq
captured_at = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
rows.append(
{
"protocol_version": NODE_PROTOCOL_VERSION,
"node_id": EDGE_NODE_ID,
"bot_id": bot_id,
"seq": next_seq,
"captured_at": captured_at,
"packet": dict(packet or {}),
}
)
if len(rows) > 200:
del rows[:-200]
def _backfill_monitor_packets(self, bot_id: str) -> None:
if bot_id in self._backfilled_bots:
return
self._backfilled_bots.add(bot_id)
backend = self._backend_for_bot(bot_id)
for line in backend.get_recent_logs(bot_id, tail=500):
packet = backend.parse_monitor_packet(line)
if packet:
self._record_monitor_packet(bot_id, packet)
def _backend_for_bot(self, bot_id: str, preferred: str | None = None) -> EdgeRuntimeBackend:
runtime_kind = self._resolve_runtime_kind(bot_id, preferred=preferred)
backend = self._runtime_backends.get(runtime_kind)
if backend is None:
raise HTTPException(status_code=501, detail=f"dashboard-edge runtime is not available: {runtime_kind}")
return backend
def _resolve_runtime_kind(self, bot_id: str, preferred: str | None = None) -> str:
normalized_preferred = self._normalize_runtime_kind(preferred, allow_empty=True)
if normalized_preferred and normalized_preferred in self._runtime_backends:
return normalized_preferred
persisted = self._normalize_runtime_kind(self._read_runtime_target(bot_id), allow_empty=True)
if persisted and persisted in self._runtime_backends:
return persisted
for runtime_kind, backend in self._runtime_backends.items():
try:
if str(backend.get_bot_status(bot_id) or "").strip().upper() == "RUNNING":
return runtime_kind
except Exception:
continue
return self._runtime_kind()
@staticmethod
def _normalize_runtime_kind(value: str | None, *, allow_empty: bool = False) -> str:
text = str(value or "").strip().lower()
if allow_empty and not text:
return ""
return text if text in {"docker", "native"} else "docker"
@staticmethod
def _runtime_target_path(bot_id: str) -> str:
return os.path.join(EDGE_BOTS_WORKSPACE_ROOT, str(bot_id or "").strip(), ".nanobot", "runtime-target.json")
@staticmethod
def _config_path(bot_id: str) -> str:
return os.path.join(EDGE_BOTS_WORKSPACE_ROOT, str(bot_id or "").strip(), ".nanobot", "config.json")
def _read_runtime_target(self, bot_id: str) -> str:
payload = self._read_runtime_target_payload(bot_id)
if isinstance(payload, dict):
return str(payload.get("runtime_kind") or "").strip().lower()
return ""
def _read_runtime_target_payload(self, bot_id: str) -> dict:
for path in self._runtime_target_paths_for_read(bot_id):
if not os.path.isfile(path):
continue
try:
with open(path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
if isinstance(payload, dict):
return payload
except Exception:
continue
return {}
def _write_runtime_target(self, *, bot_id: str, runtime_kind: str, workspace_root: str | None = None) -> None:
payload = dict(self._read_runtime_target_payload(bot_id))
payload["runtime_kind"] = self._normalize_runtime_kind(runtime_kind)
if workspace_root is not None:
normalized_root = str(workspace_root or "").strip()
if normalized_root:
payload["workspace_root"] = os.path.abspath(os.path.expanduser(normalized_root))
else:
payload.pop("workspace_root", None)
paths = self._runtime_target_paths(bot_id=bot_id, payload=payload)
for path in paths:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2)
primary = self._runtime_target_path(bot_id)
if primary not in paths and os.path.isfile(primary):
try:
os.remove(primary)
except Exception:
pass
def _runtime_target_paths(self, *, bot_id: str, payload: dict) -> list[str]:
primary = self._runtime_target_path(bot_id)
workspace_root = str(payload.get("workspace_root") or "").strip()
if workspace_root:
external = os.path.join(
os.path.abspath(os.path.expanduser(workspace_root)),
str(bot_id or "").strip(),
".nanobot",
"runtime-target.json",
)
if os.path.abspath(external) != os.path.abspath(primary):
return [external]
return [primary]
def _runtime_target_paths_for_read(self, bot_id: str) -> list[str]:
primary = self._runtime_target_path(bot_id)
rows: list[str] = [primary]
workspace_root = self._workspace_root_from_config(bot_id)
if workspace_root:
external = os.path.join(
workspace_root,
str(bot_id or "").strip(),
".nanobot",
"runtime-target.json",
)
if os.path.abspath(external) != os.path.abspath(primary):
rows.insert(0, external)
return rows
def _workspace_root_from_config(self, bot_id: str) -> str:
path = self._config_path(bot_id)
if not os.path.isfile(path):
return ""
try:
with open(path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
if not isinstance(payload, dict):
return ""
agents = payload.get("agents")
if not isinstance(agents, dict):
return ""
defaults = agents.get("defaults")
if not isinstance(defaults, dict):
return ""
workspace = str(defaults.get("workspace") or "").strip()
if not workspace:
return ""
normalized_workspace = os.path.abspath(os.path.expanduser(workspace))
marker = f"{os.sep}{str(bot_id or '').strip()}{os.sep}.nanobot{os.sep}workspace"
if marker in normalized_workspace:
return normalized_workspace.rsplit(marker, 1)[0]
except Exception:
return ""
return ""
@staticmethod
def _calc_workspace_used_bytes() -> int:
total = 0
for root, _, files in os.walk(EDGE_BOTS_WORKSPACE_ROOT):
for filename in files:
path = os.path.join(root, filename)
try:
total += int(os.path.getsize(path))
except Exception:
continue
return total
edge_runtime_service = EdgeRuntimeService()