diff --git a/backend/core/docker_manager.py b/backend/core/docker_manager.py index 63e45c8..442b10b 100644 --- a/backend/core/docker_manager.py +++ b/backend/core/docker_manager.py @@ -13,6 +13,14 @@ import docker class BotDockerManager: _RUNTIME_BOOTSTRAP_LABEL_KEY = "dashboard.runtime_bootstrap" _RUNTIME_BOOTSTRAP_LABEL_VALUE = "env-json-v1" + _DASHBOARD_READY_LOG_MARKERS = ( + "nanobot.channels.dashboard:start", + "dashboard channel 代理已上线", + ) + _DASHBOARD_FAILURE_LOG_MARKERS = ( + "failed to start channel dashboard", + "dashboard channel not available", + ) def __init__( self, @@ -265,6 +273,32 @@ class BotDockerManager: return cpu_quota / cpu_period return 0.0 + @staticmethod + def _normalize_image_id(raw: Any) -> str: + text = str(raw or "").strip().lower() + if text.startswith("sha256:"): + return text[7:] + return text + + @classmethod + def _get_container_image_id(cls, container: Any) -> str: + attrs = getattr(container, "attrs", {}) or {} + image_id = attrs.get("Image") + if image_id: + return cls._normalize_image_id(image_id) + image = getattr(container, "image", None) + return cls._normalize_image_id(getattr(image, "id", "")) + + def _resolve_image_id(self, image_ref: str) -> str: + if not self.client: + return "" + try: + image = self.client.images.get(image_ref) + except Exception as e: + print(f"[DockerManager] failed to resolve image id for {image_ref}: {e}") + return "" + return self._normalize_image_id(getattr(image, "id", "")) + def _container_storage_matches(self, actual_storage_bytes: Optional[int], desired_storage_gb: int) -> bool: expected_storage_bytes = self._desired_storage_bytes(desired_storage_gb) if expected_storage_bytes is None: @@ -277,7 +311,7 @@ class BotDockerManager: self, container: Any, *, - image: str, + image_id: str, cpu_cores: float, memory_mb: int, storage_gb: int, @@ -285,10 +319,10 @@ class BotDockerManager: network_name: str, ) -> bool: attrs = getattr(container, "attrs", {}) or {} - config = attrs.get("Config") or {} host_cfg = attrs.get("HostConfig") or {} - current_image = str(config.get("Image") or "").strip() - if current_image != image: + current_image_id = self._get_container_image_id(container) + desired_image_id = self._normalize_image_id(image_id) + if not desired_image_id or not current_image_id or current_image_id != desired_image_id: return False if not self._container_uses_expected_bootstrap(container): return False @@ -367,6 +401,10 @@ class BotDockerManager: if not self.has_image(image): print(f"❌ 错误: 镜像不存在: {image}") return False + desired_image_id = self._resolve_image_id(image) + if not desired_image_id: + print(f"❌ 错误: 无法解析镜像 ID: {image}") + return False bot_workspace = os.path.join(self.host_data_root, bot_id, ".nanobot") container_name = f"worker_{bot_id}" @@ -397,25 +435,29 @@ class BotDockerManager: try: container = self.client.containers.get(container_name) container.reload() - if container.status in {"running", "restarting"} and self._container_uses_network(container, target_network): - if on_state_change: - self.ensure_monitor(bot_id, on_state_change) - return True - if container.status in {"running", "restarting"}: - print( - f"[DockerManager] recreating {container_name} to switch network " - f"from current attachment to '{target_network}'" - ) - container.remove(force=True) - elif self._container_matches_runtime( + runtime_matches = self._container_matches_runtime( container, - image=image, + image_id=desired_image_id, cpu_cores=cpu, memory_mb=memory, storage_gb=storage, bot_workspace=bot_workspace, network_name=target_network, - ): + ) + if container.status in {"running", "restarting"} and runtime_matches: + if on_state_change: + self.ensure_monitor(bot_id, on_state_change) + return True + if container.status in {"running", "restarting"}: + if not self._container_uses_network(container, target_network): + print( + f"[DockerManager] recreating {container_name} to switch network " + f"from current attachment to '{target_network}'" + ) + else: + print(f"[DockerManager] recreating {container_name} because container config no longer matches desired runtime") + container.remove(force=True) + elif runtime_matches: container.start() if on_state_change: self.ensure_monitor(bot_id, on_state_change) @@ -502,6 +544,11 @@ class BotDockerManager: media_paths = [str(v).strip().replace("\\", "/") for v in (media or []) if str(v).strip()] self._last_delivery_error.pop(bot_id, None) + if not self._wait_for_dashboard_ready(bot_id): + if bot_id not in self._last_delivery_error: + self._last_delivery_error[bot_id] = "Dashboard channel is not ready" + return False + # Primary path on Docker Desktop/Mac: execute curl inside container namespace. for attempt in range(3): if self._send_command_via_exec(bot_id, command, media_paths): @@ -520,6 +567,45 @@ class BotDockerManager: def get_last_delivery_error(self, bot_id: str) -> str: return str(self._last_delivery_error.get(bot_id, "") or "").strip() + @classmethod + def _log_indicates_dashboard_ready(cls, line: str) -> bool: + lowered = str(line or "").strip().lower() + return any(marker in lowered for marker in cls._DASHBOARD_READY_LOG_MARKERS) + + @classmethod + def _log_indicates_dashboard_failure(cls, line: str) -> bool: + lowered = str(line or "").strip().lower() + return any(marker in lowered for marker in cls._DASHBOARD_FAILURE_LOG_MARKERS) + + def _wait_for_dashboard_ready( + self, + bot_id: str, + timeout_seconds: float = 15.0, + poll_interval_seconds: float = 0.5, + ) -> bool: + deadline = time.monotonic() + max(1.0, timeout_seconds) + while time.monotonic() < deadline: + status = self.get_bot_status(bot_id) + if status != "RUNNING": + self._last_delivery_error[bot_id] = f"Container status is {status.lower()}" + return False + + logs = self.get_recent_logs(bot_id, tail=200) + for line in logs: + if self._log_indicates_dashboard_failure(line): + detail = str(line or "").strip() + self._last_delivery_error[bot_id] = detail[:300] if detail else "Dashboard channel failed to start" + return False + if self._log_indicates_dashboard_ready(line): + return True + + time.sleep(max(0.1, poll_interval_seconds)) + + self._last_delivery_error[bot_id] = ( + f"Dashboard channel was not ready within {int(max(1.0, timeout_seconds))}s" + ) + return False + def get_bot_status(self, bot_id: str) -> str: """Return normalized runtime status from Docker: RUNNING or STOPPED.""" if not self.client: diff --git a/backend/tests/test_docker_manager.py b/backend/tests/test_docker_manager.py index 8b7390e..5d776f4 100644 --- a/backend/tests/test_docker_manager.py +++ b/backend/tests/test_docker_manager.py @@ -2,7 +2,7 @@ import sys import tempfile import types import unittest -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch docker_stub = types.ModuleType("docker") docker_stub.errors = types.SimpleNamespace( @@ -38,6 +38,7 @@ class BotDockerManagerTests(unittest.TestCase): *, status: str, image: str, + image_id: str | None = None, nano_cpus: int, memory_bytes: int, storage_opt_size: str, @@ -45,13 +46,16 @@ class BotDockerManagerTests(unittest.TestCase): network_name: str, bootstrap_label: str | None = "env-json-v1", ) -> MagicMock: + actual_image_id = image_id or image container = MagicMock() container.status = status container.reload = MagicMock() container.start = MagicMock() container.stop = MagicMock() container.remove = MagicMock() + container.image = types.SimpleNamespace(id=actual_image_id) container.attrs = { + "Image": actual_image_id, "Config": { "Image": image, "Labels": ( @@ -111,17 +115,19 @@ class BotDockerManagerTests(unittest.TestCase): def test_start_bot_reuses_compatible_stopped_container(self) -> None: manager = self._make_manager() image_tag = "nanobot-base:v1" + image_id = "sha256:img-v1" workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" container = self._build_container( status="exited", image=image_tag, + image_id=image_id, nano_cpus=1_000_000_000, memory_bytes=1024 * 1024 * 1024, storage_opt_size="10G", source_mount=workspace_mount, network_name="bridge", ) - manager.client.images.get.return_value = MagicMock() + manager.client.images.get.return_value = types.SimpleNamespace(id=image_id) manager.client.containers.get.return_value = container result = manager.start_bot( @@ -141,17 +147,51 @@ class BotDockerManagerTests(unittest.TestCase): def test_start_bot_recreates_incompatible_stopped_container(self) -> None: manager = self._make_manager() image_tag = "nanobot-base:v1" + desired_image_id = "sha256:img-v1" workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" container = self._build_container( status="exited", image="nanobot-base:old", + image_id="sha256:img-old", nano_cpus=1_000_000_000, memory_bytes=1024 * 1024 * 1024, storage_opt_size="10G", source_mount=workspace_mount, network_name="bridge", ) - manager.client.images.get.return_value = MagicMock() + manager.client.images.get.return_value = types.SimpleNamespace(id=desired_image_id) + manager.client.containers.get.return_value = container + manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) + + result = manager.start_bot( + "demo", + image_tag=image_tag, + env_vars={"TZ": "Asia/Shanghai"}, + cpu_cores=1.0, + memory_mb=1024, + storage_gb=10, + ) + + self.assertTrue(result) + container.start.assert_not_called() + container.remove.assert_called_once_with(force=True) + manager._run_container_with_storage_fallback.assert_called_once() + + def test_start_bot_recreates_stopped_container_when_image_id_changes_under_same_tag(self) -> None: + manager = self._make_manager() + image_tag = "nanobot-base:v1" + workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" + container = self._build_container( + status="exited", + image=image_tag, + image_id="sha256:img-old", + nano_cpus=1_000_000_000, + memory_bytes=1024 * 1024 * 1024, + storage_opt_size="10G", + source_mount=workspace_mount, + network_name="bridge", + ) + manager.client.images.get.return_value = types.SimpleNamespace(id="sha256:img-new") manager.client.containers.get.return_value = container manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) @@ -172,10 +212,12 @@ class BotDockerManagerTests(unittest.TestCase): def test_start_bot_recreates_container_without_new_entrypoint(self) -> None: manager = self._make_manager() image_tag = "nanobot-base:v1" + image_id = "sha256:img-v1" workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" container = self._build_container( status="exited", image=image_tag, + image_id=image_id, nano_cpus=1_000_000_000, memory_bytes=1024 * 1024 * 1024, storage_opt_size="10G", @@ -183,7 +225,7 @@ class BotDockerManagerTests(unittest.TestCase): network_name="bridge", bootstrap_label=None, ) - manager.client.images.get.return_value = MagicMock() + manager.client.images.get.return_value = types.SimpleNamespace(id=image_id) manager.client.containers.get.return_value = container manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) @@ -201,6 +243,110 @@ class BotDockerManagerTests(unittest.TestCase): container.remove.assert_called_once_with(force=True) manager._run_container_with_storage_fallback.assert_called_once() + def test_start_bot_recreates_running_container_when_image_id_changes_under_same_tag(self) -> None: + manager = self._make_manager() + image_tag = "nanobot-base:v1" + workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" + container = self._build_container( + status="running", + image=image_tag, + image_id="sha256:img-old", + nano_cpus=1_000_000_000, + memory_bytes=1024 * 1024 * 1024, + storage_opt_size="10G", + source_mount=workspace_mount, + network_name="bridge", + ) + manager.client.images.get.return_value = types.SimpleNamespace(id="sha256:img-new") + manager.client.containers.get.return_value = container + manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) + + result = manager.start_bot( + "demo", + image_tag=image_tag, + env_vars={"TZ": "Asia/Shanghai"}, + cpu_cores=1.0, + memory_mb=1024, + storage_gb=10, + ) + + self.assertTrue(result) + container.remove.assert_called_once_with(force=True) + manager._run_container_with_storage_fallback.assert_called_once() + + def test_send_command_waits_for_dashboard_ready(self) -> None: + manager = self._make_manager() + manager._wait_for_dashboard_ready = MagicMock(return_value=True) + manager._send_command_via_exec = MagicMock(return_value=True) + + result = manager.send_command("demo", "hello") + + self.assertTrue(result) + manager._wait_for_dashboard_ready.assert_called_once_with("demo") + manager._send_command_via_exec.assert_called_once_with("demo", "hello", []) + + def test_send_command_returns_false_when_dashboard_never_becomes_ready(self) -> None: + manager = self._make_manager() + def _wait_timeout(bot_id: str) -> bool: + manager._last_delivery_error[bot_id] = "Dashboard channel was not ready within 15s" + return False + + manager._wait_for_dashboard_ready = MagicMock(side_effect=_wait_timeout) + manager._send_command_via_exec = MagicMock() + manager._send_command_via_host_http = MagicMock() + + result = manager.send_command("demo", "hello") + + self.assertFalse(result) + manager._send_command_via_exec.assert_not_called() + manager._send_command_via_host_http.assert_not_called() + self.assertEqual( + manager.get_last_delivery_error("demo"), + "Dashboard channel was not ready within 15s", + ) + + def test_wait_for_dashboard_ready_returns_true_after_start_log(self) -> None: + manager = self._make_manager() + manager.get_bot_status = MagicMock(return_value="RUNNING") + manager.get_recent_logs = MagicMock( + side_effect=[ + ["Agent loop started"], + ["2026-04-25 | INFO | nanobot.channels.dashboard:start:66 - ready"], + ] + ) + + with patch("core.docker_manager.time.sleep", return_value=None): + result = manager._wait_for_dashboard_ready( + "demo", + timeout_seconds=2.0, + poll_interval_seconds=0.1, + ) + + self.assertTrue(result) + + def test_wait_for_dashboard_ready_sets_timeout_error(self) -> None: + manager = self._make_manager() + manager.get_bot_status = MagicMock(return_value="RUNNING") + manager.get_recent_logs = MagicMock(return_value=["Agent loop started"]) + + time_values = iter([0.0, 0.2, 0.4, 1.2]) + + with ( + patch("core.docker_manager.time.monotonic", side_effect=lambda: next(time_values)), + patch("core.docker_manager.time.sleep", return_value=None), + ): + result = manager._wait_for_dashboard_ready( + "demo", + timeout_seconds=1.0, + poll_interval_seconds=0.1, + ) + + self.assertFalse(result) + self.assertEqual( + manager.get_last_delivery_error("demo"), + "Dashboard channel was not ready within 1s", + ) + if __name__ == "__main__": unittest.main() diff --git a/bot-images/Dashboard.Dockerfile b/bot-images/Dashboard.Dockerfile index 31d8b0e..41871ad 100644 --- a/bot-images/Dashboard.Dockerfile +++ b/bot-images/Dashboard.Dockerfile @@ -1,30 +1,70 @@ -FROM python:3.12-slim +FROM python:3.12-slim AS builder ENV PYTHONUNBUFFERED=1 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV PYTHONIOENCODING=utf-8 +ENV PATH=/opt/venv/bin:$PATH # 1. 替换 Debian 源为国内镜像 RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources -# 2. 安装基础依赖 +# 2. 仅在构建阶段安装编译依赖 RUN apt-get update && apt-get install -y --no-install-recommends \ - curl \ gcc \ - libpq-dev \ && rm -rf /var/lib/apt/lists/* -# 3. 安装 aiohttp 和基础 python 工具 -RUN python -m pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ --upgrade \ - pip setuptools wheel aiohttp +RUN python -m venv /opt/venv WORKDIR /app -# 这一步会把您修改好的 nanobot/channels/dashboard.py 一起拷进去 -COPY . /app +COPY pyproject.toml README.md LICENSE THIRD_PARTY_NOTICES.md ./ +COPY nanobot/ nanobot/ +COPY bridge/ bridge/ -# 4. 安装 nanobot(包含 WeCom 渠道依赖) -RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ ".[wecom]" +# 3. 在 builder 中完成 Python 依赖安装,避免源码和编译工具进入最终镜像 +RUN python -m pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ --upgrade \ + --no-compile pip setuptools wheel aiohttp && \ + pip install --no-cache-dir --no-compile -i https://mirrors.aliyun.com/pypi/simple/ ".[wecom]" && \ + find /opt/venv -type d -name __pycache__ -prune -exec rm -rf {} + && \ + find /opt/venv -name '*.pyc' -delete + + +FROM python:3.12-slim +ENV PYTHONUNBUFFERED=1 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PYTHONIOENCODING=utf-8 +ENV PATH=/opt/venv/bin:$PATH + +# 1. 替换 Debian 源为国内镜像 +RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ + sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources + +# 2. 安装运行时依赖与技能所需 CLI +RUN apt-get update && apt-get install -y --no-install-recommends \ + bubblewrap \ + ca-certificates \ + curl \ + git \ + gnupg \ + openssh-client \ + tmux \ + && mkdir -p /etc/apt/keyrings /etc/apt/sources.list.d \ + && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ + && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_22.x nodistro main" > /etc/apt/sources.list.d/nodesource.list \ + && curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg > /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" > /etc/apt/sources.list.d/github-cli.list \ + && apt-get update && apt-get install -y --no-install-recommends \ + gh \ + nodejs \ + && apt-get purge -y --auto-remove gnupg \ + && git config --global --add url."https://github.com/".insteadOf ssh://git@github.com/ \ + && git config --global --add url."https://github.com/".insteadOf git@github.com: \ + && rm -rf /var/lib/apt/lists/* + +# 3. 仅复制已安装好的运行环境,避免把源码目录打进最终镜像 +COPY --from=builder /opt/venv /opt/venv WORKDIR /root # 官方 gateway 模式,现在它会自动加载您的 DashboardChannel diff --git a/frontend/src/app/BotRouteAccessGate.tsx b/frontend/src/app/BotRouteAccessGate.tsx index 924545f..1b07f48 100644 --- a/frontend/src/app/BotRouteAccessGate.tsx +++ b/frontend/src/app/BotRouteAccessGate.tsx @@ -120,6 +120,7 @@ export function BotRouteAccessGate({ const customEvent = event as CustomEvent<{ botId?: string }>; const invalidBotId = String(customEvent.detail?.botId || '').trim(); if (!invalidBotId || invalidBotId !== normalizedBotId) return; + if (!unlocked) return; setUnlocked(false); setAuthRefreshNonce((value) => value + 1); setPassword(''); @@ -128,7 +129,7 @@ export function BotRouteAccessGate({ }; window.addEventListener(BOT_AUTH_INVALID_EVENT, handleBotAuthInvalid as EventListener); return () => window.removeEventListener(BOT_AUTH_INVALID_EVENT, handleBotAuthInvalid as EventListener); - }, [copy.errorExpired, normalizedBotId, passwordEnabled]); + }, [copy.errorExpired, normalizedBotId, passwordEnabled, unlocked]); const unlockBot = async () => { const entered = String(password || '').trim(); diff --git a/frontend/src/app/PanelLoginGate.tsx b/frontend/src/app/PanelLoginGate.tsx index 25dab73..78e5259 100644 --- a/frontend/src/app/PanelLoginGate.tsx +++ b/frontend/src/app/PanelLoginGate.tsx @@ -82,6 +82,7 @@ export function PanelLoginGate({ useEffect(() => { if (typeof window === 'undefined' || bypass) return undefined; const handlePanelAuthInvalid = () => { + if (!authenticated) return; setRequired(true); setAuthenticated(false); setChecking(false); @@ -94,7 +95,7 @@ export function PanelLoginGate({ }; window.addEventListener(PANEL_AUTH_INVALID_EVENT, handlePanelAuthInvalid); return () => window.removeEventListener(PANEL_AUTH_INVALID_EVENT, handlePanelAuthInvalid); - }, [bypass, isZh]); + }, [authenticated, bypass, isZh]); const onSubmit = async () => { const next = String(password || '').trim();