From aeaaa4fde5fb04edc0609ebda57b53a29c34958a Mon Sep 17 00:00:00 2001 From: "mula.liu" Date: Fri, 24 Apr 2026 16:57:29 +0800 Subject: [PATCH] v0.1.5 --- backend/core/docker_manager.py | 169 +++++++++++++++++- backend/services/bot_lifecycle_service.py | 2 +- backend/tests/test_docker_manager.py | 206 ++++++++++++++++++++++ 3 files changed, 369 insertions(+), 8 deletions(-) create mode 100644 backend/tests/test_docker_manager.py diff --git a/backend/core/docker_manager.py b/backend/core/docker_manager.py index 60b69c5..63e45c8 100644 --- a/backend/core/docker_manager.py +++ b/backend/core/docker_manager.py @@ -11,6 +11,9 @@ import docker class BotDockerManager: + _RUNTIME_BOOTSTRAP_LABEL_KEY = "dashboard.runtime_bootstrap" + _RUNTIME_BOOTSTRAP_LABEL_VALUE = "env-json-v1" + def __init__( self, host_data_root: str, @@ -180,6 +183,136 @@ class BotDockerManager: return str(network_settings.get("IPAddress") or "").strip() + @classmethod + def _container_uses_expected_bootstrap(cls, container: Any) -> bool: + attrs = getattr(container, "attrs", {}) or {} + config = attrs.get("Config") or {} + labels = config.get("Labels") or {} + return str(labels.get(cls._RUNTIME_BOOTSTRAP_LABEL_KEY) or "").strip() == cls._RUNTIME_BOOTSTRAP_LABEL_VALUE + + @staticmethod + def _runtime_bootstrap_entrypoint() -> List[str]: + bootstrap_code = "\n".join( + [ + "import json", + "import os", + "import pathlib", + "import re", + "", + "path = pathlib.Path('/root/.nanobot/env.json')", + "pattern = re.compile(r'^[A-Z_][A-Z0-9_]{0,127}$')", + "data = {}", + "if path.is_file():", + " try:", + " data = json.loads(path.read_text(encoding='utf-8'))", + " except Exception:", + " data = {}", + "if not isinstance(data, dict):", + " data = {}", + "for raw_key, raw_value in data.items():", + " key = str(raw_key or '').strip().upper()", + " if not pattern.fullmatch(key):", + " continue", + " os.environ[key] = str(raw_value or '').strip()", + "os.execvp('nanobot', ['nanobot', 'gateway'])", + ] + ) + return [ + "python", + "-c", + bootstrap_code, + ] + + @staticmethod + def _container_has_mount(container: Any, source: str, destination: str) -> bool: + attrs = getattr(container, "attrs", {}) or {} + mounts = attrs.get("Mounts") or [] + expected_source = os.path.normpath(source) + expected_destination = str(destination or "").strip() + for mount in mounts: + if not isinstance(mount, dict): + continue + current_source = os.path.normpath(str(mount.get("Source") or "")) + current_destination = str(mount.get("Destination") or "").strip() + if current_source != expected_source or current_destination != expected_destination: + continue + if mount.get("RW") is False: + continue + return True + return False + + @staticmethod + def _desired_memory_bytes(memory_mb: int) -> int: + return int(memory_mb) * 1024 * 1024 if int(memory_mb or 0) > 0 else 0 + + @staticmethod + def _desired_storage_bytes(storage_gb: int) -> Optional[int]: + storage = int(storage_gb or 0) + if storage <= 0: + return None + return storage * 1024 * 1024 * 1024 + + @staticmethod + def _get_container_cpu_cores(container: Any) -> float: + attrs = getattr(container, "attrs", {}) or {} + host_cfg = attrs.get("HostConfig") or {} + nano_cpus = int(host_cfg.get("NanoCpus") or 0) + if nano_cpus > 0: + return nano_cpus / 1_000_000_000 + cpu_quota = int(host_cfg.get("CpuQuota") or 0) + cpu_period = int(host_cfg.get("CpuPeriod") or 0) + if cpu_quota > 0 and cpu_period > 0: + return cpu_quota / cpu_period + return 0.0 + + def _container_storage_matches(self, actual_storage_bytes: Optional[int], desired_storage_gb: int) -> bool: + expected_storage_bytes = self._desired_storage_bytes(desired_storage_gb) + if expected_storage_bytes is None: + return actual_storage_bytes in {None, 0} + if actual_storage_bytes == expected_storage_bytes: + return True + return actual_storage_bytes is None and self._storage_limit_supported is not True + + def _container_matches_runtime( + self, + container: Any, + *, + image: str, + cpu_cores: float, + memory_mb: int, + storage_gb: int, + bot_workspace: str, + network_name: str, + ) -> bool: + attrs = getattr(container, "attrs", {}) or {} + config = attrs.get("Config") or {} + host_cfg = attrs.get("HostConfig") or {} + current_image = str(config.get("Image") or "").strip() + if current_image != image: + return False + if not self._container_uses_expected_bootstrap(container): + return False + if not self._container_uses_network(container, network_name): + return False + if not self._container_has_mount(container, bot_workspace, "/root/.nanobot"): + return False + + actual_memory_bytes = int(host_cfg.get("Memory") or 0) + if actual_memory_bytes != self._desired_memory_bytes(memory_mb): + return False + + desired_cpu = float(cpu_cores or 0) + actual_cpu = self._get_container_cpu_cores(container) + if abs(actual_cpu - desired_cpu) > 0.01: + return False + + storage_opt = host_cfg.get("StorageOpt") or {} + actual_storage_bytes = self._parse_size_to_bytes(storage_opt.get("size")) + if not self._container_storage_matches(actual_storage_bytes, storage_gb): + return False + + return True + def _run_container_with_storage_fallback( self, bot_id: str, @@ -246,7 +379,10 @@ class BotDockerManager: "detach": True, "stdin_open": True, "tty": True, - "environment": env_vars or {}, + "entrypoint": self._runtime_bootstrap_entrypoint(), + "labels": { + self._RUNTIME_BOOTSTRAP_LABEL_KEY: self._RUNTIME_BOOTSTRAP_LABEL_VALUE, + }, "volumes": { bot_workspace: {"bind": "/root/.nanobot", "mode": "rw"}, }, @@ -261,16 +397,32 @@ class BotDockerManager: try: container = self.client.containers.get(container_name) container.reload() - if container.status == "running" and self._container_uses_network(container, target_network): + if container.status in {"running", "restarting"} and self._container_uses_network(container, target_network): if on_state_change: self.ensure_monitor(bot_id, on_state_change) return True - if container.status == "running": + if container.status in {"running", "restarting"}: print( f"[DockerManager] recreating {container_name} to switch network " f"from current attachment to '{target_network}'" ) - container.remove(force=True) + container.remove(force=True) + elif self._container_matches_runtime( + container, + image=image, + cpu_cores=cpu, + memory_mb=memory, + storage_gb=storage, + bot_workspace=bot_workspace, + network_name=target_network, + ): + container.start() + if on_state_change: + self.ensure_monitor(bot_id, on_state_change) + return True + else: + print(f"[DockerManager] recreating {container_name} because container config no longer matches desired runtime") + container.remove(force=True) except docker.errors.NotFound: pass @@ -322,14 +474,17 @@ class BotDockerManager: print(f"[DockerManager] Error ensuring monitor for {bot_id}: {e}") return False - def stop_bot(self, bot_id: str) -> bool: + def stop_bot(self, bot_id: str, remove: bool = False) -> bool: if not self.client: return False container_name = f"worker_{bot_id}" try: container = self.client.containers.get(container_name) - container.stop(timeout=5) - container.remove() + container.reload() + if str(container.status or "").strip().lower() in {"running", "restarting", "paused"}: + container.stop(timeout=5) + if remove: + container.remove() self.active_monitors.pop(bot_id, None) return True except docker.errors.NotFound: diff --git a/backend/services/bot_lifecycle_service.py b/backend/services/bot_lifecycle_service.py index 3af28f6..9e15c82 100644 --- a/backend/services/bot_lifecycle_service.py +++ b/backend/services/bot_lifecycle_service.py @@ -125,7 +125,7 @@ def deactivate_bot_instance(session: Session, bot_id: str) -> Dict[str, Any]: def delete_bot_instance(session: Session, bot_id: str, delete_workspace: bool = True) -> Dict[str, Any]: bot = _get_bot_or_404(session, bot_id) - docker_manager.stop_bot(bot_id) + docker_manager.stop_bot(bot_id, remove=True) messages = session.exec(select(BotMessage).where(BotMessage.bot_id == bot_id)).all() for row in messages: diff --git a/backend/tests/test_docker_manager.py b/backend/tests/test_docker_manager.py new file mode 100644 index 0000000..8b7390e --- /dev/null +++ b/backend/tests/test_docker_manager.py @@ -0,0 +1,206 @@ +import sys +import tempfile +import types +import unittest +from unittest.mock import MagicMock + +docker_stub = types.ModuleType("docker") +docker_stub.errors = types.SimpleNamespace( + ImageNotFound=type("ImageNotFound", (Exception,), {}), + NotFound=type("NotFound", (Exception,), {}), +) +sys.modules.setdefault("docker", docker_stub) + +from core.docker_manager import BotDockerManager + + +class BotDockerManagerTests(unittest.TestCase): + def setUp(self) -> None: + self._tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self) -> None: + self._tmpdir.cleanup() + + def _make_manager(self) -> BotDockerManager: + manager = BotDockerManager.__new__(BotDockerManager) + manager.client = MagicMock() + manager.host_data_root = self._tmpdir.name + manager.base_image = "nanobot-base" + manager.network_name = "" + manager.active_monitors = {} + manager._last_delivery_error = {} + manager._storage_limit_supported = True + manager._storage_limit_warning_emitted = False + return manager + + @staticmethod + def _build_container( + *, + status: str, + image: str, + nano_cpus: int, + memory_bytes: int, + storage_opt_size: str, + source_mount: str, + network_name: str, + bootstrap_label: str | None = "env-json-v1", + ) -> MagicMock: + container = MagicMock() + container.status = status + container.reload = MagicMock() + container.start = MagicMock() + container.stop = MagicMock() + container.remove = MagicMock() + container.attrs = { + "Config": { + "Image": image, + "Labels": ( + {"dashboard.runtime_bootstrap": bootstrap_label} + if bootstrap_label is not None + else {} + ), + }, + "HostConfig": { + "NanoCpus": nano_cpus, + "Memory": memory_bytes, + "StorageOpt": {"size": storage_opt_size}, + }, + "Mounts": [ + { + "Source": source_mount, + "Destination": "/root/.nanobot", + "RW": True, + } + ], + "NetworkSettings": { + "Networks": {network_name: {"IPAddress": "172.18.0.2"}}, + }, + } + return container + + def test_stop_bot_keeps_container_by_default(self) -> None: + manager = self._make_manager() + container = MagicMock() + container.status = "running" + container.reload = MagicMock() + container.stop = MagicMock() + container.remove = MagicMock() + manager.client.containers.get.return_value = container + + result = manager.stop_bot("demo") + + self.assertTrue(result) + container.stop.assert_called_once_with(timeout=5) + container.remove.assert_not_called() + + def test_stop_bot_remove_true_deletes_container(self) -> None: + manager = self._make_manager() + container = MagicMock() + container.status = "exited" + container.reload = MagicMock() + container.stop = MagicMock() + container.remove = MagicMock() + manager.client.containers.get.return_value = container + + result = manager.stop_bot("demo", remove=True) + + self.assertTrue(result) + container.stop.assert_not_called() + container.remove.assert_called_once_with() + + def test_start_bot_reuses_compatible_stopped_container(self) -> None: + manager = self._make_manager() + image_tag = "nanobot-base:v1" + workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" + container = self._build_container( + status="exited", + image=image_tag, + nano_cpus=1_000_000_000, + memory_bytes=1024 * 1024 * 1024, + storage_opt_size="10G", + source_mount=workspace_mount, + network_name="bridge", + ) + manager.client.images.get.return_value = MagicMock() + manager.client.containers.get.return_value = container + + result = manager.start_bot( + "demo", + image_tag=image_tag, + env_vars={"TZ": "UTC", "API_KEY": "updated-secret"}, + cpu_cores=1.0, + memory_mb=1024, + storage_gb=10, + ) + + self.assertTrue(result) + container.start.assert_called_once_with() + container.remove.assert_not_called() + manager.client.containers.run.assert_not_called() + + def test_start_bot_recreates_incompatible_stopped_container(self) -> None: + manager = self._make_manager() + image_tag = "nanobot-base:v1" + workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" + container = self._build_container( + status="exited", + image="nanobot-base:old", + nano_cpus=1_000_000_000, + memory_bytes=1024 * 1024 * 1024, + storage_opt_size="10G", + source_mount=workspace_mount, + network_name="bridge", + ) + manager.client.images.get.return_value = MagicMock() + manager.client.containers.get.return_value = container + manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) + + result = manager.start_bot( + "demo", + image_tag=image_tag, + env_vars={"TZ": "Asia/Shanghai"}, + cpu_cores=1.0, + memory_mb=1024, + storage_gb=10, + ) + + self.assertTrue(result) + container.start.assert_not_called() + container.remove.assert_called_once_with(force=True) + manager._run_container_with_storage_fallback.assert_called_once() + + def test_start_bot_recreates_container_without_new_entrypoint(self) -> None: + manager = self._make_manager() + image_tag = "nanobot-base:v1" + workspace_mount = f"{self._tmpdir.name}/demo/.nanobot" + container = self._build_container( + status="exited", + image=image_tag, + nano_cpus=1_000_000_000, + memory_bytes=1024 * 1024 * 1024, + storage_opt_size="10G", + source_mount=workspace_mount, + network_name="bridge", + bootstrap_label=None, + ) + manager.client.images.get.return_value = MagicMock() + manager.client.containers.get.return_value = container + manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock()) + + result = manager.start_bot( + "demo", + image_tag=image_tag, + env_vars={"TZ": "Asia/Shanghai"}, + cpu_cores=1.0, + memory_mb=1024, + storage_gb=10, + ) + + self.assertTrue(result) + container.start.assert_not_called() + container.remove.assert_called_once_with(force=True) + manager._run_container_with_storage_fallback.assert_called_once() + + +if __name__ == "__main__": + unittest.main()