main
mula.liu 2026-04-24 16:57:29 +08:00
parent ad2af1e71f
commit aeaaa4fde5
3 changed files with 369 additions and 8 deletions

View File

@ -11,6 +11,9 @@ import docker
class BotDockerManager: class BotDockerManager:
_RUNTIME_BOOTSTRAP_LABEL_KEY = "dashboard.runtime_bootstrap"
_RUNTIME_BOOTSTRAP_LABEL_VALUE = "env-json-v1"
def __init__( def __init__(
self, self,
host_data_root: str, host_data_root: str,
@ -180,6 +183,136 @@ class BotDockerManager:
return str(network_settings.get("IPAddress") or "").strip() return str(network_settings.get("IPAddress") or "").strip()
@classmethod
def _container_uses_expected_bootstrap(cls, container: Any) -> bool:
attrs = getattr(container, "attrs", {}) or {}
config = attrs.get("Config") or {}
labels = config.get("Labels") or {}
return str(labels.get(cls._RUNTIME_BOOTSTRAP_LABEL_KEY) or "").strip() == cls._RUNTIME_BOOTSTRAP_LABEL_VALUE
@staticmethod
def _runtime_bootstrap_entrypoint() -> List[str]:
bootstrap_code = "\n".join(
[
"import json",
"import os",
"import pathlib",
"import re",
"",
"path = pathlib.Path('/root/.nanobot/env.json')",
"pattern = re.compile(r'^[A-Z_][A-Z0-9_]{0,127}$')",
"data = {}",
"if path.is_file():",
" try:",
" data = json.loads(path.read_text(encoding='utf-8'))",
" except Exception:",
" data = {}",
"if not isinstance(data, dict):",
" data = {}",
"for raw_key, raw_value in data.items():",
" key = str(raw_key or '').strip().upper()",
" if not pattern.fullmatch(key):",
" continue",
" os.environ[key] = str(raw_value or '').strip()",
"os.execvp('nanobot', ['nanobot', 'gateway'])",
]
)
return [
"python",
"-c",
bootstrap_code,
]
@staticmethod
def _container_has_mount(container: Any, source: str, destination: str) -> bool:
attrs = getattr(container, "attrs", {}) or {}
mounts = attrs.get("Mounts") or []
expected_source = os.path.normpath(source)
expected_destination = str(destination or "").strip()
for mount in mounts:
if not isinstance(mount, dict):
continue
current_source = os.path.normpath(str(mount.get("Source") or ""))
current_destination = str(mount.get("Destination") or "").strip()
if current_source != expected_source or current_destination != expected_destination:
continue
if mount.get("RW") is False:
continue
return True
return False
@staticmethod
def _desired_memory_bytes(memory_mb: int) -> int:
return int(memory_mb) * 1024 * 1024 if int(memory_mb or 0) > 0 else 0
@staticmethod
def _desired_storage_bytes(storage_gb: int) -> Optional[int]:
storage = int(storage_gb or 0)
if storage <= 0:
return None
return storage * 1024 * 1024 * 1024
@staticmethod
def _get_container_cpu_cores(container: Any) -> float:
attrs = getattr(container, "attrs", {}) or {}
host_cfg = attrs.get("HostConfig") or {}
nano_cpus = int(host_cfg.get("NanoCpus") or 0)
if nano_cpus > 0:
return nano_cpus / 1_000_000_000
cpu_quota = int(host_cfg.get("CpuQuota") or 0)
cpu_period = int(host_cfg.get("CpuPeriod") or 0)
if cpu_quota > 0 and cpu_period > 0:
return cpu_quota / cpu_period
return 0.0
def _container_storage_matches(self, actual_storage_bytes: Optional[int], desired_storage_gb: int) -> bool:
expected_storage_bytes = self._desired_storage_bytes(desired_storage_gb)
if expected_storage_bytes is None:
return actual_storage_bytes in {None, 0}
if actual_storage_bytes == expected_storage_bytes:
return True
return actual_storage_bytes is None and self._storage_limit_supported is not True
def _container_matches_runtime(
self,
container: Any,
*,
image: str,
cpu_cores: float,
memory_mb: int,
storage_gb: int,
bot_workspace: str,
network_name: str,
) -> bool:
attrs = getattr(container, "attrs", {}) or {}
config = attrs.get("Config") or {}
host_cfg = attrs.get("HostConfig") or {}
current_image = str(config.get("Image") or "").strip()
if current_image != image:
return False
if not self._container_uses_expected_bootstrap(container):
return False
if not self._container_uses_network(container, network_name):
return False
if not self._container_has_mount(container, bot_workspace, "/root/.nanobot"):
return False
actual_memory_bytes = int(host_cfg.get("Memory") or 0)
if actual_memory_bytes != self._desired_memory_bytes(memory_mb):
return False
desired_cpu = float(cpu_cores or 0)
actual_cpu = self._get_container_cpu_cores(container)
if abs(actual_cpu - desired_cpu) > 0.01:
return False
storage_opt = host_cfg.get("StorageOpt") or {}
actual_storage_bytes = self._parse_size_to_bytes(storage_opt.get("size"))
if not self._container_storage_matches(actual_storage_bytes, storage_gb):
return False
return True
def _run_container_with_storage_fallback( def _run_container_with_storage_fallback(
self, self,
bot_id: str, bot_id: str,
@ -246,7 +379,10 @@ class BotDockerManager:
"detach": True, "detach": True,
"stdin_open": True, "stdin_open": True,
"tty": True, "tty": True,
"environment": env_vars or {}, "entrypoint": self._runtime_bootstrap_entrypoint(),
"labels": {
self._RUNTIME_BOOTSTRAP_LABEL_KEY: self._RUNTIME_BOOTSTRAP_LABEL_VALUE,
},
"volumes": { "volumes": {
bot_workspace: {"bind": "/root/.nanobot", "mode": "rw"}, bot_workspace: {"bind": "/root/.nanobot", "mode": "rw"},
}, },
@ -261,16 +397,32 @@ class BotDockerManager:
try: try:
container = self.client.containers.get(container_name) container = self.client.containers.get(container_name)
container.reload() container.reload()
if container.status == "running" and self._container_uses_network(container, target_network): if container.status in {"running", "restarting"} and self._container_uses_network(container, target_network):
if on_state_change: if on_state_change:
self.ensure_monitor(bot_id, on_state_change) self.ensure_monitor(bot_id, on_state_change)
return True return True
if container.status == "running": if container.status in {"running", "restarting"}:
print( print(
f"[DockerManager] recreating {container_name} to switch network " f"[DockerManager] recreating {container_name} to switch network "
f"from current attachment to '{target_network}'" f"from current attachment to '{target_network}'"
) )
container.remove(force=True) container.remove(force=True)
elif self._container_matches_runtime(
container,
image=image,
cpu_cores=cpu,
memory_mb=memory,
storage_gb=storage,
bot_workspace=bot_workspace,
network_name=target_network,
):
container.start()
if on_state_change:
self.ensure_monitor(bot_id, on_state_change)
return True
else:
print(f"[DockerManager] recreating {container_name} because container config no longer matches desired runtime")
container.remove(force=True)
except docker.errors.NotFound: except docker.errors.NotFound:
pass pass
@ -322,14 +474,17 @@ class BotDockerManager:
print(f"[DockerManager] Error ensuring monitor for {bot_id}: {e}") print(f"[DockerManager] Error ensuring monitor for {bot_id}: {e}")
return False return False
def stop_bot(self, bot_id: str) -> bool: def stop_bot(self, bot_id: str, remove: bool = False) -> bool:
if not self.client: if not self.client:
return False return False
container_name = f"worker_{bot_id}" container_name = f"worker_{bot_id}"
try: try:
container = self.client.containers.get(container_name) container = self.client.containers.get(container_name)
container.stop(timeout=5) container.reload()
container.remove() if str(container.status or "").strip().lower() in {"running", "restarting", "paused"}:
container.stop(timeout=5)
if remove:
container.remove()
self.active_monitors.pop(bot_id, None) self.active_monitors.pop(bot_id, None)
return True return True
except docker.errors.NotFound: except docker.errors.NotFound:

View File

@ -125,7 +125,7 @@ def deactivate_bot_instance(session: Session, bot_id: str) -> Dict[str, Any]:
def delete_bot_instance(session: Session, bot_id: str, delete_workspace: bool = True) -> Dict[str, Any]: def delete_bot_instance(session: Session, bot_id: str, delete_workspace: bool = True) -> Dict[str, Any]:
bot = _get_bot_or_404(session, bot_id) bot = _get_bot_or_404(session, bot_id)
docker_manager.stop_bot(bot_id) docker_manager.stop_bot(bot_id, remove=True)
messages = session.exec(select(BotMessage).where(BotMessage.bot_id == bot_id)).all() messages = session.exec(select(BotMessage).where(BotMessage.bot_id == bot_id)).all()
for row in messages: for row in messages:

View File

@ -0,0 +1,206 @@
import sys
import tempfile
import types
import unittest
from unittest.mock import MagicMock
docker_stub = types.ModuleType("docker")
docker_stub.errors = types.SimpleNamespace(
ImageNotFound=type("ImageNotFound", (Exception,), {}),
NotFound=type("NotFound", (Exception,), {}),
)
sys.modules.setdefault("docker", docker_stub)
from core.docker_manager import BotDockerManager
class BotDockerManagerTests(unittest.TestCase):
def setUp(self) -> None:
self._tmpdir = tempfile.TemporaryDirectory()
def tearDown(self) -> None:
self._tmpdir.cleanup()
def _make_manager(self) -> BotDockerManager:
manager = BotDockerManager.__new__(BotDockerManager)
manager.client = MagicMock()
manager.host_data_root = self._tmpdir.name
manager.base_image = "nanobot-base"
manager.network_name = ""
manager.active_monitors = {}
manager._last_delivery_error = {}
manager._storage_limit_supported = True
manager._storage_limit_warning_emitted = False
return manager
@staticmethod
def _build_container(
*,
status: str,
image: str,
nano_cpus: int,
memory_bytes: int,
storage_opt_size: str,
source_mount: str,
network_name: str,
bootstrap_label: str | None = "env-json-v1",
) -> MagicMock:
container = MagicMock()
container.status = status
container.reload = MagicMock()
container.start = MagicMock()
container.stop = MagicMock()
container.remove = MagicMock()
container.attrs = {
"Config": {
"Image": image,
"Labels": (
{"dashboard.runtime_bootstrap": bootstrap_label}
if bootstrap_label is not None
else {}
),
},
"HostConfig": {
"NanoCpus": nano_cpus,
"Memory": memory_bytes,
"StorageOpt": {"size": storage_opt_size},
},
"Mounts": [
{
"Source": source_mount,
"Destination": "/root/.nanobot",
"RW": True,
}
],
"NetworkSettings": {
"Networks": {network_name: {"IPAddress": "172.18.0.2"}},
},
}
return container
def test_stop_bot_keeps_container_by_default(self) -> None:
manager = self._make_manager()
container = MagicMock()
container.status = "running"
container.reload = MagicMock()
container.stop = MagicMock()
container.remove = MagicMock()
manager.client.containers.get.return_value = container
result = manager.stop_bot("demo")
self.assertTrue(result)
container.stop.assert_called_once_with(timeout=5)
container.remove.assert_not_called()
def test_stop_bot_remove_true_deletes_container(self) -> None:
manager = self._make_manager()
container = MagicMock()
container.status = "exited"
container.reload = MagicMock()
container.stop = MagicMock()
container.remove = MagicMock()
manager.client.containers.get.return_value = container
result = manager.stop_bot("demo", remove=True)
self.assertTrue(result)
container.stop.assert_not_called()
container.remove.assert_called_once_with()
def test_start_bot_reuses_compatible_stopped_container(self) -> None:
manager = self._make_manager()
image_tag = "nanobot-base:v1"
workspace_mount = f"{self._tmpdir.name}/demo/.nanobot"
container = self._build_container(
status="exited",
image=image_tag,
nano_cpus=1_000_000_000,
memory_bytes=1024 * 1024 * 1024,
storage_opt_size="10G",
source_mount=workspace_mount,
network_name="bridge",
)
manager.client.images.get.return_value = MagicMock()
manager.client.containers.get.return_value = container
result = manager.start_bot(
"demo",
image_tag=image_tag,
env_vars={"TZ": "UTC", "API_KEY": "updated-secret"},
cpu_cores=1.0,
memory_mb=1024,
storage_gb=10,
)
self.assertTrue(result)
container.start.assert_called_once_with()
container.remove.assert_not_called()
manager.client.containers.run.assert_not_called()
def test_start_bot_recreates_incompatible_stopped_container(self) -> None:
manager = self._make_manager()
image_tag = "nanobot-base:v1"
workspace_mount = f"{self._tmpdir.name}/demo/.nanobot"
container = self._build_container(
status="exited",
image="nanobot-base:old",
nano_cpus=1_000_000_000,
memory_bytes=1024 * 1024 * 1024,
storage_opt_size="10G",
source_mount=workspace_mount,
network_name="bridge",
)
manager.client.images.get.return_value = MagicMock()
manager.client.containers.get.return_value = container
manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock())
result = manager.start_bot(
"demo",
image_tag=image_tag,
env_vars={"TZ": "Asia/Shanghai"},
cpu_cores=1.0,
memory_mb=1024,
storage_gb=10,
)
self.assertTrue(result)
container.start.assert_not_called()
container.remove.assert_called_once_with(force=True)
manager._run_container_with_storage_fallback.assert_called_once()
def test_start_bot_recreates_container_without_new_entrypoint(self) -> None:
manager = self._make_manager()
image_tag = "nanobot-base:v1"
workspace_mount = f"{self._tmpdir.name}/demo/.nanobot"
container = self._build_container(
status="exited",
image=image_tag,
nano_cpus=1_000_000_000,
memory_bytes=1024 * 1024 * 1024,
storage_opt_size="10G",
source_mount=workspace_mount,
network_name="bridge",
bootstrap_label=None,
)
manager.client.images.get.return_value = MagicMock()
manager.client.containers.get.return_value = container
manager._run_container_with_storage_fallback = MagicMock(return_value=MagicMock())
result = manager.start_bot(
"demo",
image_tag=image_tag,
env_vars={"TZ": "Asia/Shanghai"},
cpu_cores=1.0,
memory_mb=1024,
storage_gb=10,
)
self.assertTrue(result)
container.start.assert_not_called()
container.remove.assert_called_once_with(force=True)
manager._run_container_with_storage_fallback.assert_called_once()
if __name__ == "__main__":
unittest.main()