""" Container lifecycle operations via docker compose. Three operations: restart - docker restart (fast, no downtime window) rebuild - docker compose down && docker compose up -d --build recreate - docker compose down --volumes && docker compose up -d --build (destructive DR) """ import json import sys from datetime import datetime, timezone from typing import AsyncGenerator from fastapi import APIRouter, Depends, Query from fastapi.responses import StreamingResponse from app.auth import verify_token from app.ops_runner import ( OPS_CLI, _BACKUP_TIMEOUT, new_op_id, is_cancelled, clear_cancelled, run_command, run_command_host, stream_command_host, ) sys.path.insert(0, "/opt/infrastructure") router = APIRouter() # --------------------------------------------------------------------------- # Descriptor helpers # --------------------------------------------------------------------------- def _descriptor(project: str): """Load the project descriptor from /opt/data/{project}/project.yaml.""" from toolkit.descriptor import find as find_project desc = find_project(project) if desc is None: raise ValueError(f"Unknown project '{project}' — no project.yaml found") return desc def _compose_dir(project: str, env: str) -> str: desc = _descriptor(project) return desc.compose_dir(env) def _container_prefix(project: str, env: str) -> str: """Return expanded container prefix, e.g. 'dev-mdf' or 'prod-seriousletter'.""" desc = _descriptor(project) return desc.container_prefix_for(env) def _all_compose_dirs(project: str, env: str) -> list[tuple[str, str]]: """Return list of (label, compose_dir) for all compose files to manage. Always includes the main compose_dir for the env. Additionally includes any subsystem compose dirs defined in the descriptor's raw config that are applicable to the given env. Currently supports: - seafile: prod-only extra compose at descriptor.raw['seafile']['compose_dir'] """ desc = _descriptor(project) dirs = [] main_dir = desc.compose_dir(env) if main_dir: dirs.append((env, main_dir)) # Check for seafile subsystem (prod-only, lives in its own compose dir) seafile = desc.raw.get("seafile") if seafile and env == "prod" and "compose_dir" in seafile: dirs.append(("seafile", seafile["compose_dir"])) return dirs def _compose_cmd(project: str, env: str) -> list[str]: """Build the base docker compose command with env-file and profile.""" import os compose_dir = _compose_dir(project, env) # Find compose file compose_file = "docker-compose.yaml" if not os.path.isfile(os.path.join(compose_dir, compose_file)): compose_file = "docker-compose.yml" cmd = ["docker", "compose", "-f", f"{compose_dir}/{compose_file}"] # Find env file for candidate in [f".env.{env}", ".env"]: path = os.path.join(compose_dir, candidate) if os.path.isfile(path): cmd += ["--env-file", path] break cmd += ["--profile", env] return cmd def _compose_cmd_for(compose_dir: str, env: str) -> list[str]: """Build the base docker compose command for a specific compose directory. Searches for .env.{env} first, then .env. Adds --profile {env}. """ import os compose_file = "docker-compose.yaml" if not os.path.isfile(os.path.join(compose_dir, compose_file)): compose_file = "docker-compose.yml" cmd = ["docker", "compose", "-f", f"{compose_dir}/{compose_file}"] for candidate in [f".env.{env}", ".env"]: path = os.path.join(compose_dir, candidate) if os.path.isfile(path): cmd += ["--env-file", path] break cmd += ["--profile", env] return cmd # --------------------------------------------------------------------------- # Container discovery # --------------------------------------------------------------------------- async def _find_containers(project: str, env: str) -> list[str]: """Find all running containers matching the project/env prefix.""" prefix = _container_prefix(project, env) pattern = f"{prefix}-" result = await run_command( ["docker", "ps", "--filter", f"name={pattern}", "--format", "{{.Names}}"], timeout=15, ) containers = [] if result["success"]: for name in result["output"].strip().splitlines(): name = name.strip() if name and name.startswith(pattern): containers.append(name) return containers # --------------------------------------------------------------------------- # SSE helpers # --------------------------------------------------------------------------- def _sse(payload: dict) -> str: return f"data: {json.dumps(payload)}\n\n" def _now() -> str: return datetime.now(timezone.utc).isoformat() def _line(text: str) -> str: return _sse({"line": text, "timestamp": _now()}) def _done(success: bool, project: str, env: str, action: str, cancelled: bool = False) -> str: payload = { "done": True, "success": success, "project": project, "env": env, "action": action, } if cancelled: payload["cancelled"] = True return _sse(payload) # --------------------------------------------------------------------------- # Operation: Restart # --------------------------------------------------------------------------- async def _op_restart(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]: """Restart: docker restart . Fast, no compose cycle. Uses _find_containers which matches all containers with the project/env prefix (e.g. 'prod-mdf-'). This naturally includes any subsystem containers such as prod-mdf-seafile, prod-mdf-seafile-mysql, prod-mdf-seafile-redis. """ if op_id: yield _sse({"op_id": op_id}) yield _line(f"[restart] Finding containers for {project}/{env}...") try: containers = await _find_containers(project, env) except Exception as exc: yield _line(f"[error] Descriptor lookup failed: {exc}") yield _done(False, project, env, "restart") return if not containers: yield _line(f"[error] No running containers found for {project}/{env}") yield _done(False, project, env, "restart") return yield _line(f"[restart] Restarting {len(containers)} container(s): {', '.join(containers)}") result = await run_command(["docker", "restart"] + containers, timeout=120) for output_line in result["output"].strip().splitlines(): if output_line.strip(): yield _line(output_line) for err_line in result["error"].strip().splitlines(): if err_line.strip(): yield _line(f"[stderr] {err_line}") if result["success"]: yield _line(f"[restart] All containers restarted successfully.") yield _done(True, project, env, "restart") else: yield _line(f"[error] docker restart failed") yield _done(False, project, env, "restart") # --------------------------------------------------------------------------- # Operation: Rebuild # --------------------------------------------------------------------------- async def _op_rebuild(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]: """Rebuild: docker compose down && docker compose up -d --build. No data loss. Iterates over all compose dirs (main + any subsystem dirs like seafile for prod). Each compose is brought down then rebuilt in sequence. """ if op_id: yield _sse({"op_id": op_id}) try: compose_dirs = _all_compose_dirs(project, env) except Exception as exc: yield _line(f"[error] Descriptor lookup failed: {exc}") yield _done(False, project, env, "rebuild") return if not compose_dirs: yield _line(f"[error] No compose directories found for {project}/{env}") yield _done(False, project, env, "rebuild") return for label, cdir in compose_dirs: yield _line(f"[rebuild] Compose dir ({label}): {cdir}") # Step 1: docker compose down yield _line(f"[rebuild] Stopping {label} via docker compose down...") result = await run_command_host( _compose_cmd_for(cdir, env) + ["down"], timeout=120, ) for output_line in (result["output"] + result["error"]).strip().splitlines(): if output_line.strip(): yield _line(output_line) if not result["success"]: yield _line(f"[error] docker compose down failed for {label}") yield _done(False, project, env, "rebuild") return yield _line(f"[rebuild] {label} containers stopped.") if op_id and is_cancelled(op_id): yield _line(f"[rebuild] Cancelled after stop. Run docker compose up manually to recover.") yield _done(False, project, env, "rebuild", cancelled=True) return # Step 2: docker compose up -d --build (streaming for real-time build output) yield _line(f"[rebuild] Building and starting {label}...") async for build_line in stream_command_host( _compose_cmd_for(cdir, env) + ["up", "-d", "--build"], timeout=_BACKUP_TIMEOUT, op_id=op_id, ): yield _line(f"[rebuild] {build_line}") if op_id and is_cancelled(op_id): yield _line(f"[rebuild] Cancelled during build/start of {label}.") yield _done(False, project, env, "rebuild", cancelled=True) return # Verify all containers came up containers = await _find_containers(project, env) if containers: yield _line(f"[rebuild] {len(containers)} container(s) running: {', '.join(containers)}") yield _done(True, project, env, "rebuild") else: yield _line(f"[warn] No running containers detected after rebuild — check compose logs") yield _done(False, project, env, "rebuild") # --------------------------------------------------------------------------- # Operation: Recreate (Disaster Recovery) # --------------------------------------------------------------------------- async def _op_recreate(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]: """Recreate: docker compose down --volumes && up --build. DESTRUCTIVE — wipes volumes. Iterates over all compose dirs (main + any subsystem dirs like seafile for prod). A safety backup is taken first. Then each compose is wiped and rebuilt in sequence. """ if op_id: yield _sse({"op_id": op_id}) try: compose_dirs = _all_compose_dirs(project, env) except Exception as exc: yield _line(f"[error] Descriptor lookup failed: {exc}") yield _done(False, project, env, "recreate") return if not compose_dirs: yield _line(f"[error] No compose directories found for {project}/{env}") yield _done(False, project, env, "recreate") return # Log all compose dirs we will operate on for label, cdir in compose_dirs: yield _line(f"[recreate] Compose dir ({label}): {cdir}") # Step 1: Safety backup before destroying anything yield _line(f"[recreate] Creating safety backup before wipe...") async for backup_line in stream_command_host( [OPS_CLI, "backup", project, env], timeout=_BACKUP_TIMEOUT, op_id=op_id, ): yield _line(f"[recreate] {backup_line}") if op_id and is_cancelled(op_id): yield _line(f"[recreate] Cancelled during safety backup. No data was lost.") yield _done(False, project, env, "recreate", cancelled=True) return yield _line(f"[recreate] Safety backup complete.") for label, cdir in compose_dirs: # Step 2: docker compose down --volumes (removes named volumes) yield _line(f"[recreate] WARNING: Running docker compose down --volumes for {label} (data will be wiped)...") result = await run_command_host( _compose_cmd_for(cdir, env) + ["down", "--volumes"], timeout=120, ) for output_line in (result["output"] + result["error"]).strip().splitlines(): if output_line.strip(): yield _line(output_line) if not result["success"]: yield _line(f"[error] docker compose down --volumes failed for {label}") yield _done(False, project, env, "recreate") return yield _line(f"[recreate] {label} containers and volumes removed.") if op_id and is_cancelled(op_id): yield _line(f"[recreate] Cancelled after volume wipe of {label}. Restore a backup to recover.") yield _done(False, project, env, "recreate", cancelled=True) return # Step 3: docker compose up -d --build yield _line(f"[recreate] Building and starting fresh {label}...") async for build_line in stream_command_host( _compose_cmd_for(cdir, env) + ["up", "-d", "--build"], timeout=_BACKUP_TIMEOUT, op_id=op_id, ): yield _line(f"[recreate] {build_line}") if op_id and is_cancelled(op_id): yield _line(f"[recreate] Cancelled during build/start of {label}.") yield _done(False, project, env, "recreate", cancelled=True) return # Verify containers came up containers = await _find_containers(project, env) if containers: yield _line(f"[recreate] {len(containers)} container(s) running. Restore a backup to complete recovery.") yield _done(True, project, env, "recreate") else: yield _line(f"[warn] No running containers after recreate — check compose logs") yield _done(False, project, env, "recreate") # --------------------------------------------------------------------------- # Dispatch # --------------------------------------------------------------------------- async def _op_generator(project: str, env: str, action: str) -> AsyncGenerator[str, None]: op_id = new_op_id() try: if action == "restart": async for chunk in _op_restart(project, env, op_id=op_id): yield chunk elif action == "rebuild": async for chunk in _op_rebuild(project, env, op_id=op_id): yield chunk elif action == "recreate": async for chunk in _op_recreate(project, env, op_id=op_id): yield chunk else: yield _line(f"[error] Unknown action '{action}'. Valid: restart, rebuild, recreate") yield _done(False, project, env, action) finally: clear_cancelled(op_id) # --------------------------------------------------------------------------- # Endpoint # --------------------------------------------------------------------------- @router.get( "/{project}/{env}", summary="Container lifecycle operation with real-time SSE output", ) async def lifecycle_op( project: str, env: str, action: str = Query( default="restart", description="Operation: restart | rebuild | recreate", ), _: str = Depends(verify_token), ) -> StreamingResponse: """ Stream a container lifecycle operation via SSE. - restart: docker restart containers (safe, fast) - rebuild: docker compose down && up --build (no data loss) - recreate: docker compose down --volumes && up --build (destructive — DR only) """ return StreamingResponse( _op_generator(project, env, action), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "X-Accel-Buffering": "no", }, )