From fd03c16eca085423267c163137b28ccb60de8db0 Mon Sep 17 00:00:00 2001
From: Matthias Nott <mnott@mnsoft.org>
Date: Wed, 25 Feb 2026 00:45:13 +0100
Subject: [PATCH] feat: multi-compose rebuild (Seafile), cancel endpoint, schedule router, project descriptor

---
 static/index.html        |   73 +
 app/routers/cancel.py    |   17 
 app/main.py              |    4 
 project.yaml             |   56 +
 app/routers/registry.py  |   48 
 app/routers/restore.py   |  116 ++
 static/js/app.js         |  685 +++++++++++++++--
 app/routers/services.py  |   71 -
 docker-compose.yml       |   13 
 app/routers/sync_data.py |    8 
 app/routers/schedule.py  |  199 +++++
 app/routers/backups.py   |  149 +++
 app/routers/rebuild.py   |  597 +++++++--------
 static/css/style.css     |   42 +
 app/ops_runner.py        |  158 ++-
 15 files changed, 1,655 insertions(+), 581 deletions(-)

diff --git a/app/main.py b/app/main.py
index 8b702ec..3bd1d5b 100644
--- a/app/main.py
+++ b/app/main.py
@@ -10,7 +10,7 @@
 from starlette.datastructures import MutableHeaders
 from starlette.types import ASGIApp, Receive, Scope, Send
 
-from app.routers import backups, promote, rebuild, registry, restore, services, status, sync_data, system
+from app.routers import backups, cancel, promote, rebuild, registry, restore, schedule, services, status, sync_data, system
 
 logging.basicConfig(
     level=logging.INFO,
@@ -64,6 +64,8 @@
 app.include_router(sync_data.router, prefix="/api/sync", tags=["sync"])
 app.include_router(registry.router, prefix="/api/registry", tags=["registry"])
 app.include_router(rebuild.router, prefix="/api/rebuild", tags=["rebuild"])
+app.include_router(schedule.router, prefix="/api/schedule", tags=["schedule"])
+app.include_router(cancel.router, prefix="/api/operations", tags=["operations"])
 
 # ---------------------------------------------------------------------------
 # Index route — serves index.html with content-hashed asset URLs.
diff --git a/app/ops_runner.py b/app/ops_runner.py
index d9a460a..460fe80 100644
--- a/app/ops_runner.py
+++ b/app/ops_runner.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import os
+import uuid
 from typing import AsyncGenerator
 
 OPS_CLI = os.environ.get("OPS_CLI", "/opt/infrastructure/ops")
@@ -9,6 +10,48 @@
 
 _DEFAULT_TIMEOUT = 300
 _BACKUP_TIMEOUT = 3600
+
+# ---------------------------------------------------------------------------
+# Operation registry — tracks running processes for cancel support
+# ---------------------------------------------------------------------------
+_active_ops: dict[str, asyncio.subprocess.Process] = {}
+_cancelled_ops: set[str] = set()
+
+
+def new_op_id() -> str:
+    return uuid.uuid4().hex[:12]
+
+
+def register_op(op_id: str, proc: asyncio.subprocess.Process) -> None:
+    _active_ops[op_id] = proc
+
+
+def deregister_op(op_id: str) -> None:
+    _active_ops.pop(op_id, None)
+    # NOTE: do NOT clear _cancelled_ops here — callers check is_cancelled()
+    # after the stream ends. The flag is cleared by clear_cancelled() instead.
+
+
+def clear_cancelled(op_id: str) -> None:
+    """Call after the generator has finished checking is_cancelled()."""
+    _cancelled_ops.discard(op_id)
+
+
+def cancel_op(op_id: str) -> bool:
+    """Terminate a running operation. Returns True if found and killed."""
+    proc = _active_ops.get(op_id)
+    if proc is None:
+        return False
+    _cancelled_ops.add(op_id)
+    try:
+        proc.terminate()
+    except ProcessLookupError:
+        pass
+    return True
+
+
+def is_cancelled(op_id: str) -> bool:
+    return op_id in _cancelled_ops
 
 # nsenter via Docker: run commands on the host from inside the container.
 # Required because ops backup/restore delegate to host Python venvs (3.12)
@@ -90,9 +133,9 @@
         }
 
 
-async def stream_ops_host(args: list[str], timeout: int = _DEFAULT_TIMEOUT) -> AsyncGenerator[str, None]:
+async def stream_ops_host(args: list[str], timeout: int = _DEFAULT_TIMEOUT, op_id: str | None = None) -> AsyncGenerator[str, None]:
     """Stream ops CLI output from the host via nsenter."""
-    async for line in _stream_exec(_NSENTER_PREFIX + [OPS_CLI] + args, timeout=timeout):
+    async for line in _stream_exec(_NSENTER_PREFIX + [OPS_CLI] + args, timeout=timeout, op_id=op_id):
         yield line
 
 
@@ -101,9 +144,9 @@
     return await _run_exec(_NSENTER_PREFIX + args, timeout=timeout)
 
 
-async def stream_command_host(args: list[str], timeout: int = _DEFAULT_TIMEOUT) -> AsyncGenerator[str, None]:
+async def stream_command_host(args: list[str], timeout: int = _DEFAULT_TIMEOUT, op_id: str | None = None) -> AsyncGenerator[str, None]:
     """Stream arbitrary command output from the host via nsenter."""
-    async for line in _stream_exec(_NSENTER_PREFIX + args, timeout=timeout):
+    async for line in _stream_exec(_NSENTER_PREFIX + args, timeout=timeout, op_id=op_id):
         yield line
 
 
@@ -137,7 +180,7 @@
         return {"success": False, "output": "", "error": str(exc)}
 
 
-async def _stream_exec(args: list[str], timeout: int = _DEFAULT_TIMEOUT) -> AsyncGenerator[str, None]:
+async def _stream_exec(args: list[str], timeout: int = _DEFAULT_TIMEOUT, op_id: str | None = None) -> AsyncGenerator[str, None]:
     """Execute a command and yield interleaved stdout/stderr lines."""
     try:
         proc = await asyncio.create_subprocess_exec(
@@ -152,53 +195,68 @@
         yield f"[error] Failed to start process: {exc}"
         return
 
-    async def _readline(stream, prefix=""):
-        while True:
+    if op_id:
+        register_op(op_id, proc)
+
+    try:
+        async def _readline(stream, prefix=""):
+            while True:
+                try:
+                    line = await asyncio.wait_for(stream.readline(), timeout=timeout)
+                except asyncio.TimeoutError:
+                    yield f"{prefix}[timeout] Command exceeded {timeout}s"
+                    break
+                if not line:
+                    break
+                yield prefix + line.decode("utf-8", errors="replace").rstrip("\n")
+
+        stdout_gen = _readline(proc.stdout).__aiter__()
+        stderr_gen = _readline(proc.stderr, "[stderr] ").__aiter__()
+
+        stdout_done = stderr_done = False
+        pending_out = pending_err = None
+
+        async def _next(it):
             try:
-                line = await asyncio.wait_for(stream.readline(), timeout=timeout)
-            except asyncio.TimeoutError:
-                yield f"{prefix}[timeout] Command exceeded {timeout}s"
+                return await it.__anext__()
+            except StopAsyncIteration:
+                return None
+
+        pending_out = asyncio.create_task(_next(stdout_gen))
+        pending_err = asyncio.create_task(_next(stderr_gen))
+
+        while not (stdout_done and stderr_done):
+            tasks = [t for t in (pending_out, pending_err) if t is not None]
+            if not tasks:
                 break
-            if not line:
-                break
-            yield prefix + line.decode("utf-8", errors="replace").rstrip("\n")
+            done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
 
-    stdout_gen = _readline(proc.stdout).__aiter__()
-    stderr_gen = _readline(proc.stderr, "[stderr] ").__aiter__()
+            for task in done:
+                val = task.result()
+                if task is pending_out:
+                    if val is None:
+                        stdout_done = True
+                        pending_out = None
+                    else:
+                        yield val
+                        pending_out = asyncio.create_task(_next(stdout_gen))
+                elif task is pending_err:
+                    if val is None:
+                        stderr_done = True
+                        pending_err = None
+                    else:
+                        yield val
+                        pending_err = asyncio.create_task(_next(stderr_gen))
 
-    stdout_done = stderr_done = False
-    pending_out = pending_err = None
-
-    async def _next(it):
+        await proc.wait()
+    except (asyncio.CancelledError, GeneratorExit):
+        # Browser disconnected or generator closed — kill the process
         try:
-            return await it.__anext__()
-        except StopAsyncIteration:
-            return None
-
-    pending_out = asyncio.create_task(_next(stdout_gen))
-    pending_err = asyncio.create_task(_next(stderr_gen))
-
-    while not (stdout_done and stderr_done):
-        tasks = [t for t in (pending_out, pending_err) if t is not None]
-        if not tasks:
-            break
-        done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
-
-        for task in done:
-            val = task.result()
-            if task is pending_out:
-                if val is None:
-                    stdout_done = True
-                    pending_out = None
-                else:
-                    yield val
-                    pending_out = asyncio.create_task(_next(stdout_gen))
-            elif task is pending_err:
-                if val is None:
-                    stderr_done = True
-                    pending_err = None
-                else:
-                    yield val
-                    pending_err = asyncio.create_task(_next(stderr_gen))
-
-    await proc.wait()
+            proc.terminate()
+        except ProcessLookupError:
+            pass
+        await proc.wait()
+        raise
+    finally:
+        if op_id:
+            deregister_op(op_id)
diff --git a/app/routers/backups.py b/app/routers/backups.py
index de5a15c..badf0bf 100644
--- a/app/routers/backups.py
+++ b/app/routers/backups.py
@@ -1,11 +1,26 @@
-from typing import Any
+import json
+from datetime import datetime, timezone
+from typing import Any, AsyncGenerator
 
 from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import StreamingResponse
 
 from app.auth import verify_token
-from app.ops_runner import run_ops, run_ops_json, run_ops_host, run_ops_host_json, run_command_host, _BACKUP_TIMEOUT
+from app.ops_runner import (
+    run_ops, run_ops_json, run_ops_host, run_ops_host_json, run_command_host,
+    stream_ops_host, stream_command_host, new_op_id, is_cancelled, clear_cancelled,
+    _BACKUP_TIMEOUT, OFFSITE_PYTHON,
+)
 
 router = APIRouter()
+
+
+def _sse(payload: dict) -> str:
+    return f"data: {json.dumps(payload)}\n\n"
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
 
 
 @router.get("/", summary="List local backups")
@@ -82,6 +97,43 @@
     }
 
 
+async def _backup_stream(project: str, env: str) -> AsyncGenerator[str, None]:
+    """Stream backup creation progress via SSE."""
+    op_id = new_op_id()
+    yield _sse({"op_id": op_id})
+    yield _sse({"line": f"Creating backup for {project}/{env}...", "timestamp": _now()})
+
+    try:
+        success = True
+        async for line in stream_ops_host(
+            ["backup", project, env], timeout=_BACKUP_TIMEOUT, op_id=op_id
+        ):
+            yield _sse({"line": line, "timestamp": _now()})
+            if line.startswith("[error]") or line.startswith("ERROR"):
+                success = False
+
+        if is_cancelled(op_id):
+            yield _sse({"done": True, "success": False, "cancelled": True})
+        else:
+            yield _sse({"done": True, "success": success, "project": project, "env": env})
+    finally:
+        clear_cancelled(op_id)
+
+
+@router.get("/stream/{project}/{env}", summary="Create backup with streaming output")
+async def create_backup_stream(
+    project: str,
+    env: str,
+    _: str = Depends(verify_token),
+) -> StreamingResponse:
+    """Create a backup with real-time SSE progress output."""
+    return StreamingResponse(
+        _backup_stream(project, env),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+    )
+
+
 @router.post("/offsite/upload/{project}/{env}", summary="Upload backup to offsite")
 async def upload_offsite(
     project: str,
@@ -100,6 +152,99 @@
     return {"success": True, "output": result["output"], "project": project, "env": env}
 
 
+async def _upload_stream(project: str, env: str, name: str | None = None) -> AsyncGenerator[str, None]:
+    """Stream offsite upload progress via SSE."""
+    op_id = new_op_id()
+    yield _sse({"op_id": op_id})
+    label = f"{project}/{env}/{name}" if name else f"{project}/{env} (latest)"
+    yield _sse({"line": f"Uploading {label} to offsite storage...", "timestamp": _now()})
+
+    cmd = ["offsite", "upload", project, env]
+    if name:
+        cmd.append(name)
+
+    try:
+        success = True
+        async for line in stream_ops_host(
+            cmd, timeout=_BACKUP_TIMEOUT, op_id=op_id
+        ):
+            yield _sse({"line": line, "timestamp": _now()})
+            if line.startswith("[error]") or line.startswith("ERROR"):
+                success = False
+
+        if is_cancelled(op_id):
+            yield _sse({"done": True, "success": False, "cancelled": True})
+        else:
+            yield _sse({"done": True, "success": success, "project": project, "env": env})
+    finally:
+        clear_cancelled(op_id)
+
+
+@router.get("/offsite/stream/{project}/{env}", summary="Upload to offsite with streaming output")
+async def upload_offsite_stream(
+    project: str,
+    env: str,
+    name: str | None = Query(None),
+    _: str = Depends(verify_token),
+) -> StreamingResponse:
+    """Upload backup to offsite with real-time SSE progress output."""
+    return StreamingResponse(
+        _upload_stream(project, env, name),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+    )
+
+
+async def _download_stream(project: str, env: str, name: str) -> AsyncGenerator[str, None]:
+    """Stream offsite-to-local download progress via SSE."""
+    op_id = new_op_id()
+    yield _sse({"op_id": op_id})
+    yield _sse({"line": f"Downloading {name} from offsite to local storage...", "timestamp": _now()})
+
+    # Download to the local backup directory so it appears in the backup list
+    local_path = f"/opt/data/backups/{project}/{env}/{name}"
+    cmd = [
+        OFFSITE_PYTHON, "-c",
+        f"import sys; sys.stdout.reconfigure(line_buffering=True); "
+        f"sys.path.insert(0, '/opt/data/scripts'); "
+        f"from offsite import download; from pathlib import Path; "
+        f"import os; os.makedirs('/opt/data/backups/{project}/{env}', exist_ok=True); "
+        f"ok = download('{name}', Path('{local_path}'), '{project}', '{env}'); "
+        f"sys.exit(0 if ok else 1)"
+    ]
+
+    try:
+        success = True
+        async for line in stream_command_host(cmd, timeout=_BACKUP_TIMEOUT, op_id=op_id):
+            yield _sse({"line": line, "timestamp": _now()})
+            if line.startswith("[error]") or line.startswith("ERROR") or "failed" in line.lower():
+                success = False
+
+        if is_cancelled(op_id):
+            yield _sse({"done": True, "success": False, "cancelled": True})
+        else:
+            yield _sse({"done": True, "success": success, "project": project, "env": env, "name": name})
+    finally:
+        clear_cancelled(op_id)
+
+
+@router.get("/offsite/download/stream/{project}/{env}", summary="Download offsite backup to local storage with streaming output")
+async def download_offsite_stream(
+    project: str,
+    env: str,
+    name: str = Query(...),
+    _: str = Depends(verify_token),
+) -> StreamingResponse:
+    """Download an offsite backup to local storage with real-time SSE progress output."""
+    if "/" in name or "\\" in name or ".." in name:
+        raise HTTPException(status_code=400, detail="Invalid backup name")
+    return StreamingResponse(
+        _download_stream(project, env, name),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+    )
+
+
 @router.post("/offsite/retention", summary="Apply offsite retention policy")
 async def apply_retention(
     _: str = Depends(verify_token),
diff --git a/app/routers/cancel.py b/app/routers/cancel.py
new file mode 100644
index 0000000..068138c
--- /dev/null
+++ b/app/routers/cancel.py
@@ -0,0 +1,17 @@
+from fastapi import APIRouter, Depends, HTTPException
+
+from app.auth import verify_token
+from app.ops_runner import cancel_op
+
+router = APIRouter()
+
+
+@router.delete("/{op_id}", summary="Cancel a running operation")
+async def cancel_operation(
+    op_id: str,
+    _: str = Depends(verify_token),
+) -> dict:
+    """Terminate a running operation by its op_id."""
+    if cancel_op(op_id):
+        return {"cancelled": True, "op_id": op_id}
+    raise HTTPException(status_code=404, detail=f"No active operation with id '{op_id}'")
diff --git a/app/routers/rebuild.py b/app/routers/rebuild.py
index 447979f..5ff798f 100644
--- a/app/routers/rebuild.py
+++ b/app/routers/rebuild.py
@@ -1,19 +1,16 @@
 """
-Container lifecycle operations via Coolify API + SSH.
+Container lifecycle operations via docker compose.
 
 Three operations:
-  restart  – docker restart {containers} via SSH (no Coolify, no image pruning)
-  rebuild  – Coolify stop → docker build → Coolify start
-  recreate – Coolify stop → wipe data → docker build → Coolify start → show backups banner
+  restart  - docker restart <containers> (fast, no downtime window)
+  rebuild  - docker compose down && docker compose up -d --build
+  recreate - docker compose down --volumes && docker compose up -d --build (destructive DR)
 """
 import json
-import os
-import urllib.request
-import urllib.error
+import sys
 from datetime import datetime, timezone
 from typing import AsyncGenerator
 
-import yaml
 from fastapi import APIRouter, Depends, Query
 from fastapi.responses import StreamingResponse
 
@@ -21,87 +18,121 @@
 from app.ops_runner import (
     OPS_CLI,
     _BACKUP_TIMEOUT,
+    new_op_id,
+    is_cancelled,
+    clear_cancelled,
     run_command,
     run_command_host,
     stream_command_host,
 )
 
+sys.path.insert(0, "/opt/infrastructure")
+
 router = APIRouter()
 
+
 # ---------------------------------------------------------------------------
-# Configuration
+# Descriptor helpers
 # ---------------------------------------------------------------------------
 
-_REGISTRY_PATH = os.environ.get(
-    "REGISTRY_PATH",
-    "/opt/infrastructure/servers/hetzner-vps/registry.yaml",
-)
+def _descriptor(project: str):
+    """Load the project descriptor from /opt/data/{project}/project.yaml."""
+    from toolkit.descriptor import find as find_project
+    desc = find_project(project)
+    if desc is None:
+        raise ValueError(f"Unknown project '{project}' — no project.yaml found")
+    return desc
 
-_COOLIFY_BASE = os.environ.get(
-    "COOLIFY_BASE_URL",
-    "https://cockpit.tekmidian.com/api/v1",
-)
 
-_COOLIFY_TOKEN = os.environ.get(
-    "COOLIFY_API_TOKEN",
-    "3|f1fa8ee5791440ddd37e6cecafd964c8cd734dd4a8891180c424efad6bfdb7f5",
-)
+def _compose_dir(project: str, env: str) -> str:
+    desc = _descriptor(project)
+    return desc.compose_dir(env)
 
-_COOLIFY_TIMEOUT = 30   # seconds for API calls
-_POLL_INTERVAL  = 5    # seconds between container status polls
-_POLL_MAX_WAIT  = 180  # max seconds to wait for containers to stop/start
+
+def _container_prefix(project: str, env: str) -> str:
+    """Return expanded container prefix, e.g. 'dev-mdf' or 'prod-seriousletter'."""
+    desc = _descriptor(project)
+    return desc.container_prefix_for(env)
+
+
+def _all_compose_dirs(project: str, env: str) -> list[tuple[str, str]]:
+    """Return list of (label, compose_dir) for all compose files to manage.
+
+    Always includes the main compose_dir for the env. Additionally includes
+    any subsystem compose dirs defined in the descriptor's raw config that
+    are applicable to the given env. Currently supports:
+      - seafile: prod-only extra compose at descriptor.raw['seafile']['compose_dir']
+    """
+    desc = _descriptor(project)
+    dirs = []
+    main_dir = desc.compose_dir(env)
+    if main_dir:
+        dirs.append((env, main_dir))
+    # Check for seafile subsystem (prod-only, lives in its own compose dir)
+    seafile = desc.raw.get("seafile")
+    if seafile and env == "prod" and "compose_dir" in seafile:
+        dirs.append(("seafile", seafile["compose_dir"]))
+    return dirs
+
+
+def _compose_cmd(project: str, env: str) -> list[str]:
+    """Build the base docker compose command with env-file and profile."""
+    import os
+    compose_dir = _compose_dir(project, env)
+    # Find compose file
+    compose_file = "docker-compose.yaml"
+    if not os.path.isfile(os.path.join(compose_dir, compose_file)):
+        compose_file = "docker-compose.yml"
+    cmd = ["docker", "compose", "-f", f"{compose_dir}/{compose_file}"]
+    # Find env file
+    for candidate in [f".env.{env}", ".env"]:
+        path = os.path.join(compose_dir, candidate)
+        if os.path.isfile(path):
+            cmd += ["--env-file", path]
+            break
+    cmd += ["--profile", env]
+    return cmd
+
+
+def _compose_cmd_for(compose_dir: str, env: str) -> list[str]:
+    """Build the base docker compose command for a specific compose directory.
+
+    Searches for .env.{env} first, then .env. Adds --profile {env}.
+    """
+    import os
+    compose_file = "docker-compose.yaml"
+    if not os.path.isfile(os.path.join(compose_dir, compose_file)):
+        compose_file = "docker-compose.yml"
+    cmd = ["docker", "compose", "-f", f"{compose_dir}/{compose_file}"]
+    for candidate in [f".env.{env}", ".env"]:
+        path = os.path.join(compose_dir, candidate)
+        if os.path.isfile(path):
+            cmd += ["--env-file", path]
+            break
+    cmd += ["--profile", env]
+    return cmd
 
 
 # ---------------------------------------------------------------------------
-# Registry helpers
+# Container discovery
 # ---------------------------------------------------------------------------
 
-def _load_registry() -> dict:
-    with open(_REGISTRY_PATH) as f:
-        return yaml.safe_load(f) or {}
+async def _find_containers(project: str, env: str) -> list[str]:
+    """Find all running containers matching the project/env prefix."""
+    prefix = _container_prefix(project, env)
+    pattern = f"{prefix}-"
 
-
-def _project_cfg(project: str) -> dict:
-    reg = _load_registry()
-    projects = reg.get("projects", {})
-    if project not in projects:
-        raise ValueError(f"Unknown project '{project}'")
-    return projects[project]
-
-
-def _coolify_uuid(project: str, env: str) -> str:
-    cfg = _project_cfg(project)
-    uuids = cfg.get("coolify_uuids", {})
-    uuid = uuids.get(env)
-    if not uuid:
-        raise ValueError(
-            f"No coolify_uuid configured for {project}/{env} in registry.yaml"
-        )
-    return uuid
-
-
-def _data_dir(project: str, env: str) -> str:
-    cfg = _project_cfg(project)
-    template = cfg.get("data_dir", "")
-    if not template:
-        raise ValueError(f"No data_dir configured for {project} in registry.yaml")
-    return template.replace("{env}", env)
-
-
-def _build_cfg(project: str, env: str) -> dict | None:
-    """Return build config or None if the project uses registry-only images."""
-    cfg = _project_cfg(project)
-    build = cfg.get("build", {})
-    if build.get("no_local_image"):
-        return None
-    ctx_template = build.get("build_context", "")
-    if not ctx_template:
-        return None
-    return {
-        "build_context": ctx_template.replace("{env}", env),
-        "image_name": build.get("image_name", project),
-        "env": env,
-    }
+    result = await run_command(
+        ["docker", "ps", "--filter", f"name={pattern}", "--format", "{{.Names}}"],
+        timeout=15,
+    )
+    containers = []
+    if result["success"]:
+        for name in result["output"].strip().splitlines():
+            name = name.strip()
+            if name and name.startswith(pattern):
+                containers.append(name)
+    return containers
 
 
 # ---------------------------------------------------------------------------
@@ -120,151 +151,38 @@
     return _sse({"line": text, "timestamp": _now()})
 
 
-def _done(success: bool, project: str, env: str, action: str) -> str:
-    return _sse({
+def _done(success: bool, project: str, env: str, action: str, cancelled: bool = False) -> str:
+    payload = {
         "done": True,
         "success": success,
         "project": project,
         "env": env,
         "action": action,
-    })
-
-
-# ---------------------------------------------------------------------------
-# Coolify API (synchronous — called from async context via run_in_executor)
-# ---------------------------------------------------------------------------
-
-def _coolify_request(method: str, path: str) -> dict:
-    """Make a Coolify API request. Returns parsed JSON body."""
-    url = f"{_COOLIFY_BASE}{path}"
-    req = urllib.request.Request(
-        url,
-        method=method,
-        headers={
-            "Authorization": f"Bearer {_COOLIFY_TOKEN}",
-            "Content-Type": "application/json",
-            "Accept": "application/json",
-        },
-    )
-    try:
-        with urllib.request.urlopen(req, timeout=_COOLIFY_TIMEOUT) as resp:
-            body = resp.read()
-            return json.loads(body) if body else {}
-    except urllib.error.HTTPError as exc:
-        body = exc.read()
-        raise RuntimeError(
-            f"Coolify API {method} {path} returned HTTP {exc.code}: {body.decode(errors='replace')[:500]}"
-        ) from exc
-    except Exception as exc:
-        raise RuntimeError(f"Coolify API call failed: {exc}") from exc
-
-
-async def _coolify_action(action: str, uuid: str) -> dict:
-    """Call a Coolify service action endpoint (stop/start/restart)."""
-    import asyncio
-    loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(
-        None, _coolify_request, "POST", f"/services/{uuid}/{action}"
-    )
-
-
-# ---------------------------------------------------------------------------
-# Container polling helpers
-# ---------------------------------------------------------------------------
-
-async def _find_containers_for_service(project: str, env: str) -> list[str]:
-    """
-    Find all running Docker containers belonging to a project/env.
-    Uses the registry name_prefix and matches {env}-{prefix}-* pattern.
-    """
-    cfg = _project_cfg(project)
-    prefix = cfg.get("name_prefix", project)
-    name_pattern = f"{env}-{prefix}-"
-
-    result = await run_command(
-        ["docker", "ps", "--filter", f"name={name_pattern}", "--format", "{{.Names}}"],
-        timeout=15,
-    )
-    containers = []
-    if result["success"]:
-        for name in result["output"].strip().splitlines():
-            name = name.strip()
-            if name and name.startswith(name_pattern):
-                containers.append(name)
-    return containers
-
-
-async def _poll_until_stopped(
-    project: str,
-    env: str,
-    max_wait: int = _POLL_MAX_WAIT,
-) -> bool:
-    """Poll until no containers for project/env are running. Returns True if stopped."""
-    import asyncio
-    cfg = _project_cfg(project)
-    prefix = cfg.get("name_prefix", project)
-    name_pattern = f"{env}-{prefix}-"
-    waited = 0
-    while waited < max_wait:
-        result = await run_command(
-            ["docker", "ps", "--filter", f"name={name_pattern}", "--format", "{{.Names}}"],
-            timeout=15,
-        )
-        running = [
-            n.strip()
-            for n in result["output"].strip().splitlines()
-            if n.strip().startswith(name_pattern)
-        ] if result["success"] else []
-        if not running:
-            return True
-        await asyncio.sleep(_POLL_INTERVAL)
-        waited += _POLL_INTERVAL
-    return False
-
-
-async def _poll_until_running(
-    project: str,
-    env: str,
-    max_wait: int = _POLL_MAX_WAIT,
-) -> bool:
-    """Poll until at least one container for project/env is running. Returns True if up."""
-    import asyncio
-    cfg = _project_cfg(project)
-    prefix = cfg.get("name_prefix", project)
-    name_pattern = f"{env}-{prefix}-"
-    waited = 0
-    while waited < max_wait:
-        result = await run_command(
-            ["docker", "ps", "--filter", f"name={name_pattern}", "--format", "{{.Names}}"],
-            timeout=15,
-        )
-        running = [
-            n.strip()
-            for n in result["output"].strip().splitlines()
-            if n.strip().startswith(name_pattern)
-        ] if result["success"] else []
-        if running:
-            return True
-        await asyncio.sleep(_POLL_INTERVAL)
-        waited += _POLL_INTERVAL
-    return False
+    }
+    if cancelled:
+        payload["cancelled"] = True
+    return _sse(payload)
 
 
 # ---------------------------------------------------------------------------
 # Operation: Restart
 # ---------------------------------------------------------------------------
 
-async def _op_restart(project: str, env: str) -> AsyncGenerator[str, None]:
+async def _op_restart(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]:
+    """Restart: docker restart <containers>. Fast, no compose cycle.
+
+    Uses _find_containers which matches all containers with the project/env
+    prefix (e.g. 'prod-mdf-'). This naturally includes any subsystem containers
+    such as prod-mdf-seafile, prod-mdf-seafile-mysql, prod-mdf-seafile-redis.
     """
-    Restart: docker restart {containers} via SSH/nsenter.
-    No Coolify involvement — avoids the image-pruning stop/start cycle.
-    """
+    if op_id:
+        yield _sse({"op_id": op_id})
     yield _line(f"[restart] Finding containers for {project}/{env}...")
 
     try:
-        containers = await _find_containers_for_service(project, env)
+        containers = await _find_containers(project, env)
     except Exception as exc:
-        yield _line(f"[error] Registry lookup failed: {exc}")
+        yield _line(f"[error] Descriptor lookup failed: {exc}")
         yield _done(False, project, env, "restart")
         return
 
@@ -275,21 +193,20 @@
 
     yield _line(f"[restart] Restarting {len(containers)} container(s): {', '.join(containers)}")
 
-    cmd = ["docker", "restart"] + containers
-    result = await run_command(cmd, timeout=120)
+    result = await run_command(["docker", "restart"] + containers, timeout=120)
 
-    if result["output"].strip():
-        for line in result["output"].strip().splitlines():
-            yield _line(line)
-    if result["error"].strip():
-        for line in result["error"].strip().splitlines():
-            yield _line(f"[stderr] {line}")
+    for output_line in result["output"].strip().splitlines():
+        if output_line.strip():
+            yield _line(output_line)
+    for err_line in result["error"].strip().splitlines():
+        if err_line.strip():
+            yield _line(f"[stderr] {err_line}")
 
     if result["success"]:
         yield _line(f"[restart] All containers restarted successfully.")
         yield _done(True, project, env, "restart")
     else:
-        yield _line(f"[error] docker restart failed (exit code non-zero)")
+        yield _line(f"[error] docker restart failed")
         yield _done(False, project, env, "restart")
 
 
@@ -297,43 +214,73 @@
 # Operation: Rebuild
 # ---------------------------------------------------------------------------
 
-async def _op_rebuild(project: str, env: str) -> AsyncGenerator[str, None]:
+async def _op_rebuild(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]:
+    """Rebuild: docker compose down && docker compose up -d --build. No data loss.
+
+    Iterates over all compose dirs (main + any subsystem dirs like seafile for prod).
+    Each compose is brought down then rebuilt in sequence.
     """
-    Rebuild: docker compose down → build image → docker compose up.
-    Uses `ops rebuild` on the host which handles env files, profiles, and cd correctly.
-    No data loss. For code/Dockerfile changes.
-    """
-    yield _line(f"[rebuild] Rebuilding {project}/{env} via ops CLI...")
+    if op_id:
+        yield _sse({"op_id": op_id})
 
-    had_output = False
-    success = True
-    async for line in stream_command_host(
-        [OPS_CLI, "rebuild", project, env],
-        timeout=_BACKUP_TIMEOUT,
-    ):
-        had_output = True
-        if line.startswith("[stderr] "):
-            yield _line(line)
-        elif line.startswith("ERROR") or line.startswith("[error]"):
-            yield _line(f"[error] {line}")
-            success = False
-        else:
-            yield _line(f"[rebuild] {line}")
+    try:
+        compose_dirs = _all_compose_dirs(project, env)
+    except Exception as exc:
+        yield _line(f"[error] Descriptor lookup failed: {exc}")
+        yield _done(False, project, env, "rebuild")
+        return
 
-    if not had_output:
-        yield _line(f"[error] ops rebuild produced no output — check registry config for {project}")
-        success = False
+    if not compose_dirs:
+        yield _line(f"[error] No compose directories found for {project}/{env}")
+        yield _done(False, project, env, "rebuild")
+        return
 
-    if success:
-        # Verify containers came up
-        containers = await _find_containers_for_service(project, env)
-        if containers:
-            yield _line(f"[rebuild] {len(containers)} container(s) running: {', '.join(containers)}")
-            yield _done(True, project, env, "rebuild")
-        else:
-            yield _line(f"[warn] No containers found after rebuild — check docker compose logs")
+    for label, cdir in compose_dirs:
+        yield _line(f"[rebuild] Compose dir ({label}): {cdir}")
+
+        # Step 1: docker compose down
+        yield _line(f"[rebuild] Stopping {label} via docker compose down...")
+        result = await run_command_host(
+            _compose_cmd_for(cdir, env) + ["down"],
+            timeout=120,
+        )
+        for output_line in (result["output"] + result["error"]).strip().splitlines():
+            if output_line.strip():
+                yield _line(output_line)
+
+        if not result["success"]:
+            yield _line(f"[error] docker compose down failed for {label}")
             yield _done(False, project, env, "rebuild")
+            return
+
+        yield _line(f"[rebuild] {label} containers stopped.")
+
+        if op_id and is_cancelled(op_id):
+            yield _line(f"[rebuild] Cancelled after stop. Run docker compose up manually to recover.")
+            yield _done(False, project, env, "rebuild", cancelled=True)
+            return
+
+        # Step 2: docker compose up -d --build (streaming for real-time build output)
+        yield _line(f"[rebuild] Building and starting {label}...")
+        async for build_line in stream_command_host(
+            _compose_cmd_for(cdir, env) + ["up", "-d", "--build"],
+            timeout=_BACKUP_TIMEOUT,
+            op_id=op_id,
+        ):
+            yield _line(f"[rebuild] {build_line}")
+
+        if op_id and is_cancelled(op_id):
+            yield _line(f"[rebuild] Cancelled during build/start of {label}.")
+            yield _done(False, project, env, "rebuild", cancelled=True)
+            return
+
+    # Verify all containers came up
+    containers = await _find_containers(project, env)
+    if containers:
+        yield _line(f"[rebuild] {len(containers)} container(s) running: {', '.join(containers)}")
+        yield _done(True, project, env, "rebuild")
     else:
+        yield _line(f"[warn] No running containers detected after rebuild — check compose logs")
         yield _done(False, project, env, "rebuild")
 
 
@@ -341,105 +288,115 @@
 # Operation: Recreate (Disaster Recovery)
 # ---------------------------------------------------------------------------
 
-async def _op_recreate(project: str, env: str) -> AsyncGenerator[str, None]:
+async def _op_recreate(project: str, env: str, op_id: str | None = None) -> AsyncGenerator[str, None]:
+    """Recreate: docker compose down --volumes && up --build. DESTRUCTIVE — wipes volumes.
+
+    Iterates over all compose dirs (main + any subsystem dirs like seafile for prod).
+    A safety backup is taken first. Then each compose is wiped and rebuilt in sequence.
     """
-    Recreate: docker compose down → wipe data → docker build → docker compose up.
-    DESTRUCTIVE — wipes all data volumes. Shows "Go to Backups" banner on success.
-    """
+    if op_id:
+        yield _sse({"op_id": op_id})
+
     try:
-        data_dir = _data_dir(project, env)
-        cfg = _project_cfg(project)
-    except ValueError as exc:
-        yield _line(f"[error] Config error: {exc}")
+        compose_dirs = _all_compose_dirs(project, env)
+    except Exception as exc:
+        yield _line(f"[error] Descriptor lookup failed: {exc}")
         yield _done(False, project, env, "recreate")
         return
 
-    # Step 1: Find and stop containers via docker compose
-    code_dir = cfg.get("path", "") + f"/{env}/code"
-    yield _line(f"[recreate] Stopping {project}/{env} containers...")
-
-    stop_result = await run_command_host(
-        ["sh", "-c", f"cd {code_dir} && docker compose -p {env}-{cfg.get('name_prefix', project)} --profile {env} down 2>&1 || true"],
-        timeout=120,
-    )
-    if stop_result["output"].strip():
-        for line in stop_result["output"].strip().splitlines():
-            yield _line(line)
-
-    # Step 2: Verify containers are stopped
-    name_prefix = cfg.get("name_prefix", project)
-    verify = await run_command_host(
-        ["sh", "-c", f"docker ps --format '{{{{.Names}}}}' | grep '^{env}-{name_prefix}-' || true"],
-        timeout=30,
-    )
-    running_containers = verify["output"].strip()
-    if running_containers:
-        yield _line(f"[error] Containers still running for {project}/{env}:")
-        for line in running_containers.splitlines():
-            yield _line(f"  {line}")
-        yield _done(False, project, env, "recreate")
-        return
-    yield _line(f"[recreate] All containers stopped.")
-
-    # Step 3: Wipe data volumes
-    yield _line(f"[recreate] WARNING: Wiping data directory: {data_dir}")
-    wipe_result = await run_command_host(
-        ["sh", "-c", f"rm -r {data_dir}/* 2>&1; echo EXIT_CODE=$?"],
-        timeout=120,
-    )
-    for line in (wipe_result["output"].strip() + "\n" + wipe_result["error"].strip()).strip().splitlines():
-        if line:
-            yield _line(line)
-    if "EXIT_CODE=0" in wipe_result["output"]:
-        yield _line(f"[recreate] Data directory wiped.")
-    else:
-        yield _line(f"[error] Wipe may have failed — check output above.")
+    if not compose_dirs:
+        yield _line(f"[error] No compose directories found for {project}/{env}")
         yield _done(False, project, env, "recreate")
         return
 
-    # Step 4: Rebuild via ops CLI (handles image build + compose up)
-    yield _line(f"[recreate] Rebuilding containers...")
-    async for line in stream_command_host(
-        [OPS_CLI, "rebuild", project, env],
+    # Log all compose dirs we will operate on
+    for label, cdir in compose_dirs:
+        yield _line(f"[recreate] Compose dir ({label}): {cdir}")
+
+    # Step 1: Safety backup before destroying anything
+    yield _line(f"[recreate] Creating safety backup before wipe...")
+    async for backup_line in stream_command_host(
+        [OPS_CLI, "backup", project, env],
         timeout=_BACKUP_TIMEOUT,
+        op_id=op_id,
     ):
-        if line.startswith("[stderr] "):
-            yield _line(line)
-        else:
-            yield _line(f"[recreate] {line}")
+        yield _line(f"[recreate] {backup_line}")
 
-    # Step 5: Verify containers came up
-    containers = await _find_containers_for_service(project, env)
+    if op_id and is_cancelled(op_id):
+        yield _line(f"[recreate] Cancelled during safety backup. No data was lost.")
+        yield _done(False, project, env, "recreate", cancelled=True)
+        return
+
+    yield _line(f"[recreate] Safety backup complete.")
+
+    for label, cdir in compose_dirs:
+        # Step 2: docker compose down --volumes (removes named volumes)
+        yield _line(f"[recreate] WARNING: Running docker compose down --volumes for {label} (data will be wiped)...")
+        result = await run_command_host(
+            _compose_cmd_for(cdir, env) + ["down", "--volumes"],
+            timeout=120,
+        )
+        for output_line in (result["output"] + result["error"]).strip().splitlines():
+            if output_line.strip():
+                yield _line(output_line)
+
+        if not result["success"]:
+            yield _line(f"[error] docker compose down --volumes failed for {label}")
+            yield _done(False, project, env, "recreate")
+            return
+
+        yield _line(f"[recreate] {label} containers and volumes removed.")
+
+        if op_id and is_cancelled(op_id):
+            yield _line(f"[recreate] Cancelled after volume wipe of {label}. Restore a backup to recover.")
+            yield _done(False, project, env, "recreate", cancelled=True)
+            return
+
+        # Step 3: docker compose up -d --build
+        yield _line(f"[recreate] Building and starting fresh {label}...")
+        async for build_line in stream_command_host(
+            _compose_cmd_for(cdir, env) + ["up", "-d", "--build"],
+            timeout=_BACKUP_TIMEOUT,
+            op_id=op_id,
+        ):
+            yield _line(f"[recreate] {build_line}")
+
+        if op_id and is_cancelled(op_id):
+            yield _line(f"[recreate] Cancelled during build/start of {label}.")
+            yield _done(False, project, env, "recreate", cancelled=True)
+            return
+
+    # Verify containers came up
+    containers = await _find_containers(project, env)
     if containers:
         yield _line(f"[recreate] {len(containers)} container(s) running. Restore a backup to complete recovery.")
         yield _done(True, project, env, "recreate")
     else:
-        yield _line(f"[warn] No containers found after recreate — check docker compose logs")
-        yield _done(True, project, env, "recreate")
+        yield _line(f"[warn] No running containers after recreate — check compose logs")
+        yield _done(False, project, env, "recreate")
 
 
 # ---------------------------------------------------------------------------
-# Dispatch wrapper
+# Dispatch
 # ---------------------------------------------------------------------------
 
-async def _op_generator(
-    project: str,
-    env: str,
-    action: str,
-) -> AsyncGenerator[str, None]:
-    """Route to the correct operation generator."""
-    if action == "restart":
-        async for chunk in _op_restart(project, env):
-            yield chunk
-    elif action == "rebuild":
-        async for chunk in _op_rebuild(project, env):
-            yield chunk
-    elif action == "recreate":
-        async for chunk in _op_recreate(project, env):
-            yield chunk
-    else:
-        yield _line(f"[error] Unknown action '{action}'. Valid: restart, rebuild, recreate")
-        yield _done(False, project, env, action)
+async def _op_generator(project: str, env: str, action: str) -> AsyncGenerator[str, None]:
+    op_id = new_op_id()
+    try:
+        if action == "restart":
+            async for chunk in _op_restart(project, env, op_id=op_id):
+                yield chunk
+        elif action == "rebuild":
+            async for chunk in _op_rebuild(project, env, op_id=op_id):
+                yield chunk
+        elif action == "recreate":
+            async for chunk in _op_recreate(project, env, op_id=op_id):
+                yield chunk
+        else:
+            yield _line(f"[error] Unknown action '{action}'. Valid: restart, rebuild, recreate")
+            yield _done(False, project, env, action)
+    finally:
+        clear_cancelled(op_id)
 
 
 # ---------------------------------------------------------------------------
@@ -463,8 +420,8 @@
     Stream a container lifecycle operation via SSE.
 
     - restart:  docker restart containers (safe, fast)
-    - rebuild:  stop via Coolify, rebuild image, start via Coolify
-    - recreate: stop, wipe data, rebuild image, start (destructive — DR only)
+    - rebuild:  docker compose down && up --build (no data loss)
+    - recreate: docker compose down --volumes && up --build (destructive — DR only)
     """
     return StreamingResponse(
         _op_generator(project, env, action),
diff --git a/app/routers/registry.py b/app/routers/registry.py
index 99e8f20..8643808 100644
--- a/app/routers/registry.py
+++ b/app/routers/registry.py
@@ -1,20 +1,35 @@
-import yaml
-from pathlib import Path
+import sys
 from typing import Any
 
 from fastapi import APIRouter, Depends
 
 from app.auth import verify_token
 
+sys.path.insert(0, "/opt/infrastructure")
+from toolkit.discovery import all_projects  # noqa: E402
+
 router = APIRouter()
 
-_REGISTRY_PATH = Path("/opt/infrastructure/servers/hetzner-vps/registry.yaml")
 
+def _serialize_project(desc: Any) -> dict:
+    """Serialize a ProjectDescriptor to a response dict."""
+    environments = [
+        {
+            "name": e.name,
+            "domain": e.domain,
+            "compose_dir": e.compose_dir,
+        }
+        for e in desc.environments
+    ]
 
-def _load_registry() -> dict:
-    """Load and return the registry YAML."""
-    with open(_REGISTRY_PATH) as f:
-        return yaml.safe_load(f)
+    return {
+        "environments": environments,
+        "domains": desc.domains,
+        "promote": desc.promote or None,
+        "has_cli": bool(desc.sync.get("type")),
+        "backup": desc.backup or None,
+        "type": desc.type,
+    }
 
 
 @router.get("/", summary="Get project registry")
@@ -22,19 +37,8 @@
     _: str = Depends(verify_token),
 ) -> dict[str, Any]:
     """Return project list with environments, promote config, and domains."""
-    registry = _load_registry()
-    projects = {}
-
-    for name, cfg in registry.get("projects", {}).items():
-        projects[name] = {
-            "environments": cfg.get("environments", []),
-            "domains": cfg.get("domains", {}),
-            "promote": cfg.get("promote"),
-            "has_cli": bool(cfg.get("cli")),
-            "static": cfg.get("static", False),
-            "infrastructure": cfg.get("infrastructure", False),
-            "backup_dir": cfg.get("backup_dir"),
-            "has_coolify": bool(cfg.get("coolify_uuids")),
-        }
-
+    projects = {
+        name: _serialize_project(desc)
+        for name, desc in all_projects().items()
+    }
     return {"projects": projects}
diff --git a/app/routers/restore.py b/app/routers/restore.py
index fc1e60f..cd10c22 100644
--- a/app/routers/restore.py
+++ b/app/routers/restore.py
@@ -1,3 +1,4 @@
+import asyncio
 import json
 from datetime import datetime, timezone
 from typing import AsyncGenerator, Literal
@@ -6,7 +7,9 @@
 from fastapi.responses import StreamingResponse
 
 from app.auth import verify_token
-from app.ops_runner import _BACKUP_TIMEOUT, stream_ops_host
+from app.ops_runner import _BACKUP_TIMEOUT, new_op_id, is_cancelled, clear_cancelled, stream_ops_host
+
+_KEEPALIVE_INTERVAL = 15  # seconds between SSE keepalive pings
 
 router = APIRouter()
 
@@ -29,52 +32,101 @@
     Runs on the host via nsenter because ops restore delegates to project CLIs
     that use host Python venvs incompatible with the container's Python.
     """
-    base_args = ["restore", project, env]
+    op_id = new_op_id()
+    yield _sse_line({"op_id": op_id})
 
-    # Pass the backup file path to avoid interactive selection prompt
-    if name:
-        backup_path = f"/opt/data/backups/{project}/{env}/{name}"
-        base_args.append(backup_path)
+    try:
+        base_args = ["restore", project, env]
 
-    if dry_run:
-        base_args.append("--dry-run")
+        # Pass the backup file path to avoid interactive selection prompt
+        if name:
+            backup_path = f"/opt/data/backups/{project}/{env}/{name}"
+            base_args.append(backup_path)
 
-    # Granular restore mode
-    if mode == "db":
-        base_args.append("--db-only")
-    elif mode == "wp":
-        base_args.append("--wp-only")
+        if dry_run:
+            base_args.append("--dry-run")
 
-    if source == "offsite":
-        # ops offsite restore <project> <env>
-        download_args = ["offsite", "restore", project, env]
-        yield _sse_line({"line": f"Downloading {project}/{env} from offsite...", "timestamp": _now()})
+        # Granular restore mode
+        if mode == "db":
+            base_args.append("--db-only")
+        elif mode == "wp":
+            base_args.append("--wp-only")
 
-        download_ok = True
-        async for line in stream_ops_host(download_args, timeout=_BACKUP_TIMEOUT):
-            yield _sse_line({"line": line, "timestamp": _now()})
-            if line.startswith("[error]"):
-                download_ok = False
+        if source == "offsite":
+            # ops offsite restore <project> <env>
+            download_args = ["offsite", "restore", project, env]
+            yield _sse_line({"line": f"Downloading {project}/{env} from offsite...", "timestamp": _now()})
 
-        if not download_ok:
-            yield _sse_line({"done": True, "success": False})
-            return
+            download_ok = True
+            downloaded_path = None
+            async for line in stream_ops_host(download_args, timeout=_BACKUP_TIMEOUT, op_id=op_id):
+                yield _sse_line({"line": line, "timestamp": _now()})
+                if line.startswith("[error]"):
+                    download_ok = False
+                # Capture downloaded file path from offsite.py output
+                if "Downloaded to" in line and "/tmp/" in line:
+                    # Parse "Downloaded to: /tmp/filename.tar.gz" or similar
+                    for part in line.split():
+                        if part.startswith("/tmp/") and part.endswith(".tar.gz"):
+                            downloaded_path = part
+                elif line.startswith("  ✓ Downloaded to "):
+                    for part in line.split():
+                        if part.startswith("/tmp/") and part.endswith(".tar.gz"):
+                            downloaded_path = part
 
-        yield _sse_line({"line": "Download complete. Starting restore...", "timestamp": _now()})
+            if is_cancelled(op_id):
+                yield _sse_line({"done": True, "success": False, "cancelled": True})
+                return
 
-    success = True
-    async for line in stream_ops_host(base_args, timeout=_BACKUP_TIMEOUT):
-        yield _sse_line({"line": line, "timestamp": _now()})
-        if line.startswith("[error]"):
-            success = False
+            if not download_ok:
+                yield _sse_line({"done": True, "success": False})
+                return
 
-    yield _sse_line({"done": True, "success": success})
+            # Use the downloaded offsite file for restore
+            if downloaded_path:
+                base_args.append(downloaded_path)
+                yield _sse_line({"line": f"Download complete. Restoring from {downloaded_path}...", "timestamp": _now()})
+            else:
+                yield _sse_line({"line": "Download complete. Starting restore...", "timestamp": _now()})
+
+        success = True
+        async for item in _stream_with_keepalive(stream_ops_host(base_args, timeout=_BACKUP_TIMEOUT, op_id=op_id)):
+            if item is None:
+                # Keepalive ping — SSE comment to prevent idle timeout
+                yield ": keepalive\n\n"
+            else:
+                yield _sse_line({"line": item, "timestamp": _now()})
+                if item.startswith("[error]"):
+                    success = False
+
+        if is_cancelled(op_id):
+            yield _sse_line({"done": True, "success": False, "cancelled": True})
+        else:
+            yield _sse_line({"done": True, "success": success})
+    finally:
+        clear_cancelled(op_id)
 
 
 def _now() -> str:
     return datetime.now(timezone.utc).isoformat()
 
 
+async def _stream_with_keepalive(gen: AsyncGenerator[str, None]) -> AsyncGenerator[str | None, None]:
+    """Wrap an async generator to yield None as keepalive when no data arrives within the interval."""
+    aiter = gen.__aiter__()
+    pending = asyncio.ensure_future(aiter.__anext__())
+    while True:
+        done, _ = await asyncio.wait({pending}, timeout=_KEEPALIVE_INTERVAL)
+        if done:
+            try:
+                yield pending.result()
+            except StopAsyncIteration:
+                break
+            pending = asyncio.ensure_future(aiter.__anext__())
+        else:
+            yield None  # keepalive — prevents Traefik idle timeout
+
+
 @router.get("/{project}/{env}", summary="Restore a backup with real-time output")
 async def restore_backup(
     project: str,
diff --git a/app/routers/schedule.py b/app/routers/schedule.py
new file mode 100644
index 0000000..1c43ca7
--- /dev/null
+++ b/app/routers/schedule.py
@@ -0,0 +1,199 @@
+import json
+import sys
+import yaml
+from pathlib import Path
+from typing import Any, AsyncGenerator
+from datetime import datetime, timezone
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+
+from app.auth import verify_token
+from app.ops_runner import (
+    run_command_host, stream_ops_host, new_op_id, is_cancelled, clear_cancelled,
+    _BACKUP_TIMEOUT,
+)
+
+sys.path.insert(0, "/opt/infrastructure")
+from toolkit.discovery import all_projects  # noqa: E402
+from toolkit.descriptor import find as find_project  # noqa: E402
+
+router = APIRouter()
+
+
+class ScheduleUpdate(BaseModel):
+    enabled: bool = True
+    schedule: str = "03:00"
+    environments: list[str] | None = None
+    command: str | None = None
+    offsite: bool = False
+    offsite_envs: list[str] | None = None
+    retention_local_days: int | None = 7
+    retention_offsite_days: int | None = 30
+
+
+@router.get("/", summary="Get backup schedules for all projects")
+async def get_schedules(
+    _: str = Depends(verify_token),
+) -> list[dict[str, Any]]:
+    """Return backup schedule config for each project from descriptors."""
+    projects = all_projects()
+    result = []
+
+    for name, desc in sorted(projects.items()):
+        backup = desc.backup or {}
+        result.append({
+            "project": name,
+            "has_backup_dir": bool(backup.get("backup_dir") or backup.get("volumes")),
+            "has_cli": desc.sync.get("type") == "cli",
+            "static": desc.type == "static",
+            "infrastructure": desc.type == "infrastructure",
+            "environments": [e.name for e in desc.environments],
+            # Backup schedule fields
+            "enabled": backup.get("enabled", False),
+            "schedule": backup.get("schedule", ""),
+            "backup_environments": backup.get("environments"),
+            "command": backup.get("command"),
+            "offsite": backup.get("offsite", False),
+            "offsite_envs": backup.get("offsite_envs"),
+            "retention_local_days": backup.get("retention", {}).get("local_days"),
+            "retention_offsite_days": backup.get("retention", {}).get("offsite_days"),
+        })
+
+    return result
+
+
+@router.put("/{project}", summary="Update backup schedule for a project")
+async def update_schedule(
+    project: str,
+    body: ScheduleUpdate,
+    _: str = Depends(verify_token),
+) -> dict[str, Any]:
+    """Update the backup schedule in project.yaml and regenerate timers."""
+    desc = find_project(project)
+    if not desc:
+        raise HTTPException(status_code=404, detail=f"Project '{project}' not found")
+
+    # Read the full project.yaml
+    yaml_path = Path(desc.path) / "project.yaml"
+    try:
+        with open(yaml_path) as f:
+            project_yaml = yaml.safe_load(f) or {}
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail=f"project.yaml not found at {yaml_path}")
+
+    # Update only the backup block fields that were sent
+    backup = project_yaml.get("backup", {})
+    backup["enabled"] = body.enabled
+    backup["schedule"] = body.schedule
+    if body.command:
+        backup["command"] = body.command
+    if body.environments:
+        backup["environments"] = body.environments
+    if body.offsite:
+        backup["offsite"] = True
+        if body.offsite_envs:
+            backup["offsite_envs"] = body.offsite_envs
+    else:
+        backup["offsite"] = False
+    retention = backup.get("retention", {})
+    if body.retention_local_days is not None:
+        retention["local_days"] = body.retention_local_days
+    if body.offsite and body.retention_offsite_days is not None:
+        retention["offsite_days"] = body.retention_offsite_days
+    if retention:
+        backup["retention"] = retention
+
+    project_yaml["backup"] = backup
+    new_yaml = yaml.dump(project_yaml, default_flow_style=False, sort_keys=False)
+
+    write_result = await run_command_host([
+        "bash", "-c",
+        f"cat > {yaml_path} << 'YAMLEOF'\n{new_yaml}YAMLEOF"
+    ])
+    if not write_result["success"]:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to write project.yaml: {write_result['error']}"
+        )
+
+    gen_result = await run_command_host([
+        "/usr/local/bin/ops", "gen-timers"
+    ])
+    if not gen_result["success"]:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to regenerate timers: {gen_result['error'] or gen_result['output']}"
+        )
+
+    return {
+        "success": True,
+        "project": project,
+        "backup": backup,
+        "gen_timers_output": gen_result["output"],
+    }
+
+
+def _sse(payload: dict) -> str:
+    return f"data: {json.dumps(payload)}\n\n"
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+async def _run_now_stream(project: str) -> AsyncGenerator[str, None]:
+    """Run backup for a project (all configured envs)."""
+    op_id = new_op_id()
+    yield _sse({"op_id": op_id})
+
+    desc = find_project(project)
+    if not desc:
+        yield _sse({"line": f"[error] Project '{project}' not found", "timestamp": _now()})
+        yield _sse({"done": True, "success": False})
+        return
+
+    envs = [e.name for e in desc.environments] or [None]
+
+    success = True
+    for env in envs:
+        if is_cancelled(op_id):
+            yield _sse({"line": "Cancelled.", "timestamp": _now()})
+            yield _sse({"done": True, "success": False, "cancelled": True})
+            clear_cancelled(op_id)
+            return
+
+        cmd = ["backup", project]
+        if env:
+            cmd.append(env)
+        label = f"{project}/{env}" if env else project
+        yield _sse({"line": f"=== Backing up {label} ===", "timestamp": _now()})
+
+        async for line in stream_ops_host(cmd, timeout=_BACKUP_TIMEOUT, op_id=op_id):
+            yield _sse({"line": line, "timestamp": _now()})
+            if line.startswith("[error]") or line.startswith("ERROR"):
+                success = False
+
+    if is_cancelled(op_id):
+        yield _sse({"done": True, "success": False, "cancelled": True})
+    else:
+        yield _sse({"done": True, "success": success})
+    clear_cancelled(op_id)
+
+
+@router.get("/{project}/run", summary="Run backup now (streaming)")
+async def run_backup_now(
+    project: str,
+    _: str = Depends(verify_token),
+) -> StreamingResponse:
+    """Trigger an immediate backup for a project, streaming output via SSE."""
+    desc = find_project(project)
+    if not desc:
+        raise HTTPException(status_code=404, detail=f"Project '{project}' not found")
+
+    return StreamingResponse(
+        _run_now_stream(project),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+    )
diff --git a/app/routers/services.py b/app/routers/services.py
index 7cdad19..56ca932 100644
--- a/app/routers/services.py
+++ b/app/routers/services.py
@@ -1,41 +1,17 @@
-import os
+import sys
 from typing import Any
 
-import yaml
 from fastapi import APIRouter, Depends, HTTPException, Query
 
 from app.auth import verify_token
 from app.ops_runner import run_command
 
+sys.path.insert(0, "/opt/infrastructure")
+from toolkit.descriptor import find as find_project  # noqa: E402
+
 router = APIRouter()
 
 _DOCKER = "docker"
-_REGISTRY_PATH = os.environ.get(
-    "REGISTRY_PATH",
-    "/opt/infrastructure/servers/hetzner-vps/registry.yaml",
-)
-
-# ---------------------------------------------------------------------------
-# Registry-based name prefix lookup (cached)
-# ---------------------------------------------------------------------------
-_prefix_cache: dict[str, str] | None = None
-
-
-def _load_prefixes() -> dict[str, str]:
-    """Load project -> name_prefix mapping from the ops registry."""
-    global _prefix_cache
-    if _prefix_cache is not None:
-        return _prefix_cache
-
-    try:
-        with open(_REGISTRY_PATH) as f:
-            data = yaml.safe_load(f)
-        _prefix_cache = {}
-        for proj_name, cfg in data.get("projects", {}).items():
-            _prefix_cache[proj_name] = cfg.get("name_prefix", proj_name)
-        return _prefix_cache
-    except Exception:
-        return {}
 
 
 # ---------------------------------------------------------------------------
@@ -76,33 +52,28 @@
     """
     Resolve the actual Docker container name from project/env/service.
 
-    Uses the ops registry name_prefix mapping and tries patterns in order:
-      1. {env}-{prefix}-{service}  (mdf, seriousletter: dev-mdf-mysql-UUID)
-      2. {prefix}-{service}        (ringsaday: ringsaday-website-UUID, coolify: coolify-db)
-      3. {prefix}-{env}            (ringsaday: ringsaday-dev-UUID)
-      4. exact {prefix}            (coolify infra: coolify)
+    Loads the project descriptor and expands container_prefix for the given
+    env (e.g. "{env}-mdf" -> "dev-mdf"), then tries:
+      1. {expanded_prefix}-{service}   e.g. dev-mdf-wordpress
+      2. exact match on expanded_prefix (infra containers with no service suffix)
     """
-    prefixes = _load_prefixes()
-    prefix = prefixes.get(project, project)
+    desc = find_project(project)
+    if desc is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Project '{project}' not found",
+        )
 
-    # Pattern 1: {env}-{prefix}-{service}
-    hit = await _find_by_prefix(f"{env}-{prefix}-{service}")
+    expanded_prefix = desc.container_prefix_for(env)
+
+    # Pattern 1: {expanded_prefix}-{service}
+    hit = await _find_by_prefix(f"{expanded_prefix}-{service}")
     if hit:
         return hit
 
-    # Pattern 2: {prefix}-{service}
-    hit = await _find_by_prefix(f"{prefix}-{service}")
-    if hit:
-        return hit
-
-    # Pattern 3: {prefix}-{env}
-    hit = await _find_by_prefix(f"{prefix}-{env}")
-    if hit:
-        return hit
-
-    # Pattern 4: exact match when service == prefix (e.g., coolify)
-    if service == prefix:
-        hit = await _find_exact(prefix)
+    # Pattern 2: exact match on prefix (infrastructure containers, e.g. "coolify")
+    if service == expanded_prefix or service == desc.name:
+        hit = await _find_exact(expanded_prefix)
         if hit:
             return hit
 
diff --git a/app/routers/sync_data.py b/app/routers/sync_data.py
index 46f1089..a51b47b 100644
--- a/app/routers/sync_data.py
+++ b/app/routers/sync_data.py
@@ -28,13 +28,16 @@
     to_env: str,
     db_only: bool,
     uploads_only: bool,
+    dry_run: bool = False,
 ) -> AsyncGenerator[str, None]:
     """Stream sync output via SSE."""
-    args = ["sync", project, "--from", from_env, "--to", to_env, "--yes"]
+    args = ["sync", project, "--from", from_env, "--to", to_env]
     if db_only:
         args.append("--db-only")
     if uploads_only:
         args.append("--uploads-only")
+    if dry_run:
+        args.append("--dry-run")
 
     mode = "db-only" if db_only else ("uploads-only" if uploads_only else "full")
     yield _sse_line({
@@ -58,6 +61,7 @@
     to_env: str = Query(default="int", alias="to"),
     db_only: bool = Query(default=False),
     uploads_only: bool = Query(default=False),
+    dry_run: bool = Query(default=False),
     _: str = Depends(verify_token),
 ) -> StreamingResponse:
     """Sync data backward (prod->int, int->dev) with SSE streaming."""
@@ -67,7 +71,7 @@
             detail=f"Invalid sync path '{from_env} -> {to_env}'. Only adjacent pairs are allowed: prod->int, int->dev.",
         )
     return StreamingResponse(
-        _sync_generator(project, from_env, to_env, db_only, uploads_only),
+        _sync_generator(project, from_env, to_env, db_only, uploads_only, dry_run),
         media_type="text/event-stream",
         headers={
             "Cache-Control": "no-cache",
diff --git a/docker-compose.yml b/docker-compose.yml
index 7999f59..c8c5a68 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,21 +4,28 @@
     container_name: ops-dashboard
     restart: unless-stopped
     env_file: .env
+    healthcheck:
+      test: ["CMD-SHELL", "curl -sf http://localhost:8080/api/registry/?token=$AUTH_TOKEN"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
     volumes:
       - /opt/infrastructure:/opt/infrastructure
       - /opt/data:/opt/data
       - /var/run/docker.sock:/var/run/docker.sock
       - ./static:/app/static
+      - ./app:/app/app
     labels:
       - "traefik.enable=true"
       - "traefik.http.routers.ops-dashboard.rule=Host(`cockpit.tekmidian.com`)"
-      - "traefik.http.routers.ops-dashboard.entrypoints=websecure"
+      - "traefik.http.routers.ops-dashboard.entrypoints=https"
       - "traefik.http.routers.ops-dashboard.tls=true"
       - "traefik.http.routers.ops-dashboard.tls.certresolver=letsencrypt"
       - "traefik.http.services.ops-dashboard.loadbalancer.server.port=8080"
     networks:
-      - coolify
+      - proxy
 
 networks:
-  coolify:
+  proxy:
     external: true
diff --git a/project.yaml b/project.yaml
new file mode 100644
index 0000000..b6f3405
--- /dev/null
+++ b/project.yaml
@@ -0,0 +1,56 @@
+name: ops-dashboard
+type: infrastructure
+description: "Ops Dashboard \u2014 FastAPI-based web dashboard for monitoring containers,\
+  \ triggering backups/restores, and managing server health across all projects"
+path: /opt/data/ops-dashboard
+container_prefix: ops-dashboard
+environments:
+- name: prod
+  domain: ops.tekmidian.com
+  compose_dir: /opt/data/ops-dashboard
+networks:
+  proxy: coolify
+build:
+  context: /opt/data/ops-dashboard
+  image: ops-dashboard
+  tag: latest
+  description: "Built locally from Dockerfile \u2014 no registry push"
+services:
+  ops-dashboard:
+    container: ops-dashboard
+    image: ops-dashboard:latest
+    volumes:
+    - /opt/infrastructure:/opt/infrastructure
+    - /opt/data:/opt/data
+    - /var/run/docker.sock:/var/run/docker.sock
+    - /opt/data/ops-dashboard/static:/app/static
+    - /opt/data/ops-dashboard/app:/app/app
+backup:
+  enabled: true
+  schedule: 04:15
+  retention:
+    local_days: 30
+    offsite_days: 30
+  offsite: true
+  backup_dir: /opt/data/backups/ops-dashboard
+  volumes:
+  - /opt/data/ops-dashboard
+  environments:
+  - prod
+  offsite_envs:
+  - prod
+restore:
+  volumes:
+  - /opt/data/ops-dashboard
+  post_restore:
+  - docker restart ops-dashboard
+promote:
+  type: git
+  description: "Infrastructure tool \u2014 deploy by rebuilding image from source"
+  post_pull: rebuild
+health:
+- env: prod
+  url: https://ops.tekmidian.com/
+  status: 200
+domains:
+  prod: ops.tekmidian.com
diff --git a/static/css/style.css b/static/css/style.css
index 9b1a219..e3b0a6d 100644
--- a/static/css/style.css
+++ b/static/css/style.css
@@ -525,3 +525,45 @@
   color: #f3f4f6;
   font-weight: 600;
 }
+
+/* ---------- Operation Progress Bar ---------- */
+.op-progress {
+  height: 3px;
+  border-radius: 2px;
+  margin-bottom: 0.75rem;
+  overflow: hidden;
+  background: #1f2937;
+  transition: opacity 0.3s;
+}
+.op-progress.hidden {
+  opacity: 0;
+  height: 0;
+  margin: 0;
+}
+.op-progress.running {
+  opacity: 1;
+}
+.op-progress.running .op-progress-fill {
+  width: 100%;
+  height: 100%;
+  background: linear-gradient(90deg, #3b82f6 0%, #60a5fa 50%, #3b82f6 100%);
+  background-size: 200% 100%;
+  animation: progress-slide 1.5s ease-in-out infinite;
+}
+.op-progress.done-ok .op-progress-fill {
+  width: 100%;
+  height: 100%;
+  background: #10b981;
+  animation: none;
+}
+.op-progress.done-fail .op-progress-fill {
+  width: 100%;
+  height: 100%;
+  background: #ef4444;
+  animation: none;
+}
+
+@keyframes progress-slide {
+  0%   { background-position: 200% 0; }
+  100% { background-position: -200% 0; }
+}
diff --git a/static/index.html b/static/index.html
index d51b7bf..8ef0b80 100644
--- a/static/index.html
+++ b/static/index.html
@@ -83,6 +83,10 @@
         <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg>
         Operations
       </a>
+      <a class="sidebar-link" data-page="schedules" onclick="showPage('schedules')">
+        <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><polyline points="12 6 12 12 16 14"/></svg>
+        Schedules
+      </a>
       <a class="sidebar-link" data-page="system" onclick="showPage('system')">
         <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="3"/><path d="M19.4 15a1.65 1.65 0 00.33 1.82l.06.06a2 2 0 010 2.83 2 2 0 01-2.83 0l-.06-.06a1.65 1.65 0 00-1.82-.33 1.65 1.65 0 00-1 1.51V21a2 2 0 01-4 0v-.09A1.65 1.65 0 009 19.4a1.65 1.65 0 00-1.82.33l-.06.06a2 2 0 01-2.83-2.83l.06-.06A1.65 1.65 0 004.68 15a1.65 1.65 0 00-1.51-1H3a2 2 0 010-4h.09A1.65 1.65 0 004.6 9a1.65 1.65 0 00-.33-1.82l-.06-.06a2 2 0 012.83-2.83l.06.06A1.65 1.65 0 009 4.68a1.65 1.65 0 001-1.51V3a2 2 0 014 0v.09a1.65 1.65 0 001 1.51 1.65 1.65 0 001.82-.33l.06-.06a2 2 0 012.83 2.83l-.06.06A1.65 1.65 0 0019.4 9a1.65 1.65 0 001.51 1H21a2 2 0 010 4h-.09a1.65 1.65 0 00-1.51 1z"/></svg>
         System
@@ -180,6 +184,7 @@
 
       <!-- SSE output (shown after start) -->
       <div id="restore-modal-output" style="display:none;">
+        <div id="restore-progress-bar" class="op-progress hidden"><div class="op-progress-fill"></div></div>
         <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Output</div>
         <div id="restore-modal-terminal" class="terminal" style="max-height:300px;"></div>
       </div>
@@ -205,6 +210,7 @@
         Dry run (preview only)
       </label>
       <div id="ops-modal-output" style="display:none;">
+        <div id="ops-progress-bar" class="op-progress hidden"><div class="op-progress-fill"></div></div>
         <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Output</div>
         <div id="ops-modal-terminal" class="terminal" style="max-height:350px;"></div>
       </div>
@@ -216,6 +222,71 @@
   </div>
 </div>
 
-<script src="/static/js/app.js?v=12"></script>
+<!-- Schedule Edit Modal -->
+<div id="schedule-modal" class="modal-overlay" style="display:none;" onclick="if(event.target===this)closeScheduleModal()">
+  <div class="modal-box" style="max-width:520px;">
+    <div class="modal-header">
+      <span id="schedule-modal-title" style="font-weight:600;color:#f3f4f6;">Edit Schedule</span>
+      <button onclick="closeScheduleModal()" style="background:none;border:none;color:#9ca3af;font-size:1.25rem;cursor:pointer;">&times;</button>
+    </div>
+    <div class="modal-body">
+      <input type="hidden" id="sched-project">
+
+      <div style="margin-bottom:1rem;">
+        <label style="display:flex;align-items:center;gap:0.5rem;font-size:0.875rem;color:#d1d5db;cursor:pointer;">
+          <input type="checkbox" id="sched-enabled" style="width:1rem;height:1rem;accent-color:#3b82f6;">
+          Enabled
+        </label>
+      </div>
+
+      <div style="margin-bottom:1rem;">
+        <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Schedule (HH:MM UTC)</div>
+        <div style="display:flex;align-items:center;gap:0.75rem;">
+          <input type="time" id="sched-time" class="form-input" style="width:120px;">
+          <span id="sched-server-clock" style="font-size:0.75rem;color:#6b7280;font-variant-numeric:tabular-nums;"></span>
+        </div>
+      </div>
+
+      <div style="margin-bottom:1rem;">
+        <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Environments</div>
+        <div id="sched-envs" style="display:flex;gap:1rem;flex-wrap:wrap;"></div>
+      </div>
+
+      <div style="margin-bottom:1rem;">
+        <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Custom Command (optional)</div>
+        <input type="text" id="sched-command" class="form-input" placeholder="Leave empty for default ops backup" style="width:100%;font-size:0.8125rem;">
+      </div>
+
+      <div style="margin-bottom:1rem;">
+        <label style="display:flex;align-items:center;gap:0.5rem;font-size:0.875rem;color:#d1d5db;cursor:pointer;">
+          <input type="checkbox" id="sched-offsite" onchange="toggleOffsiteSection()" style="width:1rem;height:1rem;accent-color:#3b82f6;">
+          Offsite Upload
+        </label>
+      </div>
+
+      <div id="sched-offsite-section" style="display:none;margin-bottom:1rem;padding-left:1.5rem;">
+        <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Offsite Environments</div>
+        <div id="sched-offsite-envs" style="display:flex;gap:1rem;flex-wrap:wrap;"></div>
+      </div>
+
+      <div style="display:flex;gap:1rem;margin-bottom:1rem;">
+        <div>
+          <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Local Retention (days)</div>
+          <input type="number" id="sched-retention-local" class="form-input" style="width:80px;" min="1" max="365">
+        </div>
+        <div>
+          <div style="font-size:0.8125rem;font-weight:500;color:#9ca3af;margin-bottom:0.375rem;">Offsite Retention (days)</div>
+          <input type="number" id="sched-retention-offsite" class="form-input" style="width:80px;" min="1" max="365">
+        </div>
+      </div>
+    </div>
+    <div class="modal-footer">
+      <button class="btn btn-ghost btn-sm" onclick="closeScheduleModal()">Cancel</button>
+      <button id="sched-save-btn" class="btn btn-primary btn-sm" onclick="saveSchedule()">Save</button>
+    </div>
+  </div>
+</div>
+
+<script src="/static/js/app.js?v=13"></script>
 </body>
 </html>
diff --git a/static/js/app.js b/static/js/app.js
index ee4c5e8..a23028a 100644
--- a/static/js/app.js
+++ b/static/js/app.js
@@ -1,5 +1,5 @@
 'use strict';
-const APP_VERSION = 'v13-20260222';
+const APP_VERSION = 'v14-20260222';
 
 // ============================================================
 // OPS Dashboard — Vanilla JS Application (v6)
@@ -38,6 +38,7 @@
 let opsEventSource = null;
 let opsCtx = { type: null, project: null, fromEnv: null, toEnv: null };
 let cachedRegistry = null;
+let currentOpId = null;
 
 // ---------------------------------------------------------------------------
 // Helpers
@@ -104,6 +105,15 @@
 }
 
 // ---------------------------------------------------------------------------
+// Progress Bar
+// ---------------------------------------------------------------------------
+function _setProgressState(barId, state) {
+  const bar = document.getElementById(barId);
+  if (!bar) return;
+  bar.className = 'op-progress ' + (state === 'running' ? 'running' : state === 'ok' ? 'done-ok' : state === 'fail' ? 'done-fail' : 'hidden');
+}
+
+// ---------------------------------------------------------------------------
 // Auth
 // ---------------------------------------------------------------------------
 function getToken() { return localStorage.getItem('ops_token'); }
@@ -142,7 +152,7 @@
 async function api(path, opts = {}) {
   const token = getToken();
   const headers = { ...(opts.headers || {}), 'Authorization': 'Bearer ' + token };
-  const resp = await fetch(path, { ...opts, headers });
+  const resp = await fetch(path, { ...opts, headers, cache: 'no-store' });
   if (resp.status === 401) { doLogout(); throw new Error('Session expired'); }
   if (!resp.ok) { const b = await resp.text(); throw new Error(b || 'HTTP ' + resp.status); }
   const ct = resp.headers.get('content-type') || '';
@@ -197,6 +207,7 @@
     case 'backups':   renderBackups(); break;
     case 'system':    renderSystem(); break;
     case 'operations': renderOperations(); break;
+    case 'schedules': renderSchedules(); break;
     default:          renderDashboard();
   }
 }
@@ -288,6 +299,8 @@
     } else if (backupDrillLevel === 2) {
       h = '<a onclick="backupDrillBack(0)">Backups</a><span class="sep">/</span><a onclick="backupDrillBack(1)">' + esc(backupDrillProject) + '</a><span class="sep">/</span><span class="current">' + esc(backupDrillEnv) + '</span>';
     }
+  } else if (currentPage === 'schedules') {
+    h = '<span class="current">Schedules</span>';
   } else if (currentPage === 'system') {
     h = '<span class="current">System</span>';
   } else if (currentPage === 'operations') {
@@ -499,15 +512,21 @@
   // YYYYMMDD_HHMMSS -> YYYY-MM-DD HH:MM
   const m = String(raw).match(/^(\d{4})(\d{2})(\d{2})[_T](\d{2})(\d{2})/);
   if (m) return `${m[1]}-${m[2]}-${m[3]} ${m[4]}:${m[5]}`;
-  // YYYY-MM-DD passthrough
+  // ISO 8601: YYYY-MM-DDTHH:MM:SS
+  const iso = String(raw).match(/^(\d{4})-(\d{2})-(\d{2})[T ](\d{2}):(\d{2})/);
+  if (iso) return `${iso[1]}-${iso[2]}-${iso[3]} ${iso[4]}:${iso[5]}`;
   return raw;
 }
 
-// Parse YYYYMMDD_HHMMSS -> { dateKey: 'YYYY-MM-DD', timeStr: 'HH:MM' }
+// Parse backup date -> { dateKey: 'YYYY-MM-DD', timeStr: 'HH:MM' }
 function parseBackupDate(raw) {
   if (!raw) return { dateKey: '', timeStr: '' };
+  // YYYYMMDD_HHMMSS
   const m = String(raw).match(/^(\d{4})(\d{2})(\d{2})[_T](\d{2})(\d{2})/);
   if (m) return { dateKey: `${m[1]}-${m[2]}-${m[3]}`, timeStr: `${m[4]}:${m[5]}` };
+  // ISO 8601: YYYY-MM-DDTHH:MM:SS
+  const iso = String(raw).match(/^(\d{4})-(\d{2})-(\d{2})[T ](\d{2}):(\d{2})/);
+  if (iso) return { dateKey: `${iso[1]}-${iso[2]}-${iso[3]}`, timeStr: `${iso[4]}:${iso[5]}` };
   return { dateKey: raw, timeStr: '' };
 }
 
@@ -536,6 +555,16 @@
   if (chevron) chevron.classList.toggle('open', !isOpen);
 }
 
+// Normalize any backup date to ISO-sortable format (YYYY-MM-DDTHH:MM:SS)
+function normalizeBackupDate(raw) {
+  if (!raw) return '';
+  // Compact: YYYYMMDD_HHMMSS -> YYYY-MM-DDTHH:MM:SS
+  const m = String(raw).match(/^(\d{4})(\d{2})(\d{2})[_T](\d{2})(\d{2})(\d{2})?/);
+  if (m) return `${m[1]}-${m[2]}-${m[3]}T${m[4]}:${m[5]}:${m[6] || '00'}`;
+  // Already ISO-ish: pass through
+  return String(raw);
+}
+
 // ---------------------------------------------------------------------------
 // Backups — merge helper (dedup local+offsite by filename)
 // ---------------------------------------------------------------------------
@@ -544,12 +573,12 @@
 
   for (const b of local) {
     const name = b.name || b.file || '';
-    const key = name || (b.project + '/' + b.env + '/' + (b.date || b.timestamp));
+    const key = name || (b.project + '/' + b.env + '/' + (b.date || b.mtime || b.timestamp));
     byName.set(key, {
       project: b.project || '',
       env: b.env || b.environment || '',
       name: name,
-      date: b.date || b.timestamp || '',
+      date: normalizeBackupDate(b.date || b.mtime || b.timestamp || ''),
       size_human: b.size_human || b.size || '',
       size_bytes: Number(b.size || 0),
       hasLocal: true,
@@ -561,13 +590,15 @@
     const name = b.name || '';
     const key = name || (b.project + '/' + b.env + '/' + (b.date || ''));
     if (byName.has(key)) {
-      byName.get(key).hasOffsite = true;
+      const existing = byName.get(key);
+      existing.hasOffsite = true;
+      if (!existing.date && b.date) existing.date = normalizeBackupDate(b.date);
     } else {
       byName.set(key, {
         project: b.project || '',
         env: b.env || b.environment || '',
         name: name,
-        date: b.date || '',
+        date: normalizeBackupDate(b.date || ''),
         size_human: b.size || '',
         size_bytes: Number(b.size_bytes || 0),
         hasLocal: false,
@@ -628,12 +659,12 @@
   h += '</div></div>';
 
   // Global stat tiles
-  h += '<div class="grid-stats" style="margin-bottom:1.5rem;">';
+  h += '<div class="grid-stats" style="margin-bottom:0.5rem;">';
   h += statTile('Local', localCount, '#3b82f6');
   h += statTile('Offsite', offsiteCount, '#8b5cf6');
   h += statTile('Synced', syncedCount, '#10b981');
-  h += statTile('Latest', latestDisplay, '#f59e0b');
   h += '</div>';
+  h += `<div style="margin-bottom:1.5rem;font-size:0.8125rem;color:#9ca3af;">Latest backup: <span style="color:#f59e0b;">${esc(latestDisplay)}</span></div>`;
 
   // Project cards
   const projects = groupBy(all, 'project');
@@ -722,6 +753,15 @@
 
   let h = '<div class="page-enter">';
 
+  // Action bar: Create Backup + Upload
+  h += `<div style="display:flex;gap:0.5rem;margin-bottom:0.75rem;">`;
+  h += `<button class="btn btn-primary btn-sm" onclick="createBackup('${esc(backupDrillProject)}','${esc(backupDrillEnv)}')">Create Backup</button>`;
+  h += `<button class="btn btn-ghost btn-sm" style="color:#a78bfa;border-color:rgba(167,139,250,0.25);" onclick="uploadOffsiteBackup('${esc(backupDrillProject)}','${esc(backupDrillEnv)}')">Upload to Offsite</button>`;
+  if (filtered.some(b => b.hasOffsite && !b.hasLocal)) {
+    h += `<button class="btn btn-ghost btn-sm" style="color:#34d399;border-color:rgba(52,211,153,0.25);" onclick="downloadOffsiteBackup('${esc(backupDrillProject)}','${esc(backupDrillEnv)}')">Download from Offsite</button>`;
+  }
+  h += `</div>`;
+
   // Selection action bar
   h += `<div id="backup-selection-bar" class="selection-bar" style="display:${selectedBackups.size > 0 ? 'flex' : 'none'};">`;
   h += `<span id="selection-count">${selectedBackups.size} selected</span>`;
@@ -782,7 +822,10 @@
         const checked = selectedBackups.has(b.name) ? ' checked' : '';
         const deleteBtn = `<button class="btn btn-ghost btn-xs" style="color:#f87171;border-color:#7f1d1d;" onclick="deleteBackup('${esc(b.project)}','${esc(b.env)}','${esc(b.name)}',${b.hasLocal},${b.hasOffsite})">Delete</button>`;
         const uploadBtn = (b.hasLocal && !b.hasOffsite)
-          ? `<button class="btn btn-ghost btn-xs" style="color:#a78bfa;border-color:rgba(167,139,250,0.25);" onclick="uploadOffsiteBackup('${esc(b.project)}','${esc(b.env)}')">Upload</button>`
+          ? `<button class="btn btn-ghost btn-xs" style="color:#a78bfa;border-color:rgba(167,139,250,0.25);" onclick="uploadOffsiteBackup('${esc(b.project)}','${esc(b.env)}','${esc(b.name)}')">Upload</button>`
+          : '';
+        const downloadBtn = (!b.hasLocal && b.hasOffsite)
+          ? `<button class="btn btn-ghost btn-xs" style="color:#34d399;border-color:rgba(52,211,153,0.25);" onclick="downloadOffsiteBackup('${esc(b.project)}','${esc(b.env)}','${esc(b.name)}')">Download</button>`
           : '';
         h += `<tr>
           <td style="padding-left:0.75rem;"><input type="checkbox" class="backup-cb" value="${esc(b.name)}"${checked} onclick="toggleBackupSelect('${esc(b.name)}')" style="accent-color:#3b82f6;cursor:pointer;"></td>
@@ -792,6 +835,7 @@
           <td style="white-space:nowrap;">
             <button class="btn btn-danger btn-xs" onclick="openRestoreModal('${esc(b.project)}','${esc(b.env)}','${restoreSource}','${esc(b.name)}',${b.hasLocal},${b.hasOffsite})">Restore</button>
             ${uploadBtn}
+            ${downloadBtn}
             ${deleteBtn}
           </td>
         </tr>`;
@@ -870,7 +914,7 @@
     if (allOffsite) target = 'offsite';
   }
   const label = target === 'both' ? 'local + offsite' : target;
-  if (!confirm(`Delete ${names.length} backup${names.length > 1 ? 's' : ''} (${label})?\n\nThis cannot be undone.`)) return;
+  if (!await showConfirmDialog(`Delete ${names.length} backup${names.length > 1 ? 's' : ''} (${label})?\n\nThis cannot be undone.`, 'Delete', true)) return;
   toast(`Deleting ${names.length} backups (${label})...`, 'info');
   let ok = 0, fail = 0;
   for (const name of names) {
@@ -885,15 +929,153 @@
   if (currentPage === 'backups') renderBackups();
 }
 
-async function uploadOffsiteBackup(project, env) {
-  if (!confirm(`Upload latest ${project}/${env} backup to offsite storage?`)) return;
-  toast('Uploading to offsite...', 'info');
-  try {
-    await api(`/api/backups/offsite/upload/${encodeURIComponent(project)}/${encodeURIComponent(env)}`, { method: 'POST' });
-    toast('Offsite upload complete for ' + project + '/' + env, 'success');
-    cachedBackups = null;
-    if (currentPage === 'backups') renderBackups();
-  } catch (e) { toast('Upload failed: ' + e.message, 'error'); }
+async function uploadOffsiteBackup(project, env, name) {
+  const label = name ? name : `latest ${project}/${env}`;
+  if (!await showConfirmDialog(`Upload ${label} to offsite storage?`, 'Upload')) return;
+
+  // Open the ops modal with streaming output
+  opsCtx = { type: 'upload', project, fromEnv: env, toEnv: null };
+  if (opsEventSource) { opsEventSource.close(); opsEventSource = null; }
+
+  const title = document.getElementById('ops-modal-title');
+  const info = document.getElementById('ops-modal-info');
+  const startBtn = document.getElementById('ops-start-btn');
+  const dryRunRow = document.getElementById('ops-dry-run-row');
+  const outputDiv = document.getElementById('ops-modal-output');
+  const term = document.getElementById('ops-modal-terminal');
+
+  title.textContent = 'Upload to Offsite';
+  let infoHtml = '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>'
+    + '<div class="restore-info-row"><span class="restore-info-label">Environment</span><span class="restore-info-value">' + esc(env) + '</span></div>';
+  if (name) infoHtml += '<div class="restore-info-row"><span class="restore-info-label">File</span><span class="restore-info-value mono">' + esc(name) + '</span></div>';
+  info.innerHTML = infoHtml;
+  if (dryRunRow) dryRunRow.style.display = 'none';
+  startBtn.style.display = 'none';
+
+  outputDiv.style.display = 'block';
+  term.textContent = 'Starting upload...\n';
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'running');
+
+  document.getElementById('ops-modal').style.display = 'flex';
+
+  let url = '/api/backups/offsite/stream/' + encodeURIComponent(project) + '/' + encodeURIComponent(env) + '?token=' + encodeURIComponent(getToken());
+  if (name) url += '&name=' + encodeURIComponent(name);
+  const es = new EventSource(url);
+  opsEventSource = es;
+
+  es.onmessage = function(e) {
+    try {
+      const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
+      if (d.done) {
+        es.close();
+        opsEventSource = null;
+        currentOpId = null;
+        const msg = d.cancelled ? '\n--- Cancelled ---\n' : d.success ? '\n--- Upload complete ---\n' : '\n--- Upload FAILED ---\n';
+        term.textContent += msg;
+        term.scrollTop = term.scrollHeight;
+        toast(d.cancelled ? 'Upload cancelled' : d.success ? 'Offsite upload complete for ' + project + '/' + env : 'Upload failed', d.success ? 'success' : d.cancelled ? 'warning' : 'error');
+        _setProgressState('ops-progress-bar', d.success ? 'ok' : 'fail');
+        cachedBackups = null;
+        if (d.success && currentPage === 'backups') renderBackups();
+        return;
+      }
+      if (d.line) {
+        term.textContent += d.line + '\n';
+        term.scrollTop = term.scrollHeight;
+      }
+    } catch (_) {}
+  };
+
+  es.onerror = function() {
+    es.close();
+    opsEventSource = null;
+    currentOpId = null;
+    term.textContent += '\n--- Connection lost ---\n';
+    toast('Connection lost', 'error');
+    _setProgressState('ops-progress-bar', 'fail');
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Offsite Download (download to local storage, no restore)
+// ---------------------------------------------------------------------------
+async function downloadOffsiteBackup(project, env, name) {
+  const label = name ? name : `latest offsite backup for ${project}/${env}`;
+  if (!name) {
+    // No specific name: find the latest offsite-only backup for this env
+    const latest = cachedBackups && cachedBackups.find(b => b.project === project && b.env === env && b.hasOffsite && !b.hasLocal);
+    if (!latest) {
+      toast('No offsite-only backup found for ' + project + '/' + env, 'warning');
+      return;
+    }
+    name = latest.name;
+  }
+  if (!await showConfirmDialog(`Download "${name}" from offsite to local storage?`, 'Download')) return;
+
+  // Open the ops modal with streaming output
+  opsCtx = { type: 'download', project, fromEnv: env, toEnv: null };
+  if (opsEventSource) { opsEventSource.close(); opsEventSource = null; }
+
+  const title = document.getElementById('ops-modal-title');
+  const info = document.getElementById('ops-modal-info');
+  const startBtn = document.getElementById('ops-start-btn');
+  const dryRunRow = document.getElementById('ops-dry-run-row');
+  const outputDiv = document.getElementById('ops-modal-output');
+  const term = document.getElementById('ops-modal-terminal');
+
+  title.textContent = 'Download from Offsite';
+  let infoHtml = '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>'
+    + '<div class="restore-info-row"><span class="restore-info-label">Environment</span><span class="restore-info-value">' + esc(env) + '</span></div>'
+    + '<div class="restore-info-row"><span class="restore-info-label">File</span><span class="restore-info-value mono">' + esc(name) + '</span></div>';
+  info.innerHTML = infoHtml;
+  if (dryRunRow) dryRunRow.style.display = 'none';
+  startBtn.style.display = 'none';
+
+  outputDiv.style.display = 'block';
+  term.textContent = 'Starting download...\n';
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'running');
+
+  document.getElementById('ops-modal').style.display = 'flex';
+
+  const url = '/api/backups/offsite/download/stream/' + encodeURIComponent(project) + '/' + encodeURIComponent(env) + '?name=' + encodeURIComponent(name) + '&token=' + encodeURIComponent(getToken());
+  const es = new EventSource(url);
+  opsEventSource = es;
+
+  es.onmessage = function(e) {
+    try {
+      const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
+      if (d.done) {
+        es.close();
+        opsEventSource = null;
+        currentOpId = null;
+        const msg = d.cancelled ? '\n--- Cancelled ---\n' : d.success ? '\n--- Download complete ---\n' : '\n--- Download FAILED ---\n';
+        term.textContent += msg;
+        term.scrollTop = term.scrollHeight;
+        toast(d.cancelled ? 'Download cancelled' : d.success ? 'Downloaded ' + (d.name || name) + ' to local storage' : 'Download failed', d.success ? 'success' : d.cancelled ? 'warning' : 'error');
+        _setProgressState('ops-progress-bar', d.success ? 'ok' : 'fail');
+        cachedBackups = null;
+        if (d.success && currentPage === 'backups') renderBackups();
+        return;
+      }
+      if (d.line) {
+        term.textContent += d.line + '\n';
+        term.scrollTop = term.scrollHeight;
+      }
+    } catch (_) {}
+  };
+
+  es.onerror = function() {
+    es.close();
+    opsEventSource = null;
+    currentOpId = null;
+    term.textContent += '\n--- Connection lost ---\n';
+    toast('Connection lost', 'error');
+    _setProgressState('ops-progress-bar', 'fail');
+  };
 }
 
 // ---------------------------------------------------------------------------
@@ -943,7 +1125,12 @@
 }
 
 function closeRestoreModal() {
+  if (currentOpId && restoreEventSource) {
+    fetch('/api/operations/' + currentOpId, { method: 'DELETE', headers: { 'Authorization': 'Bearer ' + getToken() } }).catch(() => {});
+  }
   if (restoreEventSource) { restoreEventSource.close(); restoreEventSource = null; }
+  currentOpId = null;
+  _setProgressState('restore-progress-bar', 'hidden');
   document.getElementById('restore-modal').style.display = 'none';
   restoreCtx = { project: null, env: null, source: null, name: null };
 }
@@ -975,19 +1162,25 @@
   const modeEl = document.querySelector('input[name="restore-mode"]:checked');
   const mode = modeEl ? modeEl.value : 'full';
   const url = `/api/restore/${encodeURIComponent(project)}/${encodeURIComponent(env)}?source=${encodeURIComponent(source)}${dryRun ? '&dry_run=true' : ''}&token=${encodeURIComponent(getToken())}${name ? '&name=' + encodeURIComponent(name) : ''}&mode=${encodeURIComponent(mode)}`;
+  currentOpId = null;
+  _setProgressState('restore-progress-bar', 'running');
   const es = new EventSource(url);
   restoreEventSource = es;
 
   es.onmessage = function(e) {
     try {
       const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
       if (d.done) {
         es.close();
         restoreEventSource = null;
-        const msg = d.success ? '\n--- Restore complete ---\n' : '\n--- Restore FAILED ---\n';
+        currentOpId = null;
+        const msg = d.cancelled ? '\n--- Cancelled ---\n' : d.success ? '\n--- Restore complete ---\n' : '\n--- Restore FAILED ---\n';
         term.textContent += msg;
         term.scrollTop = term.scrollHeight;
-        toast(d.success ? 'Restore completed' : 'Restore failed', d.success ? 'success' : 'error');
+        const toastMsg = d.cancelled ? 'Restore cancelled' : d.success ? 'Restore completed' : 'Restore failed';
+        toast(toastMsg, d.success ? 'success' : d.cancelled ? 'warning' : 'error');
+        _setProgressState('restore-progress-bar', d.success ? 'ok' : 'fail');
         startBtn.disabled = false;
         startBtn.textContent = 'Start Restore';
         return;
@@ -1002,8 +1195,10 @@
   es.onerror = function() {
     es.close();
     restoreEventSource = null;
+    currentOpId = null;
     term.textContent += '\n--- Connection lost ---\n';
     toast('Connection lost', 'error');
+    _setProgressState('restore-progress-bar', 'fail');
     startBtn.disabled = false;
     startBtn.textContent = 'Start Restore';
   };
@@ -1120,6 +1315,220 @@
 }
 
 // ---------------------------------------------------------------------------
+// Schedules Page
+// ---------------------------------------------------------------------------
+let cachedSchedules = null;
+
+async function renderSchedules() {
+  updateBreadcrumbs();
+  const c = document.getElementById('page-content');
+  try {
+    const schedules = await api('/api/schedule/');
+    cachedSchedules = schedules;
+
+    let h = '<div class="page-enter">';
+    h += '<h2 style="font-size:1.125rem;font-weight:600;color:#f3f4f6;margin-bottom:0.75rem;">Backup Schedules</h2>';
+    h += '<p style="font-size:0.8125rem;color:#6b7280;margin-bottom:1rem;">Managed via registry.yaml. Changes regenerate systemd timers on the server.</p>';
+
+    h += '<div class="table-wrapper"><table class="ops-table"><thead><tr>'
+       + '<th>Project</th><th>Enabled</th><th>Schedule</th><th>Environments</th>'
+       + '<th>Offsite</th><th>Retention</th><th></th>'
+       + '</tr></thead><tbody>';
+
+    for (const s of schedules) {
+      if (s.static) continue; // skip static sites
+
+      const enabled = s.enabled;
+      const enabledBadge = enabled
+        ? '<span class="badge badge-green">On</span>'
+        : '<span class="badge badge-gray">Off</span>';
+      const schedule = s.schedule || '\u2014';
+      const envs = (s.backup_environments || s.environments || []).join(', ') || '\u2014';
+      const offsiteBadge = s.offsite
+        ? '<span class="badge badge-blue" style="background:rgba(59,130,246,0.15);color:#60a5fa;border-color:rgba(59,130,246,0.3);">Yes</span>'
+        : '<span class="badge badge-gray">No</span>';
+      const retLocal = s.retention_local_days != null ? s.retention_local_days + 'd local' : '';
+      const retOffsite = s.retention_offsite_days != null ? s.retention_offsite_days + 'd offsite' : '';
+      const retention = [retLocal, retOffsite].filter(Boolean).join(', ') || '\u2014';
+
+      const canEdit = s.has_backup_dir || s.has_cli;
+      const editBtn = canEdit
+        ? `<button class="btn btn-ghost btn-xs" onclick="openScheduleEdit('${esc(s.project)}')">Edit</button>`
+        : '<span style="color:#4b5563;font-size:0.75rem;">n/a</span>';
+      const runBtn = canEdit
+        ? `<button class="btn btn-ghost btn-xs" onclick="runBackupNow('${esc(s.project)}')">Run Now</button>`
+        : '';
+
+      h += `<tr>
+        <td style="font-weight:500;">${esc(s.project)}</td>
+        <td>${enabledBadge}</td>
+        <td class="mono">${esc(schedule)}</td>
+        <td>${esc(envs)}</td>
+        <td>${offsiteBadge}</td>
+        <td style="font-size:0.8125rem;color:#9ca3af;">${esc(retention)}</td>
+        <td style="display:flex;gap:0.25rem;">${editBtn} ${runBtn}</td>
+      </tr>`;
+    }
+    h += '</tbody></table></div>';
+    h += '</div>';
+    c.innerHTML = h;
+  } catch (e) {
+    c.innerHTML = '<div class="card" style="color:#f87171;">Failed to load schedules: ' + esc(e.message) + '</div>';
+  }
+}
+
+let _schedClockInterval = null;
+function _startScheduleClock() {
+  _stopScheduleClock();
+  const el = document.getElementById('sched-server-clock');
+  const tick = () => {
+    const now = new Date();
+    el.textContent = 'Server now: ' + now.toISOString().slice(11, 19) + ' UTC';
+  };
+  tick();
+  _schedClockInterval = setInterval(tick, 1000);
+}
+function _stopScheduleClock() {
+  if (_schedClockInterval) { clearInterval(_schedClockInterval); _schedClockInterval = null; }
+}
+
+function openScheduleEdit(project) {
+  const s = (cachedSchedules || []).find(x => x.project === project);
+  if (!s) return;
+
+  const envOptions = (s.environments || []).map(e => {
+    const checked = (s.backup_environments || s.environments || []).includes(e) ? 'checked' : '';
+    return `<label style="display:flex;align-items:center;gap:0.375rem;font-size:0.875rem;color:#d1d5db;cursor:pointer;">
+      <input type="checkbox" name="sched-env" value="${esc(e)}" ${checked} style="accent-color:#3b82f6;"> ${esc(e)}
+    </label>`;
+  }).join('');
+
+  const offsiteEnvOptions = (s.environments || []).map(e => {
+    const checked = (s.offsite_envs || ['prod']).includes(e) ? 'checked' : '';
+    return `<label style="display:flex;align-items:center;gap:0.375rem;font-size:0.875rem;color:#d1d5db;cursor:pointer;">
+      <input type="checkbox" name="sched-offsite-env" value="${esc(e)}" ${checked} style="accent-color:#3b82f6;"> ${esc(e)}
+    </label>`;
+  }).join('');
+
+  const modal = document.getElementById('schedule-modal');
+  document.getElementById('schedule-modal-title').textContent = 'Edit Schedule: ' + project;
+  document.getElementById('sched-project').value = project;
+  document.getElementById('sched-enabled').checked = s.enabled;
+  document.getElementById('sched-time').value = s.schedule || '03:00';
+  document.getElementById('sched-envs').innerHTML = envOptions;
+  document.getElementById('sched-command').value = s.command || '';
+  document.getElementById('sched-offsite').checked = s.offsite;
+  document.getElementById('sched-offsite-envs').innerHTML = offsiteEnvOptions;
+  document.getElementById('sched-offsite-section').style.display = s.offsite ? '' : 'none';
+  document.getElementById('sched-retention-local').value = s.retention_local_days != null ? s.retention_local_days : 7;
+  document.getElementById('sched-retention-offsite').value = s.retention_offsite_days != null ? s.retention_offsite_days : 30;
+  document.getElementById('sched-save-btn').disabled = false;
+  document.getElementById('sched-save-btn').textContent = 'Save';
+  _startScheduleClock();
+  modal.style.display = 'flex';
+}
+
+function closeScheduleModal() {
+  _stopScheduleClock();
+  document.getElementById('schedule-modal').style.display = 'none';
+}
+
+function toggleOffsiteSection() {
+  const show = document.getElementById('sched-offsite').checked;
+  document.getElementById('sched-offsite-section').style.display = show ? '' : 'none';
+}
+
+async function saveSchedule() {
+  const project = document.getElementById('sched-project').value;
+  const btn = document.getElementById('sched-save-btn');
+  btn.disabled = true;
+  btn.textContent = 'Saving...';
+
+  const envCheckboxes = document.querySelectorAll('input[name="sched-env"]:checked');
+  const environments = Array.from(envCheckboxes).map(cb => cb.value);
+  const offsiteEnvCheckboxes = document.querySelectorAll('input[name="sched-offsite-env"]:checked');
+  const offsite_envs = Array.from(offsiteEnvCheckboxes).map(cb => cb.value);
+
+  const body = {
+    enabled: document.getElementById('sched-enabled').checked,
+    schedule: document.getElementById('sched-time').value,
+    environments: environments.length ? environments : null,
+    command: document.getElementById('sched-command').value || null,
+    offsite: document.getElementById('sched-offsite').checked,
+    offsite_envs: offsite_envs.length ? offsite_envs : null,
+    retention_local_days: parseInt(document.getElementById('sched-retention-local').value) || null,
+    retention_offsite_days: parseInt(document.getElementById('sched-retention-offsite').value) || null,
+  };
+
+  try {
+    await api('/api/schedule/' + encodeURIComponent(project), {
+      method: 'PUT',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(body),
+    });
+    toast('Schedule updated for ' + project, 'success');
+    closeScheduleModal();
+    cachedSchedules = null;
+    renderSchedules();
+  } catch (e) {
+    toast('Failed to save schedule: ' + e.message, 'error');
+    btn.disabled = false;
+    btn.textContent = 'Save';
+  }
+}
+
+async function runBackupNow(project) {
+  if (!await showConfirmDialog(`Run backup now for ${project}?`, 'Run Backup')) return;
+
+  opsCtx = { type: 'backup', project, fromEnv: null, toEnv: null };
+  if (opsEventSource) { opsEventSource.close(); opsEventSource = null; }
+
+  const title = document.getElementById('ops-modal-title');
+  const info = document.getElementById('ops-modal-info');
+  const startBtn = document.getElementById('ops-start-btn');
+  const dryRunRow = document.getElementById('ops-dry-run-row');
+  const outputDiv = document.getElementById('ops-modal-output');
+  const term = document.getElementById('ops-modal-terminal');
+
+  title.textContent = 'Backup: ' + project;
+  info.innerHTML = '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>';
+  if (dryRunRow) dryRunRow.style.display = 'none';
+  startBtn.style.display = 'none';
+
+  outputDiv.style.display = 'block';
+  term.textContent = 'Starting backup...\n';
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'running');
+
+  document.getElementById('ops-modal').style.display = 'flex';
+
+  const url = '/api/schedule/' + encodeURIComponent(project) + '/run?token=' + encodeURIComponent(getToken());
+  const es = new EventSource(url);
+  opsEventSource = es;
+
+  es.onmessage = function(e) {
+    try {
+      const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
+      if (d.done) {
+        es.close();
+        opsEventSource = null;
+        _setProgressState('ops-progress-bar', d.success ? 'done' : 'error');
+        if (d.cancelled) term.textContent += '\n--- Cancelled ---\n';
+        else if (d.success) term.textContent += '\n--- Done ---\n';
+        else term.textContent += '\n--- Failed ---\n';
+        return;
+      }
+      if (d.line != null) {
+        term.textContent += d.line + '\n';
+        term.scrollTop = term.scrollHeight;
+      }
+    } catch {}
+  };
+  es.onerror = function() { es.close(); opsEventSource = null; _setProgressState('ops-progress-bar', 'error'); };
+}
+
+// ---------------------------------------------------------------------------
 // Operations Page
 // ---------------------------------------------------------------------------
 async function renderOperations() {
@@ -1148,7 +1557,7 @@
   for (const [name, cfg] of Object.entries(projects)) {
     if (!cfg.promote || cfg.static || cfg.infrastructure) continue;
     const pType = cfg.promote.type || 'unknown';
-    const envs = cfg.environments || [];
+    const envs = (cfg.environments || []).map(e => typeof e === 'string' ? e : e.name);
     const typeBadge = pType === 'git'
       ? '<span class="badge badge-blue" style="font-size:0.6875rem;">git</span>'
       : '<span class="badge badge-purple" style="font-size:0.6875rem;">rsync</span>';
@@ -1187,7 +1596,7 @@
 
   for (const [name, cfg] of Object.entries(projects)) {
     if (!cfg.has_cli || cfg.static || cfg.infrastructure) continue;
-    const envs = cfg.environments || [];
+    const envs = (cfg.environments || []).map(e => typeof e === 'string' ? e : e.name);
 
     h += '<div class="card">';
     h += '<div style="margin-bottom:0.75rem;font-weight:600;color:#f3f4f6;">' + esc(name) + '</div>';
@@ -1215,15 +1624,14 @@
 
   // Section: Container Lifecycle
   h += '<h2 style="font-size:1.125rem;font-weight:600;color:#f3f4f6;margin-bottom:0.375rem;">Container Lifecycle</h2>';
-  h += '<p style="font-size:0.8125rem;color:#9ca3af;margin-bottom:1rem;">Manage container state via Coolify API. '
+  h += '<p style="font-size:0.8125rem;color:#9ca3af;margin-bottom:1rem;">Manage container state via docker compose. '
     + '<span style="color:#6ee7b7;">Restart</span> is safe. '
-    + '<span style="color:#fbbf24;">Rebuild</span> refreshes the image. '
-    + '<span style="color:#f87171;">Recreate</span> wipes data (disaster recovery only).</p>';
-  h += '<div class="grid-auto" style="margin-bottom:2rem;">';
+    + '<span style="color:#fbbf24;">Rebuild</span> refreshes the image.</p>';
+  h += '<div style="display:grid;grid-template-columns:repeat(auto-fill,minmax(320px,1fr));gap:1rem;margin-bottom:2rem;">';
 
   for (const [name, cfg] of Object.entries(projects)) {
-    if (cfg.static || cfg.infrastructure || !cfg.has_coolify) continue;
-    const envs = (cfg.environments || []).filter(e => e !== 'infra');
+    if (cfg.type === 'static' || cfg.type === 'infrastructure') continue;
+    const envs = (cfg.environments || []).map(e => typeof e === 'string' ? e : e.name).filter(e => e !== 'infra');
     if (!envs.length) continue;
 
     h += '<div class="card">';
@@ -1231,21 +1639,25 @@
     h += '<div style="display:flex;flex-direction:column;gap:0.625rem;">';
 
     for (const env of envs) {
-      h += '<div style="display:flex;align-items:center;gap:0.5rem;">';
+      h += '<div style="display:flex;align-items:center;gap:0.375rem;">';
       // Environment label
-      h += '<span style="min-width:2.5rem;font-size:0.75rem;color:#9ca3af;font-weight:500;">' + esc(env) + '</span>';
+      h += '<span style="min-width:2.25rem;font-size:0.75rem;color:#9ca3af;font-weight:500;">' + esc(env) + '</span>';
       // Restart (green)
-      h += '<button class="btn btn-ghost btn-xs" style="color:#6ee7b7;border-color:rgba(110,231,179,0.3);" '
+      h += '<button class="btn btn-ghost btn-xs" style="color:#6ee7b7;border-color:rgba(110,231,179,0.3);padding:0.125rem 0.375rem;font-size:0.6875rem;" '
         + 'onclick="openLifecycleModal(&apos;restart&apos;,&apos;' + esc(name) + '&apos;,&apos;' + esc(env) + '&apos;)">'
         + 'Restart</button>';
       // Rebuild (yellow)
-      h += '<button class="btn btn-ghost btn-xs" style="color:#fbbf24;border-color:rgba(251,191,36,0.3);" '
+      h += '<button class="btn btn-ghost btn-xs" style="color:#fbbf24;border-color:rgba(251,191,36,0.3);padding:0.125rem 0.375rem;font-size:0.6875rem;" '
         + 'onclick="openLifecycleModal(&apos;rebuild&apos;,&apos;' + esc(name) + '&apos;,&apos;' + esc(env) + '&apos;)">'
         + 'Rebuild</button>';
-      // Recreate (red)
-      h += '<button class="btn btn-ghost btn-xs" style="color:#f87171;border-color:rgba(248,113,113,0.3);" '
-        + 'onclick="openLifecycleModal(&apos;recreate&apos;,&apos;' + esc(name) + '&apos;,&apos;' + esc(env) + '&apos;)">'
-        + 'Recreate</button>';
+      // Backup (blue)
+      h += '<button class="btn btn-ghost btn-xs" style="color:#60a5fa;border-color:rgba(96,165,250,0.3);padding:0.125rem 0.375rem;font-size:0.6875rem;" '
+        + 'onclick="openLifecycleModal(&apos;backup&apos;,&apos;' + esc(name) + '&apos;,&apos;' + esc(env) + '&apos;)">'
+        + 'Backup</button>';
+      // Restore (navigate to backups page)
+      h += '<button class="btn btn-ghost btn-xs" style="color:#a78bfa;border-color:rgba(167,139,250,0.3);padding:0.125rem 0.375rem;font-size:0.6875rem;" '
+        + 'onclick="currentPage=&apos;backups&apos;;backupDrillLevel=2;backupDrillProject=&apos;' + esc(name) + '&apos;;backupDrillEnv=&apos;' + esc(env) + '&apos;;cachedBackups=null;selectedBackups.clear();document.querySelectorAll(&apos;#sidebar-nav .sidebar-link&apos;).forEach(el=>el.classList.toggle(&apos;active&apos;,el.dataset.page===&apos;backups&apos;));renderPage();pushHash();">'
+        + 'Restore</button>';
       h += '</div>';
     }
 
@@ -1353,7 +1765,7 @@
 }
 
 // ---------------------------------------------------------------------------
-// Lifecycle Modal (Restart / Rebuild / Recreate)
+// Lifecycle Modal (Restart / Rebuild / Backup)
 // ---------------------------------------------------------------------------
 function openLifecycleModal(action, project, env) {
   opsCtx = { type: action, project, fromEnv: env, toEnv: null };
@@ -1385,61 +1797,49 @@
       + '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>'
       + '<div class="restore-info-row"><span class="restore-info-label">Environment</span><span class="restore-info-value">' + esc(env) + '</span></div>'
       + '<div style="background:rgba(251,191,36,0.08);border:1px solid rgba(251,191,36,0.25);border-radius:0.5rem;padding:0.625rem 0.875rem;font-size:0.8125rem;color:#fde68a;margin-top:0.75rem;">'
-      + 'Stops containers via Coolify, rebuilds the Docker image, then starts again. No data loss.</div>';
+      + 'Runs <code>docker compose down</code>, rebuilds the image, then starts again. No data loss.</div>';
     startBtn.className = 'btn btn-sm';
     startBtn.style.cssText = 'background:#78350f;color:#fde68a;border:1px solid rgba(251,191,36,0.3);';
     startBtn.textContent = 'Rebuild';
 
-  } else if (action === 'recreate') {
-    title.textContent = 'Recreate Environment';
+  } else if (action === 'backup') {
+    title.textContent = 'Create Backup';
     info.innerHTML = ''
       + '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>'
       + '<div class="restore-info-row"><span class="restore-info-label">Environment</span><span class="restore-info-value">' + esc(env) + '</span></div>'
-      + '<div style="background:rgba(220,38,38,0.1);border:1px solid rgba(220,38,38,0.3);border-radius:0.5rem;padding:0.75rem 1rem;font-size:0.8125rem;color:#fca5a5;margin-top:0.75rem;">'
-      + '<strong style="display:block;margin-bottom:0.375rem;">DESTRUCTIVE — Disaster Recovery Only</strong>'
-      + 'Stops containers, wipes all data volumes, rebuilds image, starts fresh. '
-      + 'You must restore a backup afterwards.</div>'
-      + '<div style="margin-top:0.875rem;">'
-      + '<label style="font-size:0.8125rem;color:#9ca3af;display:block;margin-bottom:0.375rem;">Type the environment name to confirm:</label>'
-      + '<input id="recreate-confirm-input" type="text" placeholder="' + esc(env) + '" '
-      + 'style="width:100%;box-sizing:border-box;padding:0.5rem 0.75rem;background:#1f2937;border:1px solid rgba(220,38,38,0.4);border-radius:0.375rem;color:#f3f4f6;font-size:0.875rem;" '
-      + 'oninput="checkRecreateConfirm(\'' + esc(env) + '\')">'
-      + '</div>';
-    startBtn.className = 'btn btn-danger btn-sm';
+      + '<div style="background:rgba(59,130,246,0.08);border:1px solid rgba(59,130,246,0.25);border-radius:0.5rem;padding:0.625rem 0.875rem;font-size:0.8125rem;color:#93c5fd;margin-top:0.75rem;">'
+      + 'Creates a backup of the database and uploads for this environment.</div>';
+    startBtn.className = 'btn btn-primary btn-sm';
     startBtn.style.cssText = '';
-    startBtn.textContent = 'Recreate';
-    startBtn.disabled = true;  // enabled after typing env name
+    startBtn.textContent = 'Create Backup';
   }
 
   document.getElementById('ops-modal-output').style.display = 'none';
   document.getElementById('ops-modal-terminal').textContent = '';
 
   document.getElementById('ops-modal').style.display = 'flex';
-  if (action === 'recreate') {
-    setTimeout(() => {
-      const inp = document.getElementById('recreate-confirm-input');
-      if (inp) inp.focus();
-    }, 100);
-  }
-}
-
-function checkRecreateConfirm(expectedEnv) {
-  const inp = document.getElementById('recreate-confirm-input');
-  const startBtn = document.getElementById('ops-start-btn');
-  if (!inp || !startBtn) return;
-  startBtn.disabled = inp.value.trim() !== expectedEnv;
 }
 
 function closeOpsModal() {
+  if (currentOpId && opsEventSource) {
+    fetch('/api/operations/' + currentOpId, { method: 'DELETE', headers: { 'Authorization': 'Bearer ' + getToken() } }).catch(() => {});
+  }
   if (opsEventSource) { opsEventSource.close(); opsEventSource = null; }
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'hidden');
   document.getElementById('ops-modal').style.display = 'none';
+  // Refresh backup list if we just ran a backup or upload
+  if ((opsCtx.type === 'backup' || opsCtx.type === 'upload') && currentPage === 'backups') {
+    cachedBackups = null;
+    renderBackups();
+  }
   opsCtx = { type: null, project: null, fromEnv: null, toEnv: null };
   // Restore dry-run row visibility for promote/sync operations
   const dryRunRow = document.getElementById('ops-dry-run-row');
   if (dryRunRow) dryRunRow.style.display = '';
-  // Reset start button style
+  // Reset start button style and visibility
   const startBtn = document.getElementById('ops-start-btn');
-  if (startBtn) { startBtn.style.cssText = ''; startBtn.disabled = false; }
+  if (startBtn) { startBtn.style.cssText = ''; startBtn.style.display = ''; startBtn.disabled = false; }
 }
 
 function _btnLabelForType(type) {
@@ -1447,7 +1847,7 @@
   if (type === 'sync') return 'Sync';
   if (type === 'restart') return 'Restart';
   if (type === 'rebuild') return 'Rebuild';
-  if (type === 'recreate') return 'Recreate';
+  if (type === 'backup') return 'Create Backup';
   return 'Run';
 }
 
@@ -1461,6 +1861,8 @@
   const term = document.getElementById('ops-modal-terminal');
 
   outputDiv.style.display = 'block';
+  // Remove leftover banners from previous operations
+  outputDiv.querySelectorAll('div').forEach(el => { if (el !== term) el.remove(); });
   term.textContent = 'Starting...\n';
   startBtn.disabled = true;
   startBtn.textContent = 'Running...';
@@ -1470,12 +1872,16 @@
     url = '/api/promote/' + encodeURIComponent(project) + '/' + encodeURIComponent(fromEnv) + '/' + encodeURIComponent(toEnv) + '?dry_run=' + dryRun + '&token=' + encodeURIComponent(getToken());
   } else if (type === 'sync') {
     url = '/api/sync/' + encodeURIComponent(project) + '?from=' + encodeURIComponent(fromEnv) + '&to=' + encodeURIComponent(toEnv) + '&dry_run=' + dryRun + '&token=' + encodeURIComponent(getToken());
-  } else if (type === 'restart' || type === 'rebuild' || type === 'recreate') {
-    // All three lifecycle ops go through /api/rebuild/{project}/{env}?action=...
+  } else if (type === 'restart' || type === 'rebuild') {
     url = '/api/rebuild/' + encodeURIComponent(project) + '/' + encodeURIComponent(fromEnv)
       + '?action=' + encodeURIComponent(type) + '&token=' + encodeURIComponent(getToken());
+  } else if (type === 'backup') {
+    url = '/api/backups/stream/' + encodeURIComponent(project) + '/' + encodeURIComponent(fromEnv)
+      + '?token=' + encodeURIComponent(getToken());
   }
 
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'running');
   const es = new EventSource(url);
   opsEventSource = es;
   let opDone = false;
@@ -1483,29 +1889,24 @@
   es.onmessage = function(e) {
     try {
       const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
       if (d.done) {
         opDone = true;
         es.close();
         opsEventSource = null;
-        const msg = d.success ? '\n--- Operation complete ---\n' : '\n--- Operation FAILED ---\n';
+        currentOpId = null;
+        const msg = d.cancelled ? '\n--- Cancelled ---\n' : d.success ? '\n--- Operation complete ---\n' : '\n--- Operation FAILED ---\n';
         term.textContent += msg;
         term.scrollTop = term.scrollHeight;
-        toast(d.success ? 'Operation completed' : 'Operation failed', d.success ? 'success' : 'error');
+        const toastMsg = d.cancelled ? 'Operation cancelled' : d.success ? 'Operation completed' : 'Operation failed';
+        toast(toastMsg, d.success ? 'success' : d.cancelled ? 'warning' : 'error');
+        _setProgressState('ops-progress-bar', d.success ? 'ok' : 'fail');
         startBtn.disabled = false;
         startBtn.textContent = _btnLabelForType(type);
 
-        // Show "Go to Backups" banner after recreate (or legacy rebuild)
-        const showBackupBanner = (type === 'recreate') && d.success && d.project && d.env;
-        if (showBackupBanner) {
-          const restoreProject = d.project;
-          const restoreEnv = d.env;
-          const banner = document.createElement('div');
-          banner.style.cssText = 'margin-top:1rem;padding:0.75rem 1rem;background:rgba(16,185,129,0.1);border:1px solid rgba(16,185,129,0.3);border-radius:0.5rem;display:flex;align-items:center;gap:0.75rem;';
-          banner.innerHTML = '<span style="color:#6ee7b7;font-size:0.8125rem;flex:1;">Environment recreated. Next step: restore a backup.</span>'
-            + '<button class="btn btn-ghost btn-sm" style="color:#6ee7b7;border-color:rgba(110,231,179,0.3);white-space:nowrap;" '
-            + 'onclick="closeOpsModal();currentPage=\'backups\';backupDrillLevel=2;backupDrillProject=\'' + restoreProject + '\';backupDrillEnv=\'' + restoreEnv + '\';cachedBackups=null;selectedBackups.clear();document.querySelectorAll(\'#sidebar-nav .sidebar-link\').forEach(el=>el.classList.toggle(\'active\',el.dataset.page===\'backups\'));renderPage();pushHash();">'
-            + 'Go to Backups &rarr;</button>';
-          outputDiv.appendChild(banner);
+        // After a successful backup, invalidate cache so backups page refreshes
+        if (type === 'backup' && d.success) {
+          cachedBackups = null;
         }
 
         return;
@@ -1520,9 +1921,11 @@
   es.onerror = function() {
     es.close();
     opsEventSource = null;
+    currentOpId = null;
     if (opDone) return;
     term.textContent += '\n--- Connection lost ---\n';
     toast('Connection lost', 'error');
+    _setProgressState('ops-progress-bar', 'fail');
     startBtn.disabled = false;
     startBtn.textContent = _btnLabelForType(type);
   };
@@ -1532,7 +1935,7 @@
 // Service Actions
 // ---------------------------------------------------------------------------
 async function restartService(project, env, service) {
-  if (!confirm(`Restart ${service} in ${project}/${env}?`)) return;
+  if (!await showConfirmDialog(`Restart ${service} in ${project}/${env}?`, 'Restart')) return;
   toast('Restarting ' + service + '...', 'info');
   try {
     const r = await api(`/api/services/restart/${project}/${env}/${service}`, { method: 'POST' });
@@ -1565,14 +1968,67 @@
 }
 
 async function createBackup(project, env) {
-  if (!confirm(`Create backup for ${project}/${env}?`)) return;
-  toast('Creating backup...', 'info');
-  try {
-    await api(`/api/backups/${project}/${env}`, { method: 'POST' });
-    toast('Backup created for ' + project + '/' + env, 'success');
-    cachedBackups = null;
-    if (currentPage === 'backups') renderBackups();
-  } catch (e) { toast('Backup failed: ' + e.message, 'error'); }
+  if (!await showConfirmDialog(`Create backup for ${project}/${env}?`, 'Create Backup')) return;
+
+  // Open the ops modal with streaming output
+  opsCtx = { type: 'backup', project, fromEnv: env, toEnv: null };
+  if (opsEventSource) { opsEventSource.close(); opsEventSource = null; }
+
+  const title = document.getElementById('ops-modal-title');
+  const info = document.getElementById('ops-modal-info');
+  const startBtn = document.getElementById('ops-start-btn');
+  const dryRunRow = document.getElementById('ops-dry-run-row');
+  const outputDiv = document.getElementById('ops-modal-output');
+  const term = document.getElementById('ops-modal-terminal');
+
+  title.textContent = 'Create Backup';
+  info.innerHTML = '<div class="restore-info-row"><span class="restore-info-label">Project</span><span class="restore-info-value">' + esc(project) + '</span></div>'
+    + '<div class="restore-info-row"><span class="restore-info-label">Environment</span><span class="restore-info-value">' + esc(env) + '</span></div>';
+  if (dryRunRow) dryRunRow.style.display = 'none';
+  startBtn.style.display = 'none';
+
+  outputDiv.style.display = 'block';
+  term.textContent = 'Starting backup...\n';
+  currentOpId = null;
+  _setProgressState('ops-progress-bar', 'running');
+
+  document.getElementById('ops-modal').style.display = 'flex';
+
+  const url = '/api/backups/stream/' + encodeURIComponent(project) + '/' + encodeURIComponent(env) + '?token=' + encodeURIComponent(getToken());
+  const es = new EventSource(url);
+  opsEventSource = es;
+
+  es.onmessage = function(e) {
+    try {
+      const d = JSON.parse(e.data);
+      if (d.op_id && !currentOpId) { currentOpId = d.op_id; return; }
+      if (d.done) {
+        es.close();
+        opsEventSource = null;
+        currentOpId = null;
+        const msg = d.cancelled ? '\n--- Cancelled ---\n' : d.success ? '\n--- Backup complete ---\n' : '\n--- Backup FAILED ---\n';
+        term.textContent += msg;
+        term.scrollTop = term.scrollHeight;
+        toast(d.cancelled ? 'Backup cancelled' : d.success ? 'Backup created for ' + project + '/' + env : 'Backup failed', d.success ? 'success' : d.cancelled ? 'warning' : 'error');
+        _setProgressState('ops-progress-bar', d.success ? 'ok' : 'fail');
+        cachedBackups = null;
+        return;
+      }
+      if (d.line) {
+        term.textContent += d.line + '\n';
+        term.scrollTop = term.scrollHeight;
+      }
+    } catch (_) {}
+  };
+
+  es.onerror = function() {
+    es.close();
+    opsEventSource = null;
+    currentOpId = null;
+    term.textContent += '\n--- Connection lost ---\n';
+    toast('Connection lost', 'error');
+    _setProgressState('ops-progress-bar', 'fail');
+  };
 }
 
 async function deleteBackup(project, env, name, hasLocal, hasOffsite) {
@@ -1586,7 +2042,7 @@
     target = 'offsite';
   }
   const label = target === 'both' ? 'local + offsite' : target;
-  if (!confirm(`Delete ${label} copy of ${name}?\n\nThis cannot be undone.`)) return;
+  if (!await showConfirmDialog(`Delete ${label} copy of ${name}?\n\nThis cannot be undone.`, 'Delete', true)) return;
   toast('Deleting backup (' + label + ')...', 'info');
   try {
     await api(`/api/backups/${encodeURIComponent(project)}/${encodeURIComponent(env)}/${encodeURIComponent(name)}?target=${target}`, { method: 'DELETE' });
@@ -1594,6 +2050,35 @@
     cachedBackups = null;
     if (currentPage === 'backups') renderBackups();
   } catch (e) { toast('Delete failed: ' + e.message, 'error'); }
+}
+
+function showConfirmDialog(message, confirmLabel = 'Confirm', isDanger = false) {
+  return new Promise(resolve => {
+    const overlay = document.createElement('div');
+    overlay.style.cssText = 'position:fixed;inset:0;background:rgba(0,0,0,0.6);backdrop-filter:blur(2px);display:flex;align-items:center;justify-content:center;z-index:9999;animation:fadeIn 0.15s ease-out;';
+    const box = document.createElement('div');
+    box.style.cssText = 'background:#1e293b;border:1px solid #334155;border-radius:0.75rem;padding:1.5rem;min-width:320px;max-width:420px;color:#e2e8f0;animation:modalIn 0.2s ease-out;';
+    const btnClass = isDanger ? 'btn btn-danger' : 'btn btn-primary';
+    box.innerHTML = `
+      <p style="margin:0 0 1.25rem;font-size:0.9rem;color:#d1d5db;white-space:pre-line;">${esc(message)}</p>
+      <div style="display:flex;gap:0.75rem;justify-content:flex-end;">
+        <button class="btn btn-ghost" data-action="cancel">Cancel</button>
+        <button class="${btnClass}" data-action="confirm">${esc(confirmLabel)}</button>
+      </div>`;
+    overlay.appendChild(box);
+    document.body.appendChild(overlay);
+    box.addEventListener('click', e => {
+      const btn = e.target.closest('[data-action]');
+      if (!btn) return;
+      document.body.removeChild(overlay);
+      resolve(btn.dataset.action === 'confirm');
+    });
+    overlay.addEventListener('click', e => {
+      if (e.target === overlay) { document.body.removeChild(overlay); resolve(false); }
+    });
+    const onKey = e => { if (e.key === 'Escape') { document.removeEventListener('keydown', onKey); document.body.removeChild(overlay); resolve(false); } };
+    document.addEventListener('keydown', onKey);
+  });
 }
 
 function showDeleteTargetDialog(name) {
@@ -1658,6 +2143,8 @@
     } else {
       hash = '/backups';
     }
+  } else if (currentPage === 'schedules') {
+    hash = '/schedules';
   } else if (currentPage === 'system') {
     hash = '/system';
   } else if (currentPage === 'operations') {
@@ -1708,6 +2195,8 @@
     document.querySelectorAll('#sidebar-nav .sidebar-link').forEach(el =>
       el.classList.toggle('active', el.dataset.page === 'backups'));
     renderPage();
+  } else if (page === 'schedules') {
+    showPage('schedules');
   } else if (page === 'system') {
     showPage('system');
   } else if (page === 'operations') {

--
Gitblit v1.3.1