#!/bin/bash
################################################################################
# nvr — Unified CLI for NVR Docker management
#
# Runs on the HOST. Shipped inside the backend image and synced to
# /opt/dividia/ on container startup, so it stays current with the
# running software version.
#
# Usage: nvr <command> [args...]
################################################################################

set -e

INSTALL_DIR="/opt/dividia"

# CO6/CO7 hosts install docker at /usr/local/bin/docker, which sudo's
# secure_path omits by default. Without this export, the `docker compose
# version` detection below fails when nvr is invoked via non-interactive
# sudo (e.g. cron, remote ssh exec) and the script exits with "Docker
# Compose not found". Prepending these dirs is safe on CO9/UB24 hosts
# too, where docker is at /usr/bin/docker.
#
# Default fallback (the :- branch) avoids a trailing colon when PATH is
# unset / empty, which bash would interpret as "current directory in
# PATH". Since this script then `cd "$INSTALL_DIR"` and INSTALL_DIR is
# /opt/dividia (writable by the dividia user per install-nvr.sh), a
# trailing-colon PATH would let any local dividia-user plant a malicious
# tar/grep/mv at /opt/dividia/<bin> and have it executed by root on the
# next `sudo nvr update` or scheduled cron run.
export PATH="/usr/local/bin:/usr/local/sbin:${PATH:-/usr/sbin:/usr/bin:/sbin:/bin}"

# Detect compose command: prefer v2 plugin, fall back to standalone v1
if docker compose version &>/dev/null 2>&1; then
    COMPOSE="docker compose"
elif command -v docker-compose &>/dev/null; then
    COMPOSE="docker-compose"
else
    echo "ERROR: Docker Compose not found"
    exit 1
fi

cd "$INSTALL_DIR"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

################################################################################
# Commands
#
# Windows-side access to `db / shell / channel <set> / vm-shell /
# migrate-scalewatcher` is gated at the SSH layer: the ro-key in
# ~/.ssh/authorized_keys uses a forced-command wrapper that only lets a
# subset of subcommands through, and the admin key is passphrase-protected
# on the Windows host. On Linux-host Docker installs there is no
# script-level gate — the tech is already authenticated as `dividia` via
# SSH public-key or console login before they ever type `nvr`.
################################################################################

cmd_status() {
    echo -e "${BLUE}=== NVR Docker Stack ===${NC}"
    $COMPOSE ps
    echo ""
    echo -e "${BLUE}=== Configuration ===${NC}"
    grep -E '^(CHANNEL|REGISTRY)=' .env 2>/dev/null || true
    echo ""
    echo -e "${BLUE}=== Update cron ===${NC}"
    if [[ -f /etc/cron.d/dividia-nvr-update ]]; then
        # `|| true` keeps cmd_status surviving a future `set -o pipefail`
        # roll-out: head closes early → grep gets SIGPIPE → exit 141.
        grep -E '^[0-9]+ ' /etc/cron.d/dividia-nvr-update 2>/dev/null | head -1 || true
    else
        echo "(update cron not installed; will land on next nvr update)"
    fi
}

cmd_logs() {
    $COMPOSE logs "$@"
}

cmd_update() {
    # Serialize concurrent invocations: the cron-fired `nvr update` and
    # an operator who types `sudo nvr update` at the same minute must
    # not race. Same-shell FD form: the kernel attaches the lock to
    # FD 9 and auto-releases when this process exits (normal exit,
    # SIGKILL, OOM, hardware reset, anything) — no stale-lockfile risk.
    # Non-blocking `-n`: if the lock is already held, the second
    # invocation exits 0 (no cron email) instead of piling up.
    # Skipped for non-root callers (dev mode, container) so /var/lock
    # permission errors don't break local iteration.
    if [[ $EUID -eq 0 ]]; then
        exec 9>/var/lock/nvr-update.lock
        flock -n 9 || {
            echo "another nvr update in progress, exiting"
            exit 0
        }
    fi

    # Idempotent maintenance crons. Land here (not just install-nvr.sh)
    # so NVRs that were installed before each cron existed pick them up
    # on their next update without needing a re-install.
    ensure_prune_cron
    ensure_update_cron

    # Self-heal containers that compose has lost track of. Without this,
    # a single untracked container aborts `compose up -d` mid-recreate
    # and leaves the stack half-broken in the field. See function for
    # full root-cause notes.
    repair_untracked_compose_containers

    echo "Pulling latest images..."
    $COMPOSE pull --quiet

    # Extract updated compose files + CLI from new backend image
    echo "Extracting updated files from backend image..."
    local image
    image=$($COMPOSE config --images 2>/dev/null | grep backend | head -1)
    if [[ -n "$image" ]]; then
        local cid
        cid=$(docker create "$image" 2>/dev/null) || true
        if [[ -n "$cid" ]]; then
            # Extract files via tar stream so they're owned by calling user
            # (not root). Use atomic write-temp-then-rename: a failed
            # `docker cp | tar -xO` would otherwise truncate the destination
            # to zero bytes via the `>` redirect, leaving (e.g.)
            # /usr/local/bin/nvr-ro-wrap as an empty file — which sshd's
            # forced-command exec can't run, locking out ALL Windows-side
            # ro-key SSH until manual recovery. Verify the temp file is
            # non-empty before swapping it in.
            local extract_files=(
                "/usr/share/nvr/compose/docker-compose.yml:$INSTALL_DIR/docker-compose.yml"
                "/usr/share/nvr/compose/docker-compose.prod.yml:$INSTALL_DIR/docker-compose.prod.yml"
                "/usr/share/nvr/bin/nvr:$INSTALL_DIR/nvr"
                "/usr/share/nvr/bin/install-nvr.sh:$INSTALL_DIR/install-nvr.sh"
                "/usr/local/bin/nvr-ro-wrap:/usr/local/bin/nvr-ro-wrap"
            )
            # Platform-specific compose overlays: refresh from the image only
            # when the host already has the file on disk. Presence at install
            # time = host needs this overlay (install-nvr.sh writes it
            # conditionally per OS). Without this, co6.yml / windows.yml fixes
            # never reach customers via `nvr update` — they stay on the
            # version that install-nvr.sh originally downloaded. CO9/UB24
            # hosts have no co6.yml on disk and the entry is skipped, so we
            # don't sprout an unused overlay on the wrong platform.
            local optional_extract_files=(
                "/usr/share/nvr/compose/docker-compose.co6.yml:$INSTALL_DIR/docker-compose.co6.yml"
                "/usr/share/nvr/compose/docker-compose.windows.yml:$INSTALL_DIR/docker-compose.windows.yml"
            )
            for entry in "${optional_extract_files[@]}"; do
                if [[ -f "${entry#*:}" ]]; then
                    extract_files+=("$entry")
                fi
            done
            for entry in "${extract_files[@]}"; do
                local src="${entry%%:*}" dest="${entry#*:}"
                local tmp="${dest}.new.$$"
                if docker cp "$cid:$src" - 2>/dev/null | tar -xO > "$tmp" && [[ -s "$tmp" ]]; then
                    mv -f "$tmp" "$dest"
                else
                    rm -f "$tmp" 2>/dev/null || true
                    echo "WARN: extract of $src failed or produced empty file; keeping existing $dest" >&2
                fi
            done
            chmod 555 "$INSTALL_DIR/nvr" "$INSTALL_DIR/install-nvr.sh" 2>/dev/null || true
            chmod 555 /usr/local/bin/nvr-ro-wrap 2>/dev/null || true
            docker rm "$cid" > /dev/null
            echo "Files updated from image."
        fi
    fi

    echo "Restarting services..."
    # --remove-orphans: when an NVR migrates off watchtower for the first
    # time, the existing dividia-nvr-watchtower-1 container becomes an
    # orphan of the new compose project (the watchtower service block
    # was stripped from prod.yml in feature/replace-watchtower-with-nvr-cron).
    # Without --remove-orphans, compose emits a "Found orphan containers"
    # warning but leaves it alive — cmd_update_disable_watchtower then has
    # to catch it at the tail of cmd_update. That tail path turned out to
    # be set-e-fragile (cs2427 2026-05-28: chain broke between
    # ensure_update_cron at L106 and the disable call, leaving cron
    # installed but watchtower running). With --remove-orphans, compose
    # itself kills the orphan as part of the upgrade.
    #
    # Safe across all dividia-nvr-* compose project owners: --remove-orphans
    # only touches containers labeled com.docker.compose.project=dividia-nvr
    # that aren't declared in the current compose files. Other compose
    # projects on the host (homebridge, dragon-pilot, etc.) carry their
    # own project label and stay untouched.
    local compose_up_ok=0
    $COMPOSE up -d --quiet-pull --remove-orphans && compose_up_ok=1 || true

    # Reclaim disk from images obsoleted by the pull. 168h = 7 days:
    # anything not used for a week is gone. Closes the gap that used
    # to be covered partly by watchtower's WATCHTOWER_CLEANUP=true.
    # `|| true` so a prune failure (transient docker error, race with
    # another process pruning concurrently, etc.) doesn't break the
    # watchtower-drop contract below — that contract is gated on
    # compose-up success, not on cmd_prune success.
    cmd_prune --quiet || true

    # Atomic handoff from watchtower to cron-driven updates. Gated on the
    # explicit compose_up_ok flag, not on `set -e` chain reaching this
    # line. cs2427 2026-05-28: prior version sat at the tail of cmd_update
    # under set -e and silently never ran, even though compose up had
    # succeeded — leaving the migrated NVR half-handed-off (cron in place,
    # watchtower still active). See
    # operational_nvr_update_first_migration_skip_watchtower_drop.md.
    #
    # If compose-up succeeded we know the new stack is at least running
    # (subsequent health checks may still fail, but watchtower can't help
    # with that either — watchtower 1.7.1 is the broken version we're
    # trying to escape). If compose-up failed, leave watchtower in place
    # as the rollback fallback (original atomic-handoff intent).
    if [[ $compose_up_ok -eq 1 ]]; then
        cmd_update_disable_watchtower
    fi

    echo ""
    echo "Update complete!"
    $COMPOSE ps
}

ensure_prune_cron() {
    local cron_path="/etc/cron.d/dividia-docker-prune"
    local cron_body=$(cat <<'EOF'
# Daily Docker image prune for Dividia NVR.
#
# Reclaims disk from images that have been replaced (by watchtower auto-pull,
# by `nvr update`, by `nvr channel <new>` + manual `docker compose pull`, or
# by ad-hoc operator pulls). Two passes:
#   - dangling (no -a): immediately removes <none>:<none> images that were
#     replaced. Safe by definition; nothing references them.
#   - 168h time-filtered (-a): catches still-tagged images that haven't
#     been used in a week.
# Without the dangling pass, active dev iteration on dev/dev-* channels
# accumulates GB of dangling images that the time filter holds for a week.
#
# Owned by /usr/share/nvr/bin/nvr ensure_prune_cron; do not edit by hand.
SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
30 3 * * * root (/usr/bin/docker image prune -f; /usr/bin/docker image prune -a -f --filter "until=168h") >/var/log/dividia-docker-prune.log 2>&1
EOF
)
    # Skip if not root (containers, dev mode); only the bare-metal install
    # needs the cron managed.
    if [[ $EUID -ne 0 ]]; then
        if ! sudo -n true 2>/dev/null; then return 0; fi
        if [[ -f "$cron_path" ]] && sudo cmp -s <(echo "$cron_body") "$cron_path"; then return 0; fi
        echo "$cron_body" | sudo tee "$cron_path" >/dev/null && \
            sudo chmod 0644 "$cron_path"
    else
        if [[ -f "$cron_path" ]] && cmp -s <(echo "$cron_body") "$cron_path"; then return 0; fi
        echo "$cron_body" > "$cron_path" && chmod 0644 "$cron_path"
    fi
}

# Compute a stable per-host minute jitter in [0,59] so the fleet
# doesn't all hit DockerHub at the same instant. Primary source:
# the bSerial (customer ID) recorded in dvs.conf at install time —
# stable across reboots, replayable for support. Fallback: a hash
# of the hostname so even a zero-ID dev install gets spread.
#
# Echoes the integer to stdout. Always produces a value in [0,59];
# never errors. Safe to invoke from cron-install paths.
update_cron_jitter() {
    local dvs_conf="/opt/dividia/data/config/dvs.conf"
    local id=""
    if [[ -f "$dvs_conf" ]]; then
        id=$(grep -E '^ID=' "$dvs_conf" 2>/dev/null \
             | head -1 \
             | sed -E 's/^ID="?([^"]*)"?.*$/\1/' \
             | tr -d '[:space:]')
    fi
    if [[ "$id" =~ ^[0-9]+$ ]] && [[ "$id" -gt 0 ]]; then
        echo $(( id % 60 ))
        return 0
    fi
    # Fallback: hostname hash. cksum is on every distro back to
    # CO6; awk does the modulo so the value lands in [0,59].
    local h
    h=$(hostname 2>/dev/null | cksum 2>/dev/null | awk '{print $1 % 60}')
    [[ -n "$h" ]] && echo "$h" || echo 0
}

ensure_update_cron() {
    local cron_path="/etc/cron.d/dividia-nvr-update"
    local jitter
    jitter=$(update_cron_jitter)
    # Note: $jitter interpolates because this heredoc is NOT quoted.
    # Everything else is a literal comment or PATH/SHELL line.
    local cron_body=$(cat <<EOF
# Daily nvr update for Dividia NVR.
#
# Replaces watchtower auto-pull (containrrr/watchtower 1.7.1, unmaintained
# since 2024-01 and known to strip com.docker.compose.project labels
# during recreate — see [[pitfall-watchtower-strips-compose-labels]]).
#
# nvr update is the right loop: self-heals orphaned compose project
# labels, pulls images, extracts updated compose + CLI from the new
# backend image, calls compose up -d, prunes obsolete images, then
# removes the watchtower container at the tail of a successful run.
#
# Schedule: 02:NN host-local, where NN = bSerial % 60 from dvs.conf.
# Spreads the fleet across the 02:00-02:59 hour so DockerHub doesn't
# get a thundering herd. Concurrent operator-typed and cron-fired
# invocations serialize via flock inside cmd_update.
#
# Owned by /usr/share/nvr/bin/nvr ensure_update_cron; do not edit by hand.
SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
$jitter 2 * * * root /opt/dividia/nvr update >/var/log/dividia-nvr-update.log 2>&1
EOF
)
    # Same root + sudo idempotent-write pattern as ensure_prune_cron.
    if [[ $EUID -ne 0 ]]; then
        if ! sudo -n true 2>/dev/null; then return 0; fi
        if [[ -f "$cron_path" ]] && sudo cmp -s <(echo "$cron_body") "$cron_path"; then return 0; fi
        echo "$cron_body" | sudo tee "$cron_path" >/dev/null && \
            sudo chmod 0644 "$cron_path"
    else
        if [[ -f "$cron_path" ]] && cmp -s <(echo "$cron_body") "$cron_path"; then return 0; fi
        echo "$cron_body" > "$cron_path" && chmod 0644 "$cron_path"
    fi
}

# Atomic handoff from watchtower to cron. Invoked from cmd_update after
# `$COMPOSE up -d --quiet-pull --remove-orphans` returns success — gated on
# an explicit `compose_up_ok` flag, not on the surrounding `set -e` chain.
# The flag-based gate exists because cs2427 2026-05-28 surfaced a silent
# chain-break: cron was installed (L106 of cmd_update fires) but this
# function never ran (set -e aborted somewhere between L106 and the
# original tail-of-function call site). See
# operational_nvr_update_first_migration_skip_watchtower_drop.md.
#
# Always-attempt + silent-on-missing: handles fresh-install (no watchtower
# yet), already-removed (subsequent updates), and partial states (e.g.
# previous disable killed the container but `docker rm` raced with a
# manual cleanup). The `|| true` is load-bearing — a missing-container
# error here under set -e would abort cmd_update mid-flight on every
# steady-state run.
cmd_update_disable_watchtower() {
    docker stop dividia-nvr-watchtower-1 2>/dev/null || true
    docker rm dividia-nvr-watchtower-1 2>/dev/null || true
}

cmd_prune() {
    local quiet=0
    if [[ "${1:-}" == "--quiet" ]]; then quiet=1; fi

    # Two-pass prune. Dangling-only pass runs first with no time filter:
    # those <none>:<none> images are by definition replaced/orphaned and
    # immediately safe to remove. The 168h pass catches still-tagged
    # images that haven't been used in a week. Without the dangling pass,
    # active dev iteration (e.g. a feature channel rebuilt several times
    # in one day) accumulates GB of dangling images that the time filter
    # won't release until a week later — surfaced when cs256 hit /opt full
    # mid-pilot.
    [[ $quiet -eq 0 ]] && echo "Pruning unused images..."
    local out
    out=$(docker image prune -f 2>&1; docker image prune -a -f --filter "until=168h" 2>&1) || {
        [[ $quiet -eq 0 ]] && echo -e "${RED}prune failed:${NC} $out"
        return 0
    }

    # Sum the two "Total reclaimed space:" lines into a single number.
    # docker prints sizes like "4.154GB" or "523.1MB"; convert to bytes,
    # add, format back so the user sees one tidy number.
    local total_bytes=0 line size unit bytes
    while IFS= read -r line; do
        size=$(echo "$line" | sed -E 's/^Total reclaimed space: //; s/([0-9.]+)([A-Za-z]+)$/\1 \2/')
        [[ -z "$size" ]] && continue
        bytes=$(awk -v s="$size" 'BEGIN {
            split(s, parts, " ")
            n = parts[1] + 0
            unit = parts[2]
            mult = 1
            if (unit == "kB" || unit == "KB") mult = 1024
            else if (unit == "MB") mult = 1024 * 1024
            else if (unit == "GB") mult = 1024 * 1024 * 1024
            else if (unit == "TB") mult = 1024 * 1024 * 1024 * 1024
            printf "%d", n * mult
        }')
        total_bytes=$((total_bytes + bytes))
    done < <(echo "$out" | grep -E "^Total reclaimed space")

    local reclaimed
    reclaimed=$(awk -v b="$total_bytes" 'BEGIN {
        if (b >= 1024^3)      printf "Total reclaimed space: %.2fGB", b / 1024^3
        else if (b >= 1024^2) printf "Total reclaimed space: %.1fMB", b / 1024^2
        else if (b >= 1024)   printf "Total reclaimed space: %.1fkB", b / 1024
        else                  printf "Total reclaimed space: %dB", b
    }')

    if [[ $quiet -eq 1 ]]; then
        # Only print when we actually freed something, to keep nvr update output tight.
        [[ $total_bytes -gt 0 ]] && echo "$reclaimed"
    else
        echo "$reclaimed"
    fi
}

cmd_backup() {
    if ! $COMPOSE ps --format '{{.Service}}' 2>/dev/null | grep -q backend; then
        echo -e "${RED}ERROR: NVR backend container is not running${NC}"
        exit 1
    fi

    echo "Starting NVR backup..."
    $COMPOSE exec backend rda-db --backup
    local result=$?

    if [[ $result -ne 0 ]]; then
        echo -e "${RED}ERROR: Backup failed (exit code: $result)${NC}"
        exit 1
    fi

    local backup_dir
    backup_dir=$($COMPOSE exec -T backend sh -c 'ls -dt /videostore/vs1/backups/[0-9]* 2>/dev/null | head -1')
    [[ -n "$backup_dir" ]] && echo "Backup saved to: $backup_dir"

    # Save .env to VideoStore via backend container's /videostore bind mount
    if [[ -f .env ]]; then
        $COMPOSE cp .env backend:/videostore/vs1/backups/.env.save 2>/dev/null \
            && echo "Saved .env to VideoStore" \
            || echo -e "${YELLOW}WARN: Could not save .env to VideoStore${NC}"
    fi

    echo "Backup complete!"
}

cmd_channel() {
    if [[ -z "$1" ]]; then
        echo "Current channel: $(grep '^CHANNEL=' .env 2>/dev/null | cut -d= -f2)"
        echo ""
        echo "Usage: nvr channel <version|dev|dev-<suffix>>  (e.g., nvr channel 6.2)"
        return
    fi

    # Channel: "dev", "dev-<suffix>" (per-branch test channel), or major.minor (e.g., 6.2, 7.0).
    if [[ ! "$1" =~ ^(dev(-[a-z0-9-]+)?|[0-9]+\.[0-9]+)$ ]]; then
        echo -e "${RED}Invalid channel: $1 (must be 'dev', 'dev-<suffix>', or a version like 6.2)${NC}"; exit 1
    fi

    sed -i "s/^CHANNEL=.*/CHANNEL=$1/" .env
    echo "Channel switched to: $1"
    echo "Run 'nvr update' to pull images from the new channel."
}

cmd_db() {
    local db_pass
    db_pass=$(grep '^MYSQL_ROOT_PASSWORD=' .env 2>/dev/null | cut -d= -f2-)
    $COMPOSE exec db mariadb -u root -p"${db_pass:-lynn1094}" "$@" dtech
}

cmd_start() {
    repair_untracked_compose_containers
    $COMPOSE up -d
}

# Remove containers whose name belongs to this compose project but whose
# com.docker.compose.project label is missing or wrong. They look like
# orphans to compose, so the next `compose up -d` collides with their
# names ("Conflict. The container name '/dividia-nvr-<svc>-1' is already
# in use") and aborts mid-recreate, leaving the stack half-broken until
# the operator manually `docker rm -f`s the offender.
#
# Why a container ends up in this state:
#   - Older watchtower releases recreate containers via the Docker API
#     without preserving compose's labels. The new container has the
#     right name and image but no `com.docker.compose.project` — invisible
#     to `compose ps` / `compose up`. cs2585 hit this 2026-05 after
#     watchtower's session updated engine + connector four days earlier.
#   - Manual `docker run --name dividia-nvr-...` (rare).
#   - A killed-mid-create container left behind by a prior failed up.
#
# Removal is safe: anything matching the project name prefix is by
# convention owned by this compose project. The next `compose up -d`
# recreates it cleanly, restoring the label set.
repair_untracked_compose_containers() {
    local project="dividia-nvr"
    local untracked
    untracked=$(docker ps -a \
        --filter "name=^${project}-" \
        --format '{{.Names}}|{{.Label "com.docker.compose.project"}}' \
        2>/dev/null \
        | awk -F'|' -v p="$project" 'NF>=2 && $2 != p { print $1 }')
    if [[ -z "$untracked" ]]; then
        return 0
    fi
    local count
    count=$(printf '%s\n' "$untracked" | wc -l | tr -d ' ')
    echo -e "${YELLOW}Detected $count container(s) with missing/stale compose labels; removing so compose can recreate cleanly:${NC}" >&2
    while IFS= read -r name; do
        [[ -z "$name" ]] && continue
        echo "  - $name" >&2
        docker rm -f "$name" >/dev/null 2>&1 || \
            echo -e "    ${RED}WARN: failed to remove $name${NC}" >&2
    done <<< "$untracked"
}

cmd_stop() {
    $COMPOSE down
}

cmd_restart() {
    $COMPOSE restart "$@"
}

cmd_version() {
    echo -e "${BLUE}=== NVR Image Versions ===${NC}"
    for svc in $($COMPOSE config --services 2>/dev/null); do
        local cid
        cid=$($COMPOSE ps -q "$svc" 2>/dev/null)
        if [[ -n "$cid" ]]; then
            local ver commit channel
            ver=$(docker inspect --format '{{index .Config.Labels "org.opencontainers.image.version"}}' "$cid" 2>/dev/null || echo "?")
            commit=$(docker inspect --format '{{index .Config.Labels "org.opencontainers.image.revision"}}' "$cid" 2>/dev/null || echo "?")
            channel=$(docker inspect --format '{{index .Config.Labels "channel"}}' "$cid" 2>/dev/null || echo "?")
            printf "  %-12s version=%-8s commit=%-10s channel=%s\n" "$svc" "$ver" "$commit" "$channel"
        else
            printf "  %-12s (not running)\n" "$svc"
        fi
    done
}

cmd_shell() {
    local svc="${1:-backend}"
    $COMPOSE exec "$svc" bash
}

cmd_vm_shell() {
    # Called by the Windows nvr.cmd `nvr shell` branch. The Windows-side
    # admin-key passphrase prompt is what authenticated the tech; by the
    # time we reach this function, SSH has already accepted the admin
    # key and `sudo` has run without prompt. We drop back to the ssh
    # login user with `sudo -u dividia -i` — root would give an unexpected
    # shell with the wrong home/prompt.
    #
    # Hardcoded to `dividia` (not `${SUDO_USER}`) because cloud-init only
    # provisions the `dividia` account. Trusting SUDO_USER would silently
    # drop into whichever account sudo happened to be invoked from — if
    # ops later adds another sudoer (e.g. a `deployer` service account),
    # `nvr shell` would become identity-laundering. Fail loud instead.
    exec sudo -u dividia -i
}

cmd_migrate_scalewatcher() {
    # Import a legacy 2014 Windows Scale Watcher backup zip into this NVR.
    # $1 must be an absolute path to the zip file, either:
    #   - inside the backend container's /videostore mount (customer placed
    #     the zip on the Windows-host SMB share at C:\Dividia\VideoStore\
    #     migrate-staging\ — shows up as /videostore/vs1/migrate-staging/
    #     inside the container), OR
    #   - any Windows-host path — we docker cp it into the container
    # $2+ optional --dry-run flag.
    local zip_path="$1"
    local dry_flag=""
    shift || true
    for arg in "$@"; do
        case "$arg" in
            --dry-run) dry_flag="--dry-run" ;;
            *)
                echo -e "${RED}ERROR: unknown flag: $arg${NC}" >&2
                exit 1
                ;;
        esac
    done

    if [[ -z "$zip_path" ]]; then
        echo -e "${RED}ERROR: usage: nvr migrate-scalewatcher <path-to-zip> [--dry-run]${NC}" >&2
        exit 1
    fi

    # Defense against shell-injection via zip_path: reject anything that
    # isn't plain filesystem-path-looking (letters, digits, /, \, :, ., _,
    # -, space).  We do NOT shell-interpolate this value into bash -c
    # below, but keeping a strict charset avoids pushing the problem to
    # downstream tools that may be less careful.
    if [[ "$zip_path" =~ [\`\$\;\"\'\&\|\<\>] ]]; then
        echo -e "${RED}ERROR: zip path contains shell metacharacters${NC}" >&2
        exit 1
    fi
    # Defense against path-traversal: reject any ../ component.  Charset
    # guard above allows dots, so '/videostore/../etc/passwd' passes the
    # prefix check below.  realpath-based canonicalization would be
    # stronger but realpath(1) is inconsistent across BSD/GNU; the explicit
    # ..-rejection is portable and sufficient.
    if [[ "$zip_path" == *'/..'* || "$zip_path" == *'..'/* || "$zip_path" == '..' || "$zip_path" == *'/../'* ]]; then
        echo -e "${RED}ERROR: zip path contains .. traversal components${NC}" >&2
        exit 1
    fi

    if ! $COMPOSE ps --format '{{.Service}}' 2>/dev/null | grep -q backend; then
        echo -e "${RED}ERROR: backend container is not running${NC}" >&2
        exit 1
    fi

    # Stage inside the backend container so cleanup is bounded to one mount.
    local stage_dir="/tmp/migrate-scalewatcher-$$"
    $COMPOSE exec -T backend mkdir -p "$stage_dir"

    # Resolve the in-container path for the zip.  Inside the backend
    # container the SMB-mapped VideoStore share is mounted at /videostore/
    # vs1/ (the host-side path /mnt/videostore/ documented in the plan
    # does NOT exist inside the container).  If the caller handed us a
    # /videostore path, it's already container-local.  Everything else
    # goes through docker cp.
    local container_zip
    if [[ "$zip_path" == /videostore/* ]]; then
        container_zip="$zip_path"
    elif [[ "$zip_path" == /mnt/videostore/* ]]; then
        # Rewrite the documented host alias to the real container path
        container_zip="/videostore/vs1/${zip_path#/mnt/videostore/}"
    else
        # Arbitrary host path: docker cp into the stage dir
        container_zip="$stage_dir/scalewatcher-backup.zip"
        $COMPOSE cp "$zip_path" "backend:$container_zip" || {
            echo -e "${RED}ERROR: cannot copy $zip_path into backend container${NC}" >&2
            exit 1
        }
    fi

    # Pass zip path via env, not string interpolation — prevents shell
    # injection even if earlier guards are bypassed.  `bash -c '<script>' _
    # arg1 arg2` style with "$1"/"$2" inside the script is the one safe way
    # to forward user input through bash -c.
    # Use `|| result=$?` so `set -e` on the outer script doesn't abort
    # before we can capture the exit status and clean up stage_dir.  The
    # naive `cmd; local result=$?` idiom is dead code under set -e: if cmd
    # fails, the script exits immediately and the cleanup + pretty-error
    # block never run, leaving /tmp/migrate-scalewatcher-<pid> behind on
    # the backend container.
    local result=0
    $COMPOSE exec -T \
        -e MIGRATE_STAGE="$stage_dir" \
        -e MIGRATE_ZIP="$container_zip" \
        -e MIGRATE_DRY_FLAG="$dry_flag" \
        backend bash -c '
            set -e
            cd "$MIGRATE_STAGE"
            unzip -o "$MIGRATE_ZIP" -d unpacked/
            # Zip unpacks to a single top-level directory containing
            # manifest.json + dtech.sql.  Pin the exact expected shape
            # (exactly one top-level dir, with a manifest).
            mapfile -t export_dirs < <(find unpacked -maxdepth 1 -mindepth 1 -type d)
            if [[ ${#export_dirs[@]} -eq 0 ]]; then
                export_dir="unpacked"
            elif [[ ${#export_dirs[@]} -eq 1 ]]; then
                export_dir="${export_dirs[0]}"
            else
                echo "ERROR: zip contains multiple top-level directories" >&2
                exit 1
            fi
            if [[ ! -f "$export_dir/manifest.json" ]]; then
                echo "ERROR: manifest.json not found under $export_dir" >&2
                exit 1
            fi
            # Quote MIGRATE_DRY_FLAG to prevent word-splitting surprises if
            # parent-side validation ever loosens.  Empty string is a valid
            # argv that Python getopt/argparse rejects cleanly.
            if [[ -n "$MIGRATE_DRY_FLAG" ]]; then
                rda-db --migrate-scalewatcher "$export_dir" "$MIGRATE_DRY_FLAG"
            else
                rda-db --migrate-scalewatcher "$export_dir"
            fi
        ' || result=$?

    # Cleanup — always, even on failure, to avoid /tmp buildup.
    $COMPOSE exec -T backend rm -rf "$stage_dir" 2>/dev/null || true

    if [[ $result -ne 0 ]]; then
        echo -e "${RED}ERROR: migrate-scalewatcher failed (exit $result)${NC}" >&2
        exit 1
    fi

    if [[ -z "$dry_flag" ]]; then
        echo -e "${GREEN}Migration complete. Restarting services so new Camera/Device/POS config takes effect...${NC}"
        # Full restart: engine/playback/viewer all cache dvs.conf + DB
        # rows at startup and won't see imported data otherwise.  Skip db
        # (kept up).
        $COMPOSE restart backend engine connector playback viewer
    fi
}

cmd_find() {
    # Discover hosts on the local network using arp-scan in the backend
    # container.  Default mode auto-detects physical IPv4 NICs (eth*, en*,
    # bond*, br0) and runs arp-scan --localnet against each.  --interface
    # and --cidr override.  Vendor names come from the IEEE OUI database
    # bundled with the arp-scan package; duplicates are NOT deduplicated
    # so IP conflicts are visible.
    local iface=""
    local cidr=""

    while [[ $# -gt 0 ]]; do
        case "$1" in
            -i|--interface)
                if [[ -z "${2:-}" || "${2:0:1}" == "-" ]]; then
                    echo -e "${RED}ERROR: --interface requires a value${NC}" >&2
                    return 1
                fi
                iface="$2"; shift 2 ;;
            -c|--cidr)
                if [[ -z "${2:-}" || "${2:0:1}" == "-" ]]; then
                    echo -e "${RED}ERROR: --cidr requires a value${NC}" >&2
                    return 1
                fi
                cidr="$2"; shift 2 ;;
            -h|--help)
                cat <<'FINDHELP'
Usage: nvr find [options]

Discover hosts on the local network using arp-scan. Shows IP, MAC,
and vendor (looked up from the IEEE OUI database). Duplicates are
NOT deduplicated -- that's how you spot IP conflicts.

Options:
  -i, --interface IFACE   Scan only the named interface
  -c, --cidr CIDR         Scan a specific subnet (e.g. 192.168.0.0/24)
  -h, --help              Show this help

With no options, scans every IPv4-bearing physical NIC (eth*, en*,
bond*, br0). Docker bridges, veth, VPN tunnels are skipped.

Examples:
  nvr find                       # all physical NICs
  nvr find -i eth0               # eth0 only
  nvr find -c 10.0.0.0/24        # specific subnet on default iface
  nvr find -i eth1 -c 10.0.0.0/24
FINDHELP
                return 0 ;;
            *)
                echo -e "${RED}ERROR: unknown option: $1${NC}" >&2
                cmd_find --help >&2
                return 1 ;;
        esac
    done

    # Validate option values.  docker compose exec passes argv directly,
    # so shell-injection isn't possible -- this is defense in depth and
    # gives the user a clearer error than arp-scan's own complaint.
    # Iface regex anchors the first char to alnum so a value like `-rf`
    # can't survive validation and end up parsed as a flag by arp-scan.
    if [[ -n "$iface" && ! "$iface" =~ ^[a-zA-Z0-9][a-zA-Z0-9._-]*$ ]]; then
        echo -e "${RED}ERROR: interface name contains illegal characters${NC}" >&2
        return 1
    fi
    if [[ -n "$cidr" ]]; then
        if [[ ! "$cidr" =~ ^[0-9]{1,3}(\.[0-9]{1,3}){3}(/[0-9]{1,2})?$ ]]; then
            echo -e "${RED}ERROR: --cidr must be a dotted-quad with optional /mask${NC}" >&2
            return 1
        fi
        # Bounds-check octets and prefix.  Loose regex would accept
        # 999.999.999.999/99 and let arp-scan emit a less actionable
        # error 100ms later.
        local cidr_addr="${cidr%%/*}"
        local cidr_pfx="${cidr#*/}"
        [[ "$cidr_pfx" == "$cidr" ]] && cidr_pfx=""
        local IFS_save="$IFS"; IFS=.
        local octets=($cidr_addr)
        IFS="$IFS_save"
        for o in "${octets[@]}"; do
            if (( o > 255 )); then
                echo -e "${RED}ERROR: CIDR octet $o > 255${NC}" >&2
                return 1
            fi
        done
        if [[ -n "$cidr_pfx" ]] && (( cidr_pfx > 32 )); then
            echo -e "${RED}ERROR: CIDR prefix /$cidr_pfx > 32${NC}" >&2
            return 1
        fi
    fi

    # `grep -qx backend` (exact-line match) so a future service named
    # `backend-foo` doesn't accidentally satisfy the gate.
    if ! $COMPOSE ps --format '{{.Service}}' 2>/dev/null | grep -qx backend; then
        echo -e "${RED}ERROR: backend container is not running${NC}" >&2
        exit 1
    fi

    if [[ -n "$cidr" && -n "$iface" ]]; then
        $COMPOSE exec -T backend arp-scan -I "$iface" --plain "$cidr"
    elif [[ -n "$cidr" ]]; then
        $COMPOSE exec -T backend arp-scan --plain "$cidr"
    elif [[ -n "$iface" ]]; then
        $COMPOSE exec -T backend arp-scan -I "$iface" --localnet --plain
    else
        # Default: physical NICs only.  Filter is name-prefix + IPv4-
        # bearing.  Track per-iface success so an all-failed run exits
        # 1 (e.g. NET_RAW missing) -- partial success still exits 0.
        # No user input reaches the bash -c body; shell-quoting is fine.
        $COMPOSE exec -T backend bash -c '
            set -u
            mapfile -t candidates < <(
                ip -4 -o addr show \
                    | awk "{print \$2}" \
                    | sort -u \
                    | grep -E "^(eth|en|bond|br0)" \
                    | grep -vE "^(docker|veth|br-|tun|wg|virbr)"
            )
            if [[ ${#candidates[@]} -eq 0 ]]; then
                echo "ERROR: no physical IPv4 interfaces found (looking for eth*, en*, bond*, br0)" >&2
                exit 1
            fi
            ok=0
            first=1
            for i in "${candidates[@]}"; do
                [[ $first -eq 1 ]] || echo
                echo "=== $i ==="
                if arp-scan -I "$i" --localnet --plain; then
                    ok=$((ok+1))
                fi
                first=0
            done
            if [[ $ok -eq 0 ]]; then
                echo "ERROR: every interface scan failed (NET_RAW missing? arp-scan absent?)" >&2
                exit 1
            fi
            exit 0
        '
    fi
}

cmd_help() {
    # Quoted heredoc terminator (<<'EOF') disables variable expansion AND
    # command substitution inside the body. The line below referencing
    # `nvr update` in backticks would otherwise be evaluated as a command
    # substitution by bash on every `nvr help` invocation, recursively
    # invoking `nvr update` (= docker compose pull/up) for every help-text
    # render. Verified via `bash -x /opt/dividia/nvr help` showing
    # `++ nvr update` mid-cat. Don't drop the quotes again.
    cat <<'EOF'
NVR Docker Management CLI

Usage: nvr <command> [args...]

Commands:
  status              Show service status and configuration
  logs [svc] [-f]     View logs (pass-through to docker compose logs)
  update              Pull latest images, extract files, restart
  backup              Backup NVR to VideoStore
  channel [name]      Show or set update channel (setter gated on Windows
                      via admin-key passphrase; open on Linux)
  db [args...]        Connect to MariaDB (dtech database)
  start               Start all services
  stop                Stop all services
  restart [svc]       Restart all or specific service
  version             Show image version labels
  shell [svc]         Open a shell in a container
  find [options]      Scan local network for hosts (vendor + dup detection)
  prune               Reclaim disk from unused images >7 days old
                      (also runs automatically at the end of `nvr update`)
  migrate-scalewatcher <zip> [--dry-run]
                      Import legacy 2014 Scale Watcher export (see runbook)
  help                Show this help message

Examples:
  nvr status
  nvr logs backend -f --tail=50
  nvr update
  nvr backup
  nvr channel dev
  nvr channel dev-smartrec       # per-branch test channel
  nvr db -e "SELECT COUNT(*) FROM Camera"
  nvr restart engine
  nvr shell backend
  nvr find
  nvr find -i eth0
  nvr find -c 192.168.0.0/24
  nvr prune                                          # reclaim disk now
  nvr migrate-scalewatcher /videostore/vs1/migrate-staging/scalewatcher-backup.zip --dry-run

EOF
}

################################################################################
# Main
################################################################################

case "${1:-help}" in
    status)   shift; cmd_status "$@" ;;
    logs)     shift; cmd_logs "$@" ;;
    update)   shift; cmd_update "$@" ;;
    backup)   shift; cmd_backup "$@" ;;
    channel)  shift; cmd_channel "$@" ;;
    db)       shift; cmd_db "$@" ;;
    start)    shift; cmd_start "$@" ;;
    stop)     shift; cmd_stop "$@" ;;
    restart)  shift; cmd_restart "$@" ;;
    version)  shift; cmd_version "$@" ;;
    shell)    shift; cmd_shell "$@" ;;
    vm-shell) shift; cmd_vm_shell "$@" ;;
    find)     shift; cmd_find "$@" ;;
    prune)    shift; cmd_prune "$@" ;;
    migrate-scalewatcher) shift; cmd_migrate_scalewatcher "$@" ;;
    help|--help|-h) cmd_help ;;
    *)        echo "Unknown command: $1"; echo ""; cmd_help; exit 1 ;;
esac
