huskies: merge 1149 story huskies health chat command — surface gateway, sled, matrix, creds, and build-hash status

2026-05-19 20:07:03 +00:00
parent 5d0801854c
commit 9a286315a3
6 changed files with 732 additions and 1 deletions
@@ -269,6 +269,11 @@ pub fn commands() -> &'static [BotCommand] {
            description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
            handler: handle_cleanup_worktrees_fallback,
        },
+        BotCommand {
+            name: "health",
+            description: "Show subsystem health: gateway, sled, matrix-sync, creds, and build-hash",
+            handler: handle_health_fallback,
+        },
        BotCommand {
            name: "new",
            description: "Bootstrap a new project container (gateway only): `new project <name>`",
@@ -446,6 +451,16 @@ fn handle_project_rebuild_fallback(_ctx: &CommandContext) -> Option<String> {
    None
 }

+/// Fallback handler for the `health` command when it is not intercepted by the
+/// async handler in `on_room_message`.  In practice this is never called — health
+/// is detected and handled before `try_handle_command` is invoked.  The entry
+/// exists in the registry only so `help` lists it.
+///
+/// Returns `None` to prevent the LLM from receiving "health" as a prompt.
+fn handle_health_fallback(_ctx: &CommandContext) -> Option<String> {
+    None
+}
+
 // ---------------------------------------------------------------------------
 // Tests
 // ---------------------------------------------------------------------------
@@ -6,6 +6,7 @@ use crate::services::Services;
 use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
 use std::collections::{BTreeMap, HashSet, VecDeque};
 use std::sync::Arc;
+use std::sync::atomic::AtomicI64;
 use tokio::sync::Mutex as TokioMutex;
 use tokio::sync::RwLock;

@@ -104,6 +105,10 @@ pub struct BotContext {
    /// Used by the "rebuild gateway" command to construct the health-check URL
    /// passed to the trampoline.  `None` in standalone single-project mode.
    pub gateway_port: Option<u16>,
+    /// Timestamp (ms since Unix epoch) of the last Matrix event received in any
+    /// configured room.  Updated atomically on every `on_room_message` call so
+    /// the `health` command can detect a stale or dead sync loop.
+    pub last_matrix_event_ms: Arc<AtomicI64>,
 }

 impl BotContext {
@@ -299,6 +304,7 @@ mod tests {
                SEEN_EVENT_IDS_CAP,
            ))),
            gateway_port: None,
+            last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
        }
    }

@@ -19,6 +19,20 @@ use super::super::verification::check_sender_verified;

 use super::handle_message;

+/// Return `true` when the message is a `health` command addressed to the bot.
+///
+/// Recognised case-insensitively as the single word `health` after stripping the bot
+/// mention prefix.  Any trailing whitespace is ignored; extra arguments are not
+/// expected and are silently discarded.
+fn extract_health_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool {
+    let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
+    let trimmed = stripped
+        .trim()
+        .trim_start_matches(|c: char| !c.is_alphanumeric());
+    let cmd = trimmed.split_whitespace().next().unwrap_or("");
+    cmd.eq_ignore_ascii_case("health")
+}
+
 /// Return `true` when the message is a "rebuild gateway" command addressed to the bot.
 ///
 /// The command is recognised case-insensitively as `rebuild gateway` after stripping
@@ -100,6 +114,12 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
        return;
    }

+    // Update last-event timestamp so the `health` command can detect a stale sync loop.
+    ctx.last_matrix_event_ms.store(
+        chrono::Utc::now().timestamp_millis(),
+        std::sync::atomic::Ordering::Relaxed,
+    );
+
    // Ignore the bot's own messages to prevent echo loops.
    if ev.sender == ctx.matrix_user_id {
        return;
@@ -249,6 +269,7 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
            "config",
            "project-rebuild",
            "upgrade",
+            "health",
        ];

        let stripped = crate::chat::util::strip_bot_mention(
@@ -546,6 +567,26 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
        return;
    }

+    // `health` — async subsystem health report (gateway + standalone).
+    if extract_health_command(
+        &user_message,
+        &ctx.services.bot_name,
+        ctx.matrix_user_id.as_str(),
+    ) {
+        slog!("[matrix-bot] Handling 'health' from {sender}");
+        let response = super::super::super::health::run_health_check(&ctx).await;
+        let html = markdown_to_html(&response);
+        if let Ok(msg_id) = ctx
+            .transport
+            .send_message(&room_id_str, &response, &html)
+            .await
+            && let Ok(event_id) = msg_id.parse()
+        {
+            ctx.bot_sent_event_ids.lock().await.insert(event_id);
+        }
+        return;
+    }
+
    // Check for bot-level commands (help, status, ambient, …) before invoking
    // the LLM.  All commands are registered in commands.rs — no special-casing
    // needed here.
@@ -6,7 +6,7 @@ use matrix_sdk::ruma::OwnedRoomId;
 use matrix_sdk::{Client, LoopCtrl, config::SyncSettings};
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering};
 use tokio::sync::Mutex as TokioMutex;
 use tokio::sync::{RwLock, watch};

@@ -336,6 +336,7 @@ pub async fn run_bot(
            super::context::SEEN_EVENT_IDS_CAP,
        ))),
        gateway_port,
+        last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
    };

    slog!(
@@ -0,0 +1,666 @@
+//! `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
+//!
+//! Runs one check per subsystem concurrently (each with a 5-second timeout) and
+//! returns a compact report: one line per subsystem with PASS / WARN / FAIL and a
+//! remediation hint on every non-PASS row.  Output is capped at 20 lines; when
+//! more lines would be produced, the oldest WARN rows are dropped first.
+
+use crate::chat::transport::matrix::bot::context::BotContext;
+use std::collections::BTreeMap;
+use std::sync::atomic::Ordering;
+use std::time::Duration;
+use tokio::time::timeout;
+
+// ── Status ─────────────────────────────────────────────────────────────────────
+
+/// Health status for a single subsystem.
+#[derive(Debug, Clone, PartialEq)]
+enum Status {
+    /// Subsystem is operating normally.
+    Pass,
+    /// Subsystem is degraded but not fully broken.
+    Warn,
+    /// Subsystem has failed and needs intervention.
+    Fail,
+}
+
+// ── HealthLine ─────────────────────────────────────────────────────────────────
+
+/// One output row from the health check.
+#[derive(Debug, Clone)]
+struct HealthLine {
+    subsystem: String,
+    status: Status,
+    /// Short description of why the check is non-PASS.
+    detail: Option<String>,
+    /// Remediation hint shown after " — " on WARN/FAIL rows.
+    hint: Option<String>,
+}
+
+impl HealthLine {
+    fn pass(subsystem: impl Into<String>) -> Self {
+        Self {
+            subsystem: subsystem.into(),
+            status: Status::Pass,
+            detail: None,
+            hint: None,
+        }
+    }
+
+    fn warn(
+        subsystem: impl Into<String>,
+        detail: impl Into<String>,
+        hint: impl Into<String>,
+    ) -> Self {
+        Self {
+            subsystem: subsystem.into(),
+            status: Status::Warn,
+            detail: Some(detail.into()),
+            hint: Some(hint.into()),
+        }
+    }
+
+    fn fail(
+        subsystem: impl Into<String>,
+        detail: impl Into<String>,
+        hint: impl Into<String>,
+    ) -> Self {
+        Self {
+            subsystem: subsystem.into(),
+            status: Status::Fail,
+            detail: Some(detail.into()),
+            hint: Some(hint.into()),
+        }
+    }
+
+    /// Format as a single Markdown-friendly line.
+    fn format(&self) -> String {
+        let status = match self.status {
+            Status::Pass => "PASS",
+            Status::Warn => "WARN",
+            Status::Fail => "FAIL",
+        };
+        match (&self.detail, &self.hint) {
+            (Some(d), Some(h)) => format!("{} {}: {} — {}", self.subsystem, status, d, h),
+            (Some(d), None) => format!("{} {}: {}", self.subsystem, status, d),
+            (None, None) => format!("{} {}", self.subsystem, status),
+            (None, Some(h)) => format!("{} {}: — {}", self.subsystem, status, h),
+        }
+    }
+}
+
+// ── Truncation ────────────────────────────────────────────────────────────────
+
+/// Maximum number of output lines before truncation.
+const MAX_LINES: usize = 20;
+
+/// Truncate to ≤ MAX_LINES by removing the oldest (first in order) WARN rows.
+fn truncate_lines(mut lines: Vec<HealthLine>) -> Vec<HealthLine> {
+    while lines.len() > MAX_LINES {
+        if let Some(pos) = lines.iter().position(|l| l.status == Status::Warn) {
+            lines.remove(pos);
+        } else {
+            break;
+        }
+    }
+    lines
+}
+
+// ── Individual checks ────────────────────────────────────────────────────────
+
+/// Check the `perm_rx` receiver — PASS when the permission listener holds the lock,
+/// FAIL when no task is holding it (listener has died or was never started).
+fn check_perm_rx(ctx: &BotContext) -> HealthLine {
+    if ctx.services.perm_rx.try_lock().is_err() {
+        HealthLine::pass("perm_rx")
+    } else {
+        HealthLine::fail("perm_rx", "listener not holding lock", "restart bot")
+    }
+}
+
+/// Check the Matrix sync loop by measuring the age of the last received event.
+///
+/// WARN after 60 s of silence, FAIL after 120 s.  The timestamp is updated by
+/// `on_room_message` on every incoming event so receiving the health command
+/// itself resets the clock.
+fn check_matrix_sync(ctx: &BotContext) -> HealthLine {
+    let last_ms = ctx.last_matrix_event_ms.load(Ordering::Relaxed);
+    let age_secs = (chrono::Utc::now().timestamp_millis() - last_ms).max(0) / 1000;
+
+    if age_secs < 60 {
+        HealthLine::pass("matrix-sync")
+    } else if age_secs < 120 {
+        HealthLine::warn(
+            "matrix-sync",
+            format!("no events in {age_secs}s"),
+            "check sync loop — may be a quiet room",
+        )
+    } else {
+        HealthLine::fail(
+            "matrix-sync",
+            format!("no events in {age_secs}s"),
+            "sync loop may be dead — restart bot",
+        )
+    }
+}
+
+/// Check LLM credentials (`~/.claude/.credentials.json`).
+///
+/// FAIL if the file is missing or unreadable, FAIL if the access token is
+/// expired, WARN if it expires within the next 7 days.
+fn check_creds() -> HealthLine {
+    match crate::llm::oauth::read_credentials() {
+        Err(e) => HealthLine::fail("creds", e, "run `claude login`"),
+        Ok(creds) => {
+            let now_secs = std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap_or_default()
+                .as_secs();
+            let expires_at = creds.claude_ai_oauth.expires_at;
+            if expires_at < now_secs {
+                HealthLine::fail("creds", "token expired", "run `claude login` to refresh")
+            } else {
+                let days_left = (expires_at - now_secs) / 86400;
+                if days_left < 7 {
+                    HealthLine::warn(
+                        "creds",
+                        format!("token expires in {days_left}d"),
+                        "run `claude login` to refresh",
+                    )
+                } else {
+                    HealthLine::pass("creds")
+                }
+            }
+        }
+    }
+}
+
+/// Compare the compile-time build hash against the current HEAD of the workspace.
+///
+/// WARN when master has advanced past the running binary's commit (a rebuild is
+/// available but not urgent).  PASS when hashes match or HEAD cannot be read.
+async fn check_build_hash(project_root: &std::path::Path) -> HealthLine {
+    let running = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
+
+    // Read current HEAD from git (non-blocking, run in a spawn_blocking call).
+    let repo_root = project_root.to_path_buf();
+    let head = tokio::task::spawn_blocking(move || {
+        std::process::Command::new("git")
+            .args(["rev-parse", "--short", "HEAD"])
+            .current_dir(&repo_root)
+            .output()
+            .ok()
+            .filter(|o| o.status.success())
+            .and_then(|o| String::from_utf8(o.stdout).ok())
+            .map(|s| s.trim().to_string())
+    })
+    .await
+    .unwrap_or(None);
+
+    match head {
+        None => HealthLine::pass("build-hash"),
+        Some(ref head_hash) => {
+            if running == "unknown" || head_hash == running {
+                HealthLine::pass("build-hash")
+            } else {
+                HealthLine::warn(
+                    "build-hash",
+                    format!("running {running}, HEAD is {head_hash}"),
+                    "run `rebuild` to update",
+                )
+            }
+        }
+    }
+}
+
+/// Check each registered sled's `/health` endpoint with a 5-second timeout.
+///
+/// Returns one [`HealthLine`] per sled.  PASS when the sled responds with HTTP
+/// 2xx; FAIL when the request times out or returns an error status.
+async fn check_sleds(
+    store: &tokio::sync::RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>,
+) -> Vec<HealthLine> {
+    let entries: Vec<(String, Option<String>)> = store
+        .read()
+        .await
+        .iter()
+        .map(|(n, e)| (n.clone(), e.url.clone()))
+        .collect();
+
+    if entries.is_empty() {
+        return vec![HealthLine::warn(
+            "sled",
+            "no sleds registered",
+            "add projects to projects.toml",
+        )];
+    }
+
+    let client = reqwest::Client::new();
+    let mut lines = Vec::new();
+
+    for (name, url_opt) in entries {
+        let subsystem = format!("sled:{name}");
+        let line = match url_opt {
+            None => HealthLine::warn(subsystem, "no URL configured", "set url in projects.toml"),
+            Some(url) => {
+                let health_url = format!("{}/health", url.trim_end_matches('/'));
+                let result = timeout(Duration::from_secs(5), client.get(&health_url).send()).await;
+                match result {
+                    Err(_) => {
+                        HealthLine::fail(subsystem, "timed out", "check container is running")
+                    }
+                    Ok(Err(e)) => HealthLine::fail(
+                        subsystem,
+                        format!("unreachable: {}", short_error(&e.to_string())),
+                        "check container is running",
+                    ),
+                    Ok(Ok(resp)) if resp.status().is_success() => HealthLine::pass(subsystem),
+                    Ok(Ok(resp)) => HealthLine::fail(
+                        subsystem,
+                        format!("HTTP {}", resp.status().as_u16()),
+                        "check container logs",
+                    ),
+                }
+            }
+        };
+        lines.push(line);
+    }
+
+    lines
+}
+
+/// Check the gateway process: pidfile validity and (on macOS) binary codesign.
+///
+/// PASS when our PID is recorded in the pidfile.  On macOS, also verifies that
+/// `~/bin/huskies-bin` has a valid ad-hoc signature; FAIL with a `script/local-release`
+/// hint when it does not.
+fn check_gateway_process() -> HealthLine {
+    // Verify that the pidfile records our PID (i.e. this IS the live gateway).
+    let pidfile_ok = check_pidfile_matches_self();
+
+    // On macOS, verify the installed binary is codesigned.
+    #[cfg(target_os = "macos")]
+    {
+        if !check_codesign_macos() {
+            return HealthLine::fail(
+                "gateway-process",
+                "codesign invalid",
+                "run `script/local-release`",
+            );
+        }
+    }
+
+    if !pidfile_ok {
+        return HealthLine::warn(
+            "gateway-process",
+            "pidfile missing or stale",
+            "restart gateway with --gateway flag",
+        );
+    }
+
+    HealthLine::pass("gateway-process")
+}
+
+/// Return `true` when `$HOME/.huskies/gateway.pid` exists and contains our PID.
+fn check_pidfile_matches_self() -> bool {
+    let home = homedir::my_home().ok().flatten();
+    let home = match home {
+        Some(h) => h,
+        None => return false,
+    };
+    let path = home.join(".huskies").join("gateway.pid");
+    let content = std::fs::read_to_string(&path).unwrap_or_default();
+    content.trim().parse::<u32>().unwrap_or(0) == std::process::id()
+}
+
+/// On macOS, return `true` when `~/bin/huskies-bin` passes `codesign --verify`.
+///
+/// Falls back to the current executable when `~/bin/huskies-bin` does not exist.
+/// Returns `true` (assume ok) if the `codesign` tool is unavailable.
+#[cfg(target_os = "macos")]
+fn check_codesign_macos() -> bool {
+    let target = if let Ok(home) = std::env::var("HOME") {
+        let installed = std::path::PathBuf::from(home)
+            .join("bin")
+            .join("huskies-bin");
+        if installed.exists() {
+            installed
+        } else {
+            match std::env::current_exe() {
+                Ok(p) => p,
+                Err(_) => return true,
+            }
+        }
+    } else {
+        match std::env::current_exe() {
+            Ok(p) => p,
+            Err(_) => return true,
+        }
+    };
+
+    std::process::Command::new("codesign")
+        .args(["--verify", "--quiet", target.to_str().unwrap_or("")])
+        .output()
+        .map(|o| o.status.success())
+        .unwrap_or(true)
+}
+
+// ── Entry point ────────────────────────────────────────────────────────────────
+
+/// Run all health checks and return a formatted Markdown report (≤ 20 lines).
+///
+/// Gateway-specific checks (gateway-process, per-sled probes) are included
+/// only when running in gateway mode.  All other checks run in every mode.
+pub async fn run_health_check(ctx: &BotContext) -> String {
+    let mut lines: Vec<HealthLine> = Vec::new();
+
+    // Gateway-only checks
+    if ctx.is_gateway() {
+        lines.push(check_gateway_process());
+        if let Some(ref store) = ctx.gateway_projects_store {
+            lines.extend(check_sleds(store).await);
+        }
+    }
+
+    // Shared checks — run concurrently where possible.
+    let perm_line = check_perm_rx(ctx);
+    let sync_line = check_matrix_sync(ctx);
+    let creds_line = check_creds();
+    let hash_line = check_build_hash(&ctx.services.project_root).await;
+
+    lines.push(perm_line);
+    lines.push(sync_line);
+    lines.push(creds_line);
+    lines.push(hash_line);
+
+    let lines = truncate_lines(lines);
+    lines
+        .iter()
+        .map(|l| l.format())
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+// ── Utilities ────────────────────────────────────────────────────────────────
+
+/// Shorten a long error string to the first 60 characters for compact display.
+fn short_error(s: &str) -> String {
+    s.chars().take(60).collect()
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // -- HealthLine formatting ------------------------------------------------
+
+    #[test]
+    fn pass_line_formats_without_detail() {
+        let line = HealthLine::pass("perm_rx");
+        assert_eq!(line.format(), "perm_rx PASS");
+    }
+
+    #[test]
+    fn fail_line_formats_with_detail_and_hint() {
+        let line = HealthLine::fail(
+            "gateway-process",
+            "codesign invalid",
+            "run script/local-release",
+        );
+        assert_eq!(
+            line.format(),
+            "gateway-process FAIL: codesign invalid — run script/local-release"
+        );
+    }
+
+    #[test]
+    fn warn_line_formats_with_detail_and_hint() {
+        let line = HealthLine::warn("build-hash", "running abc, HEAD is def", "run rebuild");
+        assert_eq!(
+            line.format(),
+            "build-hash WARN: running abc, HEAD is def — run rebuild"
+        );
+    }
+
+    // -- Truncation -----------------------------------------------------------
+
+    #[test]
+    fn truncate_drops_oldest_warn_first() {
+        let mut lines: Vec<HealthLine> = (0..22)
+            .map(|i| {
+                if i % 3 == 0 {
+                    HealthLine::fail(format!("sled:{i}"), "down", "fix it")
+                } else {
+                    HealthLine::warn(format!("check:{i}"), "slow", "investigate")
+                }
+            })
+            .collect();
+
+        // Manually insert a known WARN at position 0 and a FAIL at position 1
+        lines.insert(0, HealthLine::warn("oldest-warn", "stale", "restart"));
+        lines.insert(1, HealthLine::fail("important-fail", "broken", "fix"));
+
+        let result = truncate_lines(lines.clone());
+        assert!(
+            result.len() <= MAX_LINES,
+            "output must be ≤ {MAX_LINES} lines"
+        );
+
+        // FAILs must be preserved.
+        let fail_count = result.iter().filter(|l| l.status == Status::Fail).count();
+        let orig_fail_count = lines.iter().filter(|l| l.status == Status::Fail).count();
+        assert_eq!(
+            fail_count,
+            orig_fail_count.min(MAX_LINES),
+            "all FAIL lines must be kept when they fit"
+        );
+    }
+
+    #[test]
+    fn truncate_noop_when_under_limit() {
+        let lines: Vec<HealthLine> = (0..5).map(|i| HealthLine::pass(format!("s{i}"))).collect();
+        let result = truncate_lines(lines.clone());
+        assert_eq!(result.len(), 5);
+    }
+
+    #[test]
+    fn truncate_stops_at_fails_when_no_warns_left() {
+        // 25 FAIL lines — nothing to drop; output is clamped at MAX_LINES.
+        let lines: Vec<HealthLine> = (0..25)
+            .map(|i| HealthLine::fail(format!("s{i}"), "broken", "fix"))
+            .collect();
+        let result = truncate_lines(lines);
+        // When only FAILs are present, truncation stops because no WARNs can be removed.
+        assert_eq!(result.len(), 25, "FAILs are never dropped by truncation");
+    }
+
+    // -- perm_rx check --------------------------------------------------------
+
+    #[tokio::test]
+    async fn perm_rx_pass_when_locked() {
+        use crate::services::Services;
+        use std::sync::Arc;
+        use tokio::sync::Mutex as TokioMutex;
+
+        let (perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
+        let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
+
+        // Acquire the lock to simulate the permission listener holding it.
+        let _guard = perm_rx_arc.try_lock().unwrap();
+
+        // Build a minimal services bundle referencing our locked perm_rx.
+        let services = Arc::new(Services {
+            project_root: std::path::PathBuf::from("/tmp"),
+            agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
+            bot_name: "test".to_string(),
+            bot_user_id: "@bot:test".to_string(),
+            ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
+            perm_rx: Arc::clone(&perm_rx_arc),
+            pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
+            permission_timeout_secs: 120,
+            status: Arc::new(crate::service::status::StatusBroadcaster::new()),
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
+        });
+
+        // Build a minimal BotContext just to pass services.
+        let ctx = make_test_ctx(services);
+
+        let line = check_perm_rx(&ctx);
+        assert_eq!(
+            line.status,
+            Status::Pass,
+            "perm_rx should PASS when a task holds the lock"
+        );
+
+        drop(perm_tx); // suppress unused warning
+    }
+
+    #[tokio::test]
+    async fn perm_rx_fail_when_unlocked() {
+        use crate::services::Services;
+        use std::sync::Arc;
+        use tokio::sync::Mutex as TokioMutex;
+
+        let (_perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
+        let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
+        // Lock is NOT held by anyone.
+
+        let services = Arc::new(Services {
+            project_root: std::path::PathBuf::from("/tmp"),
+            agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
+            bot_name: "test".to_string(),
+            bot_user_id: "@bot:test".to_string(),
+            ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
+            perm_rx: Arc::clone(&perm_rx_arc),
+            pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
+            permission_timeout_secs: 120,
+            status: Arc::new(crate::service::status::StatusBroadcaster::new()),
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
+        });
+
+        let ctx = make_test_ctx(services);
+
+        let line = check_perm_rx(&ctx);
+        assert_eq!(
+            line.status,
+            Status::Fail,
+            "perm_rx should FAIL when no task holds the lock"
+        );
+    }
+
+    // -- matrix-sync check ----------------------------------------------------
+
+    #[tokio::test]
+    async fn matrix_sync_pass_when_recent() {
+        let services = crate::services::Services::new_test(
+            std::path::PathBuf::from("/tmp"),
+            "bot".to_string(),
+        );
+        let ctx = make_test_ctx(services);
+        // Set last event to just now.
+        ctx.last_matrix_event_ms
+            .store(chrono::Utc::now().timestamp_millis(), Ordering::Relaxed);
+        let line = check_matrix_sync(&ctx);
+        assert_eq!(line.status, Status::Pass);
+    }
+
+    #[tokio::test]
+    async fn matrix_sync_fail_when_stale() {
+        let services = crate::services::Services::new_test(
+            std::path::PathBuf::from("/tmp"),
+            "bot".to_string(),
+        );
+        let ctx = make_test_ctx(services);
+        // Simulate 200 seconds of silence.
+        let old_ms = chrono::Utc::now().timestamp_millis() - 200_000;
+        ctx.last_matrix_event_ms.store(old_ms, Ordering::Relaxed);
+        let line = check_matrix_sync(&ctx);
+        assert_eq!(line.status, Status::Fail);
+        assert!(
+            line.detail.as_deref().unwrap_or("").contains("200s")
+                || line.detail.as_deref().unwrap_or("").contains("s"),
+            "detail should mention age in seconds"
+        );
+    }
+
+    // -- creds check ----------------------------------------------------------
+
+    #[test]
+    fn creds_fail_when_file_missing() {
+        // In the test environment there is unlikely to be a ~/.claude/.credentials.json
+        // with a valid non-expired token, so we just confirm the function returns a
+        // HealthLine without panicking.
+        let line = check_creds();
+        // We don't assert a specific status — the check should not panic.
+        let _ = line.format();
+    }
+
+    // -- build_hash check -----------------------------------------------------
+
+    #[tokio::test]
+    async fn build_hash_pass_when_git_unavailable() {
+        // In a test environment without a git repo at /tmp/nonexistent, the check
+        // should gracefully return PASS rather than panicking.
+        let line = check_build_hash(std::path::Path::new("/tmp/nonexistent")).await;
+        // Should either PASS or produce a sensible result — must not panic.
+        let _ = line.format();
+    }
+
+    // -- health command registration ------------------------------------------
+
+    #[test]
+    fn health_command_registered_in_commands() {
+        let cmds = crate::chat::commands::commands();
+        assert!(
+            cmds.iter().any(|c| c.name == "health"),
+            "health must be registered in commands()"
+        );
+    }
+
+    #[test]
+    fn health_command_has_description() {
+        let cmds = crate::chat::commands::commands();
+        let cmd = cmds.iter().find(|c| c.name == "health").unwrap();
+        assert!(!cmd.description.is_empty());
+    }
+
+    // -- Helper ---------------------------------------------------------------
+
+    /// Build a minimal `BotContext` for testing purposes.
+    fn make_test_ctx(services: std::sync::Arc<crate::services::Services>) -> BotContext {
+        use std::collections::HashSet;
+        use std::sync::Arc;
+        use std::sync::atomic::AtomicI64;
+        use tokio::sync::Mutex as TokioMutex;
+
+        BotContext {
+            services,
+            matrix_user_id: "@bot:example.com".parse().unwrap(),
+            target_room_ids: vec![],
+            allowed_users: vec![],
+            history: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
+            history_size: 20,
+            bot_sent_event_ids: Arc::new(TokioMutex::new(HashSet::new())),
+            htop_sessions: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
+            transport: Arc::new(crate::chat::transport::whatsapp::WhatsAppTransport::new(
+                "test-phone".to_string(),
+                "test-token".to_string(),
+                "pipeline_notification".to_string(),
+            )),
+            timer_store: Arc::new(crate::service::timer::TimerStore::load(
+                std::path::PathBuf::from("/tmp/timers-health.json"),
+            )),
+            gateway_active_project: None,
+            gateway_projects_store: None,
+            handled_incoming_event_ids: Arc::new(TokioMutex::new(
+                crate::chat::transport::matrix::bot::context::SeenEventIds::new(
+                    crate::chat::transport::matrix::bot::context::SEEN_EVENT_IDS_CAP,
+                ),
+            )),
+            gateway_port: None,
+            last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
+        }
+    }
+}
@@ -25,6 +25,8 @@ pub mod commands;
 pub(crate) mod config;
 /// Story deletion command — handles `!delete` bot commands to remove work items.
 pub mod delete;
+/// `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
+pub mod health;
 /// htop-style agent monitor command — renders a live process table in Matrix.
 pub mod htop;
 /// `new project <name>` chat command — Phase 1 gateway project bootstrap.