From 9a286315a3eff45ee1f15cec1c200c363bca2213 Mon Sep 17 00:00:00 2001 From: dave Date: Tue, 19 May 2026 20:07:03 +0000 Subject: [PATCH] =?UTF-8?q?huskies:=20merge=201149=20story=20`huskies=20he?= =?UTF-8?q?alth`=20chat=20command=20=E2=80=94=20surface=20gateway,=20sled,?= =?UTF-8?q?=20matrix,=20creds,=20and=20build-hash=20status?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/src/chat/commands/mod.rs | 15 + .../src/chat/transport/matrix/bot/context.rs | 6 + .../matrix/bot/messages/on_room_message.rs | 41 ++ server/src/chat/transport/matrix/bot/run.rs | 3 +- server/src/chat/transport/matrix/health.rs | 666 ++++++++++++++++++ server/src/chat/transport/matrix/mod.rs | 2 + 6 files changed, 732 insertions(+), 1 deletion(-) create mode 100644 server/src/chat/transport/matrix/health.rs diff --git a/server/src/chat/commands/mod.rs b/server/src/chat/commands/mod.rs index 09242b4c..5f4698c1 100644 --- a/server/src/chat/commands/mod.rs +++ b/server/src/chat/commands/mod.rs @@ -269,6 +269,11 @@ pub fn commands() -> &'static [BotCommand] { description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them", handler: handle_cleanup_worktrees_fallback, }, + BotCommand { + name: "health", + description: "Show subsystem health: gateway, sled, matrix-sync, creds, and build-hash", + handler: handle_health_fallback, + }, BotCommand { name: "new", description: "Bootstrap a new project container (gateway only): `new project `", @@ -446,6 +451,16 @@ fn handle_project_rebuild_fallback(_ctx: &CommandContext) -> Option { None } +/// Fallback handler for the `health` command when it is not intercepted by the +/// async handler in `on_room_message`. In practice this is never called — health +/// is detected and handled before `try_handle_command` is invoked. The entry +/// exists in the registry only so `help` lists it. +/// +/// Returns `None` to prevent the LLM from receiving "health" as a prompt. +fn handle_health_fallback(_ctx: &CommandContext) -> Option { + None +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- diff --git a/server/src/chat/transport/matrix/bot/context.rs b/server/src/chat/transport/matrix/bot/context.rs index b19128a0..d62586a0 100644 --- a/server/src/chat/transport/matrix/bot/context.rs +++ b/server/src/chat/transport/matrix/bot/context.rs @@ -6,6 +6,7 @@ use crate::services::Services; use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId}; use std::collections::{BTreeMap, HashSet, VecDeque}; use std::sync::Arc; +use std::sync::atomic::AtomicI64; use tokio::sync::Mutex as TokioMutex; use tokio::sync::RwLock; @@ -104,6 +105,10 @@ pub struct BotContext { /// Used by the "rebuild gateway" command to construct the health-check URL /// passed to the trampoline. `None` in standalone single-project mode. pub gateway_port: Option, + /// Timestamp (ms since Unix epoch) of the last Matrix event received in any + /// configured room. Updated atomically on every `on_room_message` call so + /// the `health` command can detect a stale or dead sync loop. + pub last_matrix_event_ms: Arc, } impl BotContext { @@ -299,6 +304,7 @@ mod tests { SEEN_EVENT_IDS_CAP, ))), gateway_port: None, + last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())), } } diff --git a/server/src/chat/transport/matrix/bot/messages/on_room_message.rs b/server/src/chat/transport/matrix/bot/messages/on_room_message.rs index 5bc842ce..b5d1f9a7 100644 --- a/server/src/chat/transport/matrix/bot/messages/on_room_message.rs +++ b/server/src/chat/transport/matrix/bot/messages/on_room_message.rs @@ -19,6 +19,20 @@ use super::super::verification::check_sender_verified; use super::handle_message; +/// Return `true` when the message is a `health` command addressed to the bot. +/// +/// Recognised case-insensitively as the single word `health` after stripping the bot +/// mention prefix. Any trailing whitespace is ignored; extra arguments are not +/// expected and are silently discarded. +fn extract_health_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool { + let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id); + let trimmed = stripped + .trim() + .trim_start_matches(|c: char| !c.is_alphanumeric()); + let cmd = trimmed.split_whitespace().next().unwrap_or(""); + cmd.eq_ignore_ascii_case("health") +} + /// Return `true` when the message is a "rebuild gateway" command addressed to the bot. /// /// The command is recognised case-insensitively as `rebuild gateway` after stripping @@ -100,6 +114,12 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message( return; } + // Update last-event timestamp so the `health` command can detect a stale sync loop. + ctx.last_matrix_event_ms.store( + chrono::Utc::now().timestamp_millis(), + std::sync::atomic::Ordering::Relaxed, + ); + // Ignore the bot's own messages to prevent echo loops. if ev.sender == ctx.matrix_user_id { return; @@ -249,6 +269,7 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message( "config", "project-rebuild", "upgrade", + "health", ]; let stripped = crate::chat::util::strip_bot_mention( @@ -546,6 +567,26 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message( return; } + // `health` — async subsystem health report (gateway + standalone). + if extract_health_command( + &user_message, + &ctx.services.bot_name, + ctx.matrix_user_id.as_str(), + ) { + slog!("[matrix-bot] Handling 'health' from {sender}"); + let response = super::super::super::health::run_health_check(&ctx).await; + let html = markdown_to_html(&response); + if let Ok(msg_id) = ctx + .transport + .send_message(&room_id_str, &response, &html) + .await + && let Ok(event_id) = msg_id.parse() + { + ctx.bot_sent_event_ids.lock().await.insert(event_id); + } + return; + } + // Check for bot-level commands (help, status, ambient, …) before invoking // the LLM. All commands are registered in commands.rs — no special-casing // needed here. diff --git a/server/src/chat/transport/matrix/bot/run.rs b/server/src/chat/transport/matrix/bot/run.rs index 680a5a85..7377aa36 100644 --- a/server/src/chat/transport/matrix/bot/run.rs +++ b/server/src/chat/transport/matrix/bot/run.rs @@ -6,7 +6,7 @@ use matrix_sdk::ruma::OwnedRoomId; use matrix_sdk::{Client, LoopCtrl, config::SyncSettings}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; use tokio::sync::Mutex as TokioMutex; use tokio::sync::{RwLock, watch}; @@ -336,6 +336,7 @@ pub async fn run_bot( super::context::SEEN_EVENT_IDS_CAP, ))), gateway_port, + last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())), }; slog!( diff --git a/server/src/chat/transport/matrix/health.rs b/server/src/chat/transport/matrix/health.rs new file mode 100644 index 00000000..28ac9c05 --- /dev/null +++ b/server/src/chat/transport/matrix/health.rs @@ -0,0 +1,666 @@ +//! `health` chat command — surface gateway, sled, matrix, creds, and build-hash status. +//! +//! Runs one check per subsystem concurrently (each with a 5-second timeout) and +//! returns a compact report: one line per subsystem with PASS / WARN / FAIL and a +//! remediation hint on every non-PASS row. Output is capped at 20 lines; when +//! more lines would be produced, the oldest WARN rows are dropped first. + +use crate::chat::transport::matrix::bot::context::BotContext; +use std::collections::BTreeMap; +use std::sync::atomic::Ordering; +use std::time::Duration; +use tokio::time::timeout; + +// ── Status ───────────────────────────────────────────────────────────────────── + +/// Health status for a single subsystem. +#[derive(Debug, Clone, PartialEq)] +enum Status { + /// Subsystem is operating normally. + Pass, + /// Subsystem is degraded but not fully broken. + Warn, + /// Subsystem has failed and needs intervention. + Fail, +} + +// ── HealthLine ───────────────────────────────────────────────────────────────── + +/// One output row from the health check. +#[derive(Debug, Clone)] +struct HealthLine { + subsystem: String, + status: Status, + /// Short description of why the check is non-PASS. + detail: Option, + /// Remediation hint shown after " — " on WARN/FAIL rows. + hint: Option, +} + +impl HealthLine { + fn pass(subsystem: impl Into) -> Self { + Self { + subsystem: subsystem.into(), + status: Status::Pass, + detail: None, + hint: None, + } + } + + fn warn( + subsystem: impl Into, + detail: impl Into, + hint: impl Into, + ) -> Self { + Self { + subsystem: subsystem.into(), + status: Status::Warn, + detail: Some(detail.into()), + hint: Some(hint.into()), + } + } + + fn fail( + subsystem: impl Into, + detail: impl Into, + hint: impl Into, + ) -> Self { + Self { + subsystem: subsystem.into(), + status: Status::Fail, + detail: Some(detail.into()), + hint: Some(hint.into()), + } + } + + /// Format as a single Markdown-friendly line. + fn format(&self) -> String { + let status = match self.status { + Status::Pass => "PASS", + Status::Warn => "WARN", + Status::Fail => "FAIL", + }; + match (&self.detail, &self.hint) { + (Some(d), Some(h)) => format!("{} {}: {} — {}", self.subsystem, status, d, h), + (Some(d), None) => format!("{} {}: {}", self.subsystem, status, d), + (None, None) => format!("{} {}", self.subsystem, status), + (None, Some(h)) => format!("{} {}: — {}", self.subsystem, status, h), + } + } +} + +// ── Truncation ──────────────────────────────────────────────────────────────── + +/// Maximum number of output lines before truncation. +const MAX_LINES: usize = 20; + +/// Truncate to ≤ MAX_LINES by removing the oldest (first in order) WARN rows. +fn truncate_lines(mut lines: Vec) -> Vec { + while lines.len() > MAX_LINES { + if let Some(pos) = lines.iter().position(|l| l.status == Status::Warn) { + lines.remove(pos); + } else { + break; + } + } + lines +} + +// ── Individual checks ──────────────────────────────────────────────────────── + +/// Check the `perm_rx` receiver — PASS when the permission listener holds the lock, +/// FAIL when no task is holding it (listener has died or was never started). +fn check_perm_rx(ctx: &BotContext) -> HealthLine { + if ctx.services.perm_rx.try_lock().is_err() { + HealthLine::pass("perm_rx") + } else { + HealthLine::fail("perm_rx", "listener not holding lock", "restart bot") + } +} + +/// Check the Matrix sync loop by measuring the age of the last received event. +/// +/// WARN after 60 s of silence, FAIL after 120 s. The timestamp is updated by +/// `on_room_message` on every incoming event so receiving the health command +/// itself resets the clock. +fn check_matrix_sync(ctx: &BotContext) -> HealthLine { + let last_ms = ctx.last_matrix_event_ms.load(Ordering::Relaxed); + let age_secs = (chrono::Utc::now().timestamp_millis() - last_ms).max(0) / 1000; + + if age_secs < 60 { + HealthLine::pass("matrix-sync") + } else if age_secs < 120 { + HealthLine::warn( + "matrix-sync", + format!("no events in {age_secs}s"), + "check sync loop — may be a quiet room", + ) + } else { + HealthLine::fail( + "matrix-sync", + format!("no events in {age_secs}s"), + "sync loop may be dead — restart bot", + ) + } +} + +/// Check LLM credentials (`~/.claude/.credentials.json`). +/// +/// FAIL if the file is missing or unreadable, FAIL if the access token is +/// expired, WARN if it expires within the next 7 days. +fn check_creds() -> HealthLine { + match crate::llm::oauth::read_credentials() { + Err(e) => HealthLine::fail("creds", e, "run `claude login`"), + Ok(creds) => { + let now_secs = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let expires_at = creds.claude_ai_oauth.expires_at; + if expires_at < now_secs { + HealthLine::fail("creds", "token expired", "run `claude login` to refresh") + } else { + let days_left = (expires_at - now_secs) / 86400; + if days_left < 7 { + HealthLine::warn( + "creds", + format!("token expires in {days_left}d"), + "run `claude login` to refresh", + ) + } else { + HealthLine::pass("creds") + } + } + } + } +} + +/// Compare the compile-time build hash against the current HEAD of the workspace. +/// +/// WARN when master has advanced past the running binary's commit (a rebuild is +/// available but not urgent). PASS when hashes match or HEAD cannot be read. +async fn check_build_hash(project_root: &std::path::Path) -> HealthLine { + let running = option_env!("BUILD_GIT_HASH").unwrap_or("unknown"); + + // Read current HEAD from git (non-blocking, run in a spawn_blocking call). + let repo_root = project_root.to_path_buf(); + let head = tokio::task::spawn_blocking(move || { + std::process::Command::new("git") + .args(["rev-parse", "--short", "HEAD"]) + .current_dir(&repo_root) + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) + }) + .await + .unwrap_or(None); + + match head { + None => HealthLine::pass("build-hash"), + Some(ref head_hash) => { + if running == "unknown" || head_hash == running { + HealthLine::pass("build-hash") + } else { + HealthLine::warn( + "build-hash", + format!("running {running}, HEAD is {head_hash}"), + "run `rebuild` to update", + ) + } + } + } +} + +/// Check each registered sled's `/health` endpoint with a 5-second timeout. +/// +/// Returns one [`HealthLine`] per sled. PASS when the sled responds with HTTP +/// 2xx; FAIL when the request times out or returns an error status. +async fn check_sleds( + store: &tokio::sync::RwLock>, +) -> Vec { + let entries: Vec<(String, Option)> = store + .read() + .await + .iter() + .map(|(n, e)| (n.clone(), e.url.clone())) + .collect(); + + if entries.is_empty() { + return vec![HealthLine::warn( + "sled", + "no sleds registered", + "add projects to projects.toml", + )]; + } + + let client = reqwest::Client::new(); + let mut lines = Vec::new(); + + for (name, url_opt) in entries { + let subsystem = format!("sled:{name}"); + let line = match url_opt { + None => HealthLine::warn(subsystem, "no URL configured", "set url in projects.toml"), + Some(url) => { + let health_url = format!("{}/health", url.trim_end_matches('/')); + let result = timeout(Duration::from_secs(5), client.get(&health_url).send()).await; + match result { + Err(_) => { + HealthLine::fail(subsystem, "timed out", "check container is running") + } + Ok(Err(e)) => HealthLine::fail( + subsystem, + format!("unreachable: {}", short_error(&e.to_string())), + "check container is running", + ), + Ok(Ok(resp)) if resp.status().is_success() => HealthLine::pass(subsystem), + Ok(Ok(resp)) => HealthLine::fail( + subsystem, + format!("HTTP {}", resp.status().as_u16()), + "check container logs", + ), + } + } + }; + lines.push(line); + } + + lines +} + +/// Check the gateway process: pidfile validity and (on macOS) binary codesign. +/// +/// PASS when our PID is recorded in the pidfile. On macOS, also verifies that +/// `~/bin/huskies-bin` has a valid ad-hoc signature; FAIL with a `script/local-release` +/// hint when it does not. +fn check_gateway_process() -> HealthLine { + // Verify that the pidfile records our PID (i.e. this IS the live gateway). + let pidfile_ok = check_pidfile_matches_self(); + + // On macOS, verify the installed binary is codesigned. + #[cfg(target_os = "macos")] + { + if !check_codesign_macos() { + return HealthLine::fail( + "gateway-process", + "codesign invalid", + "run `script/local-release`", + ); + } + } + + if !pidfile_ok { + return HealthLine::warn( + "gateway-process", + "pidfile missing or stale", + "restart gateway with --gateway flag", + ); + } + + HealthLine::pass("gateway-process") +} + +/// Return `true` when `$HOME/.huskies/gateway.pid` exists and contains our PID. +fn check_pidfile_matches_self() -> bool { + let home = homedir::my_home().ok().flatten(); + let home = match home { + Some(h) => h, + None => return false, + }; + let path = home.join(".huskies").join("gateway.pid"); + let content = std::fs::read_to_string(&path).unwrap_or_default(); + content.trim().parse::().unwrap_or(0) == std::process::id() +} + +/// On macOS, return `true` when `~/bin/huskies-bin` passes `codesign --verify`. +/// +/// Falls back to the current executable when `~/bin/huskies-bin` does not exist. +/// Returns `true` (assume ok) if the `codesign` tool is unavailable. +#[cfg(target_os = "macos")] +fn check_codesign_macos() -> bool { + let target = if let Ok(home) = std::env::var("HOME") { + let installed = std::path::PathBuf::from(home) + .join("bin") + .join("huskies-bin"); + if installed.exists() { + installed + } else { + match std::env::current_exe() { + Ok(p) => p, + Err(_) => return true, + } + } + } else { + match std::env::current_exe() { + Ok(p) => p, + Err(_) => return true, + } + }; + + std::process::Command::new("codesign") + .args(["--verify", "--quiet", target.to_str().unwrap_or("")]) + .output() + .map(|o| o.status.success()) + .unwrap_or(true) +} + +// ── Entry point ──────────────────────────────────────────────────────────────── + +/// Run all health checks and return a formatted Markdown report (≤ 20 lines). +/// +/// Gateway-specific checks (gateway-process, per-sled probes) are included +/// only when running in gateway mode. All other checks run in every mode. +pub async fn run_health_check(ctx: &BotContext) -> String { + let mut lines: Vec = Vec::new(); + + // Gateway-only checks + if ctx.is_gateway() { + lines.push(check_gateway_process()); + if let Some(ref store) = ctx.gateway_projects_store { + lines.extend(check_sleds(store).await); + } + } + + // Shared checks — run concurrently where possible. + let perm_line = check_perm_rx(ctx); + let sync_line = check_matrix_sync(ctx); + let creds_line = check_creds(); + let hash_line = check_build_hash(&ctx.services.project_root).await; + + lines.push(perm_line); + lines.push(sync_line); + lines.push(creds_line); + lines.push(hash_line); + + let lines = truncate_lines(lines); + lines + .iter() + .map(|l| l.format()) + .collect::>() + .join("\n") +} + +// ── Utilities ──────────────────────────────────────────────────────────────── + +/// Shorten a long error string to the first 60 characters for compact display. +fn short_error(s: &str) -> String { + s.chars().take(60).collect() +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + // -- HealthLine formatting ------------------------------------------------ + + #[test] + fn pass_line_formats_without_detail() { + let line = HealthLine::pass("perm_rx"); + assert_eq!(line.format(), "perm_rx PASS"); + } + + #[test] + fn fail_line_formats_with_detail_and_hint() { + let line = HealthLine::fail( + "gateway-process", + "codesign invalid", + "run script/local-release", + ); + assert_eq!( + line.format(), + "gateway-process FAIL: codesign invalid — run script/local-release" + ); + } + + #[test] + fn warn_line_formats_with_detail_and_hint() { + let line = HealthLine::warn("build-hash", "running abc, HEAD is def", "run rebuild"); + assert_eq!( + line.format(), + "build-hash WARN: running abc, HEAD is def — run rebuild" + ); + } + + // -- Truncation ----------------------------------------------------------- + + #[test] + fn truncate_drops_oldest_warn_first() { + let mut lines: Vec = (0..22) + .map(|i| { + if i % 3 == 0 { + HealthLine::fail(format!("sled:{i}"), "down", "fix it") + } else { + HealthLine::warn(format!("check:{i}"), "slow", "investigate") + } + }) + .collect(); + + // Manually insert a known WARN at position 0 and a FAIL at position 1 + lines.insert(0, HealthLine::warn("oldest-warn", "stale", "restart")); + lines.insert(1, HealthLine::fail("important-fail", "broken", "fix")); + + let result = truncate_lines(lines.clone()); + assert!( + result.len() <= MAX_LINES, + "output must be ≤ {MAX_LINES} lines" + ); + + // FAILs must be preserved. + let fail_count = result.iter().filter(|l| l.status == Status::Fail).count(); + let orig_fail_count = lines.iter().filter(|l| l.status == Status::Fail).count(); + assert_eq!( + fail_count, + orig_fail_count.min(MAX_LINES), + "all FAIL lines must be kept when they fit" + ); + } + + #[test] + fn truncate_noop_when_under_limit() { + let lines: Vec = (0..5).map(|i| HealthLine::pass(format!("s{i}"))).collect(); + let result = truncate_lines(lines.clone()); + assert_eq!(result.len(), 5); + } + + #[test] + fn truncate_stops_at_fails_when_no_warns_left() { + // 25 FAIL lines — nothing to drop; output is clamped at MAX_LINES. + let lines: Vec = (0..25) + .map(|i| HealthLine::fail(format!("s{i}"), "broken", "fix")) + .collect(); + let result = truncate_lines(lines); + // When only FAILs are present, truncation stops because no WARNs can be removed. + assert_eq!(result.len(), 25, "FAILs are never dropped by truncation"); + } + + // -- perm_rx check -------------------------------------------------------- + + #[tokio::test] + async fn perm_rx_pass_when_locked() { + use crate::services::Services; + use std::sync::Arc; + use tokio::sync::Mutex as TokioMutex; + + let (perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel(); + let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx)); + + // Acquire the lock to simulate the permission listener holding it. + let _guard = perm_rx_arc.try_lock().unwrap(); + + // Build a minimal services bundle referencing our locked perm_rx. + let services = Arc::new(Services { + project_root: std::path::PathBuf::from("/tmp"), + agents: Arc::new(crate::agents::AgentPool::new_test(3000)), + bot_name: "test".to_string(), + bot_user_id: "@bot:test".to_string(), + ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())), + perm_rx: Arc::clone(&perm_rx_arc), + pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())), + permission_timeout_secs: 120, + status: Arc::new(crate::service::status::StatusBroadcaster::new()), + chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)), + }); + + // Build a minimal BotContext just to pass services. + let ctx = make_test_ctx(services); + + let line = check_perm_rx(&ctx); + assert_eq!( + line.status, + Status::Pass, + "perm_rx should PASS when a task holds the lock" + ); + + drop(perm_tx); // suppress unused warning + } + + #[tokio::test] + async fn perm_rx_fail_when_unlocked() { + use crate::services::Services; + use std::sync::Arc; + use tokio::sync::Mutex as TokioMutex; + + let (_perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel(); + let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx)); + // Lock is NOT held by anyone. + + let services = Arc::new(Services { + project_root: std::path::PathBuf::from("/tmp"), + agents: Arc::new(crate::agents::AgentPool::new_test(3000)), + bot_name: "test".to_string(), + bot_user_id: "@bot:test".to_string(), + ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())), + perm_rx: Arc::clone(&perm_rx_arc), + pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())), + permission_timeout_secs: 120, + status: Arc::new(crate::service::status::StatusBroadcaster::new()), + chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)), + }); + + let ctx = make_test_ctx(services); + + let line = check_perm_rx(&ctx); + assert_eq!( + line.status, + Status::Fail, + "perm_rx should FAIL when no task holds the lock" + ); + } + + // -- matrix-sync check ---------------------------------------------------- + + #[tokio::test] + async fn matrix_sync_pass_when_recent() { + let services = crate::services::Services::new_test( + std::path::PathBuf::from("/tmp"), + "bot".to_string(), + ); + let ctx = make_test_ctx(services); + // Set last event to just now. + ctx.last_matrix_event_ms + .store(chrono::Utc::now().timestamp_millis(), Ordering::Relaxed); + let line = check_matrix_sync(&ctx); + assert_eq!(line.status, Status::Pass); + } + + #[tokio::test] + async fn matrix_sync_fail_when_stale() { + let services = crate::services::Services::new_test( + std::path::PathBuf::from("/tmp"), + "bot".to_string(), + ); + let ctx = make_test_ctx(services); + // Simulate 200 seconds of silence. + let old_ms = chrono::Utc::now().timestamp_millis() - 200_000; + ctx.last_matrix_event_ms.store(old_ms, Ordering::Relaxed); + let line = check_matrix_sync(&ctx); + assert_eq!(line.status, Status::Fail); + assert!( + line.detail.as_deref().unwrap_or("").contains("200s") + || line.detail.as_deref().unwrap_or("").contains("s"), + "detail should mention age in seconds" + ); + } + + // -- creds check ---------------------------------------------------------- + + #[test] + fn creds_fail_when_file_missing() { + // In the test environment there is unlikely to be a ~/.claude/.credentials.json + // with a valid non-expired token, so we just confirm the function returns a + // HealthLine without panicking. + let line = check_creds(); + // We don't assert a specific status — the check should not panic. + let _ = line.format(); + } + + // -- build_hash check ----------------------------------------------------- + + #[tokio::test] + async fn build_hash_pass_when_git_unavailable() { + // In a test environment without a git repo at /tmp/nonexistent, the check + // should gracefully return PASS rather than panicking. + let line = check_build_hash(std::path::Path::new("/tmp/nonexistent")).await; + // Should either PASS or produce a sensible result — must not panic. + let _ = line.format(); + } + + // -- health command registration ------------------------------------------ + + #[test] + fn health_command_registered_in_commands() { + let cmds = crate::chat::commands::commands(); + assert!( + cmds.iter().any(|c| c.name == "health"), + "health must be registered in commands()" + ); + } + + #[test] + fn health_command_has_description() { + let cmds = crate::chat::commands::commands(); + let cmd = cmds.iter().find(|c| c.name == "health").unwrap(); + assert!(!cmd.description.is_empty()); + } + + // -- Helper --------------------------------------------------------------- + + /// Build a minimal `BotContext` for testing purposes. + fn make_test_ctx(services: std::sync::Arc) -> BotContext { + use std::collections::HashSet; + use std::sync::Arc; + use std::sync::atomic::AtomicI64; + use tokio::sync::Mutex as TokioMutex; + + BotContext { + services, + matrix_user_id: "@bot:example.com".parse().unwrap(), + target_room_ids: vec![], + allowed_users: vec![], + history: Arc::new(TokioMutex::new(std::collections::HashMap::new())), + history_size: 20, + bot_sent_event_ids: Arc::new(TokioMutex::new(HashSet::new())), + htop_sessions: Arc::new(TokioMutex::new(std::collections::HashMap::new())), + transport: Arc::new(crate::chat::transport::whatsapp::WhatsAppTransport::new( + "test-phone".to_string(), + "test-token".to_string(), + "pipeline_notification".to_string(), + )), + timer_store: Arc::new(crate::service::timer::TimerStore::load( + std::path::PathBuf::from("/tmp/timers-health.json"), + )), + gateway_active_project: None, + gateway_projects_store: None, + handled_incoming_event_ids: Arc::new(TokioMutex::new( + crate::chat::transport::matrix::bot::context::SeenEventIds::new( + crate::chat::transport::matrix::bot::context::SEEN_EVENT_IDS_CAP, + ), + )), + gateway_port: None, + last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())), + } + } +} diff --git a/server/src/chat/transport/matrix/mod.rs b/server/src/chat/transport/matrix/mod.rs index 8ed25881..155de692 100644 --- a/server/src/chat/transport/matrix/mod.rs +++ b/server/src/chat/transport/matrix/mod.rs @@ -25,6 +25,8 @@ pub mod commands; pub(crate) mod config; /// Story deletion command — handles `!delete` bot commands to remove work items. pub mod delete; +/// `health` chat command — surface gateway, sled, matrix, creds, and build-hash status. +pub mod health; /// htop-style agent monitor command — renders a live process table in Matrix. pub mod htop; /// `new project ` chat command — Phase 1 gateway project bootstrap.