huskies: merge 1149 story huskies health chat command — surface gateway, sled, matrix, creds, and build-hash status
This commit is contained in:
@@ -269,6 +269,11 @@ pub fn commands() -> &'static [BotCommand] {
|
||||
description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
|
||||
handler: handle_cleanup_worktrees_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "health",
|
||||
description: "Show subsystem health: gateway, sled, matrix-sync, creds, and build-hash",
|
||||
handler: handle_health_fallback,
|
||||
},
|
||||
BotCommand {
|
||||
name: "new",
|
||||
description: "Bootstrap a new project container (gateway only): `new project <name>`",
|
||||
@@ -446,6 +451,16 @@ fn handle_project_rebuild_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback handler for the `health` command when it is not intercepted by the
|
||||
/// async handler in `on_room_message`. In practice this is never called — health
|
||||
/// is detected and handled before `try_handle_command` is invoked. The entry
|
||||
/// exists in the registry only so `help` lists it.
|
||||
///
|
||||
/// Returns `None` to prevent the LLM from receiving "health" as a prompt.
|
||||
fn handle_health_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||
None
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::services::Services;
|
||||
use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
|
||||
use std::collections::{BTreeMap, HashSet, VecDeque};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicI64;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
@@ -104,6 +105,10 @@ pub struct BotContext {
|
||||
/// Used by the "rebuild gateway" command to construct the health-check URL
|
||||
/// passed to the trampoline. `None` in standalone single-project mode.
|
||||
pub gateway_port: Option<u16>,
|
||||
/// Timestamp (ms since Unix epoch) of the last Matrix event received in any
|
||||
/// configured room. Updated atomically on every `on_room_message` call so
|
||||
/// the `health` command can detect a stale or dead sync loop.
|
||||
pub last_matrix_event_ms: Arc<AtomicI64>,
|
||||
}
|
||||
|
||||
impl BotContext {
|
||||
@@ -299,6 +304,7 @@ mod tests {
|
||||
SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
gateway_port: None,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,20 @@ use super::super::verification::check_sender_verified;
|
||||
|
||||
use super::handle_message;
|
||||
|
||||
/// Return `true` when the message is a `health` command addressed to the bot.
|
||||
///
|
||||
/// Recognised case-insensitively as the single word `health` after stripping the bot
|
||||
/// mention prefix. Any trailing whitespace is ignored; extra arguments are not
|
||||
/// expected and are silently discarded.
|
||||
fn extract_health_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool {
|
||||
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||
let trimmed = stripped
|
||||
.trim()
|
||||
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||
let cmd = trimmed.split_whitespace().next().unwrap_or("");
|
||||
cmd.eq_ignore_ascii_case("health")
|
||||
}
|
||||
|
||||
/// Return `true` when the message is a "rebuild gateway" command addressed to the bot.
|
||||
///
|
||||
/// The command is recognised case-insensitively as `rebuild gateway` after stripping
|
||||
@@ -100,6 +114,12 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// Update last-event timestamp so the `health` command can detect a stale sync loop.
|
||||
ctx.last_matrix_event_ms.store(
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
|
||||
// Ignore the bot's own messages to prevent echo loops.
|
||||
if ev.sender == ctx.matrix_user_id {
|
||||
return;
|
||||
@@ -249,6 +269,7 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
"config",
|
||||
"project-rebuild",
|
||||
"upgrade",
|
||||
"health",
|
||||
];
|
||||
|
||||
let stripped = crate::chat::util::strip_bot_mention(
|
||||
@@ -546,6 +567,26 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
||||
return;
|
||||
}
|
||||
|
||||
// `health` — async subsystem health report (gateway + standalone).
|
||||
if extract_health_command(
|
||||
&user_message,
|
||||
&ctx.services.bot_name,
|
||||
ctx.matrix_user_id.as_str(),
|
||||
) {
|
||||
slog!("[matrix-bot] Handling 'health' from {sender}");
|
||||
let response = super::super::super::health::run_health_check(&ctx).await;
|
||||
let html = markdown_to_html(&response);
|
||||
if let Ok(msg_id) = ctx
|
||||
.transport
|
||||
.send_message(&room_id_str, &response, &html)
|
||||
.await
|
||||
&& let Ok(event_id) = msg_id.parse()
|
||||
{
|
||||
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for bot-level commands (help, status, ambient, …) before invoking
|
||||
// the LLM. All commands are registered in commands.rs — no special-casing
|
||||
// needed here.
|
||||
|
||||
@@ -6,7 +6,7 @@ use matrix_sdk::ruma::OwnedRoomId;
|
||||
use matrix_sdk::{Client, LoopCtrl, config::SyncSettings};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering};
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
use tokio::sync::{RwLock, watch};
|
||||
|
||||
@@ -336,6 +336,7 @@ pub async fn run_bot(
|
||||
super::context::SEEN_EVENT_IDS_CAP,
|
||||
))),
|
||||
gateway_port,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
};
|
||||
|
||||
slog!(
|
||||
|
||||
@@ -0,0 +1,666 @@
|
||||
//! `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||
//!
|
||||
//! Runs one check per subsystem concurrently (each with a 5-second timeout) and
|
||||
//! returns a compact report: one line per subsystem with PASS / WARN / FAIL and a
|
||||
//! remediation hint on every non-PASS row. Output is capped at 20 lines; when
|
||||
//! more lines would be produced, the oldest WARN rows are dropped first.
|
||||
|
||||
use crate::chat::transport::matrix::bot::context::BotContext;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
|
||||
// ── Status ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Health status for a single subsystem.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum Status {
|
||||
/// Subsystem is operating normally.
|
||||
Pass,
|
||||
/// Subsystem is degraded but not fully broken.
|
||||
Warn,
|
||||
/// Subsystem has failed and needs intervention.
|
||||
Fail,
|
||||
}
|
||||
|
||||
// ── HealthLine ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// One output row from the health check.
|
||||
#[derive(Debug, Clone)]
|
||||
struct HealthLine {
|
||||
subsystem: String,
|
||||
status: Status,
|
||||
/// Short description of why the check is non-PASS.
|
||||
detail: Option<String>,
|
||||
/// Remediation hint shown after " — " on WARN/FAIL rows.
|
||||
hint: Option<String>,
|
||||
}
|
||||
|
||||
impl HealthLine {
|
||||
fn pass(subsystem: impl Into<String>) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Pass,
|
||||
detail: None,
|
||||
hint: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn warn(
|
||||
subsystem: impl Into<String>,
|
||||
detail: impl Into<String>,
|
||||
hint: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Warn,
|
||||
detail: Some(detail.into()),
|
||||
hint: Some(hint.into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn fail(
|
||||
subsystem: impl Into<String>,
|
||||
detail: impl Into<String>,
|
||||
hint: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
subsystem: subsystem.into(),
|
||||
status: Status::Fail,
|
||||
detail: Some(detail.into()),
|
||||
hint: Some(hint.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format as a single Markdown-friendly line.
|
||||
fn format(&self) -> String {
|
||||
let status = match self.status {
|
||||
Status::Pass => "PASS",
|
||||
Status::Warn => "WARN",
|
||||
Status::Fail => "FAIL",
|
||||
};
|
||||
match (&self.detail, &self.hint) {
|
||||
(Some(d), Some(h)) => format!("{} {}: {} — {}", self.subsystem, status, d, h),
|
||||
(Some(d), None) => format!("{} {}: {}", self.subsystem, status, d),
|
||||
(None, None) => format!("{} {}", self.subsystem, status),
|
||||
(None, Some(h)) => format!("{} {}: — {}", self.subsystem, status, h),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Truncation ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Maximum number of output lines before truncation.
|
||||
const MAX_LINES: usize = 20;
|
||||
|
||||
/// Truncate to ≤ MAX_LINES by removing the oldest (first in order) WARN rows.
|
||||
fn truncate_lines(mut lines: Vec<HealthLine>) -> Vec<HealthLine> {
|
||||
while lines.len() > MAX_LINES {
|
||||
if let Some(pos) = lines.iter().position(|l| l.status == Status::Warn) {
|
||||
lines.remove(pos);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
lines
|
||||
}
|
||||
|
||||
// ── Individual checks ────────────────────────────────────────────────────────
|
||||
|
||||
/// Check the `perm_rx` receiver — PASS when the permission listener holds the lock,
|
||||
/// FAIL when no task is holding it (listener has died or was never started).
|
||||
fn check_perm_rx(ctx: &BotContext) -> HealthLine {
|
||||
if ctx.services.perm_rx.try_lock().is_err() {
|
||||
HealthLine::pass("perm_rx")
|
||||
} else {
|
||||
HealthLine::fail("perm_rx", "listener not holding lock", "restart bot")
|
||||
}
|
||||
}
|
||||
|
||||
/// Check the Matrix sync loop by measuring the age of the last received event.
|
||||
///
|
||||
/// WARN after 60 s of silence, FAIL after 120 s. The timestamp is updated by
|
||||
/// `on_room_message` on every incoming event so receiving the health command
|
||||
/// itself resets the clock.
|
||||
fn check_matrix_sync(ctx: &BotContext) -> HealthLine {
|
||||
let last_ms = ctx.last_matrix_event_ms.load(Ordering::Relaxed);
|
||||
let age_secs = (chrono::Utc::now().timestamp_millis() - last_ms).max(0) / 1000;
|
||||
|
||||
if age_secs < 60 {
|
||||
HealthLine::pass("matrix-sync")
|
||||
} else if age_secs < 120 {
|
||||
HealthLine::warn(
|
||||
"matrix-sync",
|
||||
format!("no events in {age_secs}s"),
|
||||
"check sync loop — may be a quiet room",
|
||||
)
|
||||
} else {
|
||||
HealthLine::fail(
|
||||
"matrix-sync",
|
||||
format!("no events in {age_secs}s"),
|
||||
"sync loop may be dead — restart bot",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Check LLM credentials (`~/.claude/.credentials.json`).
|
||||
///
|
||||
/// FAIL if the file is missing or unreadable, FAIL if the access token is
|
||||
/// expired, WARN if it expires within the next 7 days.
|
||||
fn check_creds() -> HealthLine {
|
||||
match crate::llm::oauth::read_credentials() {
|
||||
Err(e) => HealthLine::fail("creds", e, "run `claude login`"),
|
||||
Ok(creds) => {
|
||||
let now_secs = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let expires_at = creds.claude_ai_oauth.expires_at;
|
||||
if expires_at < now_secs {
|
||||
HealthLine::fail("creds", "token expired", "run `claude login` to refresh")
|
||||
} else {
|
||||
let days_left = (expires_at - now_secs) / 86400;
|
||||
if days_left < 7 {
|
||||
HealthLine::warn(
|
||||
"creds",
|
||||
format!("token expires in {days_left}d"),
|
||||
"run `claude login` to refresh",
|
||||
)
|
||||
} else {
|
||||
HealthLine::pass("creds")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compare the compile-time build hash against the current HEAD of the workspace.
|
||||
///
|
||||
/// WARN when master has advanced past the running binary's commit (a rebuild is
|
||||
/// available but not urgent). PASS when hashes match or HEAD cannot be read.
|
||||
async fn check_build_hash(project_root: &std::path::Path) -> HealthLine {
|
||||
let running = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
|
||||
|
||||
// Read current HEAD from git (non-blocking, run in a spawn_blocking call).
|
||||
let repo_root = project_root.to_path_buf();
|
||||
let head = tokio::task::spawn_blocking(move || {
|
||||
std::process::Command::new("git")
|
||||
.args(["rev-parse", "--short", "HEAD"])
|
||||
.current_dir(&repo_root)
|
||||
.output()
|
||||
.ok()
|
||||
.filter(|o| o.status.success())
|
||||
.and_then(|o| String::from_utf8(o.stdout).ok())
|
||||
.map(|s| s.trim().to_string())
|
||||
})
|
||||
.await
|
||||
.unwrap_or(None);
|
||||
|
||||
match head {
|
||||
None => HealthLine::pass("build-hash"),
|
||||
Some(ref head_hash) => {
|
||||
if running == "unknown" || head_hash == running {
|
||||
HealthLine::pass("build-hash")
|
||||
} else {
|
||||
HealthLine::warn(
|
||||
"build-hash",
|
||||
format!("running {running}, HEAD is {head_hash}"),
|
||||
"run `rebuild` to update",
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check each registered sled's `/health` endpoint with a 5-second timeout.
|
||||
///
|
||||
/// Returns one [`HealthLine`] per sled. PASS when the sled responds with HTTP
|
||||
/// 2xx; FAIL when the request times out or returns an error status.
|
||||
async fn check_sleds(
|
||||
store: &tokio::sync::RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>,
|
||||
) -> Vec<HealthLine> {
|
||||
let entries: Vec<(String, Option<String>)> = store
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.map(|(n, e)| (n.clone(), e.url.clone()))
|
||||
.collect();
|
||||
|
||||
if entries.is_empty() {
|
||||
return vec![HealthLine::warn(
|
||||
"sled",
|
||||
"no sleds registered",
|
||||
"add projects to projects.toml",
|
||||
)];
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let mut lines = Vec::new();
|
||||
|
||||
for (name, url_opt) in entries {
|
||||
let subsystem = format!("sled:{name}");
|
||||
let line = match url_opt {
|
||||
None => HealthLine::warn(subsystem, "no URL configured", "set url in projects.toml"),
|
||||
Some(url) => {
|
||||
let health_url = format!("{}/health", url.trim_end_matches('/'));
|
||||
let result = timeout(Duration::from_secs(5), client.get(&health_url).send()).await;
|
||||
match result {
|
||||
Err(_) => {
|
||||
HealthLine::fail(subsystem, "timed out", "check container is running")
|
||||
}
|
||||
Ok(Err(e)) => HealthLine::fail(
|
||||
subsystem,
|
||||
format!("unreachable: {}", short_error(&e.to_string())),
|
||||
"check container is running",
|
||||
),
|
||||
Ok(Ok(resp)) if resp.status().is_success() => HealthLine::pass(subsystem),
|
||||
Ok(Ok(resp)) => HealthLine::fail(
|
||||
subsystem,
|
||||
format!("HTTP {}", resp.status().as_u16()),
|
||||
"check container logs",
|
||||
),
|
||||
}
|
||||
}
|
||||
};
|
||||
lines.push(line);
|
||||
}
|
||||
|
||||
lines
|
||||
}
|
||||
|
||||
/// Check the gateway process: pidfile validity and (on macOS) binary codesign.
|
||||
///
|
||||
/// PASS when our PID is recorded in the pidfile. On macOS, also verifies that
|
||||
/// `~/bin/huskies-bin` has a valid ad-hoc signature; FAIL with a `script/local-release`
|
||||
/// hint when it does not.
|
||||
fn check_gateway_process() -> HealthLine {
|
||||
// Verify that the pidfile records our PID (i.e. this IS the live gateway).
|
||||
let pidfile_ok = check_pidfile_matches_self();
|
||||
|
||||
// On macOS, verify the installed binary is codesigned.
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
if !check_codesign_macos() {
|
||||
return HealthLine::fail(
|
||||
"gateway-process",
|
||||
"codesign invalid",
|
||||
"run `script/local-release`",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if !pidfile_ok {
|
||||
return HealthLine::warn(
|
||||
"gateway-process",
|
||||
"pidfile missing or stale",
|
||||
"restart gateway with --gateway flag",
|
||||
);
|
||||
}
|
||||
|
||||
HealthLine::pass("gateway-process")
|
||||
}
|
||||
|
||||
/// Return `true` when `$HOME/.huskies/gateway.pid` exists and contains our PID.
|
||||
fn check_pidfile_matches_self() -> bool {
|
||||
let home = homedir::my_home().ok().flatten();
|
||||
let home = match home {
|
||||
Some(h) => h,
|
||||
None => return false,
|
||||
};
|
||||
let path = home.join(".huskies").join("gateway.pid");
|
||||
let content = std::fs::read_to_string(&path).unwrap_or_default();
|
||||
content.trim().parse::<u32>().unwrap_or(0) == std::process::id()
|
||||
}
|
||||
|
||||
/// On macOS, return `true` when `~/bin/huskies-bin` passes `codesign --verify`.
|
||||
///
|
||||
/// Falls back to the current executable when `~/bin/huskies-bin` does not exist.
|
||||
/// Returns `true` (assume ok) if the `codesign` tool is unavailable.
|
||||
#[cfg(target_os = "macos")]
|
||||
fn check_codesign_macos() -> bool {
|
||||
let target = if let Ok(home) = std::env::var("HOME") {
|
||||
let installed = std::path::PathBuf::from(home)
|
||||
.join("bin")
|
||||
.join("huskies-bin");
|
||||
if installed.exists() {
|
||||
installed
|
||||
} else {
|
||||
match std::env::current_exe() {
|
||||
Ok(p) => p,
|
||||
Err(_) => return true,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match std::env::current_exe() {
|
||||
Ok(p) => p,
|
||||
Err(_) => return true,
|
||||
}
|
||||
};
|
||||
|
||||
std::process::Command::new("codesign")
|
||||
.args(["--verify", "--quiet", target.to_str().unwrap_or("")])
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
// ── Entry point ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Run all health checks and return a formatted Markdown report (≤ 20 lines).
|
||||
///
|
||||
/// Gateway-specific checks (gateway-process, per-sled probes) are included
|
||||
/// only when running in gateway mode. All other checks run in every mode.
|
||||
pub async fn run_health_check(ctx: &BotContext) -> String {
|
||||
let mut lines: Vec<HealthLine> = Vec::new();
|
||||
|
||||
// Gateway-only checks
|
||||
if ctx.is_gateway() {
|
||||
lines.push(check_gateway_process());
|
||||
if let Some(ref store) = ctx.gateway_projects_store {
|
||||
lines.extend(check_sleds(store).await);
|
||||
}
|
||||
}
|
||||
|
||||
// Shared checks — run concurrently where possible.
|
||||
let perm_line = check_perm_rx(ctx);
|
||||
let sync_line = check_matrix_sync(ctx);
|
||||
let creds_line = check_creds();
|
||||
let hash_line = check_build_hash(&ctx.services.project_root).await;
|
||||
|
||||
lines.push(perm_line);
|
||||
lines.push(sync_line);
|
||||
lines.push(creds_line);
|
||||
lines.push(hash_line);
|
||||
|
||||
let lines = truncate_lines(lines);
|
||||
lines
|
||||
.iter()
|
||||
.map(|l| l.format())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
// ── Utilities ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Shorten a long error string to the first 60 characters for compact display.
|
||||
fn short_error(s: &str) -> String {
|
||||
s.chars().take(60).collect()
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// -- HealthLine formatting ------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn pass_line_formats_without_detail() {
|
||||
let line = HealthLine::pass("perm_rx");
|
||||
assert_eq!(line.format(), "perm_rx PASS");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fail_line_formats_with_detail_and_hint() {
|
||||
let line = HealthLine::fail(
|
||||
"gateway-process",
|
||||
"codesign invalid",
|
||||
"run script/local-release",
|
||||
);
|
||||
assert_eq!(
|
||||
line.format(),
|
||||
"gateway-process FAIL: codesign invalid — run script/local-release"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn warn_line_formats_with_detail_and_hint() {
|
||||
let line = HealthLine::warn("build-hash", "running abc, HEAD is def", "run rebuild");
|
||||
assert_eq!(
|
||||
line.format(),
|
||||
"build-hash WARN: running abc, HEAD is def — run rebuild"
|
||||
);
|
||||
}
|
||||
|
||||
// -- Truncation -----------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn truncate_drops_oldest_warn_first() {
|
||||
let mut lines: Vec<HealthLine> = (0..22)
|
||||
.map(|i| {
|
||||
if i % 3 == 0 {
|
||||
HealthLine::fail(format!("sled:{i}"), "down", "fix it")
|
||||
} else {
|
||||
HealthLine::warn(format!("check:{i}"), "slow", "investigate")
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Manually insert a known WARN at position 0 and a FAIL at position 1
|
||||
lines.insert(0, HealthLine::warn("oldest-warn", "stale", "restart"));
|
||||
lines.insert(1, HealthLine::fail("important-fail", "broken", "fix"));
|
||||
|
||||
let result = truncate_lines(lines.clone());
|
||||
assert!(
|
||||
result.len() <= MAX_LINES,
|
||||
"output must be ≤ {MAX_LINES} lines"
|
||||
);
|
||||
|
||||
// FAILs must be preserved.
|
||||
let fail_count = result.iter().filter(|l| l.status == Status::Fail).count();
|
||||
let orig_fail_count = lines.iter().filter(|l| l.status == Status::Fail).count();
|
||||
assert_eq!(
|
||||
fail_count,
|
||||
orig_fail_count.min(MAX_LINES),
|
||||
"all FAIL lines must be kept when they fit"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_noop_when_under_limit() {
|
||||
let lines: Vec<HealthLine> = (0..5).map(|i| HealthLine::pass(format!("s{i}"))).collect();
|
||||
let result = truncate_lines(lines.clone());
|
||||
assert_eq!(result.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_stops_at_fails_when_no_warns_left() {
|
||||
// 25 FAIL lines — nothing to drop; output is clamped at MAX_LINES.
|
||||
let lines: Vec<HealthLine> = (0..25)
|
||||
.map(|i| HealthLine::fail(format!("s{i}"), "broken", "fix"))
|
||||
.collect();
|
||||
let result = truncate_lines(lines);
|
||||
// When only FAILs are present, truncation stops because no WARNs can be removed.
|
||||
assert_eq!(result.len(), 25, "FAILs are never dropped by truncation");
|
||||
}
|
||||
|
||||
// -- perm_rx check --------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn perm_rx_pass_when_locked() {
|
||||
use crate::services::Services;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
let (perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||
|
||||
// Acquire the lock to simulate the permission listener holding it.
|
||||
let _guard = perm_rx_arc.try_lock().unwrap();
|
||||
|
||||
// Build a minimal services bundle referencing our locked perm_rx.
|
||||
let services = Arc::new(Services {
|
||||
project_root: std::path::PathBuf::from("/tmp"),
|
||||
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||
bot_name: "test".to_string(),
|
||||
bot_user_id: "@bot:test".to_string(),
|
||||
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||
perm_rx: Arc::clone(&perm_rx_arc),
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
|
||||
// Build a minimal BotContext just to pass services.
|
||||
let ctx = make_test_ctx(services);
|
||||
|
||||
let line = check_perm_rx(&ctx);
|
||||
assert_eq!(
|
||||
line.status,
|
||||
Status::Pass,
|
||||
"perm_rx should PASS when a task holds the lock"
|
||||
);
|
||||
|
||||
drop(perm_tx); // suppress unused warning
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn perm_rx_fail_when_unlocked() {
|
||||
use crate::services::Services;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
let (_perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||
// Lock is NOT held by anyone.
|
||||
|
||||
let services = Arc::new(Services {
|
||||
project_root: std::path::PathBuf::from("/tmp"),
|
||||
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||
bot_name: "test".to_string(),
|
||||
bot_user_id: "@bot:test".to_string(),
|
||||
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||
perm_rx: Arc::clone(&perm_rx_arc),
|
||||
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
permission_timeout_secs: 120,
|
||||
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||
});
|
||||
|
||||
let ctx = make_test_ctx(services);
|
||||
|
||||
let line = check_perm_rx(&ctx);
|
||||
assert_eq!(
|
||||
line.status,
|
||||
Status::Fail,
|
||||
"perm_rx should FAIL when no task holds the lock"
|
||||
);
|
||||
}
|
||||
|
||||
// -- matrix-sync check ----------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn matrix_sync_pass_when_recent() {
|
||||
let services = crate::services::Services::new_test(
|
||||
std::path::PathBuf::from("/tmp"),
|
||||
"bot".to_string(),
|
||||
);
|
||||
let ctx = make_test_ctx(services);
|
||||
// Set last event to just now.
|
||||
ctx.last_matrix_event_ms
|
||||
.store(chrono::Utc::now().timestamp_millis(), Ordering::Relaxed);
|
||||
let line = check_matrix_sync(&ctx);
|
||||
assert_eq!(line.status, Status::Pass);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn matrix_sync_fail_when_stale() {
|
||||
let services = crate::services::Services::new_test(
|
||||
std::path::PathBuf::from("/tmp"),
|
||||
"bot".to_string(),
|
||||
);
|
||||
let ctx = make_test_ctx(services);
|
||||
// Simulate 200 seconds of silence.
|
||||
let old_ms = chrono::Utc::now().timestamp_millis() - 200_000;
|
||||
ctx.last_matrix_event_ms.store(old_ms, Ordering::Relaxed);
|
||||
let line = check_matrix_sync(&ctx);
|
||||
assert_eq!(line.status, Status::Fail);
|
||||
assert!(
|
||||
line.detail.as_deref().unwrap_or("").contains("200s")
|
||||
|| line.detail.as_deref().unwrap_or("").contains("s"),
|
||||
"detail should mention age in seconds"
|
||||
);
|
||||
}
|
||||
|
||||
// -- creds check ----------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn creds_fail_when_file_missing() {
|
||||
// In the test environment there is unlikely to be a ~/.claude/.credentials.json
|
||||
// with a valid non-expired token, so we just confirm the function returns a
|
||||
// HealthLine without panicking.
|
||||
let line = check_creds();
|
||||
// We don't assert a specific status — the check should not panic.
|
||||
let _ = line.format();
|
||||
}
|
||||
|
||||
// -- build_hash check -----------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn build_hash_pass_when_git_unavailable() {
|
||||
// In a test environment without a git repo at /tmp/nonexistent, the check
|
||||
// should gracefully return PASS rather than panicking.
|
||||
let line = check_build_hash(std::path::Path::new("/tmp/nonexistent")).await;
|
||||
// Should either PASS or produce a sensible result — must not panic.
|
||||
let _ = line.format();
|
||||
}
|
||||
|
||||
// -- health command registration ------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn health_command_registered_in_commands() {
|
||||
let cmds = crate::chat::commands::commands();
|
||||
assert!(
|
||||
cmds.iter().any(|c| c.name == "health"),
|
||||
"health must be registered in commands()"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_command_has_description() {
|
||||
let cmds = crate::chat::commands::commands();
|
||||
let cmd = cmds.iter().find(|c| c.name == "health").unwrap();
|
||||
assert!(!cmd.description.is_empty());
|
||||
}
|
||||
|
||||
// -- Helper ---------------------------------------------------------------
|
||||
|
||||
/// Build a minimal `BotContext` for testing purposes.
|
||||
fn make_test_ctx(services: std::sync::Arc<crate::services::Services>) -> BotContext {
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicI64;
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
|
||||
BotContext {
|
||||
services,
|
||||
matrix_user_id: "@bot:example.com".parse().unwrap(),
|
||||
target_room_ids: vec![],
|
||||
allowed_users: vec![],
|
||||
history: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
history_size: 20,
|
||||
bot_sent_event_ids: Arc::new(TokioMutex::new(HashSet::new())),
|
||||
htop_sessions: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||
transport: Arc::new(crate::chat::transport::whatsapp::WhatsAppTransport::new(
|
||||
"test-phone".to_string(),
|
||||
"test-token".to_string(),
|
||||
"pipeline_notification".to_string(),
|
||||
)),
|
||||
timer_store: Arc::new(crate::service::timer::TimerStore::load(
|
||||
std::path::PathBuf::from("/tmp/timers-health.json"),
|
||||
)),
|
||||
gateway_active_project: None,
|
||||
gateway_projects_store: None,
|
||||
handled_incoming_event_ids: Arc::new(TokioMutex::new(
|
||||
crate::chat::transport::matrix::bot::context::SeenEventIds::new(
|
||||
crate::chat::transport::matrix::bot::context::SEEN_EVENT_IDS_CAP,
|
||||
),
|
||||
)),
|
||||
gateway_port: None,
|
||||
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,8 @@ pub mod commands;
|
||||
pub(crate) mod config;
|
||||
/// Story deletion command — handles `!delete` bot commands to remove work items.
|
||||
pub mod delete;
|
||||
/// `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||
pub mod health;
|
||||
/// htop-style agent monitor command — renders a live process table in Matrix.
|
||||
pub mod htop;
|
||||
/// `new project <name>` chat command — Phase 1 gateway project bootstrap.
|
||||
|
||||
Reference in New Issue
Block a user