huskies: merge 1149 story huskies health chat command — surface gateway, sled, matrix, creds, and build-hash status
This commit is contained in:
@@ -269,6 +269,11 @@ pub fn commands() -> &'static [BotCommand] {
|
|||||||
description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
|
description: "List orphaned worktrees (dry run), or `cleanup_worktrees --confirm` to remove them",
|
||||||
handler: handle_cleanup_worktrees_fallback,
|
handler: handle_cleanup_worktrees_fallback,
|
||||||
},
|
},
|
||||||
|
BotCommand {
|
||||||
|
name: "health",
|
||||||
|
description: "Show subsystem health: gateway, sled, matrix-sync, creds, and build-hash",
|
||||||
|
handler: handle_health_fallback,
|
||||||
|
},
|
||||||
BotCommand {
|
BotCommand {
|
||||||
name: "new",
|
name: "new",
|
||||||
description: "Bootstrap a new project container (gateway only): `new project <name>`",
|
description: "Bootstrap a new project container (gateway only): `new project <name>`",
|
||||||
@@ -446,6 +451,16 @@ fn handle_project_rebuild_fallback(_ctx: &CommandContext) -> Option<String> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fallback handler for the `health` command when it is not intercepted by the
|
||||||
|
/// async handler in `on_room_message`. In practice this is never called — health
|
||||||
|
/// is detected and handled before `try_handle_command` is invoked. The entry
|
||||||
|
/// exists in the registry only so `help` lists it.
|
||||||
|
///
|
||||||
|
/// Returns `None` to prevent the LLM from receiving "health" as a prompt.
|
||||||
|
fn handle_health_fallback(_ctx: &CommandContext) -> Option<String> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Tests
|
// Tests
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use crate::services::Services;
|
|||||||
use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
|
use matrix_sdk::ruma::{OwnedEventId, OwnedRoomId, OwnedUserId};
|
||||||
use std::collections::{BTreeMap, HashSet, VecDeque};
|
use std::collections::{BTreeMap, HashSet, VecDeque};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::AtomicI64;
|
||||||
use tokio::sync::Mutex as TokioMutex;
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
@@ -104,6 +105,10 @@ pub struct BotContext {
|
|||||||
/// Used by the "rebuild gateway" command to construct the health-check URL
|
/// Used by the "rebuild gateway" command to construct the health-check URL
|
||||||
/// passed to the trampoline. `None` in standalone single-project mode.
|
/// passed to the trampoline. `None` in standalone single-project mode.
|
||||||
pub gateway_port: Option<u16>,
|
pub gateway_port: Option<u16>,
|
||||||
|
/// Timestamp (ms since Unix epoch) of the last Matrix event received in any
|
||||||
|
/// configured room. Updated atomically on every `on_room_message` call so
|
||||||
|
/// the `health` command can detect a stale or dead sync loop.
|
||||||
|
pub last_matrix_event_ms: Arc<AtomicI64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BotContext {
|
impl BotContext {
|
||||||
@@ -299,6 +304,7 @@ mod tests {
|
|||||||
SEEN_EVENT_IDS_CAP,
|
SEEN_EVENT_IDS_CAP,
|
||||||
))),
|
))),
|
||||||
gateway_port: None,
|
gateway_port: None,
|
||||||
|
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,20 @@ use super::super::verification::check_sender_verified;
|
|||||||
|
|
||||||
use super::handle_message;
|
use super::handle_message;
|
||||||
|
|
||||||
|
/// Return `true` when the message is a `health` command addressed to the bot.
|
||||||
|
///
|
||||||
|
/// Recognised case-insensitively as the single word `health` after stripping the bot
|
||||||
|
/// mention prefix. Any trailing whitespace is ignored; extra arguments are not
|
||||||
|
/// expected and are silently discarded.
|
||||||
|
fn extract_health_command(message: &str, bot_name: &str, bot_user_id: &str) -> bool {
|
||||||
|
let stripped = crate::chat::util::strip_bot_mention(message, bot_name, bot_user_id);
|
||||||
|
let trimmed = stripped
|
||||||
|
.trim()
|
||||||
|
.trim_start_matches(|c: char| !c.is_alphanumeric());
|
||||||
|
let cmd = trimmed.split_whitespace().next().unwrap_or("");
|
||||||
|
cmd.eq_ignore_ascii_case("health")
|
||||||
|
}
|
||||||
|
|
||||||
/// Return `true` when the message is a "rebuild gateway" command addressed to the bot.
|
/// Return `true` when the message is a "rebuild gateway" command addressed to the bot.
|
||||||
///
|
///
|
||||||
/// The command is recognised case-insensitively as `rebuild gateway` after stripping
|
/// The command is recognised case-insensitively as `rebuild gateway` after stripping
|
||||||
@@ -100,6 +114,12 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update last-event timestamp so the `health` command can detect a stale sync loop.
|
||||||
|
ctx.last_matrix_event_ms.store(
|
||||||
|
chrono::Utc::now().timestamp_millis(),
|
||||||
|
std::sync::atomic::Ordering::Relaxed,
|
||||||
|
);
|
||||||
|
|
||||||
// Ignore the bot's own messages to prevent echo loops.
|
// Ignore the bot's own messages to prevent echo loops.
|
||||||
if ev.sender == ctx.matrix_user_id {
|
if ev.sender == ctx.matrix_user_id {
|
||||||
return;
|
return;
|
||||||
@@ -249,6 +269,7 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
|||||||
"config",
|
"config",
|
||||||
"project-rebuild",
|
"project-rebuild",
|
||||||
"upgrade",
|
"upgrade",
|
||||||
|
"health",
|
||||||
];
|
];
|
||||||
|
|
||||||
let stripped = crate::chat::util::strip_bot_mention(
|
let stripped = crate::chat::util::strip_bot_mention(
|
||||||
@@ -546,6 +567,26 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// `health` — async subsystem health report (gateway + standalone).
|
||||||
|
if extract_health_command(
|
||||||
|
&user_message,
|
||||||
|
&ctx.services.bot_name,
|
||||||
|
ctx.matrix_user_id.as_str(),
|
||||||
|
) {
|
||||||
|
slog!("[matrix-bot] Handling 'health' from {sender}");
|
||||||
|
let response = super::super::super::health::run_health_check(&ctx).await;
|
||||||
|
let html = markdown_to_html(&response);
|
||||||
|
if let Ok(msg_id) = ctx
|
||||||
|
.transport
|
||||||
|
.send_message(&room_id_str, &response, &html)
|
||||||
|
.await
|
||||||
|
&& let Ok(event_id) = msg_id.parse()
|
||||||
|
{
|
||||||
|
ctx.bot_sent_event_ids.lock().await.insert(event_id);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Check for bot-level commands (help, status, ambient, …) before invoking
|
// Check for bot-level commands (help, status, ambient, …) before invoking
|
||||||
// the LLM. All commands are registered in commands.rs — no special-casing
|
// the LLM. All commands are registered in commands.rs — no special-casing
|
||||||
// needed here.
|
// needed here.
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use matrix_sdk::ruma::OwnedRoomId;
|
|||||||
use matrix_sdk::{Client, LoopCtrl, config::SyncSettings};
|
use matrix_sdk::{Client, LoopCtrl, config::SyncSettings};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering};
|
||||||
use tokio::sync::Mutex as TokioMutex;
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
use tokio::sync::{RwLock, watch};
|
use tokio::sync::{RwLock, watch};
|
||||||
|
|
||||||
@@ -336,6 +336,7 @@ pub async fn run_bot(
|
|||||||
super::context::SEEN_EVENT_IDS_CAP,
|
super::context::SEEN_EVENT_IDS_CAP,
|
||||||
))),
|
))),
|
||||||
gateway_port,
|
gateway_port,
|
||||||
|
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||||
};
|
};
|
||||||
|
|
||||||
slog!(
|
slog!(
|
||||||
|
|||||||
@@ -0,0 +1,666 @@
|
|||||||
|
//! `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||||
|
//!
|
||||||
|
//! Runs one check per subsystem concurrently (each with a 5-second timeout) and
|
||||||
|
//! returns a compact report: one line per subsystem with PASS / WARN / FAIL and a
|
||||||
|
//! remediation hint on every non-PASS row. Output is capped at 20 lines; when
|
||||||
|
//! more lines would be produced, the oldest WARN rows are dropped first.
|
||||||
|
|
||||||
|
use crate::chat::transport::matrix::bot::context::BotContext;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::time::timeout;
|
||||||
|
|
||||||
|
// ── Status ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Health status for a single subsystem.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
enum Status {
|
||||||
|
/// Subsystem is operating normally.
|
||||||
|
Pass,
|
||||||
|
/// Subsystem is degraded but not fully broken.
|
||||||
|
Warn,
|
||||||
|
/// Subsystem has failed and needs intervention.
|
||||||
|
Fail,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── HealthLine ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// One output row from the health check.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct HealthLine {
|
||||||
|
subsystem: String,
|
||||||
|
status: Status,
|
||||||
|
/// Short description of why the check is non-PASS.
|
||||||
|
detail: Option<String>,
|
||||||
|
/// Remediation hint shown after " — " on WARN/FAIL rows.
|
||||||
|
hint: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HealthLine {
|
||||||
|
fn pass(subsystem: impl Into<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
subsystem: subsystem.into(),
|
||||||
|
status: Status::Pass,
|
||||||
|
detail: None,
|
||||||
|
hint: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn warn(
|
||||||
|
subsystem: impl Into<String>,
|
||||||
|
detail: impl Into<String>,
|
||||||
|
hint: impl Into<String>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
subsystem: subsystem.into(),
|
||||||
|
status: Status::Warn,
|
||||||
|
detail: Some(detail.into()),
|
||||||
|
hint: Some(hint.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fail(
|
||||||
|
subsystem: impl Into<String>,
|
||||||
|
detail: impl Into<String>,
|
||||||
|
hint: impl Into<String>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
subsystem: subsystem.into(),
|
||||||
|
status: Status::Fail,
|
||||||
|
detail: Some(detail.into()),
|
||||||
|
hint: Some(hint.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format as a single Markdown-friendly line.
|
||||||
|
fn format(&self) -> String {
|
||||||
|
let status = match self.status {
|
||||||
|
Status::Pass => "PASS",
|
||||||
|
Status::Warn => "WARN",
|
||||||
|
Status::Fail => "FAIL",
|
||||||
|
};
|
||||||
|
match (&self.detail, &self.hint) {
|
||||||
|
(Some(d), Some(h)) => format!("{} {}: {} — {}", self.subsystem, status, d, h),
|
||||||
|
(Some(d), None) => format!("{} {}: {}", self.subsystem, status, d),
|
||||||
|
(None, None) => format!("{} {}", self.subsystem, status),
|
||||||
|
(None, Some(h)) => format!("{} {}: — {}", self.subsystem, status, h),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Truncation ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Maximum number of output lines before truncation.
|
||||||
|
const MAX_LINES: usize = 20;
|
||||||
|
|
||||||
|
/// Truncate to ≤ MAX_LINES by removing the oldest (first in order) WARN rows.
|
||||||
|
fn truncate_lines(mut lines: Vec<HealthLine>) -> Vec<HealthLine> {
|
||||||
|
while lines.len() > MAX_LINES {
|
||||||
|
if let Some(pos) = lines.iter().position(|l| l.status == Status::Warn) {
|
||||||
|
lines.remove(pos);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Individual checks ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Check the `perm_rx` receiver — PASS when the permission listener holds the lock,
|
||||||
|
/// FAIL when no task is holding it (listener has died or was never started).
|
||||||
|
fn check_perm_rx(ctx: &BotContext) -> HealthLine {
|
||||||
|
if ctx.services.perm_rx.try_lock().is_err() {
|
||||||
|
HealthLine::pass("perm_rx")
|
||||||
|
} else {
|
||||||
|
HealthLine::fail("perm_rx", "listener not holding lock", "restart bot")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check the Matrix sync loop by measuring the age of the last received event.
|
||||||
|
///
|
||||||
|
/// WARN after 60 s of silence, FAIL after 120 s. The timestamp is updated by
|
||||||
|
/// `on_room_message` on every incoming event so receiving the health command
|
||||||
|
/// itself resets the clock.
|
||||||
|
fn check_matrix_sync(ctx: &BotContext) -> HealthLine {
|
||||||
|
let last_ms = ctx.last_matrix_event_ms.load(Ordering::Relaxed);
|
||||||
|
let age_secs = (chrono::Utc::now().timestamp_millis() - last_ms).max(0) / 1000;
|
||||||
|
|
||||||
|
if age_secs < 60 {
|
||||||
|
HealthLine::pass("matrix-sync")
|
||||||
|
} else if age_secs < 120 {
|
||||||
|
HealthLine::warn(
|
||||||
|
"matrix-sync",
|
||||||
|
format!("no events in {age_secs}s"),
|
||||||
|
"check sync loop — may be a quiet room",
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
HealthLine::fail(
|
||||||
|
"matrix-sync",
|
||||||
|
format!("no events in {age_secs}s"),
|
||||||
|
"sync loop may be dead — restart bot",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check LLM credentials (`~/.claude/.credentials.json`).
|
||||||
|
///
|
||||||
|
/// FAIL if the file is missing or unreadable, FAIL if the access token is
|
||||||
|
/// expired, WARN if it expires within the next 7 days.
|
||||||
|
fn check_creds() -> HealthLine {
|
||||||
|
match crate::llm::oauth::read_credentials() {
|
||||||
|
Err(e) => HealthLine::fail("creds", e, "run `claude login`"),
|
||||||
|
Ok(creds) => {
|
||||||
|
let now_secs = std::time::SystemTime::now()
|
||||||
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.as_secs();
|
||||||
|
let expires_at = creds.claude_ai_oauth.expires_at;
|
||||||
|
if expires_at < now_secs {
|
||||||
|
HealthLine::fail("creds", "token expired", "run `claude login` to refresh")
|
||||||
|
} else {
|
||||||
|
let days_left = (expires_at - now_secs) / 86400;
|
||||||
|
if days_left < 7 {
|
||||||
|
HealthLine::warn(
|
||||||
|
"creds",
|
||||||
|
format!("token expires in {days_left}d"),
|
||||||
|
"run `claude login` to refresh",
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
HealthLine::pass("creds")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compare the compile-time build hash against the current HEAD of the workspace.
|
||||||
|
///
|
||||||
|
/// WARN when master has advanced past the running binary's commit (a rebuild is
|
||||||
|
/// available but not urgent). PASS when hashes match or HEAD cannot be read.
|
||||||
|
async fn check_build_hash(project_root: &std::path::Path) -> HealthLine {
|
||||||
|
let running = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
|
||||||
|
|
||||||
|
// Read current HEAD from git (non-blocking, run in a spawn_blocking call).
|
||||||
|
let repo_root = project_root.to_path_buf();
|
||||||
|
let head = tokio::task::spawn_blocking(move || {
|
||||||
|
std::process::Command::new("git")
|
||||||
|
.args(["rev-parse", "--short", "HEAD"])
|
||||||
|
.current_dir(&repo_root)
|
||||||
|
.output()
|
||||||
|
.ok()
|
||||||
|
.filter(|o| o.status.success())
|
||||||
|
.and_then(|o| String::from_utf8(o.stdout).ok())
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap_or(None);
|
||||||
|
|
||||||
|
match head {
|
||||||
|
None => HealthLine::pass("build-hash"),
|
||||||
|
Some(ref head_hash) => {
|
||||||
|
if running == "unknown" || head_hash == running {
|
||||||
|
HealthLine::pass("build-hash")
|
||||||
|
} else {
|
||||||
|
HealthLine::warn(
|
||||||
|
"build-hash",
|
||||||
|
format!("running {running}, HEAD is {head_hash}"),
|
||||||
|
"run `rebuild` to update",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check each registered sled's `/health` endpoint with a 5-second timeout.
|
||||||
|
///
|
||||||
|
/// Returns one [`HealthLine`] per sled. PASS when the sled responds with HTTP
|
||||||
|
/// 2xx; FAIL when the request times out or returns an error status.
|
||||||
|
async fn check_sleds(
|
||||||
|
store: &tokio::sync::RwLock<BTreeMap<String, crate::service::gateway::config::ProjectEntry>>,
|
||||||
|
) -> Vec<HealthLine> {
|
||||||
|
let entries: Vec<(String, Option<String>)> = store
|
||||||
|
.read()
|
||||||
|
.await
|
||||||
|
.iter()
|
||||||
|
.map(|(n, e)| (n.clone(), e.url.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if entries.is_empty() {
|
||||||
|
return vec![HealthLine::warn(
|
||||||
|
"sled",
|
||||||
|
"no sleds registered",
|
||||||
|
"add projects to projects.toml",
|
||||||
|
)];
|
||||||
|
}
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
for (name, url_opt) in entries {
|
||||||
|
let subsystem = format!("sled:{name}");
|
||||||
|
let line = match url_opt {
|
||||||
|
None => HealthLine::warn(subsystem, "no URL configured", "set url in projects.toml"),
|
||||||
|
Some(url) => {
|
||||||
|
let health_url = format!("{}/health", url.trim_end_matches('/'));
|
||||||
|
let result = timeout(Duration::from_secs(5), client.get(&health_url).send()).await;
|
||||||
|
match result {
|
||||||
|
Err(_) => {
|
||||||
|
HealthLine::fail(subsystem, "timed out", "check container is running")
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => HealthLine::fail(
|
||||||
|
subsystem,
|
||||||
|
format!("unreachable: {}", short_error(&e.to_string())),
|
||||||
|
"check container is running",
|
||||||
|
),
|
||||||
|
Ok(Ok(resp)) if resp.status().is_success() => HealthLine::pass(subsystem),
|
||||||
|
Ok(Ok(resp)) => HealthLine::fail(
|
||||||
|
subsystem,
|
||||||
|
format!("HTTP {}", resp.status().as_u16()),
|
||||||
|
"check container logs",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
lines.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check the gateway process: pidfile validity and (on macOS) binary codesign.
|
||||||
|
///
|
||||||
|
/// PASS when our PID is recorded in the pidfile. On macOS, also verifies that
|
||||||
|
/// `~/bin/huskies-bin` has a valid ad-hoc signature; FAIL with a `script/local-release`
|
||||||
|
/// hint when it does not.
|
||||||
|
fn check_gateway_process() -> HealthLine {
|
||||||
|
// Verify that the pidfile records our PID (i.e. this IS the live gateway).
|
||||||
|
let pidfile_ok = check_pidfile_matches_self();
|
||||||
|
|
||||||
|
// On macOS, verify the installed binary is codesigned.
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
{
|
||||||
|
if !check_codesign_macos() {
|
||||||
|
return HealthLine::fail(
|
||||||
|
"gateway-process",
|
||||||
|
"codesign invalid",
|
||||||
|
"run `script/local-release`",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !pidfile_ok {
|
||||||
|
return HealthLine::warn(
|
||||||
|
"gateway-process",
|
||||||
|
"pidfile missing or stale",
|
||||||
|
"restart gateway with --gateway flag",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
HealthLine::pass("gateway-process")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return `true` when `$HOME/.huskies/gateway.pid` exists and contains our PID.
|
||||||
|
fn check_pidfile_matches_self() -> bool {
|
||||||
|
let home = homedir::my_home().ok().flatten();
|
||||||
|
let home = match home {
|
||||||
|
Some(h) => h,
|
||||||
|
None => return false,
|
||||||
|
};
|
||||||
|
let path = home.join(".huskies").join("gateway.pid");
|
||||||
|
let content = std::fs::read_to_string(&path).unwrap_or_default();
|
||||||
|
content.trim().parse::<u32>().unwrap_or(0) == std::process::id()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// On macOS, return `true` when `~/bin/huskies-bin` passes `codesign --verify`.
|
||||||
|
///
|
||||||
|
/// Falls back to the current executable when `~/bin/huskies-bin` does not exist.
|
||||||
|
/// Returns `true` (assume ok) if the `codesign` tool is unavailable.
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
fn check_codesign_macos() -> bool {
|
||||||
|
let target = if let Ok(home) = std::env::var("HOME") {
|
||||||
|
let installed = std::path::PathBuf::from(home)
|
||||||
|
.join("bin")
|
||||||
|
.join("huskies-bin");
|
||||||
|
if installed.exists() {
|
||||||
|
installed
|
||||||
|
} else {
|
||||||
|
match std::env::current_exe() {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
match std::env::current_exe() {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => return true,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::process::Command::new("codesign")
|
||||||
|
.args(["--verify", "--quiet", target.to_str().unwrap_or("")])
|
||||||
|
.output()
|
||||||
|
.map(|o| o.status.success())
|
||||||
|
.unwrap_or(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Entry point ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Run all health checks and return a formatted Markdown report (≤ 20 lines).
|
||||||
|
///
|
||||||
|
/// Gateway-specific checks (gateway-process, per-sled probes) are included
|
||||||
|
/// only when running in gateway mode. All other checks run in every mode.
|
||||||
|
pub async fn run_health_check(ctx: &BotContext) -> String {
|
||||||
|
let mut lines: Vec<HealthLine> = Vec::new();
|
||||||
|
|
||||||
|
// Gateway-only checks
|
||||||
|
if ctx.is_gateway() {
|
||||||
|
lines.push(check_gateway_process());
|
||||||
|
if let Some(ref store) = ctx.gateway_projects_store {
|
||||||
|
lines.extend(check_sleds(store).await);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shared checks — run concurrently where possible.
|
||||||
|
let perm_line = check_perm_rx(ctx);
|
||||||
|
let sync_line = check_matrix_sync(ctx);
|
||||||
|
let creds_line = check_creds();
|
||||||
|
let hash_line = check_build_hash(&ctx.services.project_root).await;
|
||||||
|
|
||||||
|
lines.push(perm_line);
|
||||||
|
lines.push(sync_line);
|
||||||
|
lines.push(creds_line);
|
||||||
|
lines.push(hash_line);
|
||||||
|
|
||||||
|
let lines = truncate_lines(lines);
|
||||||
|
lines
|
||||||
|
.iter()
|
||||||
|
.map(|l| l.format())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Utilities ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Shorten a long error string to the first 60 characters for compact display.
|
||||||
|
fn short_error(s: &str) -> String {
|
||||||
|
s.chars().take(60).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// -- HealthLine formatting ------------------------------------------------
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pass_line_formats_without_detail() {
|
||||||
|
let line = HealthLine::pass("perm_rx");
|
||||||
|
assert_eq!(line.format(), "perm_rx PASS");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fail_line_formats_with_detail_and_hint() {
|
||||||
|
let line = HealthLine::fail(
|
||||||
|
"gateway-process",
|
||||||
|
"codesign invalid",
|
||||||
|
"run script/local-release",
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
line.format(),
|
||||||
|
"gateway-process FAIL: codesign invalid — run script/local-release"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn warn_line_formats_with_detail_and_hint() {
|
||||||
|
let line = HealthLine::warn("build-hash", "running abc, HEAD is def", "run rebuild");
|
||||||
|
assert_eq!(
|
||||||
|
line.format(),
|
||||||
|
"build-hash WARN: running abc, HEAD is def — run rebuild"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Truncation -----------------------------------------------------------
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn truncate_drops_oldest_warn_first() {
|
||||||
|
let mut lines: Vec<HealthLine> = (0..22)
|
||||||
|
.map(|i| {
|
||||||
|
if i % 3 == 0 {
|
||||||
|
HealthLine::fail(format!("sled:{i}"), "down", "fix it")
|
||||||
|
} else {
|
||||||
|
HealthLine::warn(format!("check:{i}"), "slow", "investigate")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Manually insert a known WARN at position 0 and a FAIL at position 1
|
||||||
|
lines.insert(0, HealthLine::warn("oldest-warn", "stale", "restart"));
|
||||||
|
lines.insert(1, HealthLine::fail("important-fail", "broken", "fix"));
|
||||||
|
|
||||||
|
let result = truncate_lines(lines.clone());
|
||||||
|
assert!(
|
||||||
|
result.len() <= MAX_LINES,
|
||||||
|
"output must be ≤ {MAX_LINES} lines"
|
||||||
|
);
|
||||||
|
|
||||||
|
// FAILs must be preserved.
|
||||||
|
let fail_count = result.iter().filter(|l| l.status == Status::Fail).count();
|
||||||
|
let orig_fail_count = lines.iter().filter(|l| l.status == Status::Fail).count();
|
||||||
|
assert_eq!(
|
||||||
|
fail_count,
|
||||||
|
orig_fail_count.min(MAX_LINES),
|
||||||
|
"all FAIL lines must be kept when they fit"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn truncate_noop_when_under_limit() {
|
||||||
|
let lines: Vec<HealthLine> = (0..5).map(|i| HealthLine::pass(format!("s{i}"))).collect();
|
||||||
|
let result = truncate_lines(lines.clone());
|
||||||
|
assert_eq!(result.len(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn truncate_stops_at_fails_when_no_warns_left() {
|
||||||
|
// 25 FAIL lines — nothing to drop; output is clamped at MAX_LINES.
|
||||||
|
let lines: Vec<HealthLine> = (0..25)
|
||||||
|
.map(|i| HealthLine::fail(format!("s{i}"), "broken", "fix"))
|
||||||
|
.collect();
|
||||||
|
let result = truncate_lines(lines);
|
||||||
|
// When only FAILs are present, truncation stops because no WARNs can be removed.
|
||||||
|
assert_eq!(result.len(), 25, "FAILs are never dropped by truncation");
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- perm_rx check --------------------------------------------------------
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn perm_rx_pass_when_locked() {
|
||||||
|
use crate::services::Services;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
|
|
||||||
|
let (perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||||
|
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||||
|
|
||||||
|
// Acquire the lock to simulate the permission listener holding it.
|
||||||
|
let _guard = perm_rx_arc.try_lock().unwrap();
|
||||||
|
|
||||||
|
// Build a minimal services bundle referencing our locked perm_rx.
|
||||||
|
let services = Arc::new(Services {
|
||||||
|
project_root: std::path::PathBuf::from("/tmp"),
|
||||||
|
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||||
|
bot_name: "test".to_string(),
|
||||||
|
bot_user_id: "@bot:test".to_string(),
|
||||||
|
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||||
|
perm_rx: Arc::clone(&perm_rx_arc),
|
||||||
|
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||||
|
permission_timeout_secs: 120,
|
||||||
|
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||||
|
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Build a minimal BotContext just to pass services.
|
||||||
|
let ctx = make_test_ctx(services);
|
||||||
|
|
||||||
|
let line = check_perm_rx(&ctx);
|
||||||
|
assert_eq!(
|
||||||
|
line.status,
|
||||||
|
Status::Pass,
|
||||||
|
"perm_rx should PASS when a task holds the lock"
|
||||||
|
);
|
||||||
|
|
||||||
|
drop(perm_tx); // suppress unused warning
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn perm_rx_fail_when_unlocked() {
|
||||||
|
use crate::services::Services;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
|
|
||||||
|
let (_perm_tx, perm_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||||
|
let perm_rx_arc = Arc::new(TokioMutex::new(perm_rx));
|
||||||
|
// Lock is NOT held by anyone.
|
||||||
|
|
||||||
|
let services = Arc::new(Services {
|
||||||
|
project_root: std::path::PathBuf::from("/tmp"),
|
||||||
|
agents: Arc::new(crate::agents::AgentPool::new_test(3000)),
|
||||||
|
bot_name: "test".to_string(),
|
||||||
|
bot_user_id: "@bot:test".to_string(),
|
||||||
|
ambient_rooms: Arc::new(std::sync::Mutex::new(std::collections::HashSet::new())),
|
||||||
|
perm_rx: Arc::clone(&perm_rx_arc),
|
||||||
|
pending_perm_replies: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||||
|
permission_timeout_secs: 120,
|
||||||
|
status: Arc::new(crate::service::status::StatusBroadcaster::new()),
|
||||||
|
chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
|
||||||
|
});
|
||||||
|
|
||||||
|
let ctx = make_test_ctx(services);
|
||||||
|
|
||||||
|
let line = check_perm_rx(&ctx);
|
||||||
|
assert_eq!(
|
||||||
|
line.status,
|
||||||
|
Status::Fail,
|
||||||
|
"perm_rx should FAIL when no task holds the lock"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- matrix-sync check ----------------------------------------------------
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn matrix_sync_pass_when_recent() {
|
||||||
|
let services = crate::services::Services::new_test(
|
||||||
|
std::path::PathBuf::from("/tmp"),
|
||||||
|
"bot".to_string(),
|
||||||
|
);
|
||||||
|
let ctx = make_test_ctx(services);
|
||||||
|
// Set last event to just now.
|
||||||
|
ctx.last_matrix_event_ms
|
||||||
|
.store(chrono::Utc::now().timestamp_millis(), Ordering::Relaxed);
|
||||||
|
let line = check_matrix_sync(&ctx);
|
||||||
|
assert_eq!(line.status, Status::Pass);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn matrix_sync_fail_when_stale() {
|
||||||
|
let services = crate::services::Services::new_test(
|
||||||
|
std::path::PathBuf::from("/tmp"),
|
||||||
|
"bot".to_string(),
|
||||||
|
);
|
||||||
|
let ctx = make_test_ctx(services);
|
||||||
|
// Simulate 200 seconds of silence.
|
||||||
|
let old_ms = chrono::Utc::now().timestamp_millis() - 200_000;
|
||||||
|
ctx.last_matrix_event_ms.store(old_ms, Ordering::Relaxed);
|
||||||
|
let line = check_matrix_sync(&ctx);
|
||||||
|
assert_eq!(line.status, Status::Fail);
|
||||||
|
assert!(
|
||||||
|
line.detail.as_deref().unwrap_or("").contains("200s")
|
||||||
|
|| line.detail.as_deref().unwrap_or("").contains("s"),
|
||||||
|
"detail should mention age in seconds"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- creds check ----------------------------------------------------------
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn creds_fail_when_file_missing() {
|
||||||
|
// In the test environment there is unlikely to be a ~/.claude/.credentials.json
|
||||||
|
// with a valid non-expired token, so we just confirm the function returns a
|
||||||
|
// HealthLine without panicking.
|
||||||
|
let line = check_creds();
|
||||||
|
// We don't assert a specific status — the check should not panic.
|
||||||
|
let _ = line.format();
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- build_hash check -----------------------------------------------------
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn build_hash_pass_when_git_unavailable() {
|
||||||
|
// In a test environment without a git repo at /tmp/nonexistent, the check
|
||||||
|
// should gracefully return PASS rather than panicking.
|
||||||
|
let line = check_build_hash(std::path::Path::new("/tmp/nonexistent")).await;
|
||||||
|
// Should either PASS or produce a sensible result — must not panic.
|
||||||
|
let _ = line.format();
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- health command registration ------------------------------------------
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn health_command_registered_in_commands() {
|
||||||
|
let cmds = crate::chat::commands::commands();
|
||||||
|
assert!(
|
||||||
|
cmds.iter().any(|c| c.name == "health"),
|
||||||
|
"health must be registered in commands()"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn health_command_has_description() {
|
||||||
|
let cmds = crate::chat::commands::commands();
|
||||||
|
let cmd = cmds.iter().find(|c| c.name == "health").unwrap();
|
||||||
|
assert!(!cmd.description.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Helper ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Build a minimal `BotContext` for testing purposes.
|
||||||
|
fn make_test_ctx(services: std::sync::Arc<crate::services::Services>) -> BotContext {
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::AtomicI64;
|
||||||
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
|
|
||||||
|
BotContext {
|
||||||
|
services,
|
||||||
|
matrix_user_id: "@bot:example.com".parse().unwrap(),
|
||||||
|
target_room_ids: vec![],
|
||||||
|
allowed_users: vec![],
|
||||||
|
history: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||||
|
history_size: 20,
|
||||||
|
bot_sent_event_ids: Arc::new(TokioMutex::new(HashSet::new())),
|
||||||
|
htop_sessions: Arc::new(TokioMutex::new(std::collections::HashMap::new())),
|
||||||
|
transport: Arc::new(crate::chat::transport::whatsapp::WhatsAppTransport::new(
|
||||||
|
"test-phone".to_string(),
|
||||||
|
"test-token".to_string(),
|
||||||
|
"pipeline_notification".to_string(),
|
||||||
|
)),
|
||||||
|
timer_store: Arc::new(crate::service::timer::TimerStore::load(
|
||||||
|
std::path::PathBuf::from("/tmp/timers-health.json"),
|
||||||
|
)),
|
||||||
|
gateway_active_project: None,
|
||||||
|
gateway_projects_store: None,
|
||||||
|
handled_incoming_event_ids: Arc::new(TokioMutex::new(
|
||||||
|
crate::chat::transport::matrix::bot::context::SeenEventIds::new(
|
||||||
|
crate::chat::transport::matrix::bot::context::SEEN_EVENT_IDS_CAP,
|
||||||
|
),
|
||||||
|
)),
|
||||||
|
gateway_port: None,
|
||||||
|
last_matrix_event_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -25,6 +25,8 @@ pub mod commands;
|
|||||||
pub(crate) mod config;
|
pub(crate) mod config;
|
||||||
/// Story deletion command — handles `!delete` bot commands to remove work items.
|
/// Story deletion command — handles `!delete` bot commands to remove work items.
|
||||||
pub mod delete;
|
pub mod delete;
|
||||||
|
/// `health` chat command — surface gateway, sled, matrix, creds, and build-hash status.
|
||||||
|
pub mod health;
|
||||||
/// htop-style agent monitor command — renders a live process table in Matrix.
|
/// htop-style agent monitor command — renders a live process table in Matrix.
|
||||||
pub mod htop;
|
pub mod htop;
|
||||||
/// `new project <name>` chat command — Phase 1 gateway project bootstrap.
|
/// `new project <name>` chat command — Phase 1 gateway project bootstrap.
|
||||||
|
|||||||
Reference in New Issue
Block a user