huskies: merge 534_refactor_unify_timer_tick_watchdog_and_watcher_sweep_into_a_single_1_second_tick_loop

This commit is contained in:
dave
2026-04-10 17:34:41 +00:00
parent 808935b446
commit 91be0ac47f
5 changed files with 88 additions and 125 deletions
+5 -25
View File
@@ -1,8 +1,7 @@
//! Watchdog task: detects orphaned agents and triggers auto-assign.
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::sync::Mutex;
use tokio::sync::broadcast;
use crate::slog;
@@ -73,30 +72,11 @@ impl AgentPool {
check_orphaned_agents(&self.agents);
}
/// Spawn a background watchdog task that periodically checks for Running agents
/// whose underlying task has already finished (orphaned entries). Any such agent
/// is marked Failed and an Error event is emitted so that `wait_for_agent` unblocks.
/// Run one watchdog pass and return the number of orphaned agents detected.
///
/// The watchdog runs every 30 seconds. It is a safety net for edge cases where the
/// PTY read loop exits without updating the agent status (e.g. a panic in the
/// spawn_blocking task, or an external SIGKILL that closes the PTY fd immediately).
///
/// When orphaned agents are detected and a `project_root` is provided, auto-assign
/// is triggered so that free agents can pick up unassigned work.
pub fn spawn_watchdog(pool: Arc<AgentPool>, project_root: Option<PathBuf>) {
tokio::spawn(async move {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
loop {
interval.tick().await;
let found = check_orphaned_agents(&pool.agents);
if found > 0
&& let Some(ref root) = project_root
{
slog!("[watchdog] {found} orphaned agent(s) detected; triggering auto-assign.");
pool.auto_assign_available_work(root).await;
}
}
});
/// Called by the unified background tick loop every 30 ticks.
pub fn run_watchdog_pass(&self) -> usize {
check_orphaned_agents(&self.agents)
}
}
+4 -35
View File
@@ -1,8 +1,8 @@
//! Deferred agent start via one-shot timers.
//!
//! Provides [`TimerStore`] for persisting timers to `.huskies/timers.json`,
//! a 30-second tick loop ([`spawn_timer_tick_loop`]) that fires due timers,
//! Provides [`TimerStore`] for persisting timers to `.huskies/timers.json`
//! and command parsing / handling for the `timer` bot command.
//! Due timers are fired by the unified background tick loop in `main`.
use chrono::{DateTime, Duration, Local, NaiveTime, TimeZone, Utc};
use chrono_tz::Tz;
@@ -134,43 +134,12 @@ impl TimerStore {
// ── Tick loop ──────────────────────────────────────────────────────────────
/// Spawn a background tokio task that fires due timers every 1 second.
///
/// Same pattern as the watchdog in `agents::pool::auto_assign`.
/// When a timer fires, `start_agent` is called for the story. If all coders
/// are busy the story remains in `2_current/` and auto-assign will pick it up.
///
/// The loop body is wrapped in `catch_unwind` so a panic on any single tick
/// does not silently kill the background task.
pub fn spawn_timer_tick_loop(
store: Arc<TimerStore>,
agents: Arc<crate::agents::AgentPool>,
project_root: PathBuf,
) {
let pending_count = store.list().len();
crate::slog!(
"[timer] Tick loop started; {pending_count} pending timer(s) loaded"
);
tokio::spawn(async move {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(1));
loop {
interval.tick().await;
// Wrap the tick body so a panic doesn't kill the loop.
let tick_result = tick_once(&store, &agents, &project_root).await;
if let Err(msg) = tick_result {
crate::slog_error!("[timer] Tick panicked: {msg}");
}
}
});
}
/// Execute one tick of the timer loop.
///
/// Called by the unified background tick loop every second.
/// Separated from the loop so we can catch panics at the call-site.
/// Returns `Err` only when the tick panicked (the panic message is returned).
async fn tick_once(
pub(crate) async fn tick_once(
store: &Arc<TimerStore>,
agents: &Arc<crate::agents::AgentPool>,
project_root: &Path,
@@ -220,11 +220,6 @@ pub async fn run_bot(
let timer_store = Arc::new(crate::chat::timer::TimerStore::load(
project_root.join(".huskies").join("timers.json"),
));
crate::chat::timer::spawn_timer_tick_loop(
Arc::clone(&timer_store),
Arc::clone(&agents),
project_root.clone(),
);
// Auto-schedule timers when an agent hits a hard rate limit.
crate::chat::timer::spawn_rate_limit_auto_scheduler(
Arc::clone(&timer_store),
+20 -49
View File
@@ -19,7 +19,6 @@
//! via exit-code inspection and silently skips the commit while still broadcasting
//! the event so connected clients stay in sync.
use crate::config::{ProjectConfig, WatcherConfig};
use crate::slog;
use notify::{EventKind, RecommendedWatcher, RecursiveMode, Watcher, recommended_watcher};
use serde::Serialize;
@@ -328,7 +327,7 @@ fn flush_pending(
/// All state is read from and written to CRDT — no filesystem access.
/// Worktree pruning is handled separately by the CRDT event subscriber.
pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
use crate::pipeline_state::{Stage, read_all_typed};
use crate::pipeline_state::{PipelineEvent, Stage, stage_dir_name, transition, read_all_typed};
for item in read_all_typed() {
if let Stage::Done { merged_at, .. } = &item.stage {
@@ -337,9 +336,24 @@ pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
.to_std()
.unwrap_or_default();
if age >= done_retention {
let story_id = &item.story_id.0;
crate::db::move_item_stage(story_id, "6_archived", None);
slog!("[watcher] sweep: promoted {story_id} → 6_archived/");
let story_id = item.story_id.0.clone();
match transition(item.stage.clone(), PipelineEvent::Accepted) {
Ok(new_stage) => {
crate::crdt_state::write_item(
&story_id,
stage_dir_name(&new_stage),
None,
None,
None,
Some(false),
None,
);
slog!("[watcher] sweep: promoted {story_id} → 6_archived/");
}
Err(e) => {
slog!("[watcher] sweep: transition error for {story_id}: {e}");
}
}
}
}
}
@@ -360,7 +374,6 @@ pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
pub fn start_watcher(
git_root: PathBuf,
event_tx: broadcast::Sender<WatcherEvent>,
watcher_config: WatcherConfig,
) {
std::thread::spawn(move || {
let (notify_tx, notify_rx) = mpsc::channel::<notify::Result<notify::Event>>();
@@ -389,27 +402,13 @@ pub fn start_watcher(
}
}
slog!("[watcher] watching config files and running sweep timer");
slog!("[watcher] watching config files for hot-reload");
const DEBOUNCE: Duration = Duration::from_millis(300);
// Mutable sweep config — hot-reloaded when project.toml changes.
let mut sweep_interval = Duration::from_secs(watcher_config.sweep_interval_secs);
let mut done_retention = Duration::from_secs(watcher_config.done_retention_secs);
slog!(
"[watcher] sweep_interval={}s done_retention={}s",
watcher_config.sweep_interval_secs,
watcher_config.done_retention_secs
);
// Whether a config file change is pending in the current debounce window.
let mut config_changed_pending = false;
let mut deadline: Option<Instant> = None;
// Track when we last swept 5_done/ → 6_archived/.
// Initialise to "now minus interval" so the first sweep runs on startup.
let mut last_sweep = Instant::now()
.checked_sub(sweep_interval)
.unwrap_or_else(Instant::now);
loop {
// How long until the debounce window closes (or wait for next event).
@@ -454,37 +453,9 @@ pub fn start_watcher(
slog!("[watcher] broadcasting agent_config_changed");
let _ = event_tx.send(WatcherEvent::ConfigChanged);
// Hot-reload sweep config from project.toml.
match ProjectConfig::load(&git_root) {
Ok(cfg) => {
let new_sweep = Duration::from_secs(cfg.watcher.sweep_interval_secs);
let new_retention =
Duration::from_secs(cfg.watcher.done_retention_secs);
if new_sweep != sweep_interval || new_retention != done_retention {
slog!(
"[watcher] hot-reload: sweep_interval={}s done_retention={}s",
cfg.watcher.sweep_interval_secs,
cfg.watcher.done_retention_secs
);
sweep_interval = new_sweep;
done_retention = new_retention;
}
}
Err(e) => {
slog!("[watcher] hot-reload: failed to parse config: {e}");
}
}
config_changed_pending = false;
}
deadline = None;
// Periodically promote old items from 5_done/ to 6_archived/.
let now = Instant::now();
if now.duration_since(last_sweep) >= sweep_interval {
last_sweep = now;
sweep_done_to_archived(done_retention);
}
}
}
});
+59 -11
View File
@@ -323,19 +323,12 @@ async fn main() -> Result<(), std::io::Error> {
let (watcher_tx, _) = broadcast::channel::<io::watcher::WatcherEvent>(1024);
let agents = Arc::new(AgentPool::new(port, watcher_tx.clone()));
// Start the background watchdog that detects and cleans up orphaned Running agents.
// When orphans are found, auto-assign is triggered to reassign free agents.
let watchdog_root: Option<PathBuf> = app_state.project_root.lock().unwrap().clone();
AgentPool::spawn_watchdog(Arc::clone(&agents), watchdog_root);
// Filesystem watcher: watches config files (project.toml, agents.toml) for
// hot-reload and runs the CRDT-based done→archived sweep. Work-item pipeline
// events are driven by CRDT state transitions via crdt_state::subscribe().
// hot-reload. Work-item pipeline events are driven by CRDT state transitions
// via crdt_state::subscribe(). Sweep (done→archived) is handled by the unified
// background tick loop below.
if let Some(ref root) = *app_state.project_root.lock().unwrap() {
let watcher_config = config::ProjectConfig::load(root)
.map(|c| c.watcher)
.unwrap_or_default();
io::watcher::start_watcher(root.clone(), watcher_tx.clone(), watcher_config);
io::watcher::start_watcher(root.clone(), watcher_tx.clone());
}
// Bridge CRDT state-transition events to the watcher broadcast channel.
@@ -655,6 +648,8 @@ async fn main() -> Result<(), std::io::Error> {
.unwrap_or_else(|| std::path::PathBuf::from("/tmp/huskies-timers.json")),
));
let timer_store_for_tick = Arc::clone(&timer_store);
let ctx = AppContext {
state: app_state,
store,
@@ -672,6 +667,59 @@ async fn main() -> Result<(), std::io::Error> {
let app = build_routes(ctx, whatsapp_ctx.clone(), slack_ctx.clone(), port);
// Unified 1-second background tick loop: fires due timers, detects orphaned
// agents (watchdog), and promotes done→archived items (sweep). Replaces the
// three separate background loops that previously ran independently.
{
let tick_agents = Arc::clone(&startup_agents);
let tick_timer = timer_store_for_tick;
let tick_root = startup_root.clone();
let sweep_cfg = tick_root
.as_ref()
.and_then(|r| config::ProjectConfig::load(r).ok())
.map(|c| c.watcher)
.unwrap_or_default();
let sweep_every = sweep_cfg.sweep_interval_secs.max(1);
let done_retention = std::time::Duration::from_secs(sweep_cfg.done_retention_secs);
let pending_count = tick_timer.list().len();
crate::slog!("[tick] Unified tick loop started; {pending_count} pending timer(s)");
tokio::spawn(async move {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(1));
let mut tick_count: u64 = 0;
loop {
interval.tick().await;
tick_count = tick_count.wrapping_add(1);
// Timer: fire due timers every second.
if let Some(ref root) = tick_root {
let result =
crate::chat::timer::tick_once(&tick_timer, &tick_agents, root).await;
if let Err(msg) = result {
crate::slog_error!("[tick] Timer tick panicked: {msg}");
}
}
// Watchdog: detect orphaned Running agents every 30 ticks.
if tick_count.is_multiple_of(30) {
let found = tick_agents.run_watchdog_pass();
if found > 0 {
crate::slog!(
"[tick] {found} orphaned agent(s) detected; triggering auto-assign."
);
if let Some(ref root) = tick_root {
tick_agents.auto_assign_available_work(root).await;
}
}
}
// Sweep: promote done→archived every sweep_interval_secs ticks.
if tick_count.is_multiple_of(sweep_every) {
crate::io::watcher::sweep_done_to_archived(done_retention);
}
}
});
}
// Optional Matrix bot: connect to the homeserver and start listening for
// messages if `.huskies/bot.toml` is present and enabled.
if let Some(ref root) = startup_root {