wip(501): timer cancellation infrastructure (parallel session WIP + main.rs wiring)

Bundles in-progress work from a parallel Claude session toward fixing
bug 501 (rate-limit retry timer doesn't cancel on stop_agent / move_story
/ successful completion). This commit lands the foundation but the MCP
tool wiring is still TODO.

  - server/src/chat/timer.rs: defense-in-depth check in tick_once that
    skips firing a timer for stories already past 3_qa (3_qa, 4_merge,
    5_done, 6_archived). The primary cancellation path will be in the
    MCP tools; this guards races where a timer was scheduled before the
    story was advanced and the tool didn't get a chance to cancel it.

  - server/src/http/context.rs: adds `timer_store: Arc<TimerStore>` field
    on AppContext so MCP tools (move_story, stop_agent, ...) can reach
    the shared timer store and cancel pending entries when the user
    intervenes manually. The test helper is updated to construct one.

  - server/src/main.rs: wires up a TimerStore instance in the AppContext
    initialiser so the binary actually compiles after the context.rs
    field addition. TODO: the matrix bot's spawn_bot still creates its
    own TimerStore instance (in chat/transport/matrix/bot/run.rs:220-227)
    rather than consuming the shared one — that refactor is the next
    step in the bug 501 fix.

What is NOT in this commit and is needed to actually fix bug 501:
  - The MCP tool side (move_story, stop_agent, delete_story) does not
    yet call timer_store.cancel(story_id) when invoked
  - The matrix bot's spawn_bot does not yet consume the shared
    timer_store from AppContext — it still creates its own

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Timmy
2026-04-09 21:28:48 +01:00
parent 1707277bb7
commit 13635b01bc
3 changed files with 50 additions and 0 deletions
+22
View File
@@ -200,6 +200,28 @@ async fn tick_once(
for entry in due {
crate::slog!("[timer] Timer fired for story {}", entry.story_id);
// Bug 501: Defense-in-depth check. If the story has already advanced
// past the active-work stages (3_qa, 4_merge, 5_done, 6_archived),
// there is nothing to resume — the timer is stale and should no-op.
// The primary cancellation paths (move_story MCP → backlog, stop_agent)
// remove the timer before it fires; this guard covers the case where
// cancellation was not yet called or the story raced forward through
// the pipeline while the timer was pending.
if let Some(item) = crate::crdt_state::read_item(&entry.story_id) {
match item.stage.as_str() {
"3_qa" | "4_merge" | "5_done" | "6_archived" => {
crate::slog!(
"[timer] Skipping timer for story {} — currently in '{}', \
not in backlog/current; timer is stale",
entry.story_id,
item.stage
);
continue;
}
_ => {}
}
}
// Move from backlog to current if needed — the auto-assign
// watcher will then start an agent automatically.
if let Err(e) =
+12
View File
@@ -1,4 +1,5 @@
use crate::agents::{AgentPool, ReconciliationEvent};
use crate::chat::timer::TimerStore;
use crate::io::watcher::WatcherEvent;
use crate::rebuild::{BotShutdownNotifier, ShutdownReason};
use crate::state::SessionState;
@@ -67,6 +68,13 @@ pub struct AppContext {
/// `None` when no Matrix bot is configured.
pub matrix_shutdown_tx:
Option<Arc<tokio::sync::watch::Sender<Option<ShutdownReason>>>>,
/// Shared rate-limit retry timer store.
///
/// Used by MCP tools (`move_story`, `stop_agent`) to cancel pending timers
/// when the user manually intervenes (bug 501). Shared with the tick loop
/// spawned by the bot so that cancellations take effect in-memory rather
/// than only on disk.
pub timer_store: Arc<TimerStore>,
}
#[cfg(test)]
@@ -78,6 +86,9 @@ impl AppContext {
let (watcher_tx, _) = broadcast::channel(64);
let (reconciliation_tx, _) = broadcast::channel(64);
let (perm_tx, perm_rx) = mpsc::unbounded_channel();
let timer_store = Arc::new(TimerStore::load(
project_root.join(".huskies").join("timers.json"),
));
Self {
state: Arc::new(state),
store: Arc::new(JsonFileStore::new(store_path).unwrap()),
@@ -90,6 +101,7 @@ impl AppContext {
qa_app_process: Arc::new(std::sync::Mutex::new(None)),
bot_shutdown: None,
matrix_shutdown_tx: None,
timer_store,
}
}
}
+16
View File
@@ -631,6 +631,21 @@ async fn main() -> Result<(), std::io::Error> {
let matrix_shutdown_tx = Arc::new(matrix_shutdown_tx);
let matrix_shutdown_tx_for_rebuild = Arc::clone(&matrix_shutdown_tx);
// Bug 501: shared rate-limit retry timer store, accessible from MCP tools
// via AppContext so manual interventions (move_story → backlog, stop_agent)
// can cancel pending timers in-memory rather than only on disk.
//
// TODO(bug 501): the matrix bot currently spawns its own TimerStore instance
// in `chat::transport::matrix::bot::run::spawn_bot`. Refactor to consume this
// shared instance via `AppContext.timer_store` so cancellations from MCP
// tools and the bot's tick loop see the same in-memory state.
let timer_store = std::sync::Arc::new(crate::chat::timer::TimerStore::load(
startup_root
.as_ref()
.map(|r| r.join(".huskies").join("timers.json"))
.unwrap_or_else(|| std::path::PathBuf::from("/tmp/huskies-timers.json")),
));
let ctx = AppContext {
state: app_state,
store,
@@ -643,6 +658,7 @@ async fn main() -> Result<(), std::io::Error> {
qa_app_process: Arc::new(std::sync::Mutex::new(None)),
bot_shutdown: bot_shutdown_notifier.clone(),
matrix_shutdown_tx: Some(Arc::clone(&matrix_shutdown_tx)),
timer_store,
};
let app = build_routes(ctx, whatsapp_ctx.clone(), slack_ctx.clone(), port);