wip(501): timer cancellation infrastructure (parallel session WIP + main.rs wiring)
Bundles in-progress work from a parallel Claude session toward fixing
bug 501 (rate-limit retry timer doesn't cancel on stop_agent / move_story
/ successful completion). This commit lands the foundation but the MCP
tool wiring is still TODO.
- server/src/chat/timer.rs: defense-in-depth check in tick_once that
skips firing a timer for stories already past 3_qa (3_qa, 4_merge,
5_done, 6_archived). The primary cancellation path will be in the
MCP tools; this guards races where a timer was scheduled before the
story was advanced and the tool didn't get a chance to cancel it.
- server/src/http/context.rs: adds `timer_store: Arc<TimerStore>` field
on AppContext so MCP tools (move_story, stop_agent, ...) can reach
the shared timer store and cancel pending entries when the user
intervenes manually. The test helper is updated to construct one.
- server/src/main.rs: wires up a TimerStore instance in the AppContext
initialiser so the binary actually compiles after the context.rs
field addition. TODO: the matrix bot's spawn_bot still creates its
own TimerStore instance (in chat/transport/matrix/bot/run.rs:220-227)
rather than consuming the shared one — that refactor is the next
step in the bug 501 fix.
What is NOT in this commit and is needed to actually fix bug 501:
- The MCP tool side (move_story, stop_agent, delete_story) does not
yet call timer_store.cancel(story_id) when invoked
- The matrix bot's spawn_bot does not yet consume the shared
timer_store from AppContext — it still creates its own
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -200,6 +200,28 @@ async fn tick_once(
|
||||
for entry in due {
|
||||
crate::slog!("[timer] Timer fired for story {}", entry.story_id);
|
||||
|
||||
// Bug 501: Defense-in-depth check. If the story has already advanced
|
||||
// past the active-work stages (3_qa, 4_merge, 5_done, 6_archived),
|
||||
// there is nothing to resume — the timer is stale and should no-op.
|
||||
// The primary cancellation paths (move_story MCP → backlog, stop_agent)
|
||||
// remove the timer before it fires; this guard covers the case where
|
||||
// cancellation was not yet called or the story raced forward through
|
||||
// the pipeline while the timer was pending.
|
||||
if let Some(item) = crate::crdt_state::read_item(&entry.story_id) {
|
||||
match item.stage.as_str() {
|
||||
"3_qa" | "4_merge" | "5_done" | "6_archived" => {
|
||||
crate::slog!(
|
||||
"[timer] Skipping timer for story {} — currently in '{}', \
|
||||
not in backlog/current; timer is stale",
|
||||
entry.story_id,
|
||||
item.stage
|
||||
);
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Move from backlog to current if needed — the auto-assign
|
||||
// watcher will then start an agent automatically.
|
||||
if let Err(e) =
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::agents::{AgentPool, ReconciliationEvent};
|
||||
use crate::chat::timer::TimerStore;
|
||||
use crate::io::watcher::WatcherEvent;
|
||||
use crate::rebuild::{BotShutdownNotifier, ShutdownReason};
|
||||
use crate::state::SessionState;
|
||||
@@ -67,6 +68,13 @@ pub struct AppContext {
|
||||
/// `None` when no Matrix bot is configured.
|
||||
pub matrix_shutdown_tx:
|
||||
Option<Arc<tokio::sync::watch::Sender<Option<ShutdownReason>>>>,
|
||||
/// Shared rate-limit retry timer store.
|
||||
///
|
||||
/// Used by MCP tools (`move_story`, `stop_agent`) to cancel pending timers
|
||||
/// when the user manually intervenes (bug 501). Shared with the tick loop
|
||||
/// spawned by the bot so that cancellations take effect in-memory rather
|
||||
/// than only on disk.
|
||||
pub timer_store: Arc<TimerStore>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -78,6 +86,9 @@ impl AppContext {
|
||||
let (watcher_tx, _) = broadcast::channel(64);
|
||||
let (reconciliation_tx, _) = broadcast::channel(64);
|
||||
let (perm_tx, perm_rx) = mpsc::unbounded_channel();
|
||||
let timer_store = Arc::new(TimerStore::load(
|
||||
project_root.join(".huskies").join("timers.json"),
|
||||
));
|
||||
Self {
|
||||
state: Arc::new(state),
|
||||
store: Arc::new(JsonFileStore::new(store_path).unwrap()),
|
||||
@@ -90,6 +101,7 @@ impl AppContext {
|
||||
qa_app_process: Arc::new(std::sync::Mutex::new(None)),
|
||||
bot_shutdown: None,
|
||||
matrix_shutdown_tx: None,
|
||||
timer_store,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -631,6 +631,21 @@ async fn main() -> Result<(), std::io::Error> {
|
||||
let matrix_shutdown_tx = Arc::new(matrix_shutdown_tx);
|
||||
let matrix_shutdown_tx_for_rebuild = Arc::clone(&matrix_shutdown_tx);
|
||||
|
||||
// Bug 501: shared rate-limit retry timer store, accessible from MCP tools
|
||||
// via AppContext so manual interventions (move_story → backlog, stop_agent)
|
||||
// can cancel pending timers in-memory rather than only on disk.
|
||||
//
|
||||
// TODO(bug 501): the matrix bot currently spawns its own TimerStore instance
|
||||
// in `chat::transport::matrix::bot::run::spawn_bot`. Refactor to consume this
|
||||
// shared instance via `AppContext.timer_store` so cancellations from MCP
|
||||
// tools and the bot's tick loop see the same in-memory state.
|
||||
let timer_store = std::sync::Arc::new(crate::chat::timer::TimerStore::load(
|
||||
startup_root
|
||||
.as_ref()
|
||||
.map(|r| r.join(".huskies").join("timers.json"))
|
||||
.unwrap_or_else(|| std::path::PathBuf::from("/tmp/huskies-timers.json")),
|
||||
));
|
||||
|
||||
let ctx = AppContext {
|
||||
state: app_state,
|
||||
store,
|
||||
@@ -643,6 +658,7 @@ async fn main() -> Result<(), std::io::Error> {
|
||||
qa_app_process: Arc::new(std::sync::Mutex::new(None)),
|
||||
bot_shutdown: bot_shutdown_notifier.clone(),
|
||||
matrix_shutdown_tx: Some(Arc::clone(&matrix_shutdown_tx)),
|
||||
timer_store,
|
||||
};
|
||||
|
||||
let app = build_routes(ctx, whatsapp_ctx.clone(), slack_ctx.clone(), port);
|
||||
|
||||
Reference in New Issue
Block a user