huskies: merge 967

This commit is contained in:
dave
2026-05-13 12:34:35 +00:00
parent 40ea100eae
commit 93f774fcbb
9 changed files with 165 additions and 0 deletions
+84
View File
@@ -59,6 +59,7 @@ mod tests {
child_killers,
watcher_tx,
None,
None,
)
.await;
@@ -113,6 +114,7 @@ mod tests {
child_killers,
watcher_tx,
None,
None,
)
.await;
@@ -175,6 +177,7 @@ mod tests {
child_killers,
watcher_tx,
None,
None,
)
.await;
let after = chrono::Utc::now();
@@ -242,6 +245,7 @@ mod tests {
child_killers,
watcher_tx,
None,
None,
)
.await;
@@ -373,4 +377,84 @@ mod tests {
assert!(rx.try_recv().is_err());
}
// ── bug 967: eager session recording survives watchdog kill + task abort ──
/// AC2 regression: simulates a watchdog kill of an agent that emitted a
/// session_id mid-run. The script emits a `"system"` JSON event and then
/// sleeps; a concurrent task kills the child after 500 ms (simulating the
/// watchdog). The eager-recording path in `run_agent_pty_blocking` must
/// have already persisted the session_id before the kill, so
/// `lookup_session` returns it (warm) rather than `None` (cold).
#[tokio::test]
async fn watchdog_kill_session_id_survives_abort() {
use std::os::unix::fs::PermissionsExt;
let tmp = tempfile::tempdir().unwrap();
let project_root = tmp.path().to_path_buf();
std::fs::create_dir_all(project_root.join(".huskies")).unwrap();
// Script emits a system event immediately, then sleeps so the process
// stays alive long enough for us to kill it (simulating the watchdog).
let script = tmp.path().join("emit_then_sleep.sh");
std::fs::write(
&script,
"#!/bin/sh\nprintf '%s\\n' '{\"type\":\"system\",\"session_id\":\"sess-967-watchdog\"}'\nsleep 60\n",
)
.unwrap();
std::fs::set_permissions(&script, std::fs::Permissions::from_mode(0o755)).unwrap();
let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
let event_log = Arc::new(Mutex::new(Vec::new()));
let child_killers: Arc<
Mutex<HashMap<String, Box<dyn portable_pty::ChildKiller + Send + Sync>>>,
> = Arc::new(Mutex::new(HashMap::new()));
let child_killers_for_kill = Arc::clone(&child_killers);
// Spawn a task to kill the child after a short delay (simulating watchdog).
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
if let Ok(mut killers) = child_killers_for_kill.lock() {
for (_, killer) in killers.iter_mut() {
let _ = killer.kill();
}
}
});
// Run the PTY directly — it returns once the child is killed.
let script_arg = script.to_string_lossy().to_string();
let _ = run_agent_pty_streaming(
"967_story_watchdog",
"coder-1",
"sh",
&[script_arg],
"--",
"/tmp",
&tx,
&event_log,
None,
0, // no inactivity timeout
child_killers,
watcher_tx,
None, // no session to resume
Some((project_root.clone(), "sonnet".to_string())),
)
.await;
// The session_id must be in the store (eagerly recorded when the
// "system" event was seen, before the kill).
let recorded = crate::agents::session_store::lookup_session(
&project_root,
"967_story_watchdog",
"coder-1",
"sonnet",
);
assert_eq!(
recorded,
Some("sess-967-watchdog".to_string()),
"session_id must be recorded eagerly before the watchdog kill so \
the respawn's lookup_session returns it (warm), not None (cold)"
);
}
}
+22
View File
@@ -33,6 +33,16 @@ use super::types::{ChildKillerGuard, PtyResult, composite_key};
/// If the agent committed valid work before crashing, the "work survived" check
/// in `pipeline::advance` detects the committed code and advances the story to
/// QA instead of entering the retry/block path.
///
/// ## `eager_record` — watchdog-kill race fix (bug 967)
///
/// When `Some((project_root, model))` is passed, the blocking thread calls
/// `session_store::record_session()` immediately when the `"system"` JSON event
/// is parsed. This runs inside the OS blocking thread, which cannot be
/// cancelled by a tokio task abort. If the watchdog later kills the PTY child
/// and aborts the spawned tokio task, the session_id is already persisted and
/// the respawn's `lookup_session()` returns it (warm start instead of cold).
/// Pass `None` when session persistence is not needed (e.g. in tests).
#[allow(clippy::too_many_arguments)]
pub(in crate::agents) async fn run_agent_pty_streaming(
story_id: &str,
@@ -48,6 +58,7 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
watcher_tx: broadcast::Sender<WatcherEvent>,
session_id_to_resume: Option<&str>,
eager_record: Option<(std::path::PathBuf, String)>,
) -> Result<PtyResult, String> {
let sid = story_id.to_string();
let aname = agent_name.to_string();
@@ -74,6 +85,7 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
&child_killers,
&watcher_tx,
resume_sid.as_deref(),
eager_record,
)
})
.await
@@ -95,6 +107,7 @@ fn run_agent_pty_blocking(
child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
watcher_tx: &broadcast::Sender<WatcherEvent>,
session_id_to_resume: Option<&str>,
eager_record: Option<(std::path::PathBuf, String)>,
) -> Result<PtyResult, String> {
let pty_system = native_pty_system();
@@ -319,6 +332,15 @@ fn run_agent_pty_blocking(
.get("session_id")
.and_then(|s| s.as_str())
.map(|s| s.to_string());
// Eagerly persist the session_id so it survives a watchdog kill
// that aborts the tokio task before run_agent_spawn's
// record_session() call (bug 967). Runs in the OS blocking
// thread — not cancellable by tokio task abort.
if let (Some(sid), Some((root, model))) = (&session_id, &eager_record) {
crate::agents::session_store::record_session(
root, story_id, agent_name, model, sid,
);
}
}
// With --include-partial-messages, thinking and text arrive
// incrementally via stream_event → content_block_delta. Handle