fix(903): run_tests attaches to in-flight job instead of kill+respawn
Bug 903: every `run_tests` MCP call killed the prior `cargo test` child for the same worktree and spawned a fresh one. Combined with the ~60s MCP client-side timeout and the 896 agent prompt that told agents to "call run_tests again — it attaches to the in-flight test job", this produced a respawn loop: agent calls, MCP times out at 60s, agent retries, run_tests kills the running build and starts a new one. The test suite never reaches the finish line. Server log evidence: "Started test job for <worktree> (pid N)" with a new PID every ~60-90s for the same worktree. Fix: when `run_tests` is called and a job is already in flight for that worktree, ATTACH to it instead of killing+respawning. The original job's poll loop already writes the final status to the CRDT `test_jobs` collection; attached callers just poll that CRDT entry (the same pattern `get_test_result` uses) and return the result when the in-flight job transitions out of "running". The 896 prompt's claim is now actually true. Worktrees remain isolated from each other and may run `cargo test` concurrently — there is no cross-worktree serialisation. The single invariant is "at most one test job per worktree at a time". New test: `tool_run_tests_concurrent_calls_attach_to_single_job` spawns two concurrent calls on the same worktree against a 2s `sleep`-based script and asserts total elapsed stays close to 2s (attach) rather than 4s (respawn). Note: the cross-worktree linker-OOM symptom Timmy reported in the field was downstream of the respawn loop. Killed-but-not-fully-reaped cargo invocations stack memory pressure beyond the nominal N worktrees. With the attach fix, each worktree runs exactly one in-flight build at a time and old builds finish cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,15 +70,31 @@ pub(crate) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result<Str
|
||||
|
||||
let sid = story_key(&working_dir);
|
||||
|
||||
// Kill any existing in-flight job for this worktree before starting a new one.
|
||||
{
|
||||
let mut jobs = active_jobs().lock().map_err(|e| e.to_string())?;
|
||||
if let Some(mut old_job) = jobs.remove(&working_dir) {
|
||||
let _ = old_job.child.kill();
|
||||
let _ = old_job.child.wait();
|
||||
}
|
||||
// If a test job is already in flight for this worktree, ATTACH to it
|
||||
// rather than kill+respawn. This makes the agent system_prompt advice
|
||||
// ("if run_tests appears to time out, call run_tests again — it
|
||||
// attaches to the in-flight test job") actually true, and eliminates
|
||||
// the respawn-loop bug where MCP client-side timeouts (~60s) cause
|
||||
// agents to retry, killing the still-running cargo build each time
|
||||
// and never making progress. The original job's poll loop below
|
||||
// updates the CRDT on completion; attached callers just poll the CRDT.
|
||||
let already_running = {
|
||||
let jobs = active_jobs().lock().map_err(|e| e.to_string())?;
|
||||
jobs.contains_key(&working_dir)
|
||||
};
|
||||
if already_running {
|
||||
crate::slog!(
|
||||
"[run_tests] Attaching to in-flight test job for {}",
|
||||
working_dir.display()
|
||||
);
|
||||
return attach_to_in_flight_test_job(&sid).await;
|
||||
}
|
||||
|
||||
// Worktrees are isolated and may run cargo tests concurrently — no
|
||||
// cross-worktree serialisation. The only invariant enforced here is
|
||||
// "at most one test job per worktree at a time", which the
|
||||
// already_running check above gives us.
|
||||
|
||||
// Spawn the test process with piped stdout/stderr so we can capture output.
|
||||
// Pipes are drained in background threads to prevent deadlock when the
|
||||
// child fills the 64KB OS pipe buffer.
|
||||
@@ -233,6 +249,39 @@ pub(crate) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result<Str
|
||||
}
|
||||
}
|
||||
|
||||
/// Poll the CRDT `test_jobs` collection for `sid` until the entry transitions
|
||||
/// out of "running" or we hit [`TEST_TIMEOUT_SECS`].
|
||||
///
|
||||
/// Used by `run_tests` to attach to a job that another caller already spawned
|
||||
/// for the same worktree, so concurrent callers all observe the same single
|
||||
/// `cargo test` run rather than racing to kill+respawn.
|
||||
async fn attach_to_in_flight_test_job(sid: &str) -> Result<String, String> {
|
||||
let start = std::time::Instant::now();
|
||||
loop {
|
||||
match crate::crdt_state::read_test_job(sid) {
|
||||
None => {
|
||||
// The job entry disappeared from the CRDT — most likely the
|
||||
// spawning task lost the race between insert/read. Treat as a
|
||||
// transient error so the agent can retry.
|
||||
return Err("In-flight test job vanished from CRDT before completing".to_string());
|
||||
}
|
||||
Some(view) if view.status == "pass" || view.status == "fail" => {
|
||||
return format_crdt_result(&view);
|
||||
}
|
||||
Some(_) => {
|
||||
// Still "running" — wait and re-poll.
|
||||
}
|
||||
}
|
||||
|
||||
if start.elapsed().as_secs() > TEST_TIMEOUT_SECS {
|
||||
return Err(format!(
|
||||
"Attached test job for '{sid}' did not complete within {TEST_TIMEOUT_SECS}s"
|
||||
));
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ── get_test_result ──────────────────────────────────────────────────────────
|
||||
|
||||
/// How long `get_test_result` blocks server-side before returning "running".
|
||||
@@ -536,6 +585,60 @@ mod tests {
|
||||
assert_eq!(parsed["exit_code"], 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tool_run_tests_concurrent_calls_attach_to_single_job() {
|
||||
// Bug 903 regression: a second `run_tests` call for the same worktree
|
||||
// while the first is still in flight must ATTACH to that job (i.e.
|
||||
// observe its result) rather than kill+respawn. Pre-fix, every call
|
||||
// killed the prior `cargo test` child and spawned a fresh one,
|
||||
// creating a respawn loop driven by the agent's MCP-timeout retries.
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let script_dir = tmp.path().join("script");
|
||||
std::fs::create_dir_all(&script_dir).unwrap();
|
||||
let script_path = script_dir.join("test");
|
||||
// Slow enough that the second call definitely overlaps the first.
|
||||
std::fs::write(
|
||||
&script_path,
|
||||
"#!/usr/bin/env bash\nsleep 2\necho 'test result: ok. 0 passed'\nexit 0\n",
|
||||
)
|
||||
.unwrap();
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
std::fs::set_permissions(&script_path, std::fs::Permissions::from_mode(0o755)).unwrap();
|
||||
}
|
||||
|
||||
let ctx = test_ctx(tmp.path());
|
||||
crate::crdt_state::init_for_test();
|
||||
crate::db::ensure_content_store();
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let ctx_a = ctx.clone();
|
||||
let ctx_b = ctx.clone();
|
||||
let t1 = tokio::spawn(async move { tool_run_tests(&json!({}), &ctx_a).await });
|
||||
// Give T1 time to spawn its child + insert into active_jobs before T2
|
||||
// arrives, so T2 deterministically takes the attach path.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(200)).await;
|
||||
let t2 = tokio::spawn(async move { tool_run_tests(&json!({}), &ctx_b).await });
|
||||
|
||||
let r1 = t1.await.unwrap().unwrap();
|
||||
let r2 = t2.await.unwrap().unwrap();
|
||||
let elapsed = start.elapsed().as_secs_f64();
|
||||
|
||||
let p1: serde_json::Value = serde_json::from_str(&r1).unwrap();
|
||||
let p2: serde_json::Value = serde_json::from_str(&r2).unwrap();
|
||||
assert_eq!(p1["passed"], true, "first call must succeed: {p1}");
|
||||
assert_eq!(p2["passed"], true, "second call must succeed: {p2}");
|
||||
|
||||
// If the second call had killed + respawned, total elapsed would be
|
||||
// ~4s (two 2s runs serially). With attach, both calls observe the
|
||||
// SAME single 2s run, so elapsed stays close to 2s.
|
||||
assert!(
|
||||
elapsed < 3.5,
|
||||
"concurrent calls must share one job (~2s), not respawn (~4s); elapsed={elapsed:.2}s"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tool_run_tests_worktree_path_must_be_inside_worktrees() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
|
||||
Reference in New Issue
Block a user