From d64f1e94ff79de3ab6ed190b74afcc5d1c617c47 Mon Sep 17 00:00:00 2001 From: Timmy Date: Tue, 12 May 2026 14:22:35 +0100 Subject: [PATCH] fix(903): run_tests attaches to in-flight job instead of kill+respawn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 903: every `run_tests` MCP call killed the prior `cargo test` child for the same worktree and spawned a fresh one. Combined with the ~60s MCP client-side timeout and the 896 agent prompt that told agents to "call run_tests again — it attaches to the in-flight test job", this produced a respawn loop: agent calls, MCP times out at 60s, agent retries, run_tests kills the running build and starts a new one. The test suite never reaches the finish line. Server log evidence: "Started test job for (pid N)" with a new PID every ~60-90s for the same worktree. Fix: when `run_tests` is called and a job is already in flight for that worktree, ATTACH to it instead of killing+respawning. The original job's poll loop already writes the final status to the CRDT `test_jobs` collection; attached callers just poll that CRDT entry (the same pattern `get_test_result` uses) and return the result when the in-flight job transitions out of "running". The 896 prompt's claim is now actually true. Worktrees remain isolated from each other and may run `cargo test` concurrently — there is no cross-worktree serialisation. The single invariant is "at most one test job per worktree at a time". New test: `tool_run_tests_concurrent_calls_attach_to_single_job` spawns two concurrent calls on the same worktree against a 2s `sleep`-based script and asserts total elapsed stays close to 2s (attach) rather than 4s (respawn). Note: the cross-worktree linker-OOM symptom Timmy reported in the field was downstream of the respawn loop. Killed-but-not-fully-reaped cargo invocations stack memory pressure beyond the nominal N worktrees. With the attach fix, each worktree runs exactly one in-flight build at a time and old builds finish cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/src/http/mcp/shell_tools/script.rs | 117 ++++++++++++++++++++-- 1 file changed, 110 insertions(+), 7 deletions(-) diff --git a/server/src/http/mcp/shell_tools/script.rs b/server/src/http/mcp/shell_tools/script.rs index 0c4d589c..43091286 100644 --- a/server/src/http/mcp/shell_tools/script.rs +++ b/server/src/http/mcp/shell_tools/script.rs @@ -70,15 +70,31 @@ pub(crate) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result Result Result { + let start = std::time::Instant::now(); + loop { + match crate::crdt_state::read_test_job(sid) { + None => { + // The job entry disappeared from the CRDT — most likely the + // spawning task lost the race between insert/read. Treat as a + // transient error so the agent can retry. + return Err("In-flight test job vanished from CRDT before completing".to_string()); + } + Some(view) if view.status == "pass" || view.status == "fail" => { + return format_crdt_result(&view); + } + Some(_) => { + // Still "running" — wait and re-poll. + } + } + + if start.elapsed().as_secs() > TEST_TIMEOUT_SECS { + return Err(format!( + "Attached test job for '{sid}' did not complete within {TEST_TIMEOUT_SECS}s" + )); + } + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + } +} + // ── get_test_result ────────────────────────────────────────────────────────── /// How long `get_test_result` blocks server-side before returning "running". @@ -536,6 +585,60 @@ mod tests { assert_eq!(parsed["exit_code"], 1); } + #[tokio::test] + async fn tool_run_tests_concurrent_calls_attach_to_single_job() { + // Bug 903 regression: a second `run_tests` call for the same worktree + // while the first is still in flight must ATTACH to that job (i.e. + // observe its result) rather than kill+respawn. Pre-fix, every call + // killed the prior `cargo test` child and spawned a fresh one, + // creating a respawn loop driven by the agent's MCP-timeout retries. + let tmp = tempfile::tempdir().unwrap(); + let script_dir = tmp.path().join("script"); + std::fs::create_dir_all(&script_dir).unwrap(); + let script_path = script_dir.join("test"); + // Slow enough that the second call definitely overlaps the first. + std::fs::write( + &script_path, + "#!/usr/bin/env bash\nsleep 2\necho 'test result: ok. 0 passed'\nexit 0\n", + ) + .unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&script_path, std::fs::Permissions::from_mode(0o755)).unwrap(); + } + + let ctx = test_ctx(tmp.path()); + crate::crdt_state::init_for_test(); + crate::db::ensure_content_store(); + + let start = std::time::Instant::now(); + let ctx_a = ctx.clone(); + let ctx_b = ctx.clone(); + let t1 = tokio::spawn(async move { tool_run_tests(&json!({}), &ctx_a).await }); + // Give T1 time to spawn its child + insert into active_jobs before T2 + // arrives, so T2 deterministically takes the attach path. + tokio::time::sleep(std::time::Duration::from_millis(200)).await; + let t2 = tokio::spawn(async move { tool_run_tests(&json!({}), &ctx_b).await }); + + let r1 = t1.await.unwrap().unwrap(); + let r2 = t2.await.unwrap().unwrap(); + let elapsed = start.elapsed().as_secs_f64(); + + let p1: serde_json::Value = serde_json::from_str(&r1).unwrap(); + let p2: serde_json::Value = serde_json::from_str(&r2).unwrap(); + assert_eq!(p1["passed"], true, "first call must succeed: {p1}"); + assert_eq!(p2["passed"], true, "second call must succeed: {p2}"); + + // If the second call had killed + respawned, total elapsed would be + // ~4s (two 2s runs serially). With attach, both calls observe the + // SAME single 2s run, so elapsed stays close to 2s. + assert!( + elapsed < 3.5, + "concurrent calls must share one job (~2s), not respawn (~4s); elapsed={elapsed:.2}s" + ); + } + #[tokio::test] async fn tool_run_tests_worktree_path_must_be_inside_worktrees() { let tmp = tempfile::tempdir().unwrap();