diff --git a/server/src/agents/gates.rs b/server/src/agents/gates.rs index 4c124f1d..3f11d76d 100644 --- a/server/src/agents/gates.rs +++ b/server/src/agents/gates.rs @@ -47,8 +47,9 @@ pub(crate) fn worktree_has_committed_work(wt_path: &Path) -> bool { /// is evaluated, then restores them afterward. Uncommitted work in worktrees is /// never junk — it may be the next agent session's starting point (bug 651). /// -/// Used as part of the "work survived" check when an agent crashes mid-output -/// (bug 645). +/// No longer called from main pipeline code (bug 668 replaced cargo-check with +/// run_tests evidence), but retained for the bug-651 stash/restore regression test. +#[cfg(test)] pub(crate) fn cargo_check_in_worktree(wt_path: &Path) -> bool { // Stash uncommitted changes (including untracked files) so cargo check // evaluates only committed code. We restore them afterward. diff --git a/server/src/agents/pool/pipeline/advance/mod.rs b/server/src/agents/pool/pipeline/advance/mod.rs index 32f510e3..d8e50a4d 100644 --- a/server/src/agents/pool/pipeline/advance/mod.rs +++ b/server/src/agents/pool/pipeline/advance/mod.rs @@ -116,20 +116,28 @@ impl AgentPool { } } } else { - // Bug 645: Before retry/block, check if the agent left committed - // work that compiles. An agent may crash mid-output (e.g. Claude - // Code CLI PTY write assertion) after having already committed valid - // code. When committed work survives and `cargo check` passes, - // advance to QA instead of wasting retries. - let work_survived = worktree_path.as_ref().is_some_and(|wt_path| { - crate::agents::gates::worktree_has_committed_work(wt_path) - && crate::agents::gates::cargo_check_in_worktree(wt_path) - }); + // Bug 645 / 668: Before retry/block, check if the agent left committed + // work AND the agent had a passing run_tests result captured during its + // session. An agent may crash mid-output (e.g. Claude Code CLI PTY write + // assertion) after having already committed valid code and run tests. + // We require positive test evidence (not just cargo check) so that only + // stories with genuinely passing test suites are salvaged. + // + // The `run_tests` MCP tool writes `{story_id}:run_tests_ok` to the DB + // whenever script/test exits 0 inside a story worktree. Consume the + // evidence here so it does not persist to the next agent session. + let has_test_evidence = + crate::db::read_content(&format!("{story_id}:run_tests_ok")).is_some(); + crate::db::delete_content(&format!("{story_id}:run_tests_ok")); + let work_survived = has_test_evidence + && worktree_path.as_ref().is_some_and(|wt_path| { + crate::agents::gates::worktree_has_committed_work(wt_path) + }); if work_survived { slog!( "[pipeline] Coder '{agent_name}' failed gates for '{story_id}' but \ - committed work survives and compiles. Advancing to QA instead of \ - retrying (bug 645)." + committed work survives with captured passing tests. Advancing to QA \ + instead of retrying (bug 645)." ); let qa_mode = { let item_type = crate::agents::lifecycle::item_type_from_id(story_id); @@ -1111,6 +1119,10 @@ stage = "qa" "---\nname: Survived Test\n---\n", ); + // Simulate a passing run_tests call during the agent's session (bug 668): + // the agent ran script/test, it passed, and the server captured the evidence. + crate::db::write_content("9945_story_survived:run_tests_ok", "1"); + let pool = AgentPool::new_test(3001); // Simulate coder failing gates (e.g. agent crashed, dirty worktree). @@ -1241,4 +1253,257 @@ stage = "qa" "Story with no committed work should be blocked after exceeding retry limit" ); } + + // ── bug 668: pipeline must NOT advance when gates_passed=false and no test evidence ── + + /// Path (a): gates_passed=false with committed work but NO captured run_tests + /// evidence → story stays in coding (retries), does NOT advance to QA/merge. + #[tokio::test] + async fn gates_failed_no_test_evidence_does_not_advance() { + use std::fs; + use std::process::Command; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Init a git repo with committed work on a feature branch. + Command::new("git") + .args(["init"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(root) + .output() + .unwrap(); + fs::write( + root.join("Cargo.toml"), + "[package]\nname=\"t\"\nversion=\"0.1.0\"\nedition=\"2021\"\n", + ) + .unwrap(); + fs::create_dir_all(root.join("src")).unwrap(); + fs::write(root.join("src/lib.rs"), "// empty\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "init"]) + .current_dir(root) + .output() + .unwrap(); + + // Create a worktree with committed work on feature branch. + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-9947_story_no_evidence", + ]) + .current_dir(root) + .output() + .unwrap(); + + fs::write(wt_path.join("src/lib.rs"), "pub fn added() {}\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_path) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add fn"]) + .current_dir(&wt_path) + .output() + .unwrap(); + + // Set up the story with max_retries=1 so we can observe the retry/block. + crate::db::ensure_content_store(); + crate::db::write_content( + "9947_story_no_evidence", + "---\nname: No Evidence Test\n---\n", + ); + crate::db::write_item_with_content( + "9947_story_no_evidence", + "2_current", + "---\nname: No Evidence Test\n---\n", + ); + + // Explicitly ensure no test evidence exists for this story. + crate::db::delete_content("9947_story_no_evidence:run_tests_ok"); + + fs::create_dir_all(root.join(".huskies")).unwrap(); + fs::write( + root.join(".huskies/project.toml"), + "max_retries = 1\n\n[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n", + ) + .unwrap(); + + let pool = AgentPool::new_test(3001); + let mut rx = pool.watcher_tx.subscribe(); + + // gates_passed=false, no run_tests evidence, but committed work exists. + pool.run_pipeline_advance( + "9947_story_no_evidence", + "coder-1", + CompletionReport { + summary: "Gates failed".to_string(), + gates_passed: false, + gate_output: "Tests failed".to_string(), + }, + Some(root.to_path_buf()), + Some(wt_path), + false, + None, + ) + .await; + + // Story must NOT advance — it should be blocked (max_retries=1 means + // first failure triggers block) rather than moving to QA/merge. + let mut got_blocked = false; + while let Ok(evt) = rx.try_recv() { + if let WatcherEvent::StoryBlocked { story_id, .. } = &evt + && story_id == "9947_story_no_evidence" + { + got_blocked = true; + break; + } + } + assert!( + got_blocked, + "gates_passed=false without run_tests evidence must NOT advance to QA/merge — \ + story should stay in coding (bug 668)" + ); + } + + /// Path (b): gates_passed=false WITH captured run_tests evidence AND committed + /// work → advances to QA/merge (the legitimate bug-645 salvage case). + /// This is the case where the agent ran passing tests then crashed before server + /// gates could confirm results. + #[tokio::test] + async fn gates_failed_with_test_evidence_and_committed_work_advances() { + use std::fs; + use std::process::Command; + + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Init a git repo with committed work. + Command::new("git") + .args(["init"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.email", "test@test.com"]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["config", "user.name", "Test"]) + .current_dir(root) + .output() + .unwrap(); + fs::write( + root.join("Cargo.toml"), + "[package]\nname=\"t\"\nversion=\"0.1.0\"\nedition=\"2021\"\n", + ) + .unwrap(); + fs::create_dir_all(root.join("src")).unwrap(); + fs::write(root.join("src/lib.rs"), "// empty\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(root) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "init"]) + .current_dir(root) + .output() + .unwrap(); + + let wt_path = tmp.path().join("wt"); + Command::new("git") + .args([ + "worktree", + "add", + &wt_path.to_string_lossy(), + "-b", + "feature/story-9948_story_with_evidence", + ]) + .current_dir(root) + .output() + .unwrap(); + + fs::write(wt_path.join("src/lib.rs"), "pub fn salvaged() {}\n").unwrap(); + Command::new("git") + .args(["add", "."]) + .current_dir(&wt_path) + .output() + .unwrap(); + Command::new("git") + .args(["commit", "-m", "add salvaged fn"]) + .current_dir(&wt_path) + .output() + .unwrap(); + + crate::db::ensure_content_store(); + crate::db::write_content( + "9948_story_with_evidence", + "---\nname: With Evidence Test\n---\n", + ); + crate::db::write_item_with_content( + "9948_story_with_evidence", + "2_current", + "---\nname: With Evidence Test\n---\n", + ); + + // Write the run_tests evidence — simulates the agent having called run_tests + // MCP and getting a passing result before it crashed. + crate::db::write_content("9948_story_with_evidence:run_tests_ok", "1"); + + let pool = AgentPool::new_test(3001); + + // gates_passed=false (agent crashed), but test evidence exists. + pool.run_pipeline_advance( + "9948_story_with_evidence", + "coder-1", + CompletionReport { + summary: "Agent crashed".to_string(), + gates_passed: false, + gate_output: "PTY write assertion failed".to_string(), + }, + Some(root.to_path_buf()), + Some(wt_path), + false, + None, + ) + .await; + + // Story should advance (not blocked, no retry_count). + let content = crate::db::read_content("9948_story_with_evidence") + .expect("story must exist in content store"); + assert!( + !content.contains("blocked"), + "story must NOT be blocked when test evidence exists and work committed: {content}" + ); + assert!( + !content.contains("retry_count"), + "story must NOT have retry_count when salvaged via test evidence: {content}" + ); + // Evidence must be consumed (cleared) after use. + assert!( + crate::db::read_content("9948_story_with_evidence:run_tests_ok").is_none(), + "run_tests evidence must be cleared after pipeline advance consumes it" + ); + } } diff --git a/server/src/http/mcp/shell_tools/script.rs b/server/src/http/mcp/shell_tools/script.rs index 63bb3808..ebd450b1 100644 --- a/server/src/http/mcp/shell_tools/script.rs +++ b/server/src/http/mcp/shell_tools/script.rs @@ -119,6 +119,17 @@ pub(crate) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result, + /// Set to `true` when an agent's `run_tests` call returns `passed=true`. + /// Used by the bug-645 salvage path to require real test evidence, not just + /// compilation success. + pub run_tests_passed: Option, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -94,6 +98,10 @@ struct FrontMatter { depends_on: Option>, /// When `true`, the story is frozen. frozen: Option, + /// Set to `true` when an agent's `run_tests` call returns `passed=true`. + /// Used by the bug-645 salvage path to distinguish a genuine test-passing + /// session from one that merely compiled. + run_tests_passed: Option, } pub fn parse_front_matter(contents: &str) -> Result { @@ -135,6 +143,7 @@ fn build_metadata(front: FrontMatter) -> StoryMetadata { blocked: front.blocked, depends_on: front.depends_on, frozen: front.frozen, + run_tests_passed: front.run_tests_passed, } }