fix: async run_tests to prevent zombie cargo processes blocking gates

run_tests MCP tool now spawns tests in the background and returns immediately. Agents poll get_test_result to check completion. This prevents zombie cargo processes from holding the build lock when the CLI times out the MCP call before tests finish. Also fixes agent permission mode: acceptEdits replaces invalid allowFullAutoEdit that was causing agents to crash-loop on spawn. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 22:00:05 +00:00
parent 8393a67c89
commit f958f57e56
6 changed files with 262 additions and 63 deletions
@@ -5,8 +5,8 @@ role = "Full-stack engineer. Implements features across all components."
 model = "sonnet"
 max_turns = 50
 max_budget_usd = 5.00
-prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool — never run script/test or cargo test directly via Bash.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
+prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool (it starts tests in the background — poll get_test_result to check completion) — never run script/test or cargo test directly via Bash.\n\n## Acceptance Criteria Tracking\nAs you complete each acceptance criterion, call the check_criterion MCP tool (story_id, criterion_index) to mark it done. Index 0 is the first unchecked criterion, 1 is the second, etc. Do this as you go — not all at once at the end.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
-system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — do NOT run script/test or cargo test via Bash (the MCP tool returns a truncated summary, saving tokens). Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
+system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — it starts tests in the background, then poll get_test_result to check completion. Never run script/test or cargo test directly via Bash. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done. Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
 [[agent]]
 name = "coder-2"
@@ -15,8 +15,8 @@ role = "Full-stack engineer. Implements features across all components."
 model = "sonnet"
 max_turns = 50
 max_budget_usd = 5.00
-prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool — never run script/test or cargo test directly via Bash.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
+prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool (it starts tests in the background — poll get_test_result to check completion) — never run script/test or cargo test directly via Bash.\n\n## Acceptance Criteria Tracking\nAs you complete each acceptance criterion, call the check_criterion MCP tool (story_id, criterion_index) to mark it done. Index 0 is the first unchecked criterion, 1 is the second, etc. Do this as you go — not all at once at the end.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
-system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — do NOT run script/test or cargo test via Bash (the MCP tool returns a truncated summary, saving tokens). Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
+system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — it starts tests in the background, then poll get_test_result to check completion. Never run script/test or cargo test directly via Bash. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done. Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
 [[agent]]
 name = "coder-3"
@@ -25,8 +25,8 @@ role = "Full-stack engineer. Implements features across all components."
 model = "sonnet"
 max_turns = 50
 max_budget_usd = 5.00
-prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool — never run script/test or cargo test directly via Bash.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
+prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool (it starts tests in the background — poll get_test_result to check completion) — never run script/test or cargo test directly via Bash.\n\n## Acceptance Criteria Tracking\nAs you complete each acceptance criterion, call the check_criterion MCP tool (story_id, criterion_index) to mark it done. Index 0 is the first unchecked criterion, 1 is the second, etc. Do this as you go — not all at once at the end.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
-system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — do NOT run script/test or cargo test via Bash (the MCP tool returns a truncated summary, saving tokens). Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
+system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — it starts tests in the background, then poll get_test_result to check completion. Never run script/test or cargo test directly via Bash. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done. Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
 [[agent]]
 name = "qa-2"
@@ -48,7 +48,7 @@ Read CLAUDE.md first, then .story_kit/README.md to understand the dev process.
 ### 1. Deterministic Gates (Prerequisites)
 Run these first — if any fail, reject immediately without proceeding to AC review:
- Call the `run_tests` MCP tool — all gates must pass (0 lint errors/warnings, all tests green, frontend build clean if applicable). Do NOT run script/test via Bash.
+- Call the `run_tests` MCP tool to start tests, then poll `get_test_result` until complete — all gates must pass (0 lint errors/warnings, all tests green, frontend build clean if applicable). Do NOT run script/test via Bash.
 ### 2. Code Change Review
 - Run `git diff master...HEAD --stat` to see what files changed
@@ -126,8 +126,8 @@ role = "Senior full-stack engineer for complex tasks. Implements features across
 model = "opus"
 max_turns = 80
 max_budget_usd = 20.00
-prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool — never run script/test or cargo test directly via Bash.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
+prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. The story details are in your prompt above. Follow the SDTW process through implementation and verification (Steps 1-3). The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools. Do NOT accept the story or merge - commit your work and stop. If the user asks to review your changes, tell them to run: cd \"{{worktree_path}}\" && git difftool {{base_branch}}...HEAD\n\nIMPORTANT: Commit all your work before your process exits. The server will automatically run acceptance gates when your process exits and advance the pipeline based on the results. To verify before committing, use the run_tests MCP tool (it starts tests in the background — poll get_test_result to check completion) — never run script/test or cargo test directly via Bash.\n\n## Acceptance Criteria Tracking\nAs you complete each acceptance criterion, call the check_criterion MCP tool (story_id, criterion_index) to mark it done. Index 0 is the first unchecked criterion, 1 is the second, etc. Do this as you go — not all at once at the end.\n\n## Bug Workflow: Trust the Story, Act Fast\nWhen working on bugs:\n1. READ THE STORY DESCRIPTION FIRST. If it specifies exact files, functions, and line numbers — go directly there and make the fix. Do NOT explore git history, grep the whole codebase, or re-investigate the root cause when the story already tells you what to do.\n2. If the story does NOT specify the exact location, THEN investigate: use targeted grep to find the relevant code.\n3. Fix with a surgical, minimal change. Do NOT add new abstractions or workarounds.\n4. Commit early. If you've made the fix and tests pass, commit and exit. Do not spend turns verifying that master also has the same failures — that wastes budget.\n5. Write commit messages that explain what broke and why."
-system_prompt = "You are a senior full-stack engineer working autonomously in a git worktree. You handle complex tasks requiring deep architectural understanding. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — do NOT run script/test or cargo test via Bash (the MCP tool returns a truncated summary, saving tokens). Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
+system_prompt = "You are a senior full-stack engineer working autonomously in a git worktree. You handle complex tasks requiring deep architectural understanding. Follow the Story-Driven Test Workflow strictly. Use the run_tests MCP tool to verify your changes pass — it starts tests in the background, then poll get_test_result to check completion. Never run script/test or cargo test directly via Bash. As you complete each acceptance criterion, call check_criterion MCP tool to mark it done. Commit all your work before finishing - use a descriptive commit message. Do not accept stories, move them to archived, or merge to master - a human will do that. Do not coordinate with other agents - focus on your assigned story. The server automatically runs acceptance gates when your process exits. For bugs, trust the story description — if it specifies exact files and functions, go directly there. Do not explore git history or grep the whole codebase when the story already tells you where to look. Make surgical fixes, commit early."
 [[agent]]
 name = "qa"
@@ -149,7 +149,7 @@ Read CLAUDE.md first, then .story_kit/README.md to understand the dev process.
 ### 1. Deterministic Gates (Prerequisites)
 Run these first — if any fail, reject immediately without proceeding to AC review:
- Call the `run_tests` MCP tool — all gates must pass (0 lint errors/warnings, all tests green, frontend build clean if applicable). Do NOT run script/test via Bash.
+- Call the `run_tests` MCP tool to start tests, then poll `get_test_result` until complete — all gates must pass (0 lint errors/warnings, all tests green, frontend build clean if applicable). Do NOT run script/test via Bash.
 ### 2. Code Change Review
 - Run `git diff master...HEAD --stat` to see what files changed
@@ -250,7 +250,7 @@ When the auto-resolver fails, you have access to the merge worktree at `.story_k
 4. **Understand intent, not just syntax.** The feature branch may be behind master — master's version of shared infrastructure is almost always correct. The feature branch's contribution is the NEW functionality it adds. Your job is to integrate the new into master's structure, not pick one side.
 5. Resolve by integrating the feature's new functionality into master's code structure
 5. Stage resolved files with `git add`
-6. Call the `run_tests` MCP tool to verify compilation and tests pass
+6. Call the `run_tests` MCP tool to start tests, then poll `get_test_result` until complete
 7. If it compiles, commit and re-trigger merge_agent_work
 ### Common conflict patterns in this project:
@@ -265,7 +265,7 @@ When the auto-resolver fails, you have access to the merge worktree at `.story_k
 ## Fixing Gate Failures
-If quality gates fail, attempt to fix issues yourself in the merge worktree. Use the run_tests MCP tool to verify — do not run script/test via Bash.
+If quality gates fail, attempt to fix issues yourself in the merge worktree. Use the run_tests MCP tool (then poll get_test_result) to verify — do not run script/test via Bash.
 **Fix yourself (up to 3 attempts total):**
 - Syntax errors (missing semicolons, brackets, commas)
@@ -198,11 +198,12 @@ fn run_agent_pty_blocking(
    // and instead leak as unstructured PTY text.
    cmd.arg("--include-partial-messages");
-    // Agents use allowFullAutoEdit so the worktree's .claude/settings.json
+    // Agents use acceptEdits so file edits are auto-approved while other
-    // controls which tools are pre-approved.  Anything not in the allowlist
+    // tools (e.g. Bash) trigger the permission prompt tool, which auto-denies
-    // triggers the permission prompt tool, which auto-denies for agents.
+    // for agents.  The worktree's .claude/settings.json allowlist further
    // controls which tools are pre-approved.
    cmd.arg("--permission-mode");
-    cmd.arg("allowFullAutoEdit");
+    cmd.arg("acceptEdits");
    cmd.arg("--permission-prompt-tool");
    cmd.arg("mcp__huskies__prompt_permission");
@@ -6,9 +6,36 @@ use crate::state::SessionState;
 use crate::store::JsonFileStore;
 use crate::workflow::WorkflowState;
 use poem::http::StatusCode;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tokio::sync::{broadcast, mpsc, oneshot};
 /// A running or completed test job spawned by the `run_tests` MCP tool.
 pub struct TestJob {
    /// The child process handle. `None` once the process has exited and results
    /// have been collected.
    pub child: Option<std::process::Child>,
    /// Populated once the child exits.
    pub result: Option<TestJobResult>,
    /// When the job was started.
    pub started_at: std::time::Instant,
 }
 /// The result of a completed test job.
 #[derive(Clone)]
 pub struct TestJobResult {
    pub passed: bool,
    pub exit_code: i32,
    pub tests_passed: u64,
    pub tests_failed: u64,
    pub output: String,
 }
 /// Shared registry of in-flight and recently completed test jobs, keyed by
 /// worktree path.
 pub type TestJobRegistry = Arc<std::sync::Mutex<HashMap<PathBuf, TestJob>>>;
 /// The user's decision when responding to a permission dialog.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum PermissionDecision {
@@ -75,6 +102,9 @@ pub struct AppContext {
    /// spawned by the bot so that cancellations take effect in-memory rather
    /// than only on disk.
    pub timer_store: Arc<TimerStore>,
    /// Registry of running/completed test jobs spawned by the `run_tests` MCP
    /// tool. Keyed by worktree path so each worktree has at most one active job.
    pub test_jobs: TestJobRegistry,
 }
 #[cfg(test)]
@@ -102,6 +132,7 @@ impl AppContext {
            bot_shutdown: None,
            matrix_shutdown_tx: None,
            timer_store,
            test_jobs: Arc::new(std::sync::Mutex::new(HashMap::new())),
        }
    }
 }
@@ -1134,7 +1134,7 @@ fn handle_tools_list(id: Option<Value>) -> JsonRpcResponse {
                },
                {
                    "name": "run_tests",
-                    "description": "Run the project's test suite (script/test) and return a structured result with pass/fail, test counts, and truncated output. Runs from the project root by default, or from a specific worktree if worktree_path is provided.",
+                    "description": "Start the project's test suite (script/test) as a background job. Returns immediately with {\"status\": \"started\"}. Poll get_test_result with the same worktree_path to check for completion. If the previous run already finished, returns the result inline.",
                    "inputSchema": {
                        "type": "object",
                        "properties": {
@@ -1146,6 +1146,20 @@ fn handle_tools_list(id: Option<Value>) -> JsonRpcResponse {
                        "required": []
                    }
                },
                {
                    "name": "get_test_result",
                    "description": "Check on a running test job started by run_tests. Returns {\"status\": \"running\", \"elapsed_secs\": N} if still in progress, or the full test result (passed, exit_code, test counts, output) if finished.",
                    "inputSchema": {
                        "type": "object",
                        "properties": {
                            "worktree_path": {
                                "type": "string",
                                "description": "Optional absolute path to the worktree. Must match the worktree_path used in run_tests."
                            }
                        },
                        "required": []
                    }
                },
                {
                    "name": "git_status",
                    "description": "Return the working tree status of an agent's worktree (staged, unstaged, and untracked files). The worktree_path must be inside .huskies/worktrees/. Push and remote operations are not available.",
@@ -1402,6 +1416,7 @@ async fn handle_tools_call(
        // Shell command execution
        "run_command" => shell_tools::tool_run_command(&args, ctx).await,
        "run_tests" => shell_tools::tool_run_tests(&args, ctx).await,
        "get_test_result" => shell_tools::tool_get_test_result(&args, ctx).await,
        // Git operations
        "git_status" => git_tools::tool_git_status(&args, ctx).await,
        "git_diff" => git_tools::tool_git_diff(&args, ctx).await,
@@ -1526,6 +1541,7 @@ mod tests {
        assert!(names.contains(&"delete_story"));
        assert!(names.contains(&"run_command"));
        assert!(names.contains(&"run_tests"));
        assert!(names.contains(&"get_test_result"));
        assert!(names.contains(&"git_status"));
        assert!(names.contains(&"git_diff"));
        assert!(names.contains(&"git_add"));
@@ -371,10 +371,15 @@ fn extract_count(line: &str, label: &str) -> Option<u64> {
    num_str.parse().ok()
 }
-/// Run the project's `script/test` and return a structured result.
+/// Start the project's test suite (`script/test`) as a background process.
 ///
-/// If `worktree_path` is provided the script is run from that worktree
+/// Returns immediately with `{"status": "started"}`. The agent should poll
-/// (must be inside `.huskies/worktrees/`). Otherwise the project root is used.
+/// `get_test_result` with the same `worktree_path` to retrieve results once
 /// the tests complete.
 ///
 /// If a test job is already running for the same worktree, returns
 /// `{"status": "already_running"}`. If a previous job completed and results
 /// haven't been consumed yet, they are returned inline and the job is cleared.
 pub(super) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result<String, String> {
    let project_root = ctx.agents.get_project_root(&ctx.state)?;
@@ -393,52 +398,197 @@ pub(super) async fn tool_run_tests(args: &Value, ctx: &AppContext) -> Result<Str
        ));
    }
-    let result = tokio::time::timeout(
+    // Check for an existing job on this worktree.
-        std::time::Duration::from_secs(TEST_TIMEOUT_SECS),
+    {
-        tokio::task::spawn_blocking({
+        let mut jobs = ctx.test_jobs.lock().map_err(|e| e.to_string())?;
-            let dir = working_dir.clone();
+        if let Some(job) = jobs.get_mut(&working_dir) {
-            let script = script_path.clone();
+            // Check if the child has finished.
-            move || {
+            if let Some(child) = job.child.as_mut() {
-                std::process::Command::new("bash")
+                match child.try_wait() {
-                    .arg(&script)
+                    Ok(Some(status)) => {
-                    .current_dir(&dir)
+                        // Child finished — collect results now.
-                    .output()
+                        let result = collect_child_result(child, status);
                        job.child = None;
                        job.result = Some(result.clone());
                        // Return the completed result inline.
                        let resp = format_test_result(&result);
                        jobs.remove(&working_dir);
                        return resp;
                    }
                    Ok(None) => {
                        // Still running.
                        let elapsed = job.started_at.elapsed().as_secs();
                        return serde_json::to_string_pretty(&json!({
                            "status": "running",
                            "elapsed_secs": elapsed,
                        }))
                        .map_err(|e| format!("Serialization error: {e}"));
                    }
                    Err(e) => {
                        jobs.remove(&working_dir);
                        return Err(format!("Failed to check child status: {e}"));
                    }
                }
            }
            // Job exists with result but no child — return cached result.
            if let Some(result) = job.result.clone() {
                jobs.remove(&working_dir);
                return format_test_result(&result);
            }
        }),
    )
    .await;
    match result {
        Err(_) => serde_json::to_string_pretty(&json!({
            "passed": false,
            "exit_code": -1,
            "timed_out": true,
            "tests_passed": 0,
            "tests_failed": 0,
            "output": format!("Test suite timed out after {TEST_TIMEOUT_SECS}s"),
        }))
        .map_err(|e| format!("Serialization error: {e}")),
        Ok(Err(e)) => Err(format!("Task join error: {e}")),
        Ok(Ok(Err(e))) => Err(format!("Failed to execute test script: {e}")),
        Ok(Ok(Ok(output))) => {
            let passed = output.status.success();
            let exit_code = output.status.code().unwrap_or(-1);
            let stdout = String::from_utf8_lossy(&output.stdout).to_string();
            let stderr = String::from_utf8_lossy(&output.stderr).to_string();
            let combined = format!("{stdout}{stderr}");
            let (tests_passed, tests_failed) = parse_test_counts(&combined);
            let truncated = truncate_output(&combined, MAX_OUTPUT_LINES);
            serde_json::to_string_pretty(&json!({
                "passed": passed,
                "exit_code": exit_code,
                "timed_out": false,
                "tests_passed": tests_passed,
                "tests_failed": tests_failed,
                "output": truncated,
            }))
            .map_err(|e| format!("Serialization error: {e}"))
        }
    }
    // Spawn the test process.
    let child = std::process::Command::new("bash")
        .arg(&script_path)
        .current_dir(&working_dir)
        .stdout(std::process::Stdio::piped())
        .stderr(std::process::Stdio::piped())
        .spawn()
        .map_err(|e| format!("Failed to spawn test script: {e}"))?;
    crate::slog!(
        "[run_tests] Started test job for {} (pid {})",
        working_dir.display(),
        child.id()
    );
    {
        let mut jobs = ctx.test_jobs.lock().map_err(|e| e.to_string())?;
        jobs.insert(
            working_dir,
            crate::http::context::TestJob {
                child: Some(child),
                result: None,
                started_at: std::time::Instant::now(),
            },
        );
    }
    serde_json::to_string_pretty(&json!({
        "status": "started",
    }))
    .map_err(|e| format!("Serialization error: {e}"))
 }
 /// Check on a running test job and return results if complete.
 ///
 /// Returns `{"status": "running", "elapsed_secs": N}` if still in progress,
 /// or the full test result if finished. If no job exists for the worktree,
 /// returns an error.
 pub(super) async fn tool_get_test_result(
    args: &Value,
    ctx: &AppContext,
 ) -> Result<String, String> {
    let project_root = ctx.agents.get_project_root(&ctx.state)?;
    let working_dir = match args.get("worktree_path").and_then(|v| v.as_str()) {
        Some(wt) => validate_working_dir(wt, ctx)?,
        None => project_root
            .canonicalize()
            .map_err(|e| format!("Cannot canonicalize project root: {e}"))?,
    };
    let mut jobs = ctx.test_jobs.lock().map_err(|e| e.to_string())?;
    let job = jobs.get_mut(&working_dir).ok_or_else(|| {
        "No test job running for this worktree. Call run_tests first.".to_string()
    })?;
    // Check if child has finished.
    if let Some(child) = job.child.as_mut() {
        match child.try_wait() {
            Ok(Some(status)) => {
                let result = collect_child_result(child, status);
                job.child = None;
                job.result = Some(result.clone());
                let resp = format_test_result(&result);
                jobs.remove(&working_dir);
                return resp;
            }
            Ok(None) => {
                let elapsed = job.started_at.elapsed().as_secs();
                // If exceeded our max timeout, kill it.
                if elapsed > TEST_TIMEOUT_SECS {
                    let _ = child.kill();
                    let _ = child.wait();
                    crate::slog!(
                        "[run_tests] Killed test job for {} after {elapsed}s timeout",
                        working_dir.display()
                    );
                    jobs.remove(&working_dir);
                    return serde_json::to_string_pretty(&json!({
                        "passed": false,
                        "exit_code": -1,
                        "timed_out": true,
                        "tests_passed": 0,
                        "tests_failed": 0,
                        "output": format!("Test suite timed out after {elapsed}s"),
                    }))
                    .map_err(|e| format!("Serialization error: {e}"));
                }
                return serde_json::to_string_pretty(&json!({
                    "status": "running",
                    "elapsed_secs": elapsed,
                }))
                .map_err(|e| format!("Serialization error: {e}"));
            }
            Err(e) => {
                jobs.remove(&working_dir);
                return Err(format!("Failed to check child status: {e}"));
            }
        }
    }
    // Job exists with cached result.
    if let Some(result) = job.result.clone() {
        jobs.remove(&working_dir);
        return format_test_result(&result);
    }
    Err("Test job in unexpected state".to_string())
 }
 /// Collect stdout/stderr from a finished child and build a `TestJobResult`.
 fn collect_child_result(
    child: &mut std::process::Child,
    status: std::process::ExitStatus,
 ) -> crate::http::context::TestJobResult {
    let mut stdout = String::new();
    let mut stderr = String::new();
    if let Some(ref mut out) = child.stdout {
        use std::io::Read;
        let _ = out.read_to_string(&mut stdout);
    }
    if let Some(ref mut err) = child.stderr {
        use std::io::Read;
        let _ = err.read_to_string(&mut stderr);
    }
    let combined = format!("{stdout}{stderr}");
    let (tests_passed, tests_failed) = parse_test_counts(&combined);
    let exit_code = status.code().unwrap_or(-1);
    crate::http::context::TestJobResult {
        passed: status.success(),
        exit_code,
        tests_passed,
        tests_failed,
        output: truncate_output(&combined, MAX_OUTPUT_LINES),
    }
 }
 /// Format a `TestJobResult` as the JSON string returned to the agent.
 fn format_test_result(
    result: &crate::http::context::TestJobResult,
 ) -> Result<String, String> {
    serde_json::to_string_pretty(&json!({
        "passed": result.passed,
        "exit_code": result.exit_code,
        "timed_out": false,
        "tests_passed": result.tests_passed,
        "tests_failed": result.tests_failed,
        "output": result.output,
    }))
    .map_err(|e| format!("Serialization error: {e}"))
 }
 #[cfg(test)]
@@ -722,6 +722,7 @@ async fn main() -> Result<(), std::io::Error> {
        bot_shutdown: bot_shutdown_notifier.clone(),
        matrix_shutdown_tx: Some(Arc::clone(&matrix_shutdown_tx)),
        timer_store,
        test_jobs: std::sync::Arc::new(std::sync::Mutex::new(std::collections::HashMap::new())),
    };
    let app = build_routes(ctx, whatsapp_ctx.clone(), slack_ctx.clone(), port);