story-kit: merge 157_story_make_start_agent_non_blocking_by_deferring_worktree_creation

Make start_agent non-blocking by deferring worktree creation. The agent spawn now returns immediately while worktree setup happens asynchronously, improving responsiveness of the start_agent MCP call. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 16:50:56 +00:00
parent 71502507cc
commit 68af8c5ba9
1 changed files with 137 additions and 44 deletions
--- a/server/src/agents.rs
+++ b/server/src/agents.rs
@@ -376,52 +376,118 @@ impl AgentPool {
        // Move story from upcoming/ to current/ and auto-commit before creating the worktree.
        move_story_to_current(project_root, story_id)?;

-        // Create worktree
-        let wt_info = worktree::create_worktree(project_root, story_id, &config, self.port).await?;
-
-        // Update with worktree info
-        {
-            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
-            if let Some(agent) = agents.get_mut(&key) {
-                agent.worktree_info = Some(wt_info.clone());
-            }
-        }
-
-        // Spawn the agent process
-        let wt_path_str = wt_info.path.to_string_lossy().to_string();
-        let (command, args, mut prompt) =
-            config.render_agent_args(&wt_path_str, story_id, Some(&resolved_name), Some(&wt_info.base_branch))?;
-
-        // Append resume context if this is a restart with failure information.
-        if let Some(ctx) = resume_context {
-            prompt.push_str(ctx);
-        }
-
-        // Extract inactivity timeout from the agent config before moving config.
+        // Extract inactivity timeout from the agent config before cloning config.
        let inactivity_timeout_secs = config
            .find_agent(&resolved_name)
            .map(|a| a.inactivity_timeout_secs)
            .unwrap_or(300);

+        // Clone all values needed inside the background spawn.
+        let project_root_clone = project_root.to_path_buf();
+        let config_clone = config.clone();
+        let resume_context_owned = resume_context.map(str::to_string);
        let sid = story_id.to_string();
        let aname = resolved_name.clone();
        let tx_clone = tx.clone();
        let agents_ref = self.agents.clone();
-        let cwd = wt_path_str.clone();
        let key_clone = key.clone();
        let log_clone = event_log.clone();
        let port_for_task = self.port;
        let log_writer_clone = log_writer.clone();

+        // Spawn the background task. Worktree creation and agent launch happen here
+        // so `start_agent` returns immediately after registering the agent as
+        // Pending — non-blocking by design (story 157).
        let handle = tokio::spawn(async move {
+            // Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
+            let wt_info = match worktree::create_worktree(
+                &project_root_clone,
+                &sid,
+                &config_clone,
+                port_for_task,
+            )
+            .await
+            {
+                Ok(wt) => wt,
+                Err(e) => {
+                    // Worktree creation failed — mark agent as Failed so the UI shows the error.
+                    if let Ok(mut agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get_mut(&key_clone)
+                    {
+                        agent.status = AgentStatus::Failed;
+                        agent.completed_at = Some(Instant::now());
+                    }
+                    let _ = tx_clone.send(AgentEvent::Error {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        message: format!("Failed to create worktree: {e}"),
+                    });
+                    return;
+                }
+            };
+
+            // Step 2: store worktree info and render agent command/args/prompt.
+            let wt_path_str = wt_info.path.to_string_lossy().to_string();
+            {
+                if let Ok(mut agents) = agents_ref.lock()
+                    && let Some(agent) = agents.get_mut(&key_clone)
+                {
+                    agent.worktree_info = Some(wt_info.clone());
+                }
+            }
+
+            let (command, args, mut prompt) = match config_clone.render_agent_args(
+                &wt_path_str,
+                &sid,
+                Some(&aname),
+                Some(&wt_info.base_branch),
+            ) {
+                Ok(result) => result,
+                Err(e) => {
+                    if let Ok(mut agents) = agents_ref.lock()
+                        && let Some(agent) = agents.get_mut(&key_clone)
+                    {
+                        agent.status = AgentStatus::Failed;
+                        agent.completed_at = Some(Instant::now());
+                    }
+                    let _ = tx_clone.send(AgentEvent::Error {
+                        story_id: sid.clone(),
+                        agent_name: aname.clone(),
+                        message: format!("Failed to render agent args: {e}"),
+                    });
+                    return;
+                }
+            };
+
+            // Append resume context if this is a restart with failure information.
+            if let Some(ctx) = resume_context_owned {
+                prompt.push_str(&ctx);
+            }
+
+            // Step 3: transition to Running now that the worktree is ready.
+            {
+                if let Ok(mut agents) = agents_ref.lock()
+                    && let Some(agent) = agents.get_mut(&key_clone)
+                {
+                    agent.status = AgentStatus::Running;
+                }
+            }
            let _ = tx_clone.send(AgentEvent::Status {
                story_id: sid.clone(),
                agent_name: aname.clone(),
                status: "running".to_string(),
            });

+            // Step 4: launch the agent process.
            match run_agent_pty_streaming(
-                &sid, &aname, &command, &args, &prompt, &cwd, &tx_clone, &log_clone,
+                &sid,
+                &aname,
+                &command,
+                &args,
+                &prompt,
+                &wt_path_str,
+                &tx_clone,
+                &log_clone,
                log_writer_clone,
                inactivity_timeout_secs,
            )
@@ -455,11 +521,10 @@ impl AgentPool {
            }
        });

-        // Update status to running with task handle
+        // Store the task handle while the agent is still Pending.
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            if let Some(agent) = agents.get_mut(&key) {
-                agent.status = AgentStatus::Running;
                agent.task_handle = Some(handle);
            }
        }
@@ -470,10 +535,10 @@ impl AgentPool {
        Ok(AgentInfo {
            story_id: story_id.to_string(),
            agent_name: resolved_name,
-            status: AgentStatus::Running,
+            status: AgentStatus::Pending,
            session_id: None,
-            worktree_path: Some(wt_path_str),
-            base_branch: Some(wt_info.base_branch.clone()),
+            worktree_path: None,
+            base_branch: None,
            completion: None,
            log_session_id: Some(log_session_id),
        })
@@ -1766,11 +1831,12 @@ fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) {
        Err(_) => return,
    };

-    // Collect orphaned entries: Running agents whose task handle is finished.
-    let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>)> = lock
+    // Collect orphaned entries: Running or Pending agents whose task handle is finished.
+    // Pending agents can be orphaned if worktree creation panics before setting status.
+    let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
        .iter()
        .filter_map(|(key, agent)| {
-            if agent.status == AgentStatus::Running
+            if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
                && let Some(handle) = &agent.task_handle
                && handle.is_finished()
            {
@@ -1778,17 +1844,17 @@ fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) {
                    .rsplit_once(':')
                    .map(|(s, _)| s.to_string())
                    .unwrap_or_else(|| key.clone());
-                return Some((key.clone(), story_id, agent.tx.clone()));
+                return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone()));
            }
            None
        })
        .collect();

-    for (key, story_id, tx) in orphaned {
+    for (key, story_id, tx, prev_status) in orphaned {
        if let Some(agent) = lock.get_mut(&key) {
            agent.status = AgentStatus::Failed;
            slog!(
-                "[watchdog] Orphaned agent '{key}': task finished but status was Running. \
+                "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
                 Marking Failed."
            );
            let _ = tx.send(AgentEvent::Error {
@@ -4804,10 +4870,15 @@ stage = "qa"

    // ── bug 118: pending entry cleanup on start_agent failure ────────────────

-    /// Regression test for bug 118: when `start_agent` fails (e.g. because
-    /// `create_worktree` cannot find a git repo), the Pending entry that was
-    /// inserted into the agent HashMap must be cleaned up so it does not
-    /// permanently block `find_free_agent_for_stage` / auto-assign.
+    /// Regression test for bug 118: when worktree creation fails (e.g. because
+    /// there is no git repo), the Pending entry that was inserted into the agent
+    /// HashMap must not remain Pending — it must transition to Failed.  This
+    /// prevents `find_free_agent_for_stage` / auto-assign from being permanently
+    /// blocked.
+    ///
+    /// With story 157 the worktree creation moved into the background spawn, so
+    /// `start_agent` returns `Ok(Pending)` immediately.  We use `wait_for_agent`
+    /// to block until the background task resolves.
    #[tokio::test]
    async fn start_agent_cleans_up_pending_entry_on_failure() {
        use std::fs;
@@ -4825,7 +4896,7 @@ stage = "qa"
        .unwrap();

        // Create the story in upcoming so `move_story_to_current` succeeds,
-        // but do NOT init a git repo — `create_worktree` will fail.
+        // but do NOT init a git repo — `create_worktree` will fail in the spawn.
        let upcoming = root.join(".story_kit/work/1_upcoming");
        fs::create_dir_all(&upcoming).unwrap();
        fs::write(
@@ -4840,10 +4911,32 @@ stage = "qa"
            .start_agent(root, "50_story_test", Some("qa"), None)
            .await;

-        // The call must fail (no git repo for worktree creation).
-        assert!(result.is_err(), "start_agent should fail without a git repo");
+        // With the non-blocking flow, start_agent returns Ok(Pending) immediately.
+        // Worktree creation failure happens asynchronously in the background.
+        assert!(
+            result.is_ok(),
+            "start_agent should return Ok(Pending) immediately: {:?}",
+            result.err()
+        );
+        assert_eq!(
+            result.unwrap().status,
+            AgentStatus::Pending,
+            "initial status must be Pending"
+        );

-        // The pool must NOT retain a Pending entry for this agent.
+        // Wait for the background task to reach a terminal state.
+        // It must fail (no git repo → create_worktree returns an error).
+        let final_info = pool
+            .wait_for_agent("50_story_test", "qa", 5000)
+            .await
+            .expect("wait_for_agent should not time out");
+        assert_eq!(
+            final_info.status,
+            AgentStatus::Failed,
+            "agent must transition to Failed after worktree creation error"
+        );
+
+        // The pool must NOT retain a Pending or Running entry for this agent.
        let agents = pool.agents.lock().unwrap();
        let leaked = agents.values().any(|a| {
            a.agent_name == "qa"
@@ -4851,7 +4944,7 @@ stage = "qa"
        });
        assert!(
            !leaked,
-            "agent pool must not retain a Pending/Running entry after start_agent fails"
+            "agent pool must not retain a Pending/Running entry after worktree creation fails"
        );
    }