From 68af8c5ba96e727f49de55215990a8a8c219001c Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 24 Feb 2026 16:50:56 +0000 Subject: [PATCH] story-kit: merge 157_story_make_start_agent_non_blocking_by_deferring_worktree_creation Make start_agent non-blocking by deferring worktree creation. The agent spawn now returns immediately while worktree setup happens asynchronously, improving responsiveness of the start_agent MCP call. Co-Authored-By: Claude Opus 4.6 --- server/src/agents.rs | 181 ++++++++++++++++++++++++++++++++----------- 1 file changed, 137 insertions(+), 44 deletions(-) diff --git a/server/src/agents.rs b/server/src/agents.rs index aa76111..ede58be 100644 --- a/server/src/agents.rs +++ b/server/src/agents.rs @@ -376,52 +376,118 @@ impl AgentPool { // Move story from upcoming/ to current/ and auto-commit before creating the worktree. move_story_to_current(project_root, story_id)?; - // Create worktree - let wt_info = worktree::create_worktree(project_root, story_id, &config, self.port).await?; - - // Update with worktree info - { - let mut agents = self.agents.lock().map_err(|e| e.to_string())?; - if let Some(agent) = agents.get_mut(&key) { - agent.worktree_info = Some(wt_info.clone()); - } - } - - // Spawn the agent process - let wt_path_str = wt_info.path.to_string_lossy().to_string(); - let (command, args, mut prompt) = - config.render_agent_args(&wt_path_str, story_id, Some(&resolved_name), Some(&wt_info.base_branch))?; - - // Append resume context if this is a restart with failure information. - if let Some(ctx) = resume_context { - prompt.push_str(ctx); - } - - // Extract inactivity timeout from the agent config before moving config. + // Extract inactivity timeout from the agent config before cloning config. let inactivity_timeout_secs = config .find_agent(&resolved_name) .map(|a| a.inactivity_timeout_secs) .unwrap_or(300); + // Clone all values needed inside the background spawn. + let project_root_clone = project_root.to_path_buf(); + let config_clone = config.clone(); + let resume_context_owned = resume_context.map(str::to_string); let sid = story_id.to_string(); let aname = resolved_name.clone(); let tx_clone = tx.clone(); let agents_ref = self.agents.clone(); - let cwd = wt_path_str.clone(); let key_clone = key.clone(); let log_clone = event_log.clone(); let port_for_task = self.port; let log_writer_clone = log_writer.clone(); + // Spawn the background task. Worktree creation and agent launch happen here + // so `start_agent` returns immediately after registering the agent as + // Pending — non-blocking by design (story 157). let handle = tokio::spawn(async move { + // Step 1: create the worktree (slow — git checkout, pnpm install, etc.) + let wt_info = match worktree::create_worktree( + &project_root_clone, + &sid, + &config_clone, + port_for_task, + ) + .await + { + Ok(wt) => wt, + Err(e) => { + // Worktree creation failed — mark agent as Failed so the UI shows the error. + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + agent.completed_at = Some(Instant::now()); + } + let _ = tx_clone.send(AgentEvent::Error { + story_id: sid.clone(), + agent_name: aname.clone(), + message: format!("Failed to create worktree: {e}"), + }); + return; + } + }; + + // Step 2: store worktree info and render agent command/args/prompt. + let wt_path_str = wt_info.path.to_string_lossy().to_string(); + { + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.worktree_info = Some(wt_info.clone()); + } + } + + let (command, args, mut prompt) = match config_clone.render_agent_args( + &wt_path_str, + &sid, + Some(&aname), + Some(&wt_info.base_branch), + ) { + Ok(result) => result, + Err(e) => { + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Failed; + agent.completed_at = Some(Instant::now()); + } + let _ = tx_clone.send(AgentEvent::Error { + story_id: sid.clone(), + agent_name: aname.clone(), + message: format!("Failed to render agent args: {e}"), + }); + return; + } + }; + + // Append resume context if this is a restart with failure information. + if let Some(ctx) = resume_context_owned { + prompt.push_str(&ctx); + } + + // Step 3: transition to Running now that the worktree is ready. + { + if let Ok(mut agents) = agents_ref.lock() + && let Some(agent) = agents.get_mut(&key_clone) + { + agent.status = AgentStatus::Running; + } + } let _ = tx_clone.send(AgentEvent::Status { story_id: sid.clone(), agent_name: aname.clone(), status: "running".to_string(), }); + // Step 4: launch the agent process. match run_agent_pty_streaming( - &sid, &aname, &command, &args, &prompt, &cwd, &tx_clone, &log_clone, + &sid, + &aname, + &command, + &args, + &prompt, + &wt_path_str, + &tx_clone, + &log_clone, log_writer_clone, inactivity_timeout_secs, ) @@ -455,11 +521,10 @@ impl AgentPool { } }); - // Update status to running with task handle + // Store the task handle while the agent is still Pending. { let mut agents = self.agents.lock().map_err(|e| e.to_string())?; if let Some(agent) = agents.get_mut(&key) { - agent.status = AgentStatus::Running; agent.task_handle = Some(handle); } } @@ -470,10 +535,10 @@ impl AgentPool { Ok(AgentInfo { story_id: story_id.to_string(), agent_name: resolved_name, - status: AgentStatus::Running, + status: AgentStatus::Pending, session_id: None, - worktree_path: Some(wt_path_str), - base_branch: Some(wt_info.base_branch.clone()), + worktree_path: None, + base_branch: None, completion: None, log_session_id: Some(log_session_id), }) @@ -1766,11 +1831,12 @@ fn check_orphaned_agents(agents: &Mutex>) { Err(_) => return, }; - // Collect orphaned entries: Running agents whose task handle is finished. - let orphaned: Vec<(String, String, broadcast::Sender)> = lock + // Collect orphaned entries: Running or Pending agents whose task handle is finished. + // Pending agents can be orphaned if worktree creation panics before setting status. + let orphaned: Vec<(String, String, broadcast::Sender, AgentStatus)> = lock .iter() .filter_map(|(key, agent)| { - if agent.status == AgentStatus::Running + if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending) && let Some(handle) = &agent.task_handle && handle.is_finished() { @@ -1778,17 +1844,17 @@ fn check_orphaned_agents(agents: &Mutex>) { .rsplit_once(':') .map(|(s, _)| s.to_string()) .unwrap_or_else(|| key.clone()); - return Some((key.clone(), story_id, agent.tx.clone())); + return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone())); } None }) .collect(); - for (key, story_id, tx) in orphaned { + for (key, story_id, tx, prev_status) in orphaned { if let Some(agent) = lock.get_mut(&key) { agent.status = AgentStatus::Failed; slog!( - "[watchdog] Orphaned agent '{key}': task finished but status was Running. \ + "[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \ Marking Failed." ); let _ = tx.send(AgentEvent::Error { @@ -4804,10 +4870,15 @@ stage = "qa" // ── bug 118: pending entry cleanup on start_agent failure ──────────────── - /// Regression test for bug 118: when `start_agent` fails (e.g. because - /// `create_worktree` cannot find a git repo), the Pending entry that was - /// inserted into the agent HashMap must be cleaned up so it does not - /// permanently block `find_free_agent_for_stage` / auto-assign. + /// Regression test for bug 118: when worktree creation fails (e.g. because + /// there is no git repo), the Pending entry that was inserted into the agent + /// HashMap must not remain Pending — it must transition to Failed. This + /// prevents `find_free_agent_for_stage` / auto-assign from being permanently + /// blocked. + /// + /// With story 157 the worktree creation moved into the background spawn, so + /// `start_agent` returns `Ok(Pending)` immediately. We use `wait_for_agent` + /// to block until the background task resolves. #[tokio::test] async fn start_agent_cleans_up_pending_entry_on_failure() { use std::fs; @@ -4825,7 +4896,7 @@ stage = "qa" .unwrap(); // Create the story in upcoming so `move_story_to_current` succeeds, - // but do NOT init a git repo — `create_worktree` will fail. + // but do NOT init a git repo — `create_worktree` will fail in the spawn. let upcoming = root.join(".story_kit/work/1_upcoming"); fs::create_dir_all(&upcoming).unwrap(); fs::write( @@ -4840,10 +4911,32 @@ stage = "qa" .start_agent(root, "50_story_test", Some("qa"), None) .await; - // The call must fail (no git repo for worktree creation). - assert!(result.is_err(), "start_agent should fail without a git repo"); + // With the non-blocking flow, start_agent returns Ok(Pending) immediately. + // Worktree creation failure happens asynchronously in the background. + assert!( + result.is_ok(), + "start_agent should return Ok(Pending) immediately: {:?}", + result.err() + ); + assert_eq!( + result.unwrap().status, + AgentStatus::Pending, + "initial status must be Pending" + ); - // The pool must NOT retain a Pending entry for this agent. + // Wait for the background task to reach a terminal state. + // It must fail (no git repo → create_worktree returns an error). + let final_info = pool + .wait_for_agent("50_story_test", "qa", 5000) + .await + .expect("wait_for_agent should not time out"); + assert_eq!( + final_info.status, + AgentStatus::Failed, + "agent must transition to Failed after worktree creation error" + ); + + // The pool must NOT retain a Pending or Running entry for this agent. let agents = pool.agents.lock().unwrap(); let leaked = agents.values().any(|a| { a.agent_name == "qa" @@ -4851,7 +4944,7 @@ stage = "qa" }); assert!( !leaked, - "agent pool must not retain a Pending/Running entry after start_agent fails" + "agent pool must not retain a Pending/Running entry after worktree creation fails" ); }