story-kit: merge 157_story_make_start_agent_non_blocking_by_deferring_worktree_creation

Make start_agent non-blocking by deferring worktree creation. The agent
spawn now returns immediately while worktree setup happens asynchronously,
improving responsiveness of the start_agent MCP call.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dave
2026-02-24 16:50:56 +00:00
parent 71502507cc
commit 68af8c5ba9

View File

@@ -376,52 +376,118 @@ impl AgentPool {
// Move story from upcoming/ to current/ and auto-commit before creating the worktree.
move_story_to_current(project_root, story_id)?;
// Create worktree
let wt_info = worktree::create_worktree(project_root, story_id, &config, self.port).await?;
// Update with worktree info
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
if let Some(agent) = agents.get_mut(&key) {
agent.worktree_info = Some(wt_info.clone());
}
}
// Spawn the agent process
let wt_path_str = wt_info.path.to_string_lossy().to_string();
let (command, args, mut prompt) =
config.render_agent_args(&wt_path_str, story_id, Some(&resolved_name), Some(&wt_info.base_branch))?;
// Append resume context if this is a restart with failure information.
if let Some(ctx) = resume_context {
prompt.push_str(ctx);
}
// Extract inactivity timeout from the agent config before moving config.
// Extract inactivity timeout from the agent config before cloning config.
let inactivity_timeout_secs = config
.find_agent(&resolved_name)
.map(|a| a.inactivity_timeout_secs)
.unwrap_or(300);
// Clone all values needed inside the background spawn.
let project_root_clone = project_root.to_path_buf();
let config_clone = config.clone();
let resume_context_owned = resume_context.map(str::to_string);
let sid = story_id.to_string();
let aname = resolved_name.clone();
let tx_clone = tx.clone();
let agents_ref = self.agents.clone();
let cwd = wt_path_str.clone();
let key_clone = key.clone();
let log_clone = event_log.clone();
let port_for_task = self.port;
let log_writer_clone = log_writer.clone();
// Spawn the background task. Worktree creation and agent launch happen here
// so `start_agent` returns immediately after registering the agent as
// Pending — non-blocking by design (story 157).
let handle = tokio::spawn(async move {
// Step 1: create the worktree (slow — git checkout, pnpm install, etc.)
let wt_info = match worktree::create_worktree(
&project_root_clone,
&sid,
&config_clone,
port_for_task,
)
.await
{
Ok(wt) => wt,
Err(e) => {
// Worktree creation failed — mark agent as Failed so the UI shows the error.
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Failed;
agent.completed_at = Some(Instant::now());
}
let _ = tx_clone.send(AgentEvent::Error {
story_id: sid.clone(),
agent_name: aname.clone(),
message: format!("Failed to create worktree: {e}"),
});
return;
}
};
// Step 2: store worktree info and render agent command/args/prompt.
let wt_path_str = wt_info.path.to_string_lossy().to_string();
{
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.worktree_info = Some(wt_info.clone());
}
}
let (command, args, mut prompt) = match config_clone.render_agent_args(
&wt_path_str,
&sid,
Some(&aname),
Some(&wt_info.base_branch),
) {
Ok(result) => result,
Err(e) => {
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Failed;
agent.completed_at = Some(Instant::now());
}
let _ = tx_clone.send(AgentEvent::Error {
story_id: sid.clone(),
agent_name: aname.clone(),
message: format!("Failed to render agent args: {e}"),
});
return;
}
};
// Append resume context if this is a restart with failure information.
if let Some(ctx) = resume_context_owned {
prompt.push_str(&ctx);
}
// Step 3: transition to Running now that the worktree is ready.
{
if let Ok(mut agents) = agents_ref.lock()
&& let Some(agent) = agents.get_mut(&key_clone)
{
agent.status = AgentStatus::Running;
}
}
let _ = tx_clone.send(AgentEvent::Status {
story_id: sid.clone(),
agent_name: aname.clone(),
status: "running".to_string(),
});
// Step 4: launch the agent process.
match run_agent_pty_streaming(
&sid, &aname, &command, &args, &prompt, &cwd, &tx_clone, &log_clone,
&sid,
&aname,
&command,
&args,
&prompt,
&wt_path_str,
&tx_clone,
&log_clone,
log_writer_clone,
inactivity_timeout_secs,
)
@@ -455,11 +521,10 @@ impl AgentPool {
}
});
// Update status to running with task handle
// Store the task handle while the agent is still Pending.
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
if let Some(agent) = agents.get_mut(&key) {
agent.status = AgentStatus::Running;
agent.task_handle = Some(handle);
}
}
@@ -470,10 +535,10 @@ impl AgentPool {
Ok(AgentInfo {
story_id: story_id.to_string(),
agent_name: resolved_name,
status: AgentStatus::Running,
status: AgentStatus::Pending,
session_id: None,
worktree_path: Some(wt_path_str),
base_branch: Some(wt_info.base_branch.clone()),
worktree_path: None,
base_branch: None,
completion: None,
log_session_id: Some(log_session_id),
})
@@ -1766,11 +1831,12 @@ fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) {
Err(_) => return,
};
// Collect orphaned entries: Running agents whose task handle is finished.
let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>)> = lock
// Collect orphaned entries: Running or Pending agents whose task handle is finished.
// Pending agents can be orphaned if worktree creation panics before setting status.
let orphaned: Vec<(String, String, broadcast::Sender<AgentEvent>, AgentStatus)> = lock
.iter()
.filter_map(|(key, agent)| {
if agent.status == AgentStatus::Running
if matches!(agent.status, AgentStatus::Running | AgentStatus::Pending)
&& let Some(handle) = &agent.task_handle
&& handle.is_finished()
{
@@ -1778,17 +1844,17 @@ fn check_orphaned_agents(agents: &Mutex<HashMap<String, StoryAgent>>) {
.rsplit_once(':')
.map(|(s, _)| s.to_string())
.unwrap_or_else(|| key.clone());
return Some((key.clone(), story_id, agent.tx.clone()));
return Some((key.clone(), story_id, agent.tx.clone(), agent.status.clone()));
}
None
})
.collect();
for (key, story_id, tx) in orphaned {
for (key, story_id, tx, prev_status) in orphaned {
if let Some(agent) = lock.get_mut(&key) {
agent.status = AgentStatus::Failed;
slog!(
"[watchdog] Orphaned agent '{key}': task finished but status was Running. \
"[watchdog] Orphaned agent '{key}': task finished but status was {prev_status}. \
Marking Failed."
);
let _ = tx.send(AgentEvent::Error {
@@ -4804,10 +4870,15 @@ stage = "qa"
// ── bug 118: pending entry cleanup on start_agent failure ────────────────
/// Regression test for bug 118: when `start_agent` fails (e.g. because
/// `create_worktree` cannot find a git repo), the Pending entry that was
/// inserted into the agent HashMap must be cleaned up so it does not
/// permanently block `find_free_agent_for_stage` / auto-assign.
/// Regression test for bug 118: when worktree creation fails (e.g. because
/// there is no git repo), the Pending entry that was inserted into the agent
/// HashMap must not remain Pending — it must transition to Failed. This
/// prevents `find_free_agent_for_stage` / auto-assign from being permanently
/// blocked.
///
/// With story 157 the worktree creation moved into the background spawn, so
/// `start_agent` returns `Ok(Pending)` immediately. We use `wait_for_agent`
/// to block until the background task resolves.
#[tokio::test]
async fn start_agent_cleans_up_pending_entry_on_failure() {
use std::fs;
@@ -4825,7 +4896,7 @@ stage = "qa"
.unwrap();
// Create the story in upcoming so `move_story_to_current` succeeds,
// but do NOT init a git repo — `create_worktree` will fail.
// but do NOT init a git repo — `create_worktree` will fail in the spawn.
let upcoming = root.join(".story_kit/work/1_upcoming");
fs::create_dir_all(&upcoming).unwrap();
fs::write(
@@ -4840,10 +4911,32 @@ stage = "qa"
.start_agent(root, "50_story_test", Some("qa"), None)
.await;
// The call must fail (no git repo for worktree creation).
assert!(result.is_err(), "start_agent should fail without a git repo");
// With the non-blocking flow, start_agent returns Ok(Pending) immediately.
// Worktree creation failure happens asynchronously in the background.
assert!(
result.is_ok(),
"start_agent should return Ok(Pending) immediately: {:?}",
result.err()
);
assert_eq!(
result.unwrap().status,
AgentStatus::Pending,
"initial status must be Pending"
);
// The pool must NOT retain a Pending entry for this agent.
// Wait for the background task to reach a terminal state.
// It must fail (no git repo → create_worktree returns an error).
let final_info = pool
.wait_for_agent("50_story_test", "qa", 5000)
.await
.expect("wait_for_agent should not time out");
assert_eq!(
final_info.status,
AgentStatus::Failed,
"agent must transition to Failed after worktree creation error"
);
// The pool must NOT retain a Pending or Running entry for this agent.
let agents = pool.agents.lock().unwrap();
let leaked = agents.values().any(|a| {
a.agent_name == "qa"
@@ -4851,7 +4944,7 @@ stage = "qa"
});
assert!(
!leaked,
"agent pool must not retain a Pending/Running entry after start_agent fails"
"agent pool must not retain a Pending/Running entry after worktree creation fails"
);
}