Spike 3: Sub-agent infrastructure fixes for multi-agent coordination

- Fix CLAUDECODE env var blocking nested Claude Code sessions
- Add drain-based event_log for reliable get_agent_output polling
- Add non-SSE get_agent_output fallback (critical for MCP tool calls)
- Preserve worktrees on agent stop instead of destroying work
- Reap zombie processes with child.wait() after kill
- Increase broadcast buffer from 256 to 1024
- Engineer supervisor and coder prompts in project.toml
- Point .mcp.json to test port 3002

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dave
2026-02-20 11:57:25 +00:00
parent b089d314ba
commit db2d055f60
5 changed files with 161 additions and 46 deletions

View File

@@ -7,34 +7,55 @@ teardown = []
[[component]] [[component]]
name = "server" name = "server"
path = "." path = "."
setup = ["cargo check"] setup = ["mkdir -p frontend/dist", "cargo check"]
teardown = [] teardown = []
[[agent]] [[agent]]
name = "supervisor" name = "supervisor"
role = "Coordinates work, reviews PRs, decomposes stories." role = "Coordinates work, reviews PRs, decomposes stories."
model = "opus" model = "opus"
max_turns = 50 max_turns = 200
max_budget_usd = 10.00 max_budget_usd = 15.00
system_prompt = "You are a senior engineering lead. Coordinate the work, review code, and ensure quality." prompt = """You are the supervisor for story {{story_id}}. Your job is to coordinate coder agents to implement this story.
## Your MCP Tools
You have these tools via the story-kit MCP server:
- start_agent(story_id, agent_name) - Start a coder agent on a story
- get_agent_output(story_id, agent_name, timeout_ms) - Poll agent output (returns recent events, call repeatedly)
- list_agents() - See all running agents and their status
- stop_agent(story_id, agent_name) - Stop a running agent
- get_story_todos(story_id) - Get unchecked acceptance criteria for a story in current/
- ensure_acceptance(story_id) - Check if a story passes acceptance gates
## Your Workflow
1. Read the story file from .story_kit/stories/ to understand requirements
2. Move it to current/ if it is in upcoming/
3. Start coder-1 on the story: call start_agent with story_id="{{story_id}}" and agent_name="coder-1"
4. Monitor progress: call get_agent_output every 30-60 seconds to check on the agent
5. If the agent completes, review the worktree changes
6. If the agent gets stuck or fails, stop it and start a fresh agent
7. When the work looks complete, call ensure_acceptance to verify
## Rules
- Do NOT implement code yourself - delegate to coder agents
- Only run one coder at a time per story
- Focus on coordination, monitoring, and quality review"""
system_prompt = "You are a supervisor agent. Use MCP tools to coordinate sub-agents. Never implement code directly - always delegate to coder agents and monitor their progress."
[[agent]] [[agent]]
name = "coder-1" name = "coder-1"
role = "Full-stack engineer. Implements features across all components." role = "Full-stack engineer. Implements features across all components."
model = "sonnet" model = "sonnet"
max_turns = 30 max_turns = 50
max_budget_usd = 5.00 max_budget_usd = 5.00
prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. Pick up the story from .story_kit/stories/ - move it to current/ if needed. Follow the SDTW process end-to-end. The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools."
system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Run cargo clippy and biome checks before considering work complete. Do not coordinate with other agents - focus on your assigned story."
[[agent]] [[agent]]
name = "coder-2" name = "coder-2"
role = "Full-stack engineer. Implements features across all components." role = "Full-stack engineer. Implements features across all components."
model = "sonnet" model = "sonnet"
max_turns = 30 max_turns = 50
max_budget_usd = 5.00 max_budget_usd = 5.00
prompt = "You are working in a git worktree on story {{story_id}}. Read CLAUDE.md first, then .story_kit/README.md to understand the dev process. Pick up the story from .story_kit/stories/ - move it to current/ if needed. Follow the SDTW process end-to-end. The worktree and feature branch already exist - do not create them. Check .mcp.json for MCP tools."
[[agent]] system_prompt = "You are a full-stack engineer working autonomously in a git worktree. Follow the Story-Driven Test Workflow strictly. Run cargo clippy and biome checks before considering work complete. Do not coordinate with other agents - focus on your assigned story."
name = "reviewer"
role = "Reviews code changes, runs tests, checks quality gates."
model = "sonnet"
max_turns = 20
max_budget_usd = 3.00

View File

@@ -82,10 +82,11 @@ struct StoryAgent {
agent_name: String, agent_name: String,
status: AgentStatus, status: AgentStatus,
worktree_info: Option<WorktreeInfo>, worktree_info: Option<WorktreeInfo>,
config: ProjectConfig,
session_id: Option<String>, session_id: Option<String>,
tx: broadcast::Sender<AgentEvent>, tx: broadcast::Sender<AgentEvent>,
task_handle: Option<tokio::task::JoinHandle<()>>, task_handle: Option<tokio::task::JoinHandle<()>>,
/// Accumulated events for polling via get_agent_output.
event_log: Arc<Mutex<Vec<AgentEvent>>>,
} }
/// Manages concurrent story agents, each in its own worktree. /// Manages concurrent story agents, each in its own worktree.
@@ -140,7 +141,9 @@ impl AgentPool {
} }
} }
let (tx, _) = broadcast::channel::<AgentEvent>(256); let (tx, _) = broadcast::channel::<AgentEvent>(1024);
let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
// Register as pending // Register as pending
{ {
@@ -151,10 +154,10 @@ impl AgentPool {
agent_name: resolved_name.clone(), agent_name: resolved_name.clone(),
status: AgentStatus::Pending, status: AgentStatus::Pending,
worktree_info: None, worktree_info: None,
config: config.clone(),
session_id: None, session_id: None,
tx: tx.clone(), tx: tx.clone(),
task_handle: None, task_handle: None,
event_log: event_log.clone(),
}, },
); );
} }
@@ -187,6 +190,7 @@ impl AgentPool {
let agents_ref = self.agents.clone(); let agents_ref = self.agents.clone();
let cwd = wt_path_str.clone(); let cwd = wt_path_str.clone();
let key_clone = key.clone(); let key_clone = key.clone();
let log_clone = event_log.clone();
let handle = tokio::spawn(async move { let handle = tokio::spawn(async move {
let _ = tx_clone.send(AgentEvent::Status { let _ = tx_clone.send(AgentEvent::Status {
@@ -195,7 +199,9 @@ impl AgentPool {
status: "running".to_string(), status: "running".to_string(),
}); });
match run_agent_pty_streaming(&sid, &aname, &command, &args, &prompt, &cwd, &tx_clone) match run_agent_pty_streaming(
&sid, &aname, &command, &args, &prompt, &cwd, &tx_clone, &log_clone,
)
.await .await
{ {
Ok(session_id) => { Ok(session_id) => {
@@ -244,27 +250,26 @@ impl AgentPool {
}) })
} }
/// Stop a running agent and clean up its worktree. /// Stop a running agent. Worktree is preserved for inspection.
pub async fn stop_agent( pub async fn stop_agent(
&self, &self,
project_root: &Path, _project_root: &Path,
story_id: &str, story_id: &str,
agent_name: &str, agent_name: &str,
) -> Result<(), String> { ) -> Result<(), String> {
let key = composite_key(story_id, agent_name); let key = composite_key(story_id, agent_name);
let (worktree_info, config, task_handle, tx) = { let (worktree_info, task_handle, tx) = {
let mut agents = self.agents.lock().map_err(|e| e.to_string())?; let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents let agent = agents
.get_mut(&key) .get_mut(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?; .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
let wt = agent.worktree_info.clone(); let wt = agent.worktree_info.clone();
let cfg = agent.config.clone();
let handle = agent.task_handle.take(); let handle = agent.task_handle.take();
let tx = agent.tx.clone(); let tx = agent.tx.clone();
agent.status = AgentStatus::Failed; agent.status = AgentStatus::Failed;
(wt, cfg, handle, tx) (wt, handle, tx)
}; };
// Abort the task // Abort the task
@@ -273,11 +278,12 @@ impl AgentPool {
let _ = handle.await; let _ = handle.await;
} }
// Remove worktree // Preserve worktree for inspection — don't destroy agent's work on stop.
if let Some(ref wt) = worktree_info if let Some(ref wt) = worktree_info {
&& let Err(e) = worktree::remove_worktree(project_root, wt, &config).await eprintln!(
{ "[agents] Worktree preserved for {story_id}:{agent_name}: {}",
eprintln!("[agents] Worktree cleanup warning for {story_id}:{agent_name}: {e}"); wt.path.display()
);
} }
let _ = tx.send(AgentEvent::Status { let _ = tx.send(AgentEvent::Status {
@@ -334,6 +340,21 @@ impl AgentPool {
Ok(agent.tx.subscribe()) Ok(agent.tx.subscribe())
} }
/// Drain accumulated events for polling. Returns all events since the last drain.
pub fn drain_events(
&self,
story_id: &str,
agent_name: &str,
) -> Result<Vec<AgentEvent>, String> {
let key = composite_key(story_id, agent_name);
let agents = self.agents.lock().map_err(|e| e.to_string())?;
let agent = agents
.get(&key)
.ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
let mut log = agent.event_log.lock().map_err(|e| e.to_string())?;
Ok(log.drain(..).collect())
}
/// Get project root helper. /// Get project root helper.
pub fn get_project_root( pub fn get_project_root(
&self, &self,
@@ -344,6 +365,7 @@ impl AgentPool {
} }
/// Spawn claude agent in a PTY and stream events through the broadcast channel. /// Spawn claude agent in a PTY and stream events through the broadcast channel.
#[allow(clippy::too_many_arguments)]
async fn run_agent_pty_streaming( async fn run_agent_pty_streaming(
story_id: &str, story_id: &str,
agent_name: &str, agent_name: &str,
@@ -352,6 +374,7 @@ async fn run_agent_pty_streaming(
prompt: &str, prompt: &str,
cwd: &str, cwd: &str,
tx: &broadcast::Sender<AgentEvent>, tx: &broadcast::Sender<AgentEvent>,
event_log: &Arc<Mutex<Vec<AgentEvent>>>,
) -> Result<Option<String>, String> { ) -> Result<Option<String>, String> {
let sid = story_id.to_string(); let sid = story_id.to_string();
let aname = agent_name.to_string(); let aname = agent_name.to_string();
@@ -360,14 +383,28 @@ async fn run_agent_pty_streaming(
let prompt = prompt.to_string(); let prompt = prompt.to_string();
let cwd = cwd.to_string(); let cwd = cwd.to_string();
let tx = tx.clone(); let tx = tx.clone();
let event_log = event_log.clone();
tokio::task::spawn_blocking(move || { tokio::task::spawn_blocking(move || {
run_agent_pty_blocking(&sid, &aname, &cmd, &args, &prompt, &cwd, &tx) run_agent_pty_blocking(&sid, &aname, &cmd, &args, &prompt, &cwd, &tx, &event_log)
}) })
.await .await
.map_err(|e| format!("Agent task panicked: {e}"))? .map_err(|e| format!("Agent task panicked: {e}"))?
} }
/// Helper to send an event to both broadcast and event log.
fn emit_event(
event: AgentEvent,
tx: &broadcast::Sender<AgentEvent>,
event_log: &Mutex<Vec<AgentEvent>>,
) {
if let Ok(mut log) = event_log.lock() {
log.push(event.clone());
}
let _ = tx.send(event);
}
#[allow(clippy::too_many_arguments)]
fn run_agent_pty_blocking( fn run_agent_pty_blocking(
story_id: &str, story_id: &str,
agent_name: &str, agent_name: &str,
@@ -376,6 +413,7 @@ fn run_agent_pty_blocking(
prompt: &str, prompt: &str,
cwd: &str, cwd: &str,
tx: &broadcast::Sender<AgentEvent>, tx: &broadcast::Sender<AgentEvent>,
event_log: &Mutex<Vec<AgentEvent>>,
) -> Result<Option<String>, String> { ) -> Result<Option<String>, String> {
let pty_system = native_pty_system(); let pty_system = native_pty_system();
@@ -410,6 +448,10 @@ fn run_agent_pty_blocking(
cmd.cwd(cwd); cmd.cwd(cwd);
cmd.env("NO_COLOR", "1"); cmd.env("NO_COLOR", "1");
// Allow spawning Claude Code from within a Claude Code session
cmd.env_remove("CLAUDECODE");
cmd.env_remove("CLAUDE_CODE_ENTRYPOINT");
eprintln!("[agent:{story_id}:{agent_name}] Spawning {command} in {cwd} with args: {args:?}"); eprintln!("[agent:{story_id}:{agent_name}] Spawning {command} in {cwd} with args: {args:?}");
let mut child = pair let mut child = pair
@@ -445,11 +487,15 @@ fn run_agent_pty_blocking(
Ok(j) => j, Ok(j) => j,
Err(_) => { Err(_) => {
// Non-JSON output (terminal escapes etc.) — send as raw output // Non-JSON output (terminal escapes etc.) — send as raw output
let _ = tx.send(AgentEvent::Output { emit_event(
AgentEvent::Output {
story_id: story_id.to_string(), story_id: story_id.to_string(),
agent_name: agent_name.to_string(), agent_name: agent_name.to_string(),
text: trimmed.to_string(), text: trimmed.to_string(),
}); },
tx,
event_log,
);
continue; continue;
} }
}; };
@@ -469,11 +515,15 @@ fn run_agent_pty_blocking(
{ {
for block in content { for block in content {
if let Some(text) = block.get("text").and_then(|t| t.as_str()) { if let Some(text) = block.get("text").and_then(|t| t.as_str()) {
let _ = tx.send(AgentEvent::Output { emit_event(
AgentEvent::Output {
story_id: story_id.to_string(), story_id: story_id.to_string(),
agent_name: agent_name.to_string(), agent_name: agent_name.to_string(),
text: text.to_string(), text: text.to_string(),
}); },
tx,
event_log,
);
} }
} }
} }
@@ -482,14 +532,19 @@ fn run_agent_pty_blocking(
} }
// Forward all JSON events // Forward all JSON events
let _ = tx.send(AgentEvent::AgentJson { emit_event(
AgentEvent::AgentJson {
story_id: story_id.to_string(), story_id: story_id.to_string(),
agent_name: agent_name.to_string(), agent_name: agent_name.to_string(),
data: json, data: json,
}); },
tx,
event_log,
);
} }
let _ = child.kill(); let _ = child.kill();
let _ = child.wait();
eprintln!( eprintln!(
"[agent:{story_id}:{agent_name}] Done. Session: {:?}", "[agent:{story_id}:{agent_name}] Done. Session: {:?}",

View File

@@ -11,6 +11,7 @@ pub struct ProjectConfig {
} }
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)]
pub struct ComponentConfig { pub struct ComponentConfig {
pub name: String, pub name: String,
#[serde(default = "default_path")] #[serde(default = "default_path")]

View File

@@ -438,7 +438,7 @@ fn handle_tools_list(id: Option<Value>) -> JsonRpcResponse {
}, },
{ {
"name": "stop_agent", "name": "stop_agent",
"description": "Stop a running agent and clean up its worktree.", "description": "Stop a running agent. Worktree is preserved for inspection.",
"inputSchema": { "inputSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -532,7 +532,7 @@ async fn handle_tools_call(
"list_agents" => tool_list_agents(ctx), "list_agents" => tool_list_agents(ctx),
"get_agent_config" => tool_get_agent_config(ctx), "get_agent_config" => tool_get_agent_config(ctx),
"reload_agent_config" => tool_get_agent_config(ctx), "reload_agent_config" => tool_get_agent_config(ctx),
"get_agent_output" => Err("get_agent_output requires Accept: text/event-stream for SSE streaming".into()), "get_agent_output" => tool_get_agent_output_poll(&args, ctx).await,
_ => Err(format!("Unknown tool: {tool_name}")), _ => Err(format!("Unknown tool: {tool_name}")),
}; };
@@ -737,6 +737,40 @@ fn tool_list_agents(ctx: &AppContext) -> Result<String, String> {
.map_err(|e| format!("Serialization error: {e}")) .map_err(|e| format!("Serialization error: {e}"))
} }
async fn tool_get_agent_output_poll(args: &Value, ctx: &AppContext) -> Result<String, String> {
let story_id = args
.get("story_id")
.and_then(|v| v.as_str())
.ok_or("Missing required argument: story_id")?;
let agent_name = args
.get("agent_name")
.and_then(|v| v.as_str())
.ok_or("Missing required argument: agent_name")?;
// Drain all accumulated events since the last poll.
let drained = ctx.agents.drain_events(story_id, agent_name)?;
let done = drained.iter().any(|e| {
matches!(
e,
crate::agents::AgentEvent::Done { .. } | crate::agents::AgentEvent::Error { .. }
)
});
let events: Vec<serde_json::Value> = drained
.into_iter()
.filter_map(|e| serde_json::to_value(&e).ok())
.collect();
serde_json::to_string_pretty(&json!({
"events": events,
"done": done,
"event_count": events.len(),
"message": if done { "Agent stream ended." } else if events.is_empty() { "No new events. Call again to continue." } else { "Events returned. Call again to continue." }
}))
.map_err(|e| format!("Serialization error: {e}"))
}
fn tool_get_agent_config(ctx: &AppContext) -> Result<String, String> { fn tool_get_agent_config(ctx: &AppContext) -> Result<String, String> {
let project_root = ctx.agents.get_project_root(&ctx.state)?; let project_root = ctx.agents.get_project_root(&ctx.state)?;
let config = ProjectConfig::load(&project_root)?; let config = ProjectConfig::load(&project_root)?;

View File

@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
use std::process::Command; use std::process::Command;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct WorktreeInfo { pub struct WorktreeInfo {
pub path: PathBuf, pub path: PathBuf,
pub branch: String, pub branch: String,
@@ -104,6 +105,7 @@ fn create_worktree_sync(
} }
/// Remove a git worktree and its branch. /// Remove a git worktree and its branch.
#[allow(dead_code)]
pub async fn remove_worktree( pub async fn remove_worktree(
project_root: &Path, project_root: &Path,
info: &WorktreeInfo, info: &WorktreeInfo,
@@ -120,6 +122,7 @@ pub async fn remove_worktree(
.map_err(|e| format!("spawn_blocking: {e}"))? .map_err(|e| format!("spawn_blocking: {e}"))?
} }
#[allow(dead_code)]
fn remove_worktree_sync( fn remove_worktree_sync(
project_root: &Path, project_root: &Path,
wt_path: &Path, wt_path: &Path,
@@ -161,6 +164,7 @@ async fn run_setup_commands(wt_path: &Path, config: &ProjectConfig) -> Result<()
Ok(()) Ok(())
} }
#[allow(dead_code)]
async fn run_teardown_commands(wt_path: &Path, config: &ProjectConfig) -> Result<(), String> { async fn run_teardown_commands(wt_path: &Path, config: &ProjectConfig) -> Result<(), String> {
for component in &config.component { for component in &config.component {
let cmd_dir = wt_path.join(&component.path); let cmd_dir = wt_path.join(&component.path);