2026-04-12 13:11:23 +00:00
|
|
|
//! Agent start — spawns a new agent process in a worktree for a given story.
|
2026-04-27 01:32:08 +00:00
|
|
|
#![allow(unused_imports, dead_code)]
|
2026-03-27 15:53:32 +00:00
|
|
|
use crate::agent_log::AgentLogWriter;
|
|
|
|
|
use crate::config::ProjectConfig;
|
|
|
|
|
use crate::slog_error;
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
|
use tokio::sync::broadcast;
|
|
|
|
|
|
2026-04-13 14:07:08 +00:00
|
|
|
use super::super::runtime::{
|
|
|
|
|
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
|
|
|
|
|
};
|
2026-03-27 15:53:32 +00:00
|
|
|
use super::super::{
|
2026-04-13 14:07:08 +00:00
|
|
|
AgentEvent, AgentInfo, AgentStatus, PipelineStage, agent_config_stage, pipeline_stage,
|
2026-03-27 15:53:32 +00:00
|
|
|
};
|
2026-04-04 20:51:00 +00:00
|
|
|
use super::types::{PendingGuard, StoryAgent, composite_key};
|
2026-03-27 15:53:32 +00:00
|
|
|
use super::worktree::find_active_story_stage;
|
2026-04-13 14:07:08 +00:00
|
|
|
use super::{AgentPool, auto_assign};
|
2026-03-27 15:53:32 +00:00
|
|
|
|
2026-04-26 22:12:04 +00:00
|
|
|
mod spawn;
|
|
|
|
|
mod validation;
|
|
|
|
|
|
|
|
|
|
use validation::{read_front_matter_agent, validate_agent_stage};
|
|
|
|
|
|
2026-03-27 15:53:32 +00:00
|
|
|
impl AgentPool {
|
|
|
|
|
/// Start an agent for a story: load config, create worktree, spawn agent.
|
|
|
|
|
///
|
|
|
|
|
/// When `agent_name` is `None`, automatically selects the first idle coder
|
|
|
|
|
/// agent (story 190). If all coders are busy the call fails with an error
|
|
|
|
|
/// indicating the story will be picked up when one becomes available.
|
|
|
|
|
///
|
2026-04-12 12:52:46 +00:00
|
|
|
/// If `resume_context` is provided and `session_id_to_resume` is `None`,
|
|
|
|
|
/// the context is appended to the rendered prompt so the agent can pick up
|
|
|
|
|
/// from a previous failed attempt.
|
|
|
|
|
///
|
|
|
|
|
/// If `session_id_to_resume` is provided, the agent is launched with
|
|
|
|
|
/// `--resume <session_id>` instead of `-p <full_prompt>`. Only
|
|
|
|
|
/// `resume_context` (if any) is sent as the new message. This lets
|
|
|
|
|
/// the agent re-enter the previous conversation without re-reading
|
|
|
|
|
/// CLAUDE.md and README, satisfying story 543.
|
2026-03-27 15:53:32 +00:00
|
|
|
pub async fn start_agent(
|
|
|
|
|
&self,
|
|
|
|
|
project_root: &Path,
|
|
|
|
|
story_id: &str,
|
|
|
|
|
agent_name: Option<&str>,
|
|
|
|
|
resume_context: Option<&str>,
|
2026-04-12 12:52:46 +00:00
|
|
|
session_id_to_resume: Option<String>,
|
2026-03-27 15:53:32 +00:00
|
|
|
) -> Result<AgentInfo, String> {
|
|
|
|
|
let config = ProjectConfig::load(project_root)?;
|
|
|
|
|
|
|
|
|
|
// Validate explicit agent name early (no lock needed).
|
|
|
|
|
if let Some(name) = agent_name {
|
|
|
|
|
config
|
|
|
|
|
.find_agent(name)
|
|
|
|
|
.ok_or_else(|| format!("No agent named '{name}' in config"))?;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create name-independent shared resources before the lock so they are
|
|
|
|
|
// ready for the atomic check-and-insert (story 132).
|
|
|
|
|
let (tx, _) = broadcast::channel::<AgentEvent>(1024);
|
|
|
|
|
let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
|
|
|
|
|
let log_session_id = uuid::Uuid::new_v4().to_string();
|
|
|
|
|
|
2026-04-27 18:00:53 +00:00
|
|
|
// Create the per-session status buffer subscribed to this project's
|
|
|
|
|
// broadcaster. On restart a fresh buffer replaces the old one,
|
|
|
|
|
// giving each session an independent, clean subscription (story 735).
|
|
|
|
|
let status_buffer =
|
|
|
|
|
crate::service::status::buffer::StatusEventBuffer::new(&self.status_broadcaster);
|
|
|
|
|
|
2026-03-27 15:53:32 +00:00
|
|
|
// Move story from backlog/ to current/ before checking agent
|
|
|
|
|
// availability so that auto_assign_available_work can pick it up even
|
2026-04-09 19:18:01 +01:00
|
|
|
// when all coders are currently busy (story 203). Only do this for
|
|
|
|
|
// Coder-stage agents — QA and Mergemaster must attach to the story
|
|
|
|
|
// at its existing stage (3_qa or 4_merge) and must NOT be demoted
|
|
|
|
|
// back to 2_current/ on attach (bug 502). When `agent_name` is None
|
|
|
|
|
// we are auto-selecting an idle coder, so still move.
|
|
|
|
|
let starting_a_coder = agent_name
|
|
|
|
|
.and_then(|n| config.find_agent(n).map(agent_config_stage))
|
|
|
|
|
.map(|s| s == PipelineStage::Coder)
|
|
|
|
|
.unwrap_or(true);
|
|
|
|
|
if starting_a_coder {
|
2026-04-27 19:51:27 +00:00
|
|
|
crate::agents::lifecycle::move_story_to_current(story_id)?;
|
2026-04-09 19:18:01 +01:00
|
|
|
}
|
2026-03-27 15:53:32 +00:00
|
|
|
|
|
|
|
|
// Validate that the agent's configured stage matches the story's
|
2026-04-26 22:12:04 +00:00
|
|
|
// pipeline stage. (See validation::validate_agent_stage.)
|
|
|
|
|
validate_agent_stage(&config, project_root, story_id, agent_name)?;
|
2026-03-27 15:53:32 +00:00
|
|
|
|
|
|
|
|
// Read the preferred agent from the story's front matter before acquiring
|
2026-04-26 22:12:04 +00:00
|
|
|
// the lock. (See validation::read_front_matter_agent.)
|
|
|
|
|
let front_matter_agent: Option<String> = read_front_matter_agent(story_id, agent_name);
|
2026-03-27 15:53:32 +00:00
|
|
|
|
|
|
|
|
// Atomically resolve agent name, check availability, and register as
|
|
|
|
|
// Pending. When `agent_name` is `None` the first idle coder is
|
|
|
|
|
// selected inside the lock so no TOCTOU race can occur between the
|
|
|
|
|
// availability check and the Pending insert (story 132, story 190).
|
|
|
|
|
//
|
|
|
|
|
// The `PendingGuard` ensures that if any step below fails the entry is
|
|
|
|
|
// removed from the pool so it does not permanently block auto-assign
|
|
|
|
|
// (bug 118).
|
|
|
|
|
let resolved_name: String;
|
|
|
|
|
let key: String;
|
2026-04-27 19:31:34 +00:00
|
|
|
// Buffered status events accumulated while the agent was idle. Drained
|
|
|
|
|
// inside the lock (before the new entry replaces the old one) and
|
|
|
|
|
// formatted as a `<recent-events>` block for prepending to the first
|
|
|
|
|
// agent turn (story 736).
|
|
|
|
|
let prior_events: Option<String>;
|
2026-03-27 15:53:32 +00:00
|
|
|
{
|
|
|
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
|
|
|
|
|
|
|
|
resolved_name = match agent_name {
|
|
|
|
|
Some(name) => name.to_string(),
|
|
|
|
|
None => {
|
|
|
|
|
// Honour the `agent:` field in the story's front matter so that
|
|
|
|
|
// `start 368` after `assign 368 opus` picks the right agent
|
|
|
|
|
// (bug 379). Mirrors the auto_assign selection logic.
|
|
|
|
|
if let Some(ref pref) = front_matter_agent {
|
|
|
|
|
let stage_matches = config
|
|
|
|
|
.find_agent(pref)
|
|
|
|
|
.map(|cfg| agent_config_stage(cfg) == PipelineStage::Coder)
|
|
|
|
|
.unwrap_or(false);
|
|
|
|
|
if stage_matches {
|
|
|
|
|
if auto_assign::is_agent_free(&agents, pref) {
|
|
|
|
|
pref.clone()
|
|
|
|
|
} else {
|
|
|
|
|
return Err(format!(
|
|
|
|
|
"Preferred agent '{pref}' from story front matter is busy; \
|
|
|
|
|
story '{story_id}' has been queued in work/2_current/ and will \
|
|
|
|
|
be auto-assigned when it becomes available"
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Stage mismatch — fall back to any free coder.
|
|
|
|
|
auto_assign::find_free_agent_for_stage(
|
|
|
|
|
&config,
|
|
|
|
|
&agents,
|
|
|
|
|
&PipelineStage::Coder,
|
|
|
|
|
)
|
|
|
|
|
.map(|s| s.to_string())
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
if config
|
|
|
|
|
.agent
|
|
|
|
|
.iter()
|
|
|
|
|
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
|
|
|
|
|
{
|
|
|
|
|
format!(
|
|
|
|
|
"All coder agents are busy; story '{story_id}' has been \
|
|
|
|
|
queued in work/2_current/ and will be auto-assigned when \
|
|
|
|
|
one becomes available"
|
|
|
|
|
)
|
|
|
|
|
} else {
|
|
|
|
|
"No coder agent configured. Specify an agent_name explicitly."
|
|
|
|
|
.to_string()
|
|
|
|
|
}
|
|
|
|
|
})?
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
auto_assign::find_free_agent_for_stage(
|
|
|
|
|
&config,
|
|
|
|
|
&agents,
|
|
|
|
|
&PipelineStage::Coder,
|
|
|
|
|
)
|
|
|
|
|
.map(|s| s.to_string())
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
if config
|
|
|
|
|
.agent
|
|
|
|
|
.iter()
|
|
|
|
|
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
|
|
|
|
|
{
|
|
|
|
|
format!(
|
|
|
|
|
"All coder agents are busy; story '{story_id}' has been \
|
|
|
|
|
queued in work/2_current/ and will be auto-assigned when \
|
|
|
|
|
one becomes available"
|
|
|
|
|
)
|
|
|
|
|
} else {
|
|
|
|
|
"No coder agent configured. Specify an agent_name explicitly."
|
|
|
|
|
.to_string()
|
|
|
|
|
}
|
|
|
|
|
})?
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
key = composite_key(story_id, &resolved_name);
|
|
|
|
|
|
|
|
|
|
// Check for duplicate assignment (same story + same agent already active).
|
|
|
|
|
if let Some(agent) = agents.get(&key)
|
|
|
|
|
&& (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
|
|
|
|
|
{
|
|
|
|
|
return Err(format!(
|
|
|
|
|
"Agent '{resolved_name}' for story '{story_id}' is already {}",
|
|
|
|
|
agent.status
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
// Enforce single-stage concurrency: reject if there is already a
|
|
|
|
|
// Running/Pending agent at the same pipeline stage for this story.
|
|
|
|
|
// This prevents two coders (or two QA/mergemaster agents) from
|
|
|
|
|
// corrupting each other's work in the same worktree.
|
|
|
|
|
// Applies to both explicit and auto-selected agents; the Other
|
|
|
|
|
// stage (supervisors, unknown agents) is exempt.
|
|
|
|
|
let resolved_stage = config
|
|
|
|
|
.find_agent(&resolved_name)
|
|
|
|
|
.map(agent_config_stage)
|
|
|
|
|
.unwrap_or_else(|| pipeline_stage(&resolved_name));
|
|
|
|
|
if resolved_stage != PipelineStage::Other
|
|
|
|
|
&& let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
|
|
|
|
|
let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
|
|
|
|
|
if k_story == story_id
|
|
|
|
|
&& a.agent_name != resolved_name
|
|
|
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
|
|
|
{
|
|
|
|
|
let a_stage = config
|
|
|
|
|
.find_agent(&a.agent_name)
|
|
|
|
|
.map(agent_config_stage)
|
|
|
|
|
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
|
|
|
|
|
if a_stage == resolved_stage {
|
|
|
|
|
Some(a.agent_name.clone())
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
{
|
|
|
|
|
return Err(format!(
|
|
|
|
|
"Cannot start '{resolved_name}' on story '{story_id}': \
|
|
|
|
|
'{conflicting_name}' is already active at the same pipeline stage"
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
// Enforce single-instance concurrency for explicitly-named agents:
|
|
|
|
|
// if this agent is already running on any other story, reject.
|
|
|
|
|
// Auto-selected agents are already guaranteed idle by
|
|
|
|
|
// find_free_agent_for_stage, so this check is only needed for
|
|
|
|
|
// explicit requests.
|
|
|
|
|
if agent_name.is_some()
|
|
|
|
|
&& let Some(busy_story) = agents.iter().find_map(|(k, a)| {
|
|
|
|
|
if a.agent_name == resolved_name
|
|
|
|
|
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
|
|
|
|
|
{
|
|
|
|
|
Some(
|
|
|
|
|
k.rsplit_once(':')
|
|
|
|
|
.map(|(sid, _)| sid)
|
|
|
|
|
.unwrap_or(k)
|
|
|
|
|
.to_string(),
|
|
|
|
|
)
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
{
|
|
|
|
|
return Err(format!(
|
|
|
|
|
"Agent '{resolved_name}' is already running on story '{busy_story}'; \
|
|
|
|
|
story '{story_id}' will be picked up when the agent becomes available"
|
|
|
|
|
));
|
|
|
|
|
}
|
2026-04-27 19:31:34 +00:00
|
|
|
// Drain accumulated status events from the previous session before
|
|
|
|
|
// replacing the entry with the new one. The drained items are
|
|
|
|
|
// formatted and prepended to the first agent turn (story 736).
|
|
|
|
|
prior_events = {
|
|
|
|
|
let items = agents
|
|
|
|
|
.get(&key)
|
|
|
|
|
.and_then(|a| a.status_buffer.as_ref().map(|b| b.drain()))
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
crate::service::status::buffer::format_buffered_items(&items)
|
|
|
|
|
};
|
|
|
|
|
|
2026-03-27 15:53:32 +00:00
|
|
|
agents.insert(
|
|
|
|
|
key.clone(),
|
|
|
|
|
StoryAgent {
|
|
|
|
|
agent_name: resolved_name.clone(),
|
|
|
|
|
status: AgentStatus::Pending,
|
|
|
|
|
worktree_info: None,
|
|
|
|
|
session_id: None,
|
|
|
|
|
tx: tx.clone(),
|
|
|
|
|
task_handle: None,
|
|
|
|
|
event_log: event_log.clone(),
|
|
|
|
|
completion: None,
|
|
|
|
|
project_root: Some(project_root.to_path_buf()),
|
|
|
|
|
log_session_id: Some(log_session_id.clone()),
|
|
|
|
|
merge_failure_reported: false,
|
2026-03-28 09:21:03 +00:00
|
|
|
throttled: false,
|
2026-04-25 13:07:12 +00:00
|
|
|
termination_reason: None,
|
2026-04-27 18:00:53 +00:00
|
|
|
status_buffer: Some(status_buffer),
|
2026-03-27 15:53:32 +00:00
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
|
|
|
|
|
|
|
|
|
|
// Create persistent log writer (needs resolved_name, so must be after
|
|
|
|
|
// the atomic resolution above).
|
|
|
|
|
let log_writer =
|
|
|
|
|
match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
|
|
|
|
|
Ok(w) => Some(Arc::new(Mutex::new(w))),
|
|
|
|
|
Err(e) => {
|
|
|
|
|
eprintln!(
|
|
|
|
|
"[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
|
|
|
|
|
);
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Notify WebSocket clients that a new agent is pending.
|
|
|
|
|
Self::notify_agent_state_changed(&self.watcher_tx);
|
|
|
|
|
|
|
|
|
|
let _ = tx.send(AgentEvent::Status {
|
|
|
|
|
story_id: story_id.to_string(),
|
|
|
|
|
agent_name: resolved_name.clone(),
|
|
|
|
|
status: "pending".to_string(),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Extract inactivity timeout from the agent config before cloning config.
|
|
|
|
|
let inactivity_timeout_secs = config
|
|
|
|
|
.find_agent(&resolved_name)
|
|
|
|
|
.map(|a| a.inactivity_timeout_secs)
|
|
|
|
|
.unwrap_or(300);
|
|
|
|
|
|
2026-04-27 11:23:28 +00:00
|
|
|
// If no explicit session_id_to_resume was provided, look up from the
|
|
|
|
|
// persistent session store. The key includes the model so a model
|
|
|
|
|
// change (e.g. sonnet → opus) produces a cache miss — intentional.
|
|
|
|
|
let effective_session_id = session_id_to_resume.or_else(|| {
|
|
|
|
|
let model = config
|
|
|
|
|
.find_agent(&resolved_name)
|
|
|
|
|
.and_then(|a| a.model.clone())
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
crate::agents::session_store::lookup_session(
|
|
|
|
|
project_root,
|
|
|
|
|
story_id,
|
|
|
|
|
&resolved_name,
|
|
|
|
|
&model,
|
|
|
|
|
)
|
|
|
|
|
});
|
|
|
|
|
|
2026-03-27 15:53:32 +00:00
|
|
|
// Clone all values needed inside the background spawn.
|
|
|
|
|
// Spawn the background task. Worktree creation and agent launch happen here
|
|
|
|
|
// so `start_agent` returns immediately after registering the agent as
|
|
|
|
|
// Pending — non-blocking by design (story 157).
|
2026-04-26 22:12:04 +00:00
|
|
|
let handle = tokio::spawn(spawn::run_agent_spawn(
|
|
|
|
|
project_root.to_path_buf(),
|
|
|
|
|
config.clone(),
|
|
|
|
|
resume_context.map(str::to_string),
|
2026-04-27 11:23:28 +00:00
|
|
|
effective_session_id,
|
2026-04-26 22:12:04 +00:00
|
|
|
story_id.to_string(),
|
|
|
|
|
resolved_name.clone(),
|
|
|
|
|
tx.clone(),
|
|
|
|
|
self.agents.clone(),
|
|
|
|
|
key.clone(),
|
|
|
|
|
event_log.clone(),
|
|
|
|
|
self.port,
|
|
|
|
|
log_writer.clone(),
|
|
|
|
|
self.child_killers.clone(),
|
|
|
|
|
self.watcher_tx.clone(),
|
|
|
|
|
inactivity_timeout_secs,
|
2026-04-27 19:31:34 +00:00
|
|
|
prior_events,
|
2026-04-26 22:12:04 +00:00
|
|
|
));
|
2026-03-27 15:53:32 +00:00
|
|
|
|
|
|
|
|
// Store the task handle while the agent is still Pending.
|
|
|
|
|
{
|
|
|
|
|
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
|
|
|
|
|
if let Some(agent) = agents.get_mut(&key) {
|
|
|
|
|
agent.task_handle = Some(handle);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Agent successfully spawned — prevent the guard from removing the entry.
|
|
|
|
|
pending_guard.disarm();
|
|
|
|
|
|
|
|
|
|
Ok(AgentInfo {
|
|
|
|
|
story_id: story_id.to_string(),
|
|
|
|
|
agent_name: resolved_name,
|
|
|
|
|
status: AgentStatus::Pending,
|
|
|
|
|
session_id: None,
|
|
|
|
|
worktree_path: None,
|
|
|
|
|
base_branch: None,
|
|
|
|
|
completion: None,
|
|
|
|
|
log_session_id: Some(log_session_id),
|
2026-03-28 09:55:19 +00:00
|
|
|
throttled: false,
|
2026-04-25 13:07:12 +00:00
|
|
|
termination_reason: None,
|
2026-03-27 15:53:32 +00:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
2026-04-27 18:20:34 +00:00
|
|
|
mod tests_concurrency;
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests_selection;
|