Files
huskies/server/src/agents/pool/start/mod.rs
T

388 lines
17 KiB
Rust
Raw Normal View History

//! Agent start — spawns a new agent process in a worktree for a given story.
#![allow(unused_imports, dead_code)]
use crate::agent_log::AgentLogWriter;
use crate::config::ProjectConfig;
use crate::slog_error;
use std::path::Path;
use std::sync::{Arc, Mutex};
use tokio::sync::broadcast;
use super::super::runtime::{
AgentRuntime, ClaudeCodeRuntime, GeminiRuntime, OpenAiRuntime, RuntimeContext,
};
use super::super::{
AgentEvent, AgentInfo, AgentStatus, PipelineStage, agent_config_stage, pipeline_stage,
};
use super::types::{PendingGuard, StoryAgent, composite_key};
use super::worktree::find_active_story_stage;
use super::{AgentPool, auto_assign};
mod spawn;
mod validation;
use validation::{read_front_matter_agent, validate_agent_stage};
impl AgentPool {
/// Start an agent for a story: load config, create worktree, spawn agent.
///
/// When `agent_name` is `None`, automatically selects the first idle coder
/// agent (story 190). If all coders are busy the call fails with an error
/// indicating the story will be picked up when one becomes available.
///
/// If `resume_context` is provided and `session_id_to_resume` is `None`,
/// the context is appended to the rendered prompt so the agent can pick up
/// from a previous failed attempt.
///
/// If `session_id_to_resume` is provided, the agent is launched with
/// `--resume <session_id>` instead of `-p <full_prompt>`. Only
/// `resume_context` (if any) is sent as the new message. This lets
/// the agent re-enter the previous conversation without re-reading
/// CLAUDE.md and README, satisfying story 543.
pub async fn start_agent(
&self,
project_root: &Path,
story_id: &str,
agent_name: Option<&str>,
resume_context: Option<&str>,
session_id_to_resume: Option<String>,
) -> Result<AgentInfo, String> {
let config = ProjectConfig::load(project_root)?;
// Validate explicit agent name early (no lock needed).
if let Some(name) = agent_name {
config
.find_agent(name)
.ok_or_else(|| format!("No agent named '{name}' in config"))?;
}
// Create name-independent shared resources before the lock so they are
// ready for the atomic check-and-insert (story 132).
let (tx, _) = broadcast::channel::<AgentEvent>(1024);
let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
let log_session_id = uuid::Uuid::new_v4().to_string();
// Create the per-session status buffer subscribed to this project's
// broadcaster. On restart a fresh buffer replaces the old one,
// giving each session an independent, clean subscription (story 735).
let status_buffer =
crate::service::status::buffer::StatusEventBuffer::new(&self.status_broadcaster);
// Move story from backlog/ to current/ before checking agent
// availability so that auto_assign_available_work can pick it up even
// when all coders are currently busy (story 203). Only do this for
// Coder-stage agents — QA and Mergemaster must attach to the story
// at its existing stage (3_qa or 4_merge) and must NOT be demoted
// back to 2_current/ on attach (bug 502). When `agent_name` is None
// we are auto-selecting an idle coder, so still move.
let starting_a_coder = agent_name
.and_then(|n| config.find_agent(n).map(agent_config_stage))
.map(|s| s == PipelineStage::Coder)
.unwrap_or(true);
if starting_a_coder {
crate::agents::lifecycle::move_story_to_current(project_root, story_id)?;
}
// Validate that the agent's configured stage matches the story's
// pipeline stage. (See validation::validate_agent_stage.)
validate_agent_stage(&config, project_root, story_id, agent_name)?;
// Read the preferred agent from the story's front matter before acquiring
// the lock. (See validation::read_front_matter_agent.)
let front_matter_agent: Option<String> = read_front_matter_agent(story_id, agent_name);
// Atomically resolve agent name, check availability, and register as
// Pending. When `agent_name` is `None` the first idle coder is
// selected inside the lock so no TOCTOU race can occur between the
// availability check and the Pending insert (story 132, story 190).
//
// The `PendingGuard` ensures that if any step below fails the entry is
// removed from the pool so it does not permanently block auto-assign
// (bug 118).
let resolved_name: String;
let key: String;
// Buffered status events accumulated while the agent was idle. Drained
// inside the lock (before the new entry replaces the old one) and
// formatted as a `<recent-events>` block for prepending to the first
// agent turn (story 736).
let prior_events: Option<String>;
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
resolved_name = match agent_name {
Some(name) => name.to_string(),
None => {
// Honour the `agent:` field in the story's front matter so that
// `start 368` after `assign 368 opus` picks the right agent
// (bug 379). Mirrors the auto_assign selection logic.
if let Some(ref pref) = front_matter_agent {
let stage_matches = config
.find_agent(pref)
.map(|cfg| agent_config_stage(cfg) == PipelineStage::Coder)
.unwrap_or(false);
if stage_matches {
if auto_assign::is_agent_free(&agents, pref) {
pref.clone()
} else {
return Err(format!(
"Preferred agent '{pref}' from story front matter is busy; \
story '{story_id}' has been queued in work/2_current/ and will \
be auto-assigned when it becomes available"
));
}
} else {
// Stage mismatch — fall back to any free coder.
auto_assign::find_free_agent_for_stage(
&config,
&agents,
&PipelineStage::Coder,
)
.map(|s| s.to_string())
.ok_or_else(|| {
if config
.agent
.iter()
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
{
format!(
"All coder agents are busy; story '{story_id}' has been \
queued in work/2_current/ and will be auto-assigned when \
one becomes available"
)
} else {
"No coder agent configured. Specify an agent_name explicitly."
.to_string()
}
})?
}
} else {
auto_assign::find_free_agent_for_stage(
&config,
&agents,
&PipelineStage::Coder,
)
.map(|s| s.to_string())
.ok_or_else(|| {
if config
.agent
.iter()
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
{
format!(
"All coder agents are busy; story '{story_id}' has been \
queued in work/2_current/ and will be auto-assigned when \
one becomes available"
)
} else {
"No coder agent configured. Specify an agent_name explicitly."
.to_string()
}
})?
}
}
};
key = composite_key(story_id, &resolved_name);
// Check for duplicate assignment (same story + same agent already active).
if let Some(agent) = agents.get(&key)
&& (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
{
return Err(format!(
"Agent '{resolved_name}' for story '{story_id}' is already {}",
agent.status
));
}
// Enforce single-stage concurrency: reject if there is already a
// Running/Pending agent at the same pipeline stage for this story.
// This prevents two coders (or two QA/mergemaster agents) from
// corrupting each other's work in the same worktree.
// Applies to both explicit and auto-selected agents; the Other
// stage (supervisors, unknown agents) is exempt.
let resolved_stage = config
.find_agent(&resolved_name)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&resolved_name));
if resolved_stage != PipelineStage::Other
&& let Some(conflicting_name) = agents.iter().find_map(|(k, a)| {
let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
if k_story == story_id
&& a.agent_name != resolved_name
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
{
let a_stage = config
.find_agent(&a.agent_name)
.map(agent_config_stage)
.unwrap_or_else(|| pipeline_stage(&a.agent_name));
if a_stage == resolved_stage {
Some(a.agent_name.clone())
} else {
None
}
} else {
None
}
})
{
return Err(format!(
"Cannot start '{resolved_name}' on story '{story_id}': \
'{conflicting_name}' is already active at the same pipeline stage"
));
}
// Enforce single-instance concurrency for explicitly-named agents:
// if this agent is already running on any other story, reject.
// Auto-selected agents are already guaranteed idle by
// find_free_agent_for_stage, so this check is only needed for
// explicit requests.
if agent_name.is_some()
&& let Some(busy_story) = agents.iter().find_map(|(k, a)| {
if a.agent_name == resolved_name
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
{
Some(
k.rsplit_once(':')
.map(|(sid, _)| sid)
.unwrap_or(k)
.to_string(),
)
} else {
None
}
})
{
return Err(format!(
"Agent '{resolved_name}' is already running on story '{busy_story}'; \
story '{story_id}' will be picked up when the agent becomes available"
));
}
// Drain accumulated status events from the previous session before
// replacing the entry with the new one. The drained items are
// formatted and prepended to the first agent turn (story 736).
prior_events = {
let items = agents
.get(&key)
.and_then(|a| a.status_buffer.as_ref().map(|b| b.drain()))
.unwrap_or_default();
crate::service::status::buffer::format_buffered_items(&items)
};
agents.insert(
key.clone(),
StoryAgent {
agent_name: resolved_name.clone(),
status: AgentStatus::Pending,
worktree_info: None,
session_id: None,
tx: tx.clone(),
task_handle: None,
event_log: event_log.clone(),
completion: None,
project_root: Some(project_root.to_path_buf()),
log_session_id: Some(log_session_id.clone()),
merge_failure_reported: false,
throttled: false,
termination_reason: None,
status_buffer: Some(status_buffer),
},
);
}
let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
// Create persistent log writer (needs resolved_name, so must be after
// the atomic resolution above).
let log_writer =
match AgentLogWriter::new(project_root, story_id, &resolved_name, &log_session_id) {
Ok(w) => Some(Arc::new(Mutex::new(w))),
Err(e) => {
eprintln!(
"[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
);
None
}
};
// Notify WebSocket clients that a new agent is pending.
Self::notify_agent_state_changed(&self.watcher_tx);
let _ = tx.send(AgentEvent::Status {
story_id: story_id.to_string(),
agent_name: resolved_name.clone(),
status: "pending".to_string(),
});
// Extract inactivity timeout from the agent config before cloning config.
let inactivity_timeout_secs = config
.find_agent(&resolved_name)
.map(|a| a.inactivity_timeout_secs)
.unwrap_or(300);
// If no explicit session_id_to_resume was provided, look up from the
// persistent session store. The key includes the model so a model
// change (e.g. sonnet → opus) produces a cache miss — intentional.
let effective_session_id = session_id_to_resume.or_else(|| {
let model = config
.find_agent(&resolved_name)
.and_then(|a| a.model.clone())
.unwrap_or_default();
crate::agents::session_store::lookup_session(
project_root,
story_id,
&resolved_name,
&model,
)
});
// Clone all values needed inside the background spawn.
// Spawn the background task. Worktree creation and agent launch happen here
// so `start_agent` returns immediately after registering the agent as
// Pending — non-blocking by design (story 157).
let handle = tokio::spawn(spawn::run_agent_spawn(
project_root.to_path_buf(),
config.clone(),
resume_context.map(str::to_string),
effective_session_id,
story_id.to_string(),
resolved_name.clone(),
tx.clone(),
self.agents.clone(),
key.clone(),
event_log.clone(),
self.port,
log_writer.clone(),
self.child_killers.clone(),
self.watcher_tx.clone(),
Arc::clone(&self.merge_jobs),
inactivity_timeout_secs,
prior_events,
));
// Store the task handle while the agent is still Pending.
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
if let Some(agent) = agents.get_mut(&key) {
agent.task_handle = Some(handle);
}
}
// Agent successfully spawned — prevent the guard from removing the entry.
pending_guard.disarm();
Ok(AgentInfo {
story_id: story_id.to_string(),
agent_name: resolved_name,
status: AgentStatus::Pending,
session_id: None,
worktree_path: None,
base_branch: None,
completion: None,
log_session_id: Some(log_session_id),
throttled: false,
termination_reason: None,
})
}
}
#[cfg(test)]
mod tests_concurrency;
#[cfg(test)]
mod tests_selection;