story-kit: merge 190_story_auto_select_available_agent_for_stage_in_start_agent

This commit is contained in:
Dave
2026-02-25 16:17:38 +00:00
parent 8c338a6c63
commit 411653ab15
3 changed files with 316 additions and 168 deletions

View File

@@ -323,7 +323,11 @@ impl AgentPool {
}
/// Start an agent for a story: load config, create worktree, spawn agent.
/// If `agent_name` is None, defaults to the first configured agent.
///
/// When `agent_name` is `None`, automatically selects the first idle coder
/// agent (story 190). If all coders are busy the call fails with an error
/// indicating the story will be picked up when one becomes available.
///
/// If `resume_context` is provided, it is appended to the rendered prompt
/// so the agent can pick up from a previous failed attempt.
pub async fn start_agent(
@@ -335,59 +339,56 @@ impl AgentPool {
) -> Result<AgentInfo, String> {
let config = ProjectConfig::load(project_root)?;
// Resolve agent name from config
let resolved_name = match agent_name {
Some(name) => {
config
.find_agent(name)
.ok_or_else(|| format!("No agent named '{name}' in config"))?;
name.to_string()
}
None => config
.default_coder_agent()
.ok_or_else(|| {
"No coder agent configured. Specify an agent_name explicitly.".to_string()
})?
.name
.clone(),
};
// Validate explicit agent name early (no lock needed).
if let Some(name) = agent_name {
config
.find_agent(name)
.ok_or_else(|| format!("No agent named '{name}' in config"))?;
}
let key = composite_key(story_id, &resolved_name);
// Create shared resources before the atomic check-and-insert so the
// agents lock is held continuously from the availability check through
// the Pending insert, eliminating the TOCTOU race (story 132).
// Create name-independent shared resources before the lock so they are
// ready for the atomic check-and-insert (story 132).
let (tx, _) = broadcast::channel::<AgentEvent>(1024);
let event_log: Arc<Mutex<Vec<AgentEvent>>> = Arc::new(Mutex::new(Vec::new()));
// Generate a unique session ID for the persistent log file.
let log_session_id = uuid::Uuid::new_v4().to_string();
// Create persistent log writer.
let log_writer = match AgentLogWriter::new(
project_root,
story_id,
&resolved_name,
&log_session_id,
) {
Ok(w) => Some(Arc::new(Mutex::new(w))),
Err(e) => {
eprintln!("[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}");
None
}
};
// Atomically check availability and register as Pending. The lock is
// held continuously from the duplicate check through the HashMap insert
// so no concurrent start_agent call can slip through the check before
// this insert completes (fixes TOCTOU race, story 132).
// Atomically resolve agent name, check availability, and register as
// Pending. When `agent_name` is `None` the first idle coder is
// selected inside the lock so no TOCTOU race can occur between the
// availability check and the Pending insert (story 132, story 190).
//
// The `PendingGuard` ensures that if any step below fails the entry is
// removed from the pool so it does not permanently block auto-assign
// (bug 118).
let resolved_name: String;
let key: String;
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
resolved_name = match agent_name {
Some(name) => name.to_string(),
None => find_free_agent_for_stage(&config, &agents, &PipelineStage::Coder)
.map(|s| s.to_string())
.ok_or_else(|| {
if config
.agent
.iter()
.any(|a| agent_config_stage(a) == PipelineStage::Coder)
{
format!(
"All coder agents are busy; story '{story_id}' will be \
picked up when one becomes available"
)
} else {
"No coder agent configured. Specify an agent_name explicitly."
.to_string()
}
})?,
};
key = composite_key(story_id, &resolved_name);
// Check for duplicate assignment (same story + same agent already active).
if let Some(agent) = agents.get(&key)
&& (agent.status == AgentStatus::Running || agent.status == AgentStatus::Pending)
{
@@ -396,24 +397,27 @@ impl AgentPool {
agent.status
));
}
// Enforce single-instance concurrency: if this agent is already running on
// any other story, reject the request. The story remains in its current pipeline
// directory and `auto_assign_available_work` will pick it up when the agent
// becomes free.
if let Some(busy_story) = agents.iter().find_map(|(k, a)| {
if a.agent_name == resolved_name
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
{
Some(
k.rsplit_once(':')
.map(|(sid, _)| sid)
.unwrap_or(k)
.to_string(),
)
} else {
None
}
}) {
// Enforce single-instance concurrency for explicitly-named agents:
// if this agent is already running on any other story, reject.
// Auto-selected agents are already guaranteed idle by
// find_free_agent_for_stage, so this check is only needed for
// explicit requests.
if agent_name.is_some()
&& let Some(busy_story) = agents.iter().find_map(|(k, a)| {
if a.agent_name == resolved_name
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
{
Some(
k.rsplit_once(':')
.map(|(sid, _)| sid)
.unwrap_or(k)
.to_string(),
)
} else {
None
}
})
{
return Err(format!(
"Agent '{resolved_name}' is already running on story '{busy_story}'; \
story '{story_id}' will be picked up when the agent becomes available"
@@ -437,6 +441,23 @@ impl AgentPool {
}
let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
// Create persistent log writer (needs resolved_name, so must be after
// the atomic resolution above).
let log_writer = match AgentLogWriter::new(
project_root,
story_id,
&resolved_name,
&log_session_id,
) {
Ok(w) => Some(Arc::new(Mutex::new(w))),
Err(e) => {
eprintln!(
"[agents] Failed to create log writer for {story_id}:{resolved_name}: {e}"
);
None
}
};
// Notify WebSocket clients that a new agent is pending.
Self::notify_agent_state_changed(&self.watcher_tx);
@@ -674,6 +695,28 @@ impl AgentPool {
Ok(())
}
/// Return the names of configured agents for `stage` that are not currently
/// running or pending.
pub fn available_agents_for_stage(
&self,
config: &ProjectConfig,
stage: &PipelineStage,
) -> Result<Vec<String>, String> {
let agents = self.agents.lock().map_err(|e| e.to_string())?;
Ok(config
.agent
.iter()
.filter(|cfg| agent_config_stage(cfg) == *stage)
.filter(|cfg| {
!agents.values().any(|a| {
a.agent_name == cfg.name
&& matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
})
})
.map(|cfg| cfg.name.clone())
.collect())
}
/// List all agents with their status.
pub fn list_agents(&self) -> Result<Vec<AgentInfo>, String> {
let agents = self.agents.lock().map_err(|e| e.to_string())?;
@@ -6299,4 +6342,198 @@ stage = "qa"
(bug 173: lozenges must update when agents are assigned during pipeline advance)"
);
}
// ── available_agents_for_stage tests (story 190) ──────────────────────────
fn make_config(toml_str: &str) -> ProjectConfig {
ProjectConfig::parse(toml_str).unwrap()
}
#[test]
fn available_agents_for_stage_returns_idle_agents() {
let config = make_config(
r#"
[[agent]]
name = "coder-1"
stage = "coder"
[[agent]]
name = "coder-2"
stage = "coder"
[[agent]]
name = "qa"
stage = "qa"
"#,
);
let pool = AgentPool::new_test(3001);
// coder-1 is busy on story-1
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
let available = pool
.available_agents_for_stage(&config, &PipelineStage::Coder)
.unwrap();
assert_eq!(available, vec!["coder-2"]);
let available_qa = pool
.available_agents_for_stage(&config, &PipelineStage::Qa)
.unwrap();
assert_eq!(available_qa, vec!["qa"]);
}
#[test]
fn available_agents_for_stage_returns_empty_when_all_busy() {
let config = make_config(
r#"
[[agent]]
name = "coder-1"
stage = "coder"
"#,
);
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
let available = pool
.available_agents_for_stage(&config, &PipelineStage::Coder)
.unwrap();
assert!(available.is_empty());
}
#[test]
fn available_agents_for_stage_ignores_completed_agents() {
let config = make_config(
r#"
[[agent]]
name = "coder-1"
stage = "coder"
"#,
);
let pool = AgentPool::new_test(3001);
// Completed agents should not count as busy.
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Completed);
let available = pool
.available_agents_for_stage(&config, &PipelineStage::Coder)
.unwrap();
assert_eq!(available, vec!["coder-1"]);
}
#[tokio::test]
async fn start_agent_auto_selects_second_coder_when_first_busy() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".story_kit");
std::fs::create_dir_all(&sk).unwrap();
std::fs::write(
sk.join("project.toml"),
r#"
[[agent]]
name = "supervisor"
stage = "other"
[[agent]]
name = "coder-1"
stage = "coder"
[[agent]]
name = "coder-2"
stage = "coder"
"#,
)
.unwrap();
let pool = AgentPool::new_test(3001);
// coder-1 is busy on another story
pool.inject_test_agent("other-story", "coder-1", AgentStatus::Running);
// Call start_agent without agent_name — should pick coder-2
let result = pool
.start_agent(tmp.path(), "42_my_story", None, None)
.await;
// Will fail for infrastructure reasons (no git repo), but should NOT
// fail with "All coder agents are busy" — that would mean it didn't
// try coder-2.
match result {
Ok(info) => {
assert_eq!(info.agent_name, "coder-2");
}
Err(err) => {
assert!(
!err.contains("All coder agents are busy"),
"should have selected coder-2 but got: {err}"
);
assert!(
!err.contains("No coder agent configured"),
"should not fail on agent selection, got: {err}"
);
}
}
}
#[tokio::test]
async fn start_agent_returns_busy_when_all_coders_occupied() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".story_kit");
std::fs::create_dir_all(&sk).unwrap();
std::fs::write(
sk.join("project.toml"),
r#"
[[agent]]
name = "coder-1"
stage = "coder"
[[agent]]
name = "coder-2"
stage = "coder"
"#,
)
.unwrap();
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
pool.inject_test_agent("story-2", "coder-2", AgentStatus::Pending);
let result = pool
.start_agent(tmp.path(), "story-3", None, None)
.await;
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.contains("All coder agents are busy"),
"expected busy error, got: {err}"
);
}
#[tokio::test]
async fn start_agent_explicit_name_unchanged_when_busy() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".story_kit");
std::fs::create_dir_all(&sk).unwrap();
std::fs::write(
sk.join("project.toml"),
r#"
[[agent]]
name = "coder-1"
stage = "coder"
[[agent]]
name = "coder-2"
stage = "coder"
"#,
)
.unwrap();
let pool = AgentPool::new_test(3001);
pool.inject_test_agent("story-1", "coder-1", AgentStatus::Running);
// Explicit request for coder-1 (busy) should fail even though coder-2 is free.
let result = pool
.start_agent(tmp.path(), "story-2", Some("coder-1"), None)
.await;
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.contains("coder-1") && err.contains("already running"),
"expected explicit busy error, got: {err}"
);
}
}