story-kit: merge 118_bug_agent_pool_retains_stale_running_state_after_completion_blocking_auto_assign

This commit is contained in:
Dave
2026-02-23 22:53:23 +00:00
parent ff3ea26ac6
commit 908fcef353

View File

@@ -28,6 +28,53 @@ fn composite_key(story_id: &str, agent_name: &str) -> String {
format!("{story_id}:{agent_name}")
}
/// RAII guard that removes a pending agent entry from the pool on drop.
///
/// Created after inserting a `Pending` entry into the agent HashMap.
/// If `start_agent` succeeds (the agent process is spawned and status
/// transitions to `Running`), call [`disarm`](Self::disarm) to prevent
/// cleanup. If any intermediate step fails and the guard is dropped
/// without being disarmed, the pending entry is removed so it cannot
/// block future auto-assign dispatches.
struct PendingGuard {
agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
key: String,
armed: bool,
}
impl PendingGuard {
fn new(agents: Arc<Mutex<HashMap<String, StoryAgent>>>, key: String) -> Self {
Self {
agents,
key,
armed: true,
}
}
/// Prevent the guard from cleaning up the entry (call after
/// successful spawn).
fn disarm(&mut self) {
self.armed = false;
}
}
impl Drop for PendingGuard {
fn drop(&mut self) {
if self.armed
&& let Ok(mut agents) = self.agents.lock()
&& agents
.get(&self.key)
.is_some_and(|a| a.status == AgentStatus::Pending)
{
agents.remove(&self.key);
slog!(
"[agents] Cleaned up leaked Pending entry for '{}'",
self.key
);
}
}
}
/// Events streamed from a running agent to SSE clients.
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type", rename_all = "snake_case")]
@@ -269,7 +316,9 @@ impl AgentPool {
}
};
// Register as pending
// Register as pending. The `PendingGuard` ensures that if any
// step below fails the entry is removed from the pool so it does
// not permanently block auto-assign (bug 118).
{
let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
agents.insert(
@@ -288,6 +337,7 @@ impl AgentPool {
},
);
}
let mut pending_guard = PendingGuard::new(self.agents.clone(), key.clone());
let _ = tx.send(AgentEvent::Status {
story_id: story_id.to_string(),
@@ -378,6 +428,9 @@ impl AgentPool {
}
}
// Agent successfully spawned — prevent the guard from removing the entry.
pending_guard.disarm();
Ok(AgentInfo {
story_id: story_id.to_string(),
agent_name: resolved_name,
@@ -3835,4 +3888,95 @@ name = "qa"
assert_eq!(entries[0].event["type"], "status");
assert_eq!(entries[0].event["status"], "running");
}
// ── bug 118: pending entry cleanup on start_agent failure ────────────────
/// Regression test for bug 118: when `start_agent` fails (e.g. because
/// `create_worktree` cannot find a git repo), the Pending entry that was
/// inserted into the agent HashMap must be cleaned up so it does not
/// permanently block `find_free_agent_for_stage` / auto-assign.
#[tokio::test]
async fn start_agent_cleans_up_pending_entry_on_failure() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
// Minimal project.toml with a "qa" agent.
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
// Create the story in upcoming so `move_story_to_current` succeeds,
// but do NOT init a git repo — `create_worktree` will fail.
let upcoming = root.join(".story_kit/work/1_upcoming");
fs::create_dir_all(&upcoming).unwrap();
fs::write(
upcoming.join("50_story_test.md"),
"---\nname: Test\n---\n",
)
.unwrap();
let pool = AgentPool::new(3099);
let result = pool
.start_agent(root, "50_story_test", Some("qa"), None)
.await;
// The call must fail (no git repo for worktree creation).
assert!(result.is_err(), "start_agent should fail without a git repo");
// The pool must NOT retain a Pending entry for this agent.
let agents = pool.agents.lock().unwrap();
let leaked = agents.values().any(|a| {
a.agent_name == "qa"
&& matches!(a.status, AgentStatus::Pending | AgentStatus::Running)
});
assert!(
!leaked,
"agent pool must not retain a Pending/Running entry after start_agent fails"
);
}
/// Verify that a successful start_agent keeps the Running entry (guard is
/// disarmed). We cannot truly spawn an agent in tests, but we verify that
/// the concurrency check still blocks a second concurrent start — which
/// proves the first entry survived the guard.
#[tokio::test]
async fn start_agent_guard_does_not_remove_running_entry() {
use std::fs;
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
let sk_dir = root.join(".story_kit");
fs::create_dir_all(&sk_dir).unwrap();
fs::write(
sk_dir.join("project.toml"),
"[[agent]]\nname = \"qa\"\n",
)
.unwrap();
let pool = AgentPool::new(3099);
// Manually inject a Running agent (simulates successful start).
pool.inject_test_agent("story-x", "qa", AgentStatus::Running);
// Attempting to start the same agent on a different story must be
// rejected — the Running entry must still be there.
let result = pool
.start_agent(root, "story-y", Some("qa"), None)
.await;
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.contains("already running") || err.contains("becomes available"),
"running entry must survive: got '{err}'"
);
}
}