Bump version to 0.11.1

huskies: merge 1105 bug Freeze from Backlog stores wrong resume_to — Unfreeze restores to Coding instead of Backlog
fix(1102): require non-empty origin.id on create_* MCP tools
2026-05-15 23:38:09 +01:00 · 2026-05-15 22:33:54 +00:00 · 2026-05-15 23:13:54 +01:00 · 2026-05-15 23:13:38 +01:00 · 2026-05-15 21:02:37 +00:00 · 2026-05-15 20:24:31 +00:00
66 changed files with 2332 additions and 374 deletions
@@ -541,6 +541,7 @@
    "enum TerminationReason",
    "enum PipelineStage",
    "fn pipeline_stage",
+    "fn canonical_pipeline_stage",
    "fn agent_config_stage",
    "struct CompletionReport",
    "struct TokenUsage",
@@ -678,9 +679,7 @@
  "server/src/agents/pool/pipeline/mod.rs": [],
  "server/src/agents/pool/process.rs": [
    "fn kill_all_children",
-    "fn kill_child_for_key",
-    "fn inject_child_killer",
-    "fn child_killer_count"
+    "fn kill_child_for_key"
  ],
  "server/src/agents/pool/query.rs": [
    "fn available_agents_for_stage",
@@ -707,6 +706,7 @@
  ],
  "server/src/agents/pool/stop.rs": [
    "fn stop_agent",
+    "fn reconcile_canonical_agents",
    "fn remove_agents_for_story"
  ],
  "server/src/agents/pool/test_helpers.rs": [
@@ -752,9 +752,7 @@
    "fn run_agent_pty_streaming"
  ],
  "server/src/agents/pty/types.rs": [
-    "struct PtyResult",
-    "fn composite_key",
-    "struct ChildKillerGuard"
+    "struct PtyResult"
  ],
  "server/src/agents/runtime/claude_code.rs": [
    "struct ClaudeCodeRuntime",
@@ -898,6 +896,13 @@
  "server/src/chat/commands/unreleased.rs": [
    "fn handle_unreleased"
  ],
+  "server/src/chat/dispatcher.rs": [
+    "type SpawnFn",
+    "struct ChatDispatcher",
+    "fn new",
+    "fn submit",
+    "fn stop"
+  ],
  "server/src/chat/history.rs": [
    "type ChatConversationHistory",
    "fn load_chat_history",
@@ -908,6 +913,7 @@
  ],
  "server/src/chat/mod.rs": [
    "mod commands",
+    "mod dispatcher",
    "mod history",
    "mod lookup",
    "mod test_helpers",
@@ -1037,6 +1043,7 @@
    "fn default_permission_timeout_secs",
    "fn default_aggregated_notifications_poll_interval_secs",
    "fn default_aggregated_notifications_enabled",
+    "fn default_coalesce_window_ms",
    "fn default_transport",
    "fn default_whatsapp_provider",
    "struct BotConfig"
@@ -1441,7 +1448,9 @@
    "fn migrate_legacy_stage_strings",
    "fn migrate_node_claims_to_agent_claims",
    "fn migrate_merge_job",
-    "fn purge_done_stage_merge_jobs"
+    "fn purge_done_stage_merge_jobs",
+    "fn migrate_zombie_pipeline_rows",
+    "fn sweep_zombie_rows"
  ],
  "server/src/crdt_state/write/mod.rs": [],
  "server/src/crdt_state/write/tests.rs": [],
@@ -1550,7 +1559,11 @@
    "fn named",
    "fn write_item_with_content",
    "fn move_item_stage",
+    "fn sync_item_agent",
    "fn delete_item",
+    "fn delete_item_sync",
+    "fn sync_item_name",
+    "fn sync_item_depends_on",
    "fn next_item_number"
  ],
  "server/src/db/recover.rs": [
@@ -1565,6 +1578,7 @@
    "struct PipelineWriteMsg",
    "struct PipelineDb",
    "static PIPELINE_DB",
+    "static SHADOW_DB_PATH",
    "fn init",
    "fn backup_pre_pipeline_status",
    "fn check_schema_drift"
@@ -2175,6 +2189,7 @@
    "mod mesh",
    "mod node_identity",
    "mod pipeline_state",
+    "mod process_kill",
    "mod rebuild",
    "mod services",
    "mod sled_uplink",
@@ -2273,6 +2288,11 @@
    "fn stage_label",
    "fn stage_dir_name"
  ],
+  "server/src/process_kill.rs": [
+    "fn sigkill_pids_and_verify",
+    "fn pids_matching",
+    "fn descendant_pids"
+  ],
  "server/src/rebuild.rs": [
    "enum ShutdownReason",
    "struct BotShutdownNotifier",
@@ -1911,7 +1911,7 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"

 [[package]]
 name = "huskies"
-version = "0.11.0"
+version = "0.11.1"
 dependencies = [
 "ammonia",
 "async-stream",
@@ -1,12 +1,12 @@
 {
 	"name": "huskies",
-	"version": "0.11.0",
+	"version": "0.11.1",
 	"lockfileVersion": 3,
 	"requires": true,
 	"packages": {
 		"": {
 			"name": "huskies",
-			"version": "0.11.0",
+			"version": "0.11.1",
 			"dependencies": {
 				"@types/react-syntax-highlighter": "^15.5.13",
 				"react": "^19.1.0",
@@ -1,7 +1,7 @@
 {
 	"name": "huskies",
 	"private": true,
-	"version": "0.11.0",
+	"version": "0.11.1",
 	"type": "module",
 	"scripts": {
 		"dev": "vite",
@@ -1,6 +1,6 @@
 [package]
 name = "huskies"
-version = "0.11.0"
+version = "0.11.1"
 edition = "2024"
 build = "build.rs"

@@ -78,6 +78,7 @@ pub(super) fn build_agent_app_context(
        pending_perm_replies: Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())),
        permission_timeout_secs: 120,
        status: agents.status_broadcaster(),
+        chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
    });
    crate::http::context::AppContext {
        state: Arc::new(state),
@@ -161,6 +161,42 @@ pub fn pipeline_stage(agent_name: &str) -> PipelineStage {
    }
 }

+/// Map a pipeline [`Stage`] to the canonical [`PipelineStage`] for LLM agent spawning.
+///
+/// Returns `None` for stages where no LLM agent should be active (terminal states,
+/// blocked, frozen, or unclassified merge failures requiring human intervention).
+/// Returns `Some(stage)` naming the single LLM-agent type that may run on this story.
+/// Used by `validate_agent_stage` and `reconcile_canonical_agents` to enforce the
+/// one-agent-per-story invariant (story 1100).
+pub fn canonical_pipeline_stage(s: &crate::pipeline_state::Stage) -> Option<PipelineStage> {
+    use crate::pipeline_state::{MergeFailureKind, Stage};
+    match s {
+        Stage::Coding { .. } => Some(PipelineStage::Coder),
+        Stage::Qa => Some(PipelineStage::Qa),
+        Stage::Merge { .. } => Some(PipelineStage::Mergemaster),
+        Stage::MergeFailure {
+            kind: MergeFailureKind::ConflictDetected(_),
+            ..
+        } => Some(PipelineStage::Mergemaster),
+        Stage::MergeFailure {
+            kind: MergeFailureKind::GatesFailed(_),
+            ..
+        } => Some(PipelineStage::Coder),
+        Stage::MergeFailureFinal { .. } => Some(PipelineStage::Mergemaster),
+        Stage::Upcoming
+        | Stage::Backlog
+        | Stage::MergeFailure { .. }
+        | Stage::Done { .. }
+        | Stage::Blocked { .. }
+        | Stage::Archived { .. }
+        | Stage::Frozen { .. }
+        | Stage::ReviewHold { .. }
+        | Stage::Abandoned { .. }
+        | Stage::Superseded { .. }
+        | Stage::Rejected { .. } => None,
+    }
+}
+
 /// Determine the pipeline stage for a configured agent.
 ///
 /// Prefers the explicit `stage` config field (added in Bug 150) over the
@@ -105,12 +105,6 @@ impl AgentPool {
                    }
                }

-                // Step 4: drop the (now-stale) child_killers entry — the
-                // process it pointed at is gone.
-                if let Ok(mut killers) = self.child_killers.lock() {
-                    killers.remove(key);
-                }
-
                // Use the retry mechanism: increment retry_count and only block
                // when the limit is exceeded, matching the pipeline's behaviour.
                let story_id = key.rsplit_once(':').map(|(s, _)| s).unwrap_or(key);
@@ -18,7 +18,6 @@ mod test_helpers;

 use crate::io::watcher::WatcherEvent;
 use crate::service::status::StatusBroadcaster;
-use portable_pty::ChildKiller;
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::broadcast;
@@ -31,10 +30,6 @@ use types::{StoryAgent, composite_key};
 pub struct AgentPool {
    agents: Arc<Mutex<HashMap<String, StoryAgent>>>,
    port: u16,
-    /// Registry of active PTY child process killers, keyed by "{story_id}:{agent_name}".
-    /// Used to terminate child processes on server shutdown or agent stop, preventing
-    /// orphaned Claude Code processes from running after the server exits.
-    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    /// Broadcast channel for notifying WebSocket clients of agent state changes.
    /// When an agent transitions state (Pending, Running, Completed, Failed, Stopped),
    /// an `AgentStateChanged` event is emitted so the frontend can refresh the
@@ -56,7 +51,6 @@ impl AgentPool {
        let pool = Self {
            agents: Arc::new(Mutex::new(HashMap::new())),
            port,
-            child_killers: Arc::new(Mutex::new(HashMap::new())),
            watcher_tx: watcher_tx.clone(),
            status_broadcaster: Arc::new(StatusBroadcaster::new()),
        };
@@ -33,7 +33,6 @@ pub(crate) fn spawn_pipeline_advance(
        let pool = AgentPool {
            agents,
            port,
-            child_killers: Arc::new(Mutex::new(HashMap::new())),
            watcher_tx,
            status_broadcaster: Arc::new(crate::service::status::StatusBroadcaster::new()),
        };
@@ -78,21 +78,34 @@ impl AgentPool {
                    // The coder exited with uncommitted content but no commits
                    // (typical "claude-code session boundary mid-sweep" pattern).
                    // Use a PROGRESS-AWARE retry cap: the agent gets unlimited
-                    // respawns as long as file edits keep growing between
-                    // attempts; only when the worktree diff is byte-identical
-                    // to the previous attempt do we count it as "no progress".
-                    // After NO_PROGRESS_CAP consecutive no-progress respawns,
-                    // block for human attention.
+                    // respawns as long as progress is being made between attempts.
+                    // Progress is satisfied if EITHER (a) the worktree diff grew,
+                    // OR (b) the set of files the agent read grew.  Raw tool-call
+                    // count does NOT count — a looping agent can produce many calls.
+                    // Only self-exited sessions with no file or read progress count
+                    // toward the cap; forced exits (API error, network, budget
+                    // exhaustion) are excluded (story 1089).
+                    // After NO_PROGRESS_CAP consecutive qualifying no-progress
+                    // respawns, block for human attention.
                    //
                    // TOTAL_ATTEMPTS_CAP is the OUTER bound: even if the agent
                    // keeps making file-edit progress every session, after this
-                    // many total respawns without a commit we escalate — caught
-                    // the "agent flaps between different edits but never
-                    // commits" pattern that the progress-aware counter would
-                    // never trigger.
+                    // many total respawns without a commit we escalate — catches
+                    // the "agent flaps between different edits but never commits"
+                    // pattern that the progress-aware counter would never trigger.
                    const NO_PROGRESS_CAP: u32 = 3;
                    const TOTAL_ATTEMPTS_CAP: u32 = 8;

+                    // AC1: consume the forced-exit flag written by spawn.rs when
+                    // the agent process exited with a non-zero code.
+                    let forced_exit = crate::db::read_content(
+                        crate::db::ContentKey::CommitRecoveryForcedExit(story_id),
+                    )
+                    .is_some();
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryForcedExit(
+                        story_id,
+                    ));
+
                    let current_fingerprint = worktree_path.as_deref().and_then(|p| {
                        std::process::Command::new("git")
                            .args(["diff", "master"])
@@ -104,18 +117,31 @@ impl AgentPool {
                    let stored_fingerprint = crate::db::read_content(
                        crate::db::ContentKey::CommitRecoveryDiffFingerprint(story_id),
                    );
-                    let made_progress = current_fingerprint.is_some()
+                    let diff_progress = current_fingerprint.is_some()
                        && stored_fingerprint.as_ref() != current_fingerprint.as_ref();
-                    let no_progress_count = if made_progress || stored_fingerprint.is_none() {
+
+                    // AC2: check read-file set progress as an additional signal.
+                    let read_progress = previous_session_id.as_deref().is_some_and(|session_id| {
+                        collect_read_progress(&project_root, story_id, agent_name, session_id)
+                    });
+
+                    let made_progress = diff_progress || read_progress;
+
+                    let prev_no_progress_count = crate::db::read_content(
+                        crate::db::ContentKey::CommitRecoveryPending(story_id),
+                    )
+                    .and_then(|s| s.trim().parse::<u32>().ok())
+                    .unwrap_or(0);
+
+                    // AC1: forced exits do not increment the stuck-respawn counter.
+                    let no_progress_count = if forced_exit {
+                        prev_no_progress_count
+                    } else if made_progress || stored_fingerprint.is_none() {
                        1
                    } else {
-                        crate::db::read_content(crate::db::ContentKey::CommitRecoveryPending(
-                            story_id,
-                        ))
-                        .and_then(|s| s.trim().parse::<u32>().ok())
-                        .unwrap_or(0)
-                            + 1
+                        prev_no_progress_count + 1
                    };
+
                    let total_attempts = crate::db::read_content(
                        crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                    )
@@ -136,13 +162,17 @@ impl AgentPool {
                        crate::db::delete_content(
                            crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                        );
+                        crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                            story_id,
+                        ));
                        slog!(
                            "[pipeline] Coder '{agent_name}' for '{story_id}' hit total \
                             commit-recovery cap ({total_attempts}/{TOTAL_ATTEMPTS_CAP}) \
                             without a commit. Blocking story."
                        );
                        let reason = format!(
-                            "agent flapped — {total_attempts} respawns without ever committing"
+                            "commit absent after {total_attempts} respawns — \
+                             agent kept making edits but never committed"
                        );
                        if let Err(e) =
                            crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -167,14 +197,18 @@ impl AgentPool {
                        crate::db::delete_content(
                            crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id),
                        );
+                        crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                            story_id,
+                        ));
                        slog!(
                            "[pipeline] Coder '{agent_name}' for '{story_id}' made no \
-                             file-edit progress over {no_progress_count} consecutive \
-                             commit-recovery respawns. Blocking story."
+                             file or read progress over {no_progress_count} consecutive \
+                             self-exit commit-recovery respawns. Blocking story."
                        );
+                        // AC4: block message names the specific cause.
                        let reason = format!(
-                            "agent stuck — {no_progress_count} respawns without commits or \
-                             new file edits"
+                            "stuck-respawn cap reached: {NO_PROGRESS_CAP} consecutive \
+                             self-exits with no file or read progress"
                        );
                        if let Err(e) =
                            crate::agents::lifecycle::transition_to_blocked(story_id, &reason)
@@ -206,7 +240,8 @@ impl AgentPool {
                            "[pipeline] Coder '{agent_name}' exited with uncommitted work \
                             for '{story_id}' (no-progress {no_progress_count}/\
                             {NO_PROGRESS_CAP}, total {total_attempts}/\
-                             {TOTAL_ATTEMPTS_CAP}; progress_made={made_progress}). \
+                             {TOTAL_ATTEMPTS_CAP}; diff_progress={diff_progress}, \
+                             read_progress={read_progress}, forced_exit={forced_exit}). \
                             Issuing commit-only respawn."
                        );
                        let addendum = "\n\nYou have uncommitted work in this worktree. \
@@ -302,10 +337,13 @@ impl AgentPool {
                        });
                    }
                } else if completion.gates_passed {
-                    // Clear any stale recovery key when the coder succeeds normally.
+                    // Clear any stale recovery keys when the coder succeeds normally.
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
                        story_id,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                        story_id,
+                    ));
                    // Determine effective QA mode for this story.
                    let qa_mode = {
                        let item_type = crate::agents::lifecycle::item_type_from_id(story_id);
@@ -361,11 +399,14 @@ impl AgentPool {
                        }
                    }
                } else {
-                    // Clear any stale recovery key when gates fail normally (agent committed
+                    // Clear any stale recovery keys when gates fail normally (agent committed
                    // but the build is broken — treat as a standard retry, not a recovery).
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(
                        story_id,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(
+                        story_id,
+                    ));
                    // Bug 645 / 668: Before retry/block, check if the agent left committed
                    // work AND the agent had a passing run_tests result captured during its
                    // session.  An agent may crash mid-output (e.g. Claude Code CLI PTY write
@@ -724,6 +765,109 @@ mod helpers;
 use helpers::{resolve_qa_mode_from_store, write_review_hold_to_store};
 pub(crate) use helpers::{should_block_story, spawn_pipeline_advance};

+/// Parse a huskies agent log and return the set of file paths passed to the
+/// Read tool in that session.  Returns an empty set if the log cannot be read.
+///
+/// Used by [`collect_read_progress`] to detect read-exploration progress even
+/// when the worktree diff did not grow (story 1089, AC2).
+fn collect_read_files_from_log(
+    project_root: &std::path::Path,
+    story_id: &str,
+    agent_name: &str,
+    session_id: &str,
+) -> std::collections::HashSet<String> {
+    let log_path = crate::agent_log::log_file_path(project_root, story_id, agent_name, session_id);
+    let mut files = std::collections::HashSet::new();
+
+    let log_text = match std::fs::read_to_string(&log_path) {
+        Ok(t) => t,
+        Err(_) => return files,
+    };
+
+    for line in log_text.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        let entry: serde_json::Value = match serde_json::from_str(trimmed) {
+            Ok(v) => v,
+            Err(_) => continue,
+        };
+        // Only look at agent_json events where data.type == "assistant".
+        if entry.get("type").and_then(|t| t.as_str()) != Some("agent_json") {
+            continue;
+        }
+        let data = match entry.get("data") {
+            Some(d) => d,
+            None => continue,
+        };
+        if data.get("type").and_then(|t| t.as_str()) != Some("assistant") {
+            continue;
+        }
+        let content = match data.pointer("/message/content").and_then(|c| c.as_array()) {
+            Some(c) => c,
+            None => continue,
+        };
+        for item in content {
+            if item.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
+                continue;
+            }
+            if item.get("name").and_then(|n| n.as_str()) != Some("Read") {
+                continue;
+            }
+            if let Some(path) = item.pointer("/input/file_path").and_then(|p| p.as_str()) {
+                files.insert(path.to_string());
+            }
+        }
+    }
+
+    files
+}
+
+/// Return `true` if the agent read any files in `session_id` that were not in
+/// the cumulative read set for `story_id`.  Updates the stored cumulative set
+/// when new files are found (story 1089, AC2).
+fn collect_read_progress(
+    project_root: &std::path::Path,
+    story_id: &str,
+    agent_name: &str,
+    session_id: &str,
+) -> bool {
+    let session_files = collect_read_files_from_log(project_root, story_id, agent_name, session_id);
+    if session_files.is_empty() {
+        return false;
+    }
+
+    let stored_set: std::collections::HashSet<String> =
+        crate::db::read_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id))
+            .map(|s| {
+                s.lines()
+                    .filter(|l| !l.is_empty())
+                    .map(str::to_string)
+                    .collect()
+            })
+            .unwrap_or_default();
+
+    let union: std::collections::HashSet<String> =
+        stored_set.union(&session_files).cloned().collect();
+
+    if union.len() > stored_set.len() {
+        let mut sorted: Vec<&String> = union.iter().collect();
+        sorted.sort();
+        crate::db::write_content(
+            crate::db::ContentKey::CommitRecoveryReadSet(story_id),
+            &sorted
+                .into_iter()
+                .map(String::as_str)
+                .collect::<Vec<_>>()
+                .join("\n"),
+        );
+        true
+    } else {
+        false
+    }
+}
+
 #[cfg(test)]
 mod tests;
 #[cfg(test)]
@@ -1077,7 +1077,7 @@ stage = "coder"
        "Story must be blocked after NO_PROGRESS_CAP consecutive no-progress respawns"
    );
    assert!(
-        block_reason.contains("without commits or new file edits"),
+        block_reason.contains("self-exits with no file or read progress"),
        "Block reason should describe the no-progress condition, got: {block_reason}"
    );

@@ -1193,7 +1193,7 @@ stage = "coder"
        "Story must be blocked once total commit-recovery attempts hits the outer cap"
    );
    assert!(
-        block_reason.contains("flapped") && block_reason.contains("without ever committing"),
+        block_reason.contains("commit absent") && block_reason.contains("never committed"),
        "Block reason should describe the flapping pattern, got: {block_reason}"
    );

@@ -111,7 +111,6 @@ impl AgentPool {
        let pool_clone = Self {
            agents: Arc::clone(&self.agents),
            port: self.port,
-            child_killers: Arc::clone(&self.child_killers),
            watcher_tx: self.watcher_tx.clone(),
            status_broadcaster: Arc::clone(&self.status_broadcaster),
        };
@@ -74,25 +74,11 @@ pub(in crate::agents::pool) async fn run_server_owned_completion(

    // Kill any in-flight cargo test processes for this worktree so they don't
    // hold the build lock while gates try to run.
-    if let Some(wt_path) = worktree_path.as_ref()
-        && let Ok(output) = std::process::Command::new("pgrep")
-            .args([
-                "-f",
-                &format!("--manifest-path {}/Cargo.toml", wt_path.display()),
-            ])
-            .output()
-    {
-        let pids = String::from_utf8_lossy(&output.stdout);
-        for pid_str in pids.lines() {
-            if let Ok(pid) = pid_str.trim().parse::<i32>() {
-                crate::slog!(
-                    "[agents] Killing stale cargo process (pid {pid}) for '{story_id}' before running gates"
-                );
-                unsafe {
-                    libc::kill(pid, libc::SIGKILL);
-                }
-            }
-        }
+    if let Some(wt_path) = worktree_path.as_ref() {
+        let pattern = format!("--manifest-path {}/Cargo.toml", wt_path.display());
+        let _ = crate::process_kill::sigkill_pids_and_verify(&crate::process_kill::pids_matching(
+            &pattern,
+        ));
    }

    // Run acceptance gates.  Third element of the tuple is `needs_commit_recovery`:
@@ -18,7 +18,6 @@ impl AgentPool {
        let pool = Arc::new(Self {
            agents: Arc::clone(&self.agents),
            port: self.port,
-            child_killers: Arc::clone(&self.child_killers),
            watcher_tx: self.watcher_tx.clone(),
            status_broadcaster: Arc::clone(&self.status_broadcaster),
        });
@@ -186,6 +186,50 @@ impl AgentPool {
                        .map(|k| k.is_self_evident_fix())
                        .unwrap_or(false);

+                // Bug 1101 diagnostic: log the classified failure_kind and the
+                // matched classifier-trigger substring with surrounding context,
+                // so we can confirm whether classify() is incorrectly matching
+                // a passing-step stdout substring (e.g. "Diff in " inside a
+                // failing test's panic message) and bouncing the story to a
+                // fixup coder. Remove once the fix lands.
+                if let Ok(r) = report.as_ref()
+                    && let crate::agents::merge::MergeResult::GateFailure {
+                        output: gate_output,
+                        failure_kind: Some(k),
+                    } = &r.result
+                {
+                    const TRIGGERS: &[&str] = &[
+                        "CONFLICT (content):",
+                        "Merge conflict:",
+                        "Diff in ",
+                        "would reformat",
+                        "missing-docs direction",
+                        "error[clippy::",
+                        "warning[clippy::",
+                        "missing_doc_comments",
+                        "error[E",
+                    ];
+                    let matched = TRIGGERS
+                        .iter()
+                        .find_map(|t| gate_output.find(t).map(|i| (*t, i)));
+                    let (trigger, context) = match matched {
+                        Some((t, i)) => {
+                            let start = i.saturating_sub(30);
+                            let end = (i + t.len() + 60).min(gate_output.len());
+                            let ctx = gate_output
+                                .get(start..end)
+                                .unwrap_or("<context unavailable>")
+                                .replace('\n', " ");
+                            (Some(t), ctx)
+                        }
+                        None => (None, String::from("<no trigger matched>")),
+                    };
+                    slog!(
+                        "[merge] classify diagnostic for '{sid}': failure_kind={k:?} \
+                         is_fixup={is_fixup} trigger={trigger:?} context='{context}'"
+                    );
+                }
+
                if is_no_commits {
                    let reason = kind.display_reason();
                    if let Err(e) = crate::agents::lifecycle::transition_to_blocked(&sid, &reason) {
@@ -1,12 +1,20 @@
 //! Process management — kills orphaned PTY child processes on server shutdown.
 //!
-//! See [`crate::process_kill`] for the general process-termination primitives
-//! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
-//! should eventually be migrated to.  Those methods currently use
-//! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
-//! claude-code ignores — so they leave orphans on every shutdown/stop.  The
-//! migration is tracked in a separate story to keep its diff focused.
+//! As of story 1090 (2026-05-15), all process termination in this module uses
+//! [`crate::process_kill::sigkill_pids_and_verify`] — SIGHUP-based killing via
+//! `portable_pty::ChildKiller` has been removed entirely from the server.
+//!
+//! ## History
+//!
+//! Prior to commit `fe9804b3`, the watchdog and all kill paths sent SIGHUP via
+//! `portable_pty::ChildKiller::kill()`.  Claude Code ignores SIGHUP, so agents
+//! survived "kills" and ran concurrently with their replacements — the root cause
+//! of the 2026-05-15 duplicate-spawn incident.  `fe9804b3` migrated the watchdog;
+//! story 1090 completes the migration by rewriting `kill_all_children` and
+//! `kill_child_for_key` (this file) to use `pids_matching` + `sigkill_pids_and_verify`.
+use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
 use crate::slog;
+use crate::slog_warn;

 use super::AgentPool;

@@ -14,53 +22,97 @@ impl AgentPool {
    /// Kill all active PTY child processes.
    ///
    /// Called on server shutdown to prevent orphaned Claude Code processes from
-    /// continuing to run after the server exits. Each registered killer is called
-    /// once, then the registry is cleared.
+    /// continuing to run after the server exits.  Collects each agent's worktree
+    /// path, then SIGKILLs every process running inside that path and verifies
+    /// termination before returning.
    pub fn kill_all_children(&self) {
-        if let Ok(mut killers) = self.child_killers.lock() {
-            for (key, killer) in killers.iter_mut() {
-                slog!("[agents] Killing child process for {key} on shutdown");
-                let _ = killer.kill();
+        let worktree_paths: Vec<(String, std::path::PathBuf)> = {
+            let Ok(agents) = self.agents.lock() else {
+                return;
+            };
+            agents
+                .iter()
+                .filter_map(|(key, agent)| {
+                    agent
+                        .worktree_info
+                        .as_ref()
+                        .map(|wt| (key.clone(), wt.path.clone()))
+                })
+                .collect()
+        };
+
+        for (key, path) in worktree_paths {
+            let pattern = path.display().to_string();
+            let pids = pids_matching(&pattern);
+            if pids.is_empty() {
+                slog!(
+                    "[agents] No processes found in worktree {} for '{key}' on shutdown",
+                    path.display()
+                );
+                continue;
+            }
+            match sigkill_pids_and_verify(&pids) {
+                Ok(n) => slog!(
+                    "[agents] SIGKILL'd {n} process(es) in worktree {} for '{key}' on shutdown",
+                    path.display()
+                ),
+                Err(survivors) => slog_warn!(
+                    "[agents] SIGKILL incomplete for '{key}' on shutdown: \
+                     pids still alive: {survivors:?}"
+                ),
            }
-            killers.clear();
        }
    }

    /// Kill and deregister the child process for a specific agent key.
    ///
-    /// Used by `stop_agent` to ensure the PTY child is terminated even though
-    /// aborting a `spawn_blocking` task handle does not interrupt the blocking thread.
+    /// Fallback used by `stop_agent` when no worktree path is recorded for the
+    /// agent.  Also the primary kill path for any caller that has only a composite
+    /// key and not a worktree path directly.
    pub(super) fn kill_child_for_key(&self, key: &str) {
-        if let Ok(mut killers) = self.child_killers.lock()
-            && let Some(mut killer) = killers.remove(key)
-        {
-            slog!("[agents] Killing child process for {key} on stop");
-            let _ = killer.kill();
+        let worktree_path = {
+            let Ok(agents) = self.agents.lock() else {
+                return;
+            };
+            agents
+                .get(key)
+                .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
+        };
+
+        let Some(path) = worktree_path else {
+            slog_warn!(
+                "[agents] No worktree path recorded for '{key}'; \
+                 cannot SIGKILL via process_kill (no-op)"
+            );
+            return;
+        };
+
+        let pattern = path.display().to_string();
+        let pids = pids_matching(&pattern);
+        if pids.is_empty() {
+            slog!(
+                "[agents] No processes found in worktree {} for '{key}' on stop",
+                path.display()
+            );
+            return;
+        }
+        match sigkill_pids_and_verify(&pids) {
+            Ok(n) => slog!(
+                "[agents] SIGKILL'd {n} process(es) in worktree {} for '{key}' on stop",
+                path.display()
+            ),
+            Err(survivors) => slog_warn!(
+                "[agents] SIGKILL incomplete for '{key}' on stop: \
+                 pids still alive: {survivors:?}"
+            ),
        }
-    }
-
-    /// Test helper: inject a child killer into the registry.
-    #[cfg(test)]
-    pub fn inject_child_killer(
-        &self,
-        key: &str,
-        killer: Box<dyn portable_pty::ChildKiller + Send + Sync>,
-    ) {
-        let mut killers = self.child_killers.lock().unwrap();
-        killers.insert(key.to_string(), killer);
-    }
-
-    /// Test helper: return the number of registered child killers.
-    #[cfg(test)]
-    pub fn child_killer_count(&self) -> usize {
-        self.child_killers.lock().unwrap().len()
    }
 }

 #[cfg(test)]
 mod tests {
    use super::super::AgentPool;
-    use portable_pty::{CommandBuilder, PtySize, native_pty_system};
+    use crate::agents::AgentStatus;
    use std::process::Command;

    /// Returns true if a process with the given PID is currently running.
@@ -75,79 +127,100 @@ mod tests {
    #[test]
    fn kill_all_children_is_safe_on_empty_pool() {
        let pool = AgentPool::new_test(3001);
-        pool.kill_all_children();
-        assert_eq!(pool.child_killer_count(), 0);
+        pool.kill_all_children(); // must not panic
    }

+    /// AC 4 — `kill_child_for_key` SIGKILLs the single agent's process and
+    /// verifies it is gone within 2 s.  The sleeper has the worktree path in
+    /// its argv[0] so `pgrep -f` can locate it, mirroring how claude-code is
+    /// launched with `--directory <worktree>` in production.
    #[test]
-    fn kill_all_children_kills_real_process() {
-        let pool = AgentPool::new_test(3001);
+    fn kill_child_for_key_kills_real_process() {
+        use std::os::unix::process::CommandExt;

-        let pty_system = native_pty_system();
-        let pair = pty_system
-            .openpty(PtySize {
-                rows: 24,
-                cols: 80,
-                pixel_width: 0,
-                pixel_height: 0,
-            })
-            .expect("failed to open pty");
+        let pool = AgentPool::new_test(3002);
+        let tmp = tempfile::tempdir().unwrap();
+        let worktree = tmp.path();

-        let mut cmd = CommandBuilder::new("sleep");
-        cmd.arg("100");
-        let mut child = pair
-            .slave
-            .spawn_command(cmd)
-            .expect("failed to spawn sleep");
-        let pid = child.process_id().expect("no pid");
+        // argv[0] = worktree path → pgrep -f <path> finds this process.
+        let mut child = Command::new("sleep")
+            .arg0(worktree.to_string_lossy().as_ref())
+            .arg("100")
+            .spawn()
+            .expect("spawn sleeper");
+        let pid = child.id();

-        pool.inject_child_killer("story:agent", child.clone_killer());
+        // Give pgrep a moment to see the new process.
+        std::thread::sleep(std::time::Duration::from_millis(100));
+
+        pool.inject_test_agent_with_path(
+            "story-1090-kill",
+            "coder",
+            AgentStatus::Running,
+            worktree.to_path_buf(),
+        );

        assert!(
            process_is_running(pid),
-            "process {pid} should be running before kill_all_children"
+            "sleeper pid {pid} should be running before kill_child_for_key"
        );

-        pool.kill_all_children();
-        let _ = child.wait();
+        pool.kill_child_for_key("story-1090-kill:coder");
+        let _ = child.wait(); // reap zombie so ps -p returns false

        assert!(
            !process_is_running(pid),
-            "process {pid} should have been killed by kill_all_children"
+            "sleeper pid {pid} should be dead after kill_child_for_key"
        );
    }

+    /// AC 5 — `kill_all_children` SIGKILLs all agents' processes.  Two agents
+    /// with distinct worktree paths are injected; both must be gone after the call.
    #[test]
-    fn kill_all_children_clears_registry() {
-        let pool = AgentPool::new_test(3001);
+    fn kill_all_children_kills_multiple_real_processes() {
+        use std::os::unix::process::CommandExt;

-        let pty_system = native_pty_system();
-        let pair = pty_system
-            .openpty(PtySize {
-                rows: 24,
-                cols: 80,
-                pixel_width: 0,
-                pixel_height: 0,
+        let pool = AgentPool::new_test(3003);
+
+        let mut sleepers: Vec<(u32, std::process::Child, tempfile::TempDir)> = (0..2_u32)
+            .map(|i| {
+                let tmp = tempfile::tempdir().unwrap();
+                let worktree = tmp.path();
+                // argv[0] = worktree path for pgrep discoverability.
+                let child = Command::new("sleep")
+                    .arg0(worktree.to_string_lossy().as_ref())
+                    .arg("100")
+                    .spawn()
+                    .expect("spawn sleeper");
+                let pid = child.id();
+                pool.inject_test_agent_with_path(
+                    &format!("story-1090-all-{i}"),
+                    "coder",
+                    AgentStatus::Running,
+                    worktree.to_path_buf(),
+                );
+                (pid, child, tmp)
            })
-            .expect("failed to open pty");
+            .collect();

-        let mut cmd = CommandBuilder::new("sleep");
-        cmd.arg("1");
-        let mut child = pair
-            .slave
-            .spawn_command(cmd)
-            .expect("failed to spawn sleep");
+        // Give pgrep a moment to see the new processes.
+        std::thread::sleep(std::time::Duration::from_millis(100));

-        pool.inject_child_killer("story:agent", child.clone_killer());
-        assert_eq!(pool.child_killer_count(), 1);
+        for (pid, _, _) in &sleepers {
+            assert!(
+                process_is_running(*pid),
+                "pid {pid} should be running before kill_all_children"
+            );
+        }

        pool.kill_all_children();
-        let _ = child.wait();

-        assert_eq!(
-            pool.child_killer_count(),
-            0,
-            "child_killers should be cleared after kill_all_children"
-        );
+        for (pid, child, _tmp) in &mut sleepers {
+            let _ = child.wait(); // reap zombie
+            assert!(
+                !process_is_running(*pid),
+                "pid {pid} should be dead after kill_all_children"
+            );
+        }
    }
 }
@@ -271,6 +271,42 @@ impl AgentPool {
                     '{conflicting_name}' is already active at the same pipeline stage"
                ));
            }
+            // Cross-stage LLM agent guard: reject if any Coder/Qa/Mergemaster agent
+            // is already Running or Pending on this story at a *different* pipeline stage.
+            // These are stale agents left over from a previous stage transition that has
+            // since advanced. The periodic reconciler (reconcile_canonical_agents) stops
+            // them; here we surface the conflict so the caller waits for reconciliation.
+            if matches!(
+                resolved_stage,
+                PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
+            ) && let Some(stale_name) = agents.iter().find_map(|(k, a)| {
+                let k_story = k.rsplit_once(':').map(|(s, _)| s).unwrap_or(k);
+                if k_story != story_id || a.agent_name == resolved_name {
+                    return None;
+                }
+                if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) {
+                    return None;
+                }
+                let a_stage = config
+                    .find_agent(&a.agent_name)
+                    .map(agent_config_stage)
+                    .unwrap_or_else(|| pipeline_stage(&a.agent_name));
+                if matches!(
+                    a_stage,
+                    PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
+                ) && a_stage != resolved_stage
+                {
+                    Some(a.agent_name.clone())
+                } else {
+                    None
+                }
+            }) {
+                return Err(format!(
+                    "story '{story_id}' already has an active LLM agent '{stale_name}'; \
+                     refusing to spawn '{resolved_name}'"
+                ));
+            }
+
            // Enforce single-instance concurrency for explicitly-named agents:
            // if this agent is already running on any other story, reject.
            // Auto-selected agents are already guaranteed idle by
@@ -392,7 +428,6 @@ impl AgentPool {
            event_log.clone(),
            self.port,
            log_writer.clone(),
-            self.child_killers.clone(),
            self.watcher_tx.clone(),
            inactivity_timeout_secs,
            prior_events,
@@ -8,7 +8,6 @@ use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};

-use portable_pty::ChildKiller;
 use tokio::sync::broadcast;

 use crate::agent_log::AgentLogWriter;
@@ -135,7 +134,6 @@ pub(super) async fn run_agent_spawn(
    event_log: Arc<Mutex<Vec<AgentEvent>>>,
    port: u16,
    log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
-    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
    inactivity_timeout_secs: u64,
    // Formatted `<recent-events>` block drained from the previous session's
@@ -159,7 +157,6 @@ pub(super) async fn run_agent_spawn(
    let log_clone = event_log;
    let port_for_task = port;
    let log_writer_clone = log_writer;
-    let child_killers_clone = child_killers;
    let watcher_tx_clone = watcher_tx;
    let _ = inactivity_timeout_secs; // currently unused inside the closure body

@@ -371,8 +368,7 @@ pub(super) async fn run_agent_spawn(

    let run_result = match runtime_name {
        "claude-code" => {
-            let runtime =
-                ClaudeCodeRuntime::new(child_killers_clone.clone(), watcher_tx_clone.clone());
+            let runtime = ClaudeCodeRuntime::new(watcher_tx_clone.clone());
            let ctx = RuntimeContext {
                story_id: sid.clone(),
                agent_name: aname.clone(),
@@ -566,7 +562,6 @@ pub(super) async fn run_agent_spawn(
                        let pool = AgentPool {
                            agents: agents_for_respawn,
                            port: port_r,
-                            child_killers: Arc::new(Mutex::new(HashMap::new())),
                            watcher_tx: watcher_for_respawn,
                            status_broadcaster: Arc::new(
                                crate::service::status::StatusBroadcaster::new(),
@@ -654,7 +649,6 @@ pub(super) async fn run_agent_spawn(
                            let pool = AgentPool {
                                agents: agents_for_cd,
                                port: port_for_cd,
-                                child_killers: Arc::new(Mutex::new(HashMap::new())),
                                watcher_tx: watcher_for_cd,
                                status_broadcaster: Arc::new(
                                    crate::service::status::StatusBroadcaster::new(),
@@ -774,7 +768,6 @@ pub(super) async fn run_agent_spawn(
                        let pool = AgentPool {
                            agents: agents_for_cd,
                            port: port_for_cd,
-                            child_killers: Arc::new(Mutex::new(HashMap::new())),
                            watcher_tx: watcher_for_cd,
                            status_broadcaster: Arc::new(
                                crate::service::status::StatusBroadcaster::new(),
@@ -815,6 +808,7 @@ pub(super) async fn run_agent_spawn(
                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(
                        &sid,
                    ));
+                    crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(&sid));

                    // Remove agent from the pool and unblock any wait_for_agent callers.
                    let tx_done = {
@@ -862,7 +856,6 @@ pub(super) async fn run_agent_spawn(
                        let pool = AgentPool {
                            agents: agents_for_respawn,
                            port: port_r,
-                            child_killers: Arc::new(Mutex::new(HashMap::new())),
                            watcher_tx: watcher_for_respawn,
                            status_broadcaster: Arc::new(
                                crate::service::status::StatusBroadcaster::new(),
@@ -881,6 +874,17 @@ pub(super) async fn run_agent_spawn(
                    return;
                }

+                // AC1 (story 1089): mark forced exits so the commit-recovery
+                // stuck counter is not incremented for API errors, network
+                // failures, or Claude-API budget exhaustion.  A non-zero exit
+                // code means the CLI was forced out, not that it chose to stop.
+                if !result.exit_ok {
+                    crate::db::write_content(
+                        crate::db::ContentKey::CommitRecoveryForcedExit(&sid),
+                        "1",
+                    );
+                }
+
                // Server-owned completion: run acceptance gates automatically
                // when the agent process exits normally.
                super::super::pipeline::run_server_owned_completion(
@@ -1254,12 +1258,13 @@ mod tests {
                "abc123",
            );

-            // Rate-limit exit handler: reset all three counters (the fix).
+            // Rate-limit exit handler: reset all counters (the fix).
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryPending(story_id));
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryDiffFingerprint(
                story_id,
            ));
            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryTotalAttempts(story_id));
+            crate::db::delete_content(crate::db::ContentKey::CommitRecoveryReadSet(story_id));

            // CommitRecoveryPending must be cleared after each rate-limit exit.
            assert!(
@@ -602,6 +602,266 @@ async fn start_agent_allows_correct_stage_agent() {
    }
 }

+// ── story-1100: cross-stage LLM agent rejection ─────────────────────────
+
+#[tokio::test]
+async fn start_agent_rejects_mergemaster_when_coder_running_same_story() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(&sk_dir).unwrap();
+    fs::write(
+        sk_dir.join("project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+         [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+    )
+    .unwrap();
+
+    let pool = AgentPool::new_test(3099);
+    pool.inject_test_agent("999_story_cross", "coder-1", AgentStatus::Running);
+
+    let result = pool
+        .start_agent(root, "999_story_cross", Some("mergemaster"), None, None)
+        .await;
+
+    assert!(
+        result.is_err(),
+        "mergemaster must be rejected when coder-1 is still running on same story"
+    );
+    let err = result.unwrap_err();
+    assert!(
+        err.contains("active LLM agent") || err.contains("stale agent"),
+        "error must mention active LLM agent conflict, got: '{err}'"
+    );
+}
+
+#[tokio::test]
+async fn start_agent_rejects_coder_when_mergemaster_running_same_story() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(&sk_dir).unwrap();
+    fs::write(
+        sk_dir.join("project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+         [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+    )
+    .unwrap();
+
+    let pool = AgentPool::new_test(3099);
+    pool.inject_test_agent("888_story_cross2", "mergemaster", AgentStatus::Running);
+
+    let result = pool
+        .start_agent(root, "888_story_cross2", Some("coder-1"), None, None)
+        .await;
+
+    assert!(
+        result.is_err(),
+        "coder-1 must be rejected when mergemaster is running on same story"
+    );
+    let err = result.unwrap_err();
+    assert!(
+        err.contains("active LLM agent") || err.contains("stale agent"),
+        "error must mention active LLM agent conflict, got: '{err}'"
+    );
+}
+
+#[tokio::test]
+async fn start_agent_cross_stage_does_not_block_different_stories() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(sk_dir.join("work/1_backlog")).unwrap();
+    fs::write(
+        root.join(".huskies/project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+         [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+    )
+    .unwrap();
+    fs::write(
+        root.join(".huskies/work/1_backlog/777_story_other.md"),
+        "---\nname: Other\n---\n",
+    )
+    .unwrap();
+
+    let pool = AgentPool::new_test(3099);
+    // mergemaster running on story-x should NOT block coder on story-y
+    pool.inject_test_agent("111_story_x", "mergemaster", AgentStatus::Running);
+
+    let result = pool
+        .start_agent(root, "777_story_other", Some("coder-1"), None, None)
+        .await;
+
+    if let Err(ref e) = result {
+        assert!(
+            !e.contains("active LLM agent") && !e.contains("stale agent"),
+            "cross-stage guard must not fire for agents on different stories, got: '{e}'"
+        );
+    }
+}
+
+#[tokio::test]
+async fn reconcile_canonical_agents_stops_stale_coder_in_qa_stage() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(&sk_dir).unwrap();
+    fs::write(
+        sk_dir.join("project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+    )
+    .unwrap();
+
+    // Write story to CRDT in QA stage: canonical = Qa, but coder-1 is Running.
+    crate::db::ensure_content_store();
+    crate::db::write_item_with_content(
+        "777_story_reconcile",
+        "qa",
+        "---\nname: Reconcile Test\n---\n",
+        crate::db::ItemMeta::named("Reconcile Test"),
+    );
+
+    let pool = AgentPool::new_test(3099);
+    pool.inject_test_agent("777_story_reconcile", "coder-1", AgentStatus::Running);
+
+    let before = pool.list_agents().unwrap();
+    assert!(
+        before.iter().any(|a| a.agent_name == "coder-1"
+            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)),
+        "coder-1 should be Running before reconciliation"
+    );
+
+    pool.reconcile_canonical_agents(root).await;
+
+    let after = pool.list_agents().unwrap();
+    let still_active = after.iter().any(|a| {
+        a.story_id == "777_story_reconcile"
+            && a.agent_name == "coder-1"
+            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+    });
+    assert!(
+        !still_active,
+        "reconciler must have stopped coder-1 (CRDT stage is QA, coder is wrong stage)"
+    );
+}
+
+#[tokio::test]
+async fn reconcile_canonical_agents_leaves_correct_stage_agent_alone() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(&sk_dir).unwrap();
+    fs::write(
+        sk_dir.join("project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n",
+    )
+    .unwrap();
+
+    // Story is in coding stage: canonical = Coder. coder-1 is correct.
+    crate::db::ensure_content_store();
+    crate::db::write_item_with_content(
+        "555_story_correct",
+        "coding",
+        "---\nname: Correct Stage\n---\n",
+        crate::db::ItemMeta::named("Correct Stage"),
+    );
+
+    let pool = AgentPool::new_test(3099);
+    pool.inject_test_agent("555_story_correct", "coder-1", AgentStatus::Running);
+
+    pool.reconcile_canonical_agents(root).await;
+
+    let after = pool.list_agents().unwrap();
+    let still_active = after.iter().any(|a| {
+        a.story_id == "555_story_correct"
+            && a.agent_name == "coder-1"
+            && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+    });
+    assert!(
+        still_active,
+        "reconciler must NOT stop coder-1 when it matches the canonical stage"
+    );
+}
+
+/// Regression test for story 1100: a stale coder left running after a stage
+/// transition blocks both a same-stage coder and a cross-stage mergemaster.
+/// The periodic reconciler stops the stale coder, after which the pool no
+/// longer has a cross-stage conflict.
+#[tokio::test]
+async fn regression_1100_stale_coder_blocks_mergemaster_then_reconciler_clears() {
+    use std::fs;
+
+    let tmp = tempfile::tempdir().unwrap();
+    let root = tmp.path();
+
+    let sk_dir = root.join(".huskies");
+    fs::create_dir_all(&sk_dir).unwrap();
+    fs::write(
+        sk_dir.join("project.toml"),
+        "[[agent]]\nname = \"coder-1\"\nstage = \"coder\"\n\n\
+         [[agent]]\nname = \"coder-2\"\nstage = \"coder\"\n\n\
+         [[agent]]\nname = \"mergemaster\"\nstage = \"mergemaster\"\n",
+    )
+    .unwrap();
+
+    let pool = AgentPool::new_test(3099);
+    // Simulate coder-1 still Running after the story advanced past the coding stage.
+    pool.inject_test_agent("1100_reg", "coder-1", AgentStatus::Running);
+
+    // coder-2 blocked by same-stage check (both are Coder stage)
+    let r1 = pool
+        .start_agent(root, "1100_reg", Some("coder-2"), None, None)
+        .await;
+    assert!(r1.is_err(), "coder-2 must be rejected by same-stage guard");
+    assert!(
+        r1.unwrap_err().contains("same pipeline stage"),
+        "same-stage check must fire for coder-2"
+    );
+
+    // mergemaster blocked by cross-stage LLM guard (coder-1 is a different LLM stage)
+    let r2 = pool
+        .start_agent(root, "1100_reg", Some("mergemaster"), None, None)
+        .await;
+    assert!(
+        r2.is_err(),
+        "mergemaster must be rejected because coder-1 (different LLM stage) is still running"
+    );
+    let r2_err = r2.unwrap_err();
+    assert!(
+        r2_err.contains("active LLM agent") || r2_err.contains("stale agent"),
+        "cross-stage rejection expected, got: '{r2_err}'"
+    );
+
+    // Reconciler: story "1100_reg" has no CRDT entry → canonical = None → stop coder-1.
+    pool.reconcile_canonical_agents(root).await;
+
+    // coder-1 must be gone from the active pool.
+    let remaining = pool.list_agents().unwrap();
+    assert!(
+        !remaining.iter().any(|a| {
+            a.story_id == "1100_reg"
+                && a.agent_name == "coder-1"
+                && matches!(a.status, AgentStatus::Running | AgentStatus::Pending)
+        }),
+        "reconciler must have removed stale coder-1 from the active pool"
+    );
+}
+
 /// Bug 502: when start_agent is called for a non-Coder agent (mergemaster
 /// or qa) on a story that's in 4_merge/, the unconditional
 /// move_story_to_current at the top of start_agent must NOT fire — even
@@ -2,11 +2,11 @@

 use std::path::Path;

-use crate::config::ProjectConfig;
-use crate::pipeline_state::Stage;
-
-use super::super::super::{PipelineStage, agent_config_stage, pipeline_stage};
+use super::super::super::{
+    PipelineStage, agent_config_stage, canonical_pipeline_stage, pipeline_stage,
+};
 use super::super::worktree::find_active_story_stage;
+use crate::config::ProjectConfig;

 /// Validate that an explicit `agent_name` is allowed to attach to `story_id`'s
 /// current pipeline stage.
@@ -34,16 +34,15 @@ pub(super) fn validate_agent_stage(
    let Some(story_stage) = find_active_story_stage(project_root, story_id) else {
        return Ok(());
    };
-    let expected_stage = match story_stage {
-        Stage::Coding { .. } => PipelineStage::Coder,
-        Stage::Qa => PipelineStage::Qa,
-        Stage::Merge { .. } => PipelineStage::Mergemaster,
-        _ => PipelineStage::Other,
-    };
-    if expected_stage != PipelineStage::Other && expected_stage != agent_stage {
+    let canonical = canonical_pipeline_stage(&story_stage);
+    let is_llm = matches!(
+        agent_stage,
+        PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
+    );
+    if is_llm && (canonical.is_none() || canonical.as_ref() != Some(&agent_stage)) {
        return Err(format!(
            "Agent '{name}' (stage: {agent_stage:?}) cannot be assigned to \
-             story '{story_id}' in {}/ (requires stage: {expected_stage:?})",
+             story '{story_id}' in {}/ (requires stage: {canonical:?})",
            story_stage.dir_name()
        ));
    }
@@ -5,7 +5,10 @@ use crate::slog_error;
 use crate::slog_warn;
 use std::path::Path;

-use super::super::{AgentEvent, AgentStatus};
+use super::super::{
+    AgentEvent, AgentStatus, PipelineStage, agent_config_stage, canonical_pipeline_stage,
+    pipeline_stage,
+};
 use super::AgentPool;
 use super::types::composite_key;

@@ -71,8 +74,7 @@ impl AgentPool {
            self.kill_child_for_key(&key);
        }

-        // Step 3: now safe to mutate.  Status flip, handle abort, drop the
-        // child_killers entry.
+        // Step 3: now safe to mutate.  Status flip and handle abort.
        let (task_handle, tx) = {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents
@@ -88,9 +90,6 @@ impl AgentPool {
            handle.abort();
            let _ = handle.await;
        }
-        if let Ok(mut killers) = self.child_killers.lock() {
-            killers.remove(&key);
-        }

        // Preserve worktree for inspection — don't destroy agent's work on stop.
        if let Some(ref wt) = worktree_info {
@@ -118,6 +117,82 @@ impl AgentPool {
        Ok(())
    }

+    /// Stop LLM agents whose pipeline stage no longer matches the story's canonical stage.
+    ///
+    /// Called periodically by the tick loop (story 1100).  For each Running or Pending
+    /// LLM agent (Coder, Qa, or Mergemaster) whose stage does not match the canonical
+    /// stage derived from the story's current CRDT state, the agent is stopped via the
+    /// existing SIGKILL path.  Idempotent: agents already at the correct stage are left
+    /// untouched.  Also stops LLM agents on stories that have no active pipeline stage
+    /// (terminal, blocked, or frozen), since no LLM agent should run there.
+    pub async fn reconcile_canonical_agents(&self, root: &std::path::Path) {
+        use crate::config::ProjectConfig;
+
+        let config = match ProjectConfig::load(root) {
+            Ok(c) => c,
+            Err(e) => {
+                slog_warn!("[reconcile] Cannot load config for canonical reconcile: {e}");
+                return;
+            }
+        };
+
+        // Snapshot active LLM agents without holding the lock during async stops.
+        let snapshot: Vec<(String, String, PipelineStage)> = {
+            let Ok(agents) = self.agents.lock() else {
+                return;
+            };
+            agents
+                .iter()
+                .filter_map(|(key, a)| {
+                    if !matches!(a.status, AgentStatus::Running | AgentStatus::Pending) {
+                        return None;
+                    }
+                    let stage = config
+                        .find_agent(&a.agent_name)
+                        .map(agent_config_stage)
+                        .unwrap_or_else(|| pipeline_stage(&a.agent_name));
+                    if !matches!(
+                        stage,
+                        PipelineStage::Coder | PipelineStage::Qa | PipelineStage::Mergemaster
+                    ) {
+                        return None;
+                    }
+                    let story_id = key
+                        .rsplit_once(':')
+                        .map(|(s, _)| s)
+                        .unwrap_or(key)
+                        .to_string();
+                    Some((story_id, a.agent_name.clone(), stage))
+                })
+                .collect()
+        };
+
+        for (story_id, agent_name, agent_stage) in snapshot {
+            let canonical = crate::pipeline_state::read_typed(&story_id)
+                .ok()
+                .flatten()
+                .and_then(|item| canonical_pipeline_stage(&item.stage));
+
+            let should_stop = match &canonical {
+                None => true,
+                Some(c) if *c != agent_stage => true,
+                _ => false,
+            };
+
+            if !should_stop {
+                continue;
+            }
+
+            slog!(
+                "[reconcile] stopping '{agent_name}' on '{story_id}': \
+                 canonical={canonical:?} actual={agent_stage:?}"
+            );
+            if let Err(e) = self.stop_agent(root, &story_id, &agent_name).await {
+                slog_warn!("[reconcile] failed to stop '{agent_name}' on '{story_id}': {e}");
+            }
+        }
+    }
+
    /// Remove all agent entries for a given story_id from the pool.
    ///
    /// Called when a story is archived so that stale entries don't accumulate.
@@ -33,6 +33,8 @@ pub(super) fn find_active_story_stage(
            crate::pipeline_state::Stage::Coding { .. }
                | crate::pipeline_state::Stage::Qa
                | crate::pipeline_state::Stage::Merge { .. }
+                | crate::pipeline_state::Stage::MergeFailure { .. }
+                | crate::pipeline_state::Stage::MergeFailureFinal { .. }
        )
    {
        return Some(item.stage);
@@ -13,7 +13,6 @@ mod tests {
    use super::*;
    use crate::agents::AgentEvent;
    use crate::io::watcher::WatcherEvent;
-    use std::collections::HashMap;
    use std::sync::{Arc, Mutex};
    use tokio::sync::broadcast;

@@ -41,7 +40,6 @@ mod tests {
        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
        let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
        let event_log = Arc::new(Mutex::new(Vec::new()));
-        let child_killers = Arc::new(Mutex::new(HashMap::new()));

        // sh -p "--" <script>: -p = privileged mode, "--" = end options,
        // then the script path is the file operand.
@@ -56,7 +54,6 @@ mod tests {
            &event_log,
            None,
            0,
-            child_killers,
            watcher_tx,
            None,
            None,
@@ -98,7 +95,6 @@ mod tests {
        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
        let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
        let event_log = Arc::new(Mutex::new(Vec::new()));
-        let child_killers = Arc::new(Mutex::new(HashMap::new()));

        let result = run_agent_pty_streaming(
            "423_story_rate_limit",
@@ -111,7 +107,6 @@ mod tests {
            &event_log,
            None,
            0,
-            child_killers,
            watcher_tx,
            None,
            None,
@@ -160,7 +155,6 @@ mod tests {
        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
        let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
        let event_log = Arc::new(Mutex::new(Vec::new()));
-        let child_killers = Arc::new(Mutex::new(HashMap::new()));

        let before = chrono::Utc::now();
        let result = run_agent_pty_streaming(
@@ -174,7 +168,6 @@ mod tests {
            &event_log,
            None,
            0,
-            child_killers,
            watcher_tx,
            None,
            None,
@@ -229,7 +222,6 @@ mod tests {
        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
        let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
        let event_log = Arc::new(Mutex::new(Vec::new()));
-        let child_killers = Arc::new(Mutex::new(HashMap::new()));

        let result = run_agent_pty_streaming(
            "916_story_rate_limit_extension",
@@ -242,7 +234,6 @@ mod tests {
            &event_log,
            None,
            1, // inactivity_timeout_secs = 1s; would expire before the 3s sleep without the extension
-            child_killers,
            watcher_tx,
            None,
            None,
@@ -407,18 +398,16 @@ mod tests {
        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
        let (watcher_tx, _watcher_rx) = broadcast::channel::<WatcherEvent>(16);
        let event_log = Arc::new(Mutex::new(Vec::new()));
-        let child_killers: Arc<
-            Mutex<HashMap<String, Box<dyn portable_pty::ChildKiller + Send + Sync>>>,
-        > = Arc::new(Mutex::new(HashMap::new()));
-        let child_killers_for_kill = Arc::clone(&child_killers);

        // Spawn a task to kill the child after a short delay (simulating watchdog).
+        // Uses pids_matching on the script path — same mechanism as the production
+        // watchdog after the process_kill migration (story 1090).
+        let script_path_for_kill = script.to_string_lossy().to_string();
        tokio::spawn(async move {
            tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
-            if let Ok(mut killers) = child_killers_for_kill.lock() {
-                for (_, killer) in killers.iter_mut() {
-                    let _ = killer.kill();
-                }
+            let pids = crate::process_kill::pids_matching(&script_path_for_kill);
+            if !pids.is_empty() {
+                let _ = crate::process_kill::sigkill_pids_and_verify(&pids);
            }
        });

@@ -435,7 +424,6 @@ mod tests {
            &event_log,
            None,
            0, // no inactivity timeout
-            child_killers,
            watcher_tx,
            None, // no session to resume
            Some((project_root.clone(), "sonnet".to_string())),
@@ -457,4 +445,62 @@ mod tests {
             the respawn's lookup_session returns it (warm), not None (cold)"
        );
    }
+
+    // ── bug 1103: soft rate-limit warning (status=allowed) must NOT set rate_limit_exit ──
+
+    /// Regression: a `rate_limit_event` with `status="allowed"` is a soft
+    /// warning — the request was permitted.  The session that follows should
+    /// complete normally and report `rate_limit_exit == false`, not trigger the
+    /// rate-limit respawn path in the spawn handler.
+    #[tokio::test]
+    async fn rate_limit_allowed_status_does_not_set_rate_limit_exit() {
+        use std::os::unix::fs::PermissionsExt;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let script = tmp.path().join("emit_allowed_then_exit.sh");
+        // Emit status="allowed" (soft warning), then exit cleanly.
+        std::fs::write(
+            &script,
+            "#!/bin/sh\nprintf '%s\\n' '{\"type\":\"rate_limit_event\",\"rate_limit_info\":{\"status\":\"allowed\",\"reset_at\":\"2099-01-01T12:00:00Z\"}}'\n",
+        )
+        .unwrap();
+        std::fs::set_permissions(&script, std::fs::Permissions::from_mode(0o755)).unwrap();
+
+        let (tx, _rx) = broadcast::channel::<AgentEvent>(64);
+        let (watcher_tx, mut watcher_rx) = broadcast::channel::<WatcherEvent>(16);
+        let event_log = Arc::new(Mutex::new(Vec::new()));
+
+        let result = run_agent_pty_streaming(
+            "1103_soft_warning_no_exit_flag",
+            "coder-1",
+            "sh",
+            &[script.to_string_lossy().to_string()],
+            "--",
+            "/tmp",
+            &tx,
+            &event_log,
+            None,
+            0,
+            watcher_tx,
+            None,
+            None,
+        )
+        .await;
+
+        let pty = result.expect("PTY run should succeed");
+        assert!(
+            !pty.rate_limit_exit,
+            "rate_limit_exit must be false for a soft 'allowed' warning; \
+             only genuine hard blocks (rejected) should set it"
+        );
+
+        // Watcher must have received RateLimitWarning, not RateLimitHardBlock.
+        let evt = watcher_rx
+            .try_recv()
+            .expect("Expected a RateLimitWarning watcher event");
+        assert!(
+            matches!(evt, WatcherEvent::RateLimitWarning { .. }),
+            "Expected RateLimitWarning for status=allowed, got: {evt:?}"
+        );
+    }
 }
@@ -1,10 +1,9 @@
 //! PTY process spawning and output loop: builds the command, drives the reader thread,
 //! and dispatches parsed JSON events to the broadcast channel.
-use std::collections::HashMap;
 use std::io::{BufRead, BufReader};
 use std::sync::{Arc, Mutex};

-use portable_pty::{ChildKiller, CommandBuilder, PtySize, native_pty_system};
+use portable_pty::{CommandBuilder, PtySize, native_pty_system};
 use tokio::sync::broadcast;

 use crate::agent_log::AgentLogWriter;
@@ -14,7 +13,7 @@ use crate::slog;
 use crate::slog_warn;

 use super::events::{emit_event, handle_agent_stream_event};
-use super::types::{ChildKillerGuard, PtyResult, composite_key};
+use super::types::PtyResult;

 /// Spawn claude agent in a PTY and stream events through the broadcast channel.
 ///
@@ -55,7 +54,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
    event_log: &Arc<Mutex<Vec<AgentEvent>>>,
    log_writer: Option<Arc<Mutex<AgentLogWriter>>>,
    inactivity_timeout_secs: u64,
-    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
    session_id_to_resume: Option<&str>,
    eager_record: Option<(std::path::PathBuf, String)>,
@@ -82,7 +80,6 @@ pub(in crate::agents) async fn run_agent_pty_streaming(
            &event_log,
            log_writer.as_deref(),
            inactivity_timeout_secs,
-            &child_killers,
            &watcher_tx,
            resume_sid.as_deref(),
            eager_record,
@@ -104,7 +101,6 @@ fn run_agent_pty_blocking(
    event_log: &Mutex<Vec<AgentEvent>>,
    log_writer: Option<&Mutex<AgentLogWriter>>,
    inactivity_timeout_secs: u64,
-    child_killers: &Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: &broadcast::Sender<WatcherEvent>,
    session_id_to_resume: Option<&str>,
    eager_record: Option<(std::path::PathBuf, String)>,
@@ -204,21 +200,6 @@ fn run_agent_pty_blocking(
        .spawn_command(cmd)
        .map_err(|e| format!("Failed to spawn agent for {story_id}:{agent_name}: {e}"))?;

-    // Register the child killer so that kill_all_children() / stop_agent() can
-    // terminate this process on server shutdown, even if the blocking thread
-    // cannot be interrupted.  The ChildKillerGuard deregisters on function exit.
-    let killer_key = composite_key(story_id, agent_name);
-    {
-        let killer = child.clone_killer();
-        if let Ok(mut killers) = child_killers.lock() {
-            killers.insert(killer_key.clone(), killer);
-        }
-    }
-    let _killer_guard = ChildKillerGuard {
-        killers: Arc::clone(child_killers),
-        key: killer_key,
-    };
-
    drop(pair.slave);

    let reader = pair
@@ -366,7 +347,11 @@ fn run_agent_pty_blocking(
                    .and_then(|i| i.get("status"))
                    .and_then(|s| s.as_str())
                    .unwrap_or("");
-                let is_hard_block = !status.is_empty() && status != "allowed_warning";
+                // "allowed" and "allowed_warning" are soft warnings — the request was
+                // permitted; only statuses that actually block the request (e.g. "rejected")
+                // are genuine hard blocks that warrant a rate-limit exit respawn.
+                let is_hard_block =
+                    !status.is_empty() && status != "allowed" && status != "allowed_warning";
                let reset_at = rate_limit_info
                    .and_then(|i| i.get("reset_at"))
                    .and_then(|r| r.as_str())
@@ -1,9 +1,4 @@
 //! Core types for the PTY runner: result container and process lifecycle helpers.
-use std::collections::HashMap;
-use std::sync::{Arc, Mutex};
-
-use portable_pty::ChildKiller;
-
 use crate::agents::TokenUsage;

 /// Result from a PTY agent session, containing the session ID and token usage.
@@ -23,20 +18,3 @@ pub(in crate::agents) struct PtyResult {
    /// event was seen or when the `reset_at` field was absent from the event.
    pub rate_limit_reset_at: Option<chrono::DateTime<chrono::Utc>>,
 }
-
-pub(super) fn composite_key(story_id: &str, agent_name: &str) -> String {
-    format!("{story_id}:{agent_name}")
-}
-
-pub(super) struct ChildKillerGuard {
-    pub killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
-    pub key: String,
-}
-
-impl Drop for ChildKillerGuard {
-    fn drop(&mut self) {
-        if let Ok(mut killers) = self.killers.lock() {
-            killers.remove(&self.key);
-        }
-    }
-}
@@ -1,8 +1,6 @@
 //! Claude Code runtime — launches Claude Code CLI sessions as agent backends.
-use std::collections::HashMap;
 use std::sync::{Arc, Mutex};

-use portable_pty::ChildKiller;
 use tokio::sync::broadcast;

 use crate::agent_log::AgentLogWriter;
@@ -17,20 +15,13 @@ use super::{AgentEvent, AgentRuntime, RuntimeContext, RuntimeResult, RuntimeStat
 /// It wraps the existing PTY-based execution logic, preserving all streaming,
 /// token tracking, and inactivity timeout behaviour.
 pub struct ClaudeCodeRuntime {
-    child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
    watcher_tx: broadcast::Sender<WatcherEvent>,
 }

 impl ClaudeCodeRuntime {
-    /// Create a new Claude Code runtime with shared child-killer registry and event channel.
-    pub fn new(
-        child_killers: Arc<Mutex<HashMap<String, Box<dyn ChildKiller + Send + Sync>>>>,
-        watcher_tx: broadcast::Sender<WatcherEvent>,
-    ) -> Self {
-        Self {
-            child_killers,
-            watcher_tx,
-        }
+    /// Create a new Claude Code runtime with a shared event channel.
+    pub fn new(watcher_tx: broadcast::Sender<WatcherEvent>) -> Self {
+        Self { watcher_tx }
    }
 }

@@ -57,7 +48,6 @@ impl AgentRuntime for ClaudeCodeRuntime {
            &event_log,
            log_writer.clone(),
            ctx.inactivity_timeout_secs,
-            Arc::clone(&self.child_killers),
            self.watcher_tx.clone(),
            ctx.session_id_to_resume.as_deref(),
            eager_record.clone(),
@@ -69,6 +59,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
                // Abort+no-session: CLI crashed (e.g. SIGABRT) before emitting its
                // first "system" event.  Detected by: non-zero exit AND no session.
                aborted_signal: !result.exit_ok && result.session_id.is_none(),
+                exit_ok: result.exit_ok,
                session_id: result.session_id,
                token_usage: result.token_usage,
                rate_limit_exit: result.rate_limit_exit,
@@ -94,7 +85,6 @@ impl AgentRuntime for ClaudeCodeRuntime {
                    &event_log,
                    log_writer,
                    ctx.inactivity_timeout_secs,
-                    Arc::clone(&self.child_killers),
                    self.watcher_tx.clone(),
                    None, // no --resume on fallback
                    eager_record,
@@ -103,6 +93,7 @@ impl AgentRuntime for ClaudeCodeRuntime {
                Ok(RuntimeResult {
                    aborted_signal: !fallback_result.exit_ok
                        && fallback_result.session_id.is_none(),
+                    exit_ok: fallback_result.exit_ok,
                    session_id: fallback_result.session_id,
                    token_usage: fallback_result.token_usage,
                    rate_limit_exit: fallback_result.rate_limit_exit,
@@ -115,7 +106,6 @@ impl AgentRuntime for ClaudeCodeRuntime {

    fn stop(&self) {
        // Stopping is handled externally by the pool via kill_child_for_key().
-        // The ChildKillerGuard in pty.rs deregisters automatically on process exit.
    }

    fn get_status(&self) -> RuntimeStatus {
@@ -135,6 +135,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -151,6 +152,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -254,6 +256,7 @@ impl AgentRuntime for GeminiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -339,6 +342,7 @@ impl AgentRuntime for GeminiRuntime {
        Ok(RuntimeResult {
            session_id: None,
            token_usage: Some(total_usage),
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -55,6 +55,12 @@ pub struct RuntimeContext {
 pub struct RuntimeResult {
    pub session_id: Option<String>,
    pub token_usage: Option<TokenUsage>,
+    /// `true` when the process exited with exit code 0; `false` for non-zero exits
+    /// (API errors, network failures, or Claude-API-level budget exhaustion).  Always
+    /// `true` for API-based runtimes (OpenAI, Gemini) which have no exit-code concept.
+    /// Used by the commit-recovery path to skip the stuck-respawn counter for forced
+    /// exits (story 1089, AC1).
+    pub exit_ok: bool,
    /// `true` when the process exited with a failure AND no session was established.
    ///
    /// This indicates the Claude Code CLI crashed (e.g. SIGABRT from an assertion
@@ -169,6 +175,7 @@ mod tests {
                cache_read_input_tokens: 0,
                total_cost_usd: 0.01,
            }),
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -186,6 +193,7 @@ mod tests {
        let result = RuntimeResult {
            session_id: None,
            token_usage: None,
+            exit_ok: true,
            aborted_signal: false,
            rate_limit_exit: false,
            rate_limit_reset_at: None,
@@ -204,20 +212,16 @@ mod tests {
    #[test]
    fn claude_code_runtime_get_status_returns_idle() {
        use crate::io::watcher::WatcherEvent;
-        use std::collections::HashMap;
-        let killers = Arc::new(Mutex::new(HashMap::new()));
        let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(16);
-        let runtime = ClaudeCodeRuntime::new(killers, watcher_tx);
+        let runtime = ClaudeCodeRuntime::new(watcher_tx);
        assert_eq!(runtime.get_status(), RuntimeStatus::Idle);
    }

    #[test]
    fn claude_code_runtime_stream_events_empty() {
        use crate::io::watcher::WatcherEvent;
-        use std::collections::HashMap;
-        let killers = Arc::new(Mutex::new(HashMap::new()));
        let (watcher_tx, _) = broadcast::channel::<WatcherEvent>(16);
-        let runtime = ClaudeCodeRuntime::new(killers, watcher_tx);
+        let runtime = ClaudeCodeRuntime::new(watcher_tx);
        assert!(runtime.stream_events().is_empty());
    }
 }
@@ -122,6 +122,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -138,6 +139,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -224,6 +226,7 @@ impl AgentRuntime for OpenAiRuntime {
                return Ok(RuntimeResult {
                    session_id: None,
                    token_usage: Some(total_usage),
+                    exit_ok: true,
                    aborted_signal: false,
                    rate_limit_exit: false,
                    rate_limit_reset_at: None,
@@ -0,0 +1,367 @@
+//! Protocol-agnostic chat dispatcher — coalesce window + per-session serial lock.
+//!
+//! Sits between every inbound transport (Matrix, Slack, WhatsApp, …) and the
+//! `claude -p` spawner.  Transport handlers call [`ChatDispatcher::submit`]
+//! instead of spawning directly; the dispatcher enforces two invariants:
+//!
+//! 1. **Coalesce window**: messages arriving for the same session within
+//!    `coalesce_ms` of each other are concatenated and delivered to a single
+//!    spawn.  The window is a *debounce*: each new message extends the window by
+//!    `coalesce_ms` from its arrival time, so bursts flush as one batch.
+//!
+//! 2. **Per-session serial lock**: while one `claude -p` run is active, further
+//!    messages for that session queue up and are dispatched as a single batch
+//!    once the running invocation completes.
+//!
+//! A [`ChatDispatcher::stop`] call cancels the active run for a session and
+//! discards the pending queue.
+
+use crate::slog;
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+use tokio::sync::{mpsc, watch};
+
+/// A factory function that produces one LLM execution future per dispatch.
+///
+/// Arguments:
+/// - `String` — the (possibly concatenated) prompt to send to `claude -p`.
+/// - `watch::Receiver<bool>` — send `true` on this channel to cancel the run.
+///
+/// Returns a boxed, pinned `Send + 'static` future that resolves when the LLM
+/// session ends (whether normally or via cancellation).
+pub type SpawnFn = Arc<
+    dyn Fn(
+            String,
+            watch::Receiver<bool>,
+        ) -> Pin<Box<dyn std::future::Future<Output = ()> + Send + 'static>>
+        + Send
+        + Sync,
+>;
+
+enum SessionMsg {
+    UserMessage { text: String, factory: SpawnFn },
+    Stop,
+}
+
+struct SessionHandle {
+    tx: mpsc::UnboundedSender<SessionMsg>,
+}
+
+/// Coalescing, serialising dispatcher for chat-to-LLM message routing.
+///
+/// Construct once at startup via [`ChatDispatcher::new`] and share via `Arc`.
+/// Call [`submit`](ChatDispatcher::submit) from every transport handler instead
+/// of spawning `claude -p` directly.
+pub struct ChatDispatcher {
+    sessions: Mutex<HashMap<String, SessionHandle>>,
+    coalesce_ms: u64,
+}
+
+impl ChatDispatcher {
+    /// Create a new dispatcher with the given coalesce window in milliseconds.
+    pub fn new(coalesce_ms: u64) -> Self {
+        Self {
+            sessions: Mutex::new(HashMap::new()),
+            coalesce_ms,
+        }
+    }
+
+    /// Submit a message for a chat session.
+    ///
+    /// If no session task exists for `session_key`, one is created lazily.
+    /// The `factory` is called by the session task when the coalesce window
+    /// closes (or immediately after the current run finishes, for pending
+    /// messages).
+    pub fn submit(&self, session_key: String, message: String, factory: SpawnFn) {
+        let mut guard = self.sessions.lock().unwrap();
+        let coalesce_ms = self.coalesce_ms;
+        let handle = guard.entry(session_key.clone()).or_insert_with(|| {
+            let (tx, rx) = mpsc::unbounded_channel();
+            tokio::spawn(session_task(session_key.clone(), rx, coalesce_ms));
+            SessionHandle { tx }
+        });
+        let _ = handle.tx.send(SessionMsg::UserMessage {
+            text: message,
+            factory,
+        });
+    }
+
+    /// Stop the active LLM run for `session_key` and clear its pending queue.
+    ///
+    /// Returns `true` if the session existed (whether or not anything was
+    /// actually running), `false` if no session for that key has been created.
+    pub fn stop(&self, session_key: &str) -> bool {
+        let guard = self.sessions.lock().unwrap();
+        if let Some(handle) = guard.get(session_key) {
+            let _ = handle.tx.send(SessionMsg::Stop);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+/// Per-session background task.
+///
+/// Phases:
+/// 1. **Wait** — blocks until the first `UserMessage` arrives.
+/// 2. **Coalesce** — extends the window by `coalesce_ms` on each new message;
+///    fires when no message arrives within the window.
+/// 3. **Run** — calls the factory with the concatenated batch; while running,
+///    collects further `UserMessage`s into a pending list and logs a warn per
+///    message.  A `Stop` message cancels the running call and clears pending.
+/// 4. **Drain** — after the run, if pending is non-empty, fires a second run
+///    with the accumulated batch and loops back to step 3.
+/// 5. Returns to step 1 when pending is empty.
+async fn session_task(
+    session_key: String,
+    mut rx: mpsc::UnboundedReceiver<SessionMsg>,
+    coalesce_ms: u64,
+) {
+    let coalesce_dur = Duration::from_millis(coalesce_ms);
+
+    loop {
+        // ── Phase 1: wait for the first message ─────────────────────────────
+        let (first_text, first_factory) = loop {
+            match rx.recv().await {
+                None => return,
+                Some(SessionMsg::Stop) => continue,
+                Some(SessionMsg::UserMessage { text, factory }) => break (text, factory),
+            }
+        };
+
+        // ── Phase 2: coalesce window (debounce) ──────────────────────────────
+        let mut batch: Vec<String> = vec![first_text];
+        let mut latest_factory: SpawnFn = first_factory;
+        let mut deadline = tokio::time::Instant::now() + coalesce_dur;
+
+        'coalesce: loop {
+            let now = tokio::time::Instant::now();
+            if now >= deadline {
+                break 'coalesce;
+            }
+            let remaining = deadline - now;
+            match tokio::time::timeout(remaining, rx.recv()).await {
+                Err(_) => break 'coalesce, // window closed
+                Ok(None) => return,        // channel closed → exit task
+                Ok(Some(SessionMsg::Stop)) => {
+                    batch.clear();
+                    break 'coalesce;
+                }
+                Ok(Some(SessionMsg::UserMessage { text, factory })) => {
+                    batch.push(text);
+                    latest_factory = factory;
+                    // Extend deadline on each new message (debounce).
+                    deadline = tokio::time::Instant::now() + coalesce_dur;
+                }
+            }
+        }
+
+        if batch.is_empty() {
+            continue; // Stop received during coalesce — restart
+        }
+
+        // ── Phase 3 + 4: run → drain pending → repeat ───────────────────────
+        let mut prompt = batch.join("\n\n");
+        let mut factory = latest_factory;
+
+        loop {
+            let (cancel_tx, cancel_rx) = watch::channel(false);
+            let llm_fut = factory(prompt, cancel_rx);
+            let mut llm_task = tokio::spawn(llm_fut);
+
+            let mut pending_texts: Vec<String> = vec![];
+            let mut pending_factory: Option<SpawnFn> = None;
+            let mut stopped = false;
+
+            // Wait for the LLM to finish, collecting messages that arrive during the run.
+            loop {
+                tokio::select! {
+                    _ = &mut llm_task => { break; }
+                    msg = rx.recv() => {
+                        match msg {
+                            None => {
+                                llm_task.abort();
+                                return;
+                            }
+                            Some(SessionMsg::Stop) => {
+                                let _ = cancel_tx.send(true);
+                                let _ = llm_task.await;
+                                pending_texts.clear();
+                                stopped = true;
+                                break;
+                            }
+                            Some(SessionMsg::UserMessage { text, factory: f }) => {
+                                pending_texts.push(text);
+                                let depth = pending_texts.len();
+                                slog!(
+                                    "[chat-dispatcher] coalescing message for session={}, queue_depth={}",
+                                    session_key,
+                                    depth,
+                                );
+                                pending_factory = Some(f);
+                            }
+                        }
+                    }
+                }
+            }
+
+            if stopped || pending_texts.is_empty() {
+                break; // back to Phase 1
+            }
+
+            // Fire the pending batch as the next run (no additional coalesce window).
+            prompt = pending_texts.join("\n\n");
+            factory = pending_factory.unwrap();
+        }
+    }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    fn make_factory(spawn_count: Arc<AtomicUsize>, run_ms: u64) -> SpawnFn {
+        Arc::new(move |_prompt: String, _cancel_rx: watch::Receiver<bool>| {
+            let count = Arc::clone(&spawn_count);
+            Box::pin(async move {
+                count.fetch_add(1, Ordering::SeqCst);
+                tokio::time::sleep(Duration::from_millis(run_ms)).await;
+            })
+        })
+    }
+
+    /// AC 6 regression: three messages arriving 200 ms / (long gap) / (after run)
+    /// apart on the same session must produce at most two spawns, never three
+    /// concurrent processes.
+    ///
+    /// Setup:
+    ///   coalesce_ms = 50 ms   (short window so test runs fast)
+    ///   LLM "run" = 150 ms
+    ///   msg1 @ t=0
+    ///   msg2 @ t=20 ms  — within coalesce window, merged with msg1 → 1 spawn
+    ///   msg3 @ t=300 ms — after run completes → 2nd spawn
+    ///
+    /// Expected: exactly 2 spawns, never 3.
+    #[tokio::test]
+    async fn three_messages_never_three_concurrent_spawns() {
+        let spawn_count = Arc::new(AtomicUsize::new(0));
+        let dispatcher = Arc::new(ChatDispatcher::new(50));
+        let session = "room1".to_string();
+
+        // msg1 at t=0
+        dispatcher.submit(
+            session.clone(),
+            "msg1".to_string(),
+            make_factory(Arc::clone(&spawn_count), 150),
+        );
+
+        // msg2 at t=20 ms — inside the 50 ms coalesce window
+        tokio::time::sleep(Duration::from_millis(20)).await;
+        dispatcher.submit(
+            session.clone(),
+            "msg2".to_string(),
+            make_factory(Arc::clone(&spawn_count), 150),
+        );
+
+        // msg3 at t=300 ms — after the coalesce window fires (t≈70 ms) and the
+        // 150 ms run completes (t≈220 ms), so msg3 starts a second coalesce cycle.
+        tokio::time::sleep(Duration::from_millis(280)).await;
+        dispatcher.submit(
+            session.clone(),
+            "msg3".to_string(),
+            make_factory(Arc::clone(&spawn_count), 150),
+        );
+
+        // Wait long enough for both runs to finish.
+        tokio::time::sleep(Duration::from_millis(500)).await;
+
+        let count = spawn_count.load(Ordering::SeqCst);
+        assert!(
+            (1..=2).contains(&count),
+            "expected 1 or 2 spawns (msgs 1+2 coalesced, msg3 separate), got {count}"
+        );
+    }
+
+    /// Messages that arrive while the LLM is running are not lost — they are
+    /// delivered as a single follow-up spawn once the first run completes.
+    #[tokio::test]
+    async fn pending_messages_dispatched_after_run_completes() {
+        let spawn_count = Arc::new(AtomicUsize::new(0));
+        let dispatcher = Arc::new(ChatDispatcher::new(50));
+        let session = "room2".to_string();
+
+        // First message — starts a 200 ms run.
+        dispatcher.submit(
+            session.clone(),
+            "first".to_string(),
+            make_factory(Arc::clone(&spawn_count), 200),
+        );
+
+        // Wait for coalesce window to fire, then send two more.
+        tokio::time::sleep(Duration::from_millis(100)).await;
+        dispatcher.submit(
+            session.clone(),
+            "second".to_string(),
+            make_factory(Arc::clone(&spawn_count), 50),
+        );
+        dispatcher.submit(
+            session.clone(),
+            "third".to_string(),
+            make_factory(Arc::clone(&spawn_count), 50),
+        );
+
+        // Wait long enough for both runs.
+        tokio::time::sleep(Duration::from_millis(600)).await;
+
+        let count = spawn_count.load(Ordering::SeqCst);
+        assert_eq!(
+            count, 2,
+            "first run + one pending-batch run = 2 total spawns"
+        );
+    }
+
+    /// Stop cancels the running LLM and discards pending messages.
+    #[tokio::test]
+    async fn stop_cancels_run_and_clears_pending() {
+        let spawn_count = Arc::new(AtomicUsize::new(0));
+        let dispatcher = Arc::new(ChatDispatcher::new(30));
+        let session = "room3".to_string();
+
+        // Start a long run.
+        dispatcher.submit(
+            session.clone(),
+            "long-running".to_string(),
+            make_factory(Arc::clone(&spawn_count), 500),
+        );
+
+        // Wait for coalesce window to fire.
+        tokio::time::sleep(Duration::from_millis(80)).await;
+
+        // Queue a pending message.
+        dispatcher.submit(
+            session.clone(),
+            "pending".to_string(),
+            make_factory(Arc::clone(&spawn_count), 50),
+        );
+
+        // Stop immediately.
+        dispatcher.stop(&session);
+
+        // Wait longer than the run would have taken if not stopped.
+        tokio::time::sleep(Duration::from_millis(700)).await;
+
+        let count = spawn_count.load(Ordering::SeqCst);
+        // The first run was started before stop (spawn_count=1).
+        // The pending message should NOT have produced a second spawn.
+        assert!(
+            count <= 1,
+            "stop should discard pending; got {count} spawns"
+        );
+    }
+}
@@ -6,6 +6,8 @@

 /// Bot command registry and dispatch — parses and routes incoming chat messages.
 pub mod commands;
+/// Protocol-agnostic chat dispatcher — coalesce window and per-session serial lock.
+pub mod dispatcher;
 /// Chat history utilities — loading and serialising conversation history.
 pub mod history;
 pub(crate) mod lookup;
@@ -268,6 +268,7 @@ mod tests {
            pending_perm_replies: Arc::new(TokioMutex::new(HashMap::new())),
            permission_timeout_secs: 120,
            status: Arc::new(crate::service::status::StatusBroadcaster::new()),
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        })
    }

@@ -21,6 +21,7 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
    ctx: BotContext,
    sender: String,
    user_message: String,
+    mut cancel_rx: watch::Receiver<bool>,
 ) {
    // Look up the room's existing Claude Code session ID (if any) so we can
    // resume the conversation with structured API messages instead of
@@ -68,9 +69,6 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
    );

    let provider = ClaudeCodeProvider::new();
-    let (cancel_tx, mut cancel_rx) = watch::channel(false);
-    // Keep the sender alive for the duration of the call.
-    let _cancel_tx = cancel_tx;

    // Channel for sending complete paragraphs to the Matrix posting task.
    let (msg_tx, mut msg_rx) = tokio::sync::mpsc::unbounded_channel::<String>();
@@ -608,9 +608,56 @@ pub(in crate::chat::transport::matrix::bot) async fn on_room_message(
        return;
    }

-    // Spawn a separate task so the Matrix sync loop is not blocked while we
-    // wait for the LLM response (which can take several seconds).
-    tokio::spawn(async move {
-        handle_message(room_id_str, incoming_room_id, ctx, sender, user_message).await;
-    });
+    // "stop" — cancel the running LLM turn for this session and clear pending queue.
+    {
+        let stripped = crate::chat::util::strip_bot_mention(
+            &user_message,
+            &ctx.services.bot_name,
+            ctx.matrix_user_id.as_str(),
+        )
+        .trim()
+        .to_ascii_lowercase();
+        if stripped == "stop" {
+            slog!("[matrix-bot] stop command from {sender} for session {room_id_str}");
+            ctx.services.chat_dispatcher.stop(&room_id_str);
+            let msg = "Stopped.";
+            let html = markdown_to_html(msg);
+            if let Ok(msg_id) = ctx.transport.send_message(&room_id_str, msg, &html).await
+                && let Ok(event_id) = msg_id.parse()
+            {
+                ctx.bot_sent_event_ids.lock().await.insert(event_id);
+            }
+            return;
+        }
+    }
+
+    // Hand the message to the protocol-agnostic dispatcher instead of spawning
+    // directly.  The dispatcher applies a coalesce window and a per-session
+    // serial lock, preventing duplicate concurrent Timmy spawns.
+    let ctx_for_factory = ctx.clone();
+    let factory: crate::chat::dispatcher::SpawnFn = {
+        let room_id_str2 = room_id_str.clone();
+        std::sync::Arc::new(
+            move |coalesced: String, cancel_rx: tokio::sync::watch::Receiver<bool>| {
+                let room_id_str = room_id_str2.clone();
+                let incoming_room_id = incoming_room_id.clone();
+                let ctx = ctx_for_factory.clone();
+                let sender = sender.clone();
+                Box::pin(async move {
+                    handle_message(
+                        room_id_str,
+                        incoming_room_id,
+                        ctx,
+                        sender,
+                        coalesced,
+                        cancel_rx,
+                    )
+                    .await;
+                })
+            },
+        )
+    };
+    ctx.services
+        .chat_dispatcher
+        .submit(room_id_str, user_message, factory);
 }
@@ -150,6 +150,7 @@ mod tests {
            pending_perm_replies: Arc::new(TokioMutex::new(HashMap::new())),
            permission_timeout_secs: 120,
            status: Arc::new(crate::service::status::StatusBroadcaster::new()),
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });
        (services, perm_tx)
    }
@@ -17,6 +17,11 @@ pub(super) fn default_aggregated_notifications_enabled() -> bool {
    true
 }

+/// Default coalesce window for the chat dispatcher (1 500 ms).
+pub(super) fn default_coalesce_window_ms() -> u64 {
+    1_500
+}
+
 pub(super) fn default_transport() -> String {
    "matrix".to_string()
 }
@@ -187,4 +192,14 @@ pub struct BotConfig {
    /// Defaults to `true`.
    #[serde(default = "default_aggregated_notifications_enabled")]
    pub aggregated_notifications_enabled: bool,
+
+    /// Duration in milliseconds of the chat dispatcher's coalesce window.
+    ///
+    /// Messages for the same session arriving within this window are
+    /// concatenated into a single `claude -p` call.  The window is a
+    /// debounce: each new message extends the deadline by this duration.
+    ///
+    /// Defaults to 1 500 ms (1.5 s).
+    #[serde(default = "default_coalesce_window_ms")]
+    pub coalesce_window_ms: u64,
 }
@@ -310,6 +310,7 @@ mod tests {
            perm_rx: Arc::new(tokio::sync::Mutex::new(perm_rx)),
            pending_perm_replies: Arc::new(tokio::sync::Mutex::new(Default::default())),
            permission_timeout_secs: 120,
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });
        Arc::new(WhatsAppWebhookContext {
            services,
@@ -54,10 +54,10 @@ pub use types::{
 };
 pub use write::{
    bump_retry_count, migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
-    migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric, name_from_story_id,
-    purge_done_stage_merge_jobs, set_agent, set_depends_on, set_epic, set_item_type, set_name,
-    set_origin, set_plan_state, set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count,
-    write_item,
+    migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric,
+    migrate_zombie_pipeline_rows, name_from_story_id, purge_done_stage_merge_jobs, set_agent,
+    set_depends_on, set_epic, set_item_type, set_name, set_origin, set_plan_state, set_qa_mode,
+    set_resume_to, set_resume_to_raw, set_retry_count, write_item,
 };

 #[cfg(test)]
@@ -21,21 +21,26 @@ use crate::pipeline_state::{AgentClaim, Stage, stage_dir_name};
 ///
 /// Returns `true` if the item was found and the op was applied, `false` otherwise.
 pub fn set_depends_on(story_id: &str, deps: &[u32]) -> bool {
-    let Some(state_mutex) = get_crdt() else {
-        return false;
-    };
-    let Ok(mut state) = state_mutex.lock() else {
-        return false;
-    };
-    let Some(&idx) = state.index.get(story_id) else {
-        return false;
-    };
-    let value = if deps.is_empty() {
-        String::new()
-    } else {
-        serde_json::to_string(deps).unwrap_or_default()
-    };
-    apply_and_persist(&mut state, |s| s.crdt.doc.items[idx].depends_on.set(value));
+    {
+        let Some(state_mutex) = get_crdt() else {
+            return false;
+        };
+        let Ok(mut state) = state_mutex.lock() else {
+            return false;
+        };
+        let Some(&idx) = state.index.get(story_id) else {
+            return false;
+        };
+        let value = if deps.is_empty() {
+            String::new()
+        } else {
+            serde_json::to_string(deps).unwrap_or_default()
+        };
+        apply_and_persist(&mut state, |s| s.crdt.doc.items[idx].depends_on.set(value));
+    }
+    // Drop the CRDT lock before calling sync: read_item acquires the same
+    // mutex and would deadlock if the lock were still held here.
+    crate::db::ops::sync_item_depends_on(story_id);
    true
 }

@@ -155,6 +160,9 @@ pub fn set_name(story_id: &str, name: Option<&str>) -> bool {
    apply_and_persist(&mut state, |s| {
        s.crdt.doc.items[idx].name.set(value.clone())
    });
+    // Drop the lock before the shadow write so `read_item` can acquire it.
+    drop(state);
+    crate::db::sync_item_name(story_id);
    true
 }

@@ -175,16 +183,21 @@ pub fn set_agent(story_id: &str, agent: Option<crate::config::AgentName>) -> boo
    let Some(state_mutex) = get_crdt() else {
        return false;
    };
-    let Ok(mut state) = state_mutex.lock() else {
-        return false;
-    };
-    let Some(&idx) = state.index.get(story_id) else {
-        return false;
-    };
-    let value = agent.map(|a| a.as_str().to_string()).unwrap_or_default();
-    apply_and_persist(&mut state, |s| {
-        s.crdt.doc.items[idx].agent.set(value.clone())
-    });
+    {
+        let Ok(mut state) = state_mutex.lock() else {
+            return false;
+        };
+        let Some(&idx) = state.index.get(story_id) else {
+            return false;
+        };
+        let value = agent.map(|a| a.as_str().to_string()).unwrap_or_default();
+        apply_and_persist(&mut state, |s| {
+            s.crdt.doc.items[idx].agent.set(value.clone())
+        });
+    }
+    // Sync the updated agent to the SQLite shadow table.  Must be called after
+    // releasing the CRDT mutex so read_item can re-acquire it without deadlock.
+    crate::db::ops::sync_item_agent(story_id);
    true
 }

@@ -556,6 +569,24 @@ pub fn set_retry_count(story_id: &str, count: i64) {
        _ => return,
    };
    write_item(story_id, &new_stage, None, None, None, None);
+    if let Some(db) = crate::db::shadow_write::PIPELINE_DB.get() {
+        let stage = stage_dir_name(&new_stage).to_string();
+        let name = Some(item.name().to_string());
+        let agent = item.agent().map(|a| a.to_string());
+        let depends_on = (!item.depends_on().is_empty())
+            .then(|| serde_json::to_string(item.depends_on()).ok())
+            .flatten();
+        let msg = crate::db::shadow_write::PipelineWriteMsg {
+            story_id: story_id.to_string(),
+            stage,
+            name,
+            agent,
+            retry_count: Some(count.max(0)),
+            depends_on,
+            content: None,
+        };
+        let _ = db.tx.send(msg);
+    }
 }

 /// Increment `retries` by 1 and return the new value.
@@ -605,5 +636,23 @@ pub fn bump_retry_count(story_id: &str) -> i64 {
        _ => return 0,
    };
    write_item(story_id, &new_stage, None, None, None, None);
+    if let Some(db) = crate::db::shadow_write::PIPELINE_DB.get() {
+        let stage = stage_dir_name(&new_stage).to_string();
+        let name = Some(item.name().to_string());
+        let agent = item.agent().map(|a| a.to_string());
+        let depends_on = (!item.depends_on().is_empty())
+            .then(|| serde_json::to_string(item.depends_on()).ok())
+            .flatten();
+        let msg = crate::db::shadow_write::PipelineWriteMsg {
+            story_id: story_id.to_string(),
+            stage,
+            name,
+            agent,
+            retry_count: Some(new_retries as i64),
+            depends_on,
+            content: None,
+        };
+        let _ = db.tx.send(msg);
+    }
    new_retries as i64
 }
@@ -705,6 +705,59 @@ pub fn purge_done_stage_merge_jobs() {
    slog!("[crdt] Purged {count} stale MergeJob entries for terminal-stage stories");
 }

+/// Delete `pipeline_items` rows that correspond to CRDT-tombstoned stories.
+///
+/// Pre-1094 code deleted pipeline_items via a fire-and-forget channel that
+/// could be lost on an abrupt restart, leaving rows with non-terminal stage
+/// values for stories that no longer exist in the CRDT.  This migration
+/// removes those zombie rows on startup.
+///
+/// Idempotent: rows already absent are unaffected; running twice produces the
+/// same result.
+pub async fn migrate_zombie_pipeline_rows() {
+    let pool = match crate::db::get_shared_pool() {
+        Some(p) => p,
+        None => return,
+    };
+    let tombstone_ids = crate::crdt_state::tombstoned_ids();
+    sweep_zombie_rows(pool, &tombstone_ids).await;
+}
+
+/// Inner sweep used by [`migrate_zombie_pipeline_rows`] and its tests.
+///
+/// Deletes every `pipeline_items` row in `ids` whose stage is not already a
+/// terminal value.  Returns the number of rows deleted.
+#[cfg_attr(test, allow(dead_code))]
+pub(crate) async fn sweep_zombie_rows(pool: &sqlx::SqlitePool, ids: &[String]) -> u32 {
+    if ids.is_empty() {
+        return 0;
+    }
+    let mut cleaned = 0u32;
+    for story_id in ids {
+        match sqlx::query(
+            "DELETE FROM pipeline_items WHERE id = ?1 AND stage NOT IN \
+             ('done','archived','abandoned','superseded','rejected')",
+        )
+        .bind(story_id)
+        .execute(pool)
+        .await
+        {
+            Ok(r) if r.rows_affected() > 0 => cleaned += 1,
+            Ok(_) => {}
+            Err(e) => {
+                slog!(
+                    "[crdt] migrate_zombie_pipeline_rows: failed to delete '{}': {e}",
+                    story_id
+                );
+            }
+        }
+    }
+    if cleaned > 0 {
+        slog!("[crdt] Swept {cleaned} zombie pipeline_items rows for tombstoned stories");
+    }
+    cleaned
+}
+
 #[cfg(test)]
 mod merge_job_migration_tests {
    use super::super::super::state::init_for_test;
@@ -909,3 +962,100 @@ mod merge_job_migration_tests {
        migrate_merge_job(std::path::Path::new("/nonexistent/pipeline.db"));
    }
 }
+
+#[cfg(test)]
+mod zombie_row_migration_tests {
+    use super::super::super::state::init_for_test;
+    use super::*;
+    use sqlx::Row as _;
+
+    async fn make_pool() -> sqlx::SqlitePool {
+        let options = sqlx::sqlite::SqliteConnectOptions::new()
+            .filename(":memory:")
+            .create_if_missing(true);
+        let pool = sqlx::pool::PoolOptions::new()
+            .max_connections(1)
+            .connect_with(options)
+            .await
+            .unwrap();
+        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
+        pool
+    }
+
+    async fn insert_row(pool: &sqlx::SqlitePool, story_id: &str, stage: &str) {
+        let now = chrono::Utc::now().to_rfc3339();
+        sqlx::query(
+            "INSERT INTO pipeline_items \
+             (id, name, stage, agent, retry_count, depends_on, content, created_at, updated_at) \
+             VALUES (?1, ?2, ?3, NULL, 0, NULL, NULL, ?4, ?4)",
+        )
+        .bind(story_id)
+        .bind(story_id)
+        .bind(stage)
+        .bind(&now)
+        .execute(pool)
+        .await
+        .unwrap();
+    }
+
+    async fn row_stage(pool: &sqlx::SqlitePool, story_id: &str) -> Option<String> {
+        sqlx::query("SELECT stage FROM pipeline_items WHERE id = ?1")
+            .bind(story_id)
+            .fetch_optional(pool)
+            .await
+            .unwrap()
+            .map(|r| r.get(0))
+    }
+
+    /// Bug 1094 regression: delete a story in `coding` stage, assert the
+    /// `pipeline_items` row is gone; then re-run the sweep and confirm no
+    /// further changes (idempotent).
+    #[tokio::test]
+    async fn sweep_removes_zombie_coding_row_and_is_idempotent() {
+        init_for_test();
+        let pool = make_pool().await;
+        let story_id = "1094_zombie_regression";
+
+        // Seed: insert a pipeline_items row in the "coding" stage.
+        insert_row(&pool, story_id, "coding").await;
+        assert_eq!(row_stage(&pool, story_id).await.as_deref(), Some("coding"));
+
+        // Tombstone the story in the CRDT (simulate evict_item outcome).
+        crate::crdt_state::write_item_str(
+            story_id,
+            "coding",
+            Some("Zombie regression story"),
+            None,
+            None,
+            None,
+        );
+        crate::crdt_state::evict_item(story_id).ok();
+
+        // Run the sweep — row must be deleted.
+        let deleted = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
+        assert_eq!(deleted, 1, "expected one zombie row to be cleaned");
+        assert!(
+            row_stage(&pool, story_id).await.is_none(),
+            "pipeline_items row must be gone after sweep"
+        );
+
+        // Re-run is a no-op (idempotent).
+        let second = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
+        assert_eq!(second, 0, "second sweep must be a no-op");
+    }
+
+    /// Rows already in a terminal stage must be left alone.
+    #[tokio::test]
+    async fn sweep_skips_terminal_stage_rows() {
+        let pool = make_pool().await;
+        let story_id = "1094_terminal_skip";
+        insert_row(&pool, story_id, "done").await;
+
+        let deleted = sweep_zombie_rows(&pool, &[story_id.to_string()]).await;
+        assert_eq!(deleted, 0, "terminal-stage row must not be deleted");
+        assert!(
+            row_stage(&pool, story_id).await.is_some(),
+            "terminal-stage row must survive sweep"
+        );
+    }
+}
@@ -18,6 +18,6 @@ pub use item::{
 pub use item::write_item_str;
 pub use migrations::{
    migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
-    migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric, name_from_story_id,
-    purge_done_stage_merge_jobs,
+    migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric,
+    migrate_zombie_pipeline_rows, name_from_story_id, purge_done_stage_merge_jobs,
 };
@@ -60,6 +60,17 @@ pub enum ContentKey<'a> {
    /// completion.  Read by `get_merge_status` to surface gate output for the
    /// "completed" state without a separate MergeJob CRDT register (story 1036).
    MergeReport(&'a str),
+    /// Flag written by spawn.rs when a coder session exits with a non-zero exit
+    /// code (API error, network failure, or Claude-API-level budget exhaustion).
+    /// Prevents the stuck-respawn counter from incrementing for forced exits —
+    /// only self-exits with no file or read changes count toward the cap.
+    /// Consumed (read + deleted) by the commit-recovery path in pipeline advance.
+    CommitRecoveryForcedExit(&'a str),
+    /// Cumulative set of files read across all commit-recovery sessions for a
+    /// story, stored as a newline-separated sorted list.  Used to detect whether
+    /// the agent made read-exploration progress even when the worktree diff did
+    /// not grow (story 1089, AC2).  Cleared when a commit lands or the story blocks.
+    CommitRecoveryReadSet(&'a str),
 }

 impl<'a> ContentKey<'a> {
@@ -85,6 +96,10 @@ impl<'a> ContentKey<'a> {
            ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
            ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
            ContentKey::MergeReport(id) => format!("{id}:merge_report"),
+            ContentKey::CommitRecoveryForcedExit(id) => {
+                format!("{id}:commit_recovery_forced_exit")
+            }
+            ContentKey::CommitRecoveryReadSet(id) => format!("{id}:commit_recovery_read_set"),
        }
    }
 }
@@ -28,7 +28,10 @@ pub mod recover;
 pub mod shadow_write;

 pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content};
-pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content};
+pub use ops::{
+    ItemMeta, delete_item, delete_item_sync, move_item_stage, next_item_number, sync_item_name,
+    write_item_with_content,
+};
 pub use shadow_write::{check_schema_drift, get_shared_pool, init};

 #[cfg(test)]
@@ -589,6 +592,144 @@ mod tests {
        );
    }

+    /// `shadow_write::init` spawns its background task on the calling runtime,
+    /// which under `#[tokio::test]` is per-test and dies when the test ends.
+    /// Park the init on a leaked multi-thread runtime so the bg task lives for
+    /// the whole test process; mirrors `db::ops::tests::ensure_shadow_db`.
+    #[cfg(test)]
+    static SHADOW_RT: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
+
+    #[cfg(test)]
+    async fn ensure_shadow_db() {
+        static INIT: std::sync::OnceLock<()> = std::sync::OnceLock::new();
+        if INIT.get().is_some() {
+            return;
+        }
+        let rt = SHADOW_RT.get_or_init(|| {
+            tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(1)
+                .enable_all()
+                .build()
+                .expect("shadow rt")
+        });
+        rt.spawn(async {
+            static INNER: std::sync::OnceLock<()> = std::sync::OnceLock::new();
+            if INNER.get().is_some() {
+                return;
+            }
+            let tmp = tempfile::tempdir().expect("tmp");
+            let db_path = tmp.path().join("pipeline.db");
+            std::mem::forget(tmp);
+            shadow_write::init(&db_path).await.expect("shadow init");
+            let _ = INNER.set(());
+        })
+        .await
+        .expect("shadow init task");
+        let _ = INIT.set(());
+    }
+
+    /// Regression for story 1095: `set_name` must propagate the new name to the
+    /// SQLite shadow table via `sync_item_name`.  Before the fix, the CRDT
+    /// register was updated but `pipeline_items.name` stayed stale.
+    #[tokio::test]
+    async fn set_name_updates_shadow_name_column() {
+        crate::crdt_state::init_for_test();
+        ensure_content_store();
+        ensure_shadow_db().await;
+
+        let story_id = "9095_story_set_name_shadow";
+        write_item_with_content(
+            story_id,
+            "1_backlog",
+            "---\nname: Original Name\n---\n",
+            ItemMeta::named("Original Name"),
+        );
+
+        // Wait for the initial insert to land.
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        // Rename via the CRDT setter — now also triggers sync_item_name.
+        crate::crdt_state::set_name(story_id, Some("Updated Name"));
+
+        // Wait for the background write task to flush.
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+
+        // Open a fresh pool on this test's runtime — sqlx pools are not safe
+        // to share across runtimes, so we can't reuse `get_shared_pool()`
+        // (which was created on the leaked shadow-write runtime).
+        let path = shadow_write::SHADOW_DB_PATH
+            .get()
+            .expect("SHADOW_DB_PATH set by init");
+        let opts = sqlx::sqlite::SqliteConnectOptions::new()
+            .filename(path)
+            .create_if_missing(false);
+        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
+        let row: (Option<String>,) =
+            sqlx::query_as("SELECT name FROM pipeline_items WHERE id = ?1")
+                .bind(story_id)
+                .fetch_one(&pool)
+                .await
+                .unwrap();
+
+        assert_eq!(
+            row.0.as_deref(),
+            Some("Updated Name"),
+            "set_name must propagate the new name to the shadow table"
+        );
+    }
+
+    /// Bug 1098: `bump_retry_count` must mirror the new value to the SQLite
+    /// shadow table, not only to the CRDT register.
+    ///
+    /// Before the fix, calling `bump_retry_count` updated the CRDT but left
+    /// `pipeline_items.retry_count` stale.
+    #[tokio::test]
+    async fn bump_retry_count_updates_shadow_table() {
+        crate::crdt_state::init_for_test();
+        ensure_content_store();
+        ensure_shadow_db().await;
+
+        let story_id = "9899_story_retry_shadow_1098";
+
+        // Insert the story into both CRDT and the shadow table.
+        write_item_with_content(
+            story_id,
+            "2_current",
+            "# Retry shadow test\n",
+            ItemMeta::named("Retry Shadow Test"),
+        );
+
+        // Let the background write task process the initial insert.
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        // Three bumps → retry_count must reach 3 in SQLite.
+        crate::crdt_state::bump_retry_count(story_id);
+        crate::crdt_state::bump_retry_count(story_id);
+        crate::crdt_state::bump_retry_count(story_id);
+
+        // Let the background write task process all three updates.
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+
+        let path = shadow_write::SHADOW_DB_PATH
+            .get()
+            .expect("SHADOW_DB_PATH set by init");
+        let opts = sqlx::sqlite::SqliteConnectOptions::new()
+            .filename(path)
+            .create_if_missing(false);
+        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
+        let (count,): (i64,) =
+            sqlx::query_as("SELECT retry_count FROM pipeline_items WHERE id = ?1")
+                .bind(story_id)
+                .fetch_one(&pool)
+                .await
+                .unwrap();
+
+        assert_eq!(
+            count, 3,
+            "retry_count must be 3 after three bump_retry_count calls"
+        );
+    }
+
    /// Story 1087, AC2: the split-stage migration projects every supported
    /// wire-form `stage` string into the canonical `(pipeline, status)` pair.
    /// The fixture covers each Stage variant (and the legacy numeric-prefix
@@ -176,6 +176,43 @@ pub fn move_item_stage(
    }
 }

+/// Shadow-write the updated agent field for an existing pipeline item.
+///
+/// Called by [`crate::crdt_state::set_agent`] after the CRDT register is updated
+/// so `pipeline_items.agent` stays in sync.  Reads the full current metadata from
+/// the CRDT (stage, name, depends_on, retry_count) to avoid overwriting other
+/// columns with stale values — only the `agent` column carries the new data.
+pub fn sync_item_agent(story_id: &str) {
+    let Some(db) = PIPELINE_DB.get() else {
+        return;
+    };
+    let Some(view) = crate::crdt_state::read_item(story_id) else {
+        return;
+    };
+    let stage = view.stage().dir_name().to_string();
+    let name = Some(view.name().to_string());
+    let agent = view.agent().map(|a| a.as_str().to_string());
+    let depends_on = {
+        let d = view.depends_on();
+        if d.is_empty() {
+            None
+        } else {
+            serde_json::to_string(d).ok()
+        }
+    };
+    let retry_count = Some(i64::from(view.retry_count()));
+    let msg = PipelineWriteMsg {
+        story_id: story_id.to_string(),
+        stage,
+        name,
+        agent,
+        retry_count,
+        depends_on,
+        content: None,
+    };
+    let _ = db.tx.send(msg);
+}
+
 /// Delete a story from the shadow table (fire-and-forget).
 pub fn delete_item(story_id: &str) {
    delete_content(ContentKey::Story(story_id));
@@ -198,6 +235,111 @@ pub fn delete_item(story_id: &str) {
    }
 }

+/// Delete a story from the shadow table, awaiting the SQLite write.
+///
+/// Unlike [`delete_item`], this function issues a direct `DELETE FROM
+/// pipeline_items` via the shared pool and awaits the result — so the row
+/// is gone before this function returns.  Use this from async call sites
+/// where durability of the deletion matters (e.g. story deletion, startup
+/// migration).  Falls back to the fire-and-forget channel when the shared
+/// pool is not yet initialised.
+pub async fn delete_item_sync(story_id: &str) {
+    delete_content(ContentKey::Story(story_id));
+
+    if let Some(pool) = super::shadow_write::get_shared_pool() {
+        if let Err(e) = sqlx::query("DELETE FROM pipeline_items WHERE id = ?1")
+            .bind(story_id)
+            .execute(pool)
+            .await
+        {
+            crate::slog_warn!(
+                "[db] Synchronous delete from pipeline_items failed for '{}': {e}",
+                story_id
+            );
+        }
+    } else if let Some(db) = PIPELINE_DB.get() {
+        let msg = PipelineWriteMsg {
+            story_id: story_id.to_string(),
+            stage: "deleted".to_string(),
+            name: None,
+            agent: None,
+            retry_count: None,
+            depends_on: None,
+            content: None,
+        };
+        let _ = db.tx.send(msg);
+    }
+}
+
+/// Sync the shadow table's `name` column after a CRDT name-register write.
+///
+/// Reads the current item from the CRDT (which already holds the new name after
+/// `apply_and_persist`) and sends a `PipelineWriteMsg` so the SQLite mirror
+/// stays in sync.  All other columns (stage, agent, retry_count, depends_on)
+/// are preserved from the live CRDT view; `content` is left as `None` so the
+/// UPSERT's `COALESCE` keeps the existing value.
+///
+/// No-ops if the DB is not initialised or the item is not in the CRDT.
+pub fn sync_item_name(story_id: &str) {
+    let Some(db) = PIPELINE_DB.get() else { return };
+    let Some(view) = crate::crdt_state::read_item(story_id) else {
+        return;
+    };
+    let depends_on = {
+        let d = view.depends_on();
+        if d.is_empty() {
+            None
+        } else {
+            serde_json::to_string(d).ok()
+        }
+    };
+    let msg = PipelineWriteMsg {
+        story_id: story_id.to_string(),
+        stage: view.stage().dir_name().to_string(),
+        name: Some(view.name().to_string()),
+        agent: view.agent().map(|a| a.to_string()),
+        retry_count: Some(view.retry_count() as i64),
+        depends_on,
+        content: None,
+    };
+    let _ = db.tx.send(msg);
+}
+
+/// Sync the `depends_on` field of a pipeline item from the CRDT to the shadow table.
+///
+/// Called after [`crate::crdt_state::set_depends_on`] updates the CRDT register so
+/// that the SQLite shadow table stays in lock-step.  Reads the full current view from
+/// the CRDT (stage, name, agent, retry_count, depends_on) and sends a
+/// [`PipelineWriteMsg`] over [`PIPELINE_DB`]`.tx`.  Pattern mirrors
+/// [`move_item_stage`] lines 157-176.  No-op when the CRDT is uninitialised or the
+/// story_id is not found.
+pub fn sync_item_depends_on(story_id: &str) {
+    let Some(db) = PIPELINE_DB.get() else {
+        return;
+    };
+    let Some(view) = crate::crdt_state::read_item(story_id) else {
+        return;
+    };
+    let depends_on = {
+        let d = view.depends_on();
+        if d.is_empty() {
+            None
+        } else {
+            serde_json::to_string(d).ok()
+        }
+    };
+    let msg = PipelineWriteMsg {
+        story_id: story_id.to_string(),
+        stage: view.stage().dir_name().to_string(),
+        name: Some(view.name().to_string()),
+        agent: view.agent().map(|a| a.to_string()),
+        retry_count: Some(view.retry_count() as i64),
+        depends_on,
+        content: None,
+    };
+    let _ = db.tx.send(msg);
+}
+
 /// Get the next available item number by scanning the CRDT state, the
 /// in-memory content store, AND the tombstone set for the highest existing
 /// number.
@@ -248,3 +390,88 @@ pub fn next_item_number() -> u32 {

    max_num + 1
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::db::shadow_write;
+
+    /// `shadow_write::init` spawns its background task on the calling runtime.
+    /// Under `#[tokio::test]` that runtime is per-test and drops when the test
+    /// ends, killing the task.  This OnceLock holds a multi-thread runtime that
+    /// persists for the lifetime of the test binary so the write loop stays alive
+    /// across all tests that share `PIPELINE_DB`.
+    static SHADOW_RT: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
+
+    async fn ensure_shadow_db() {
+        static INIT: std::sync::OnceLock<()> = std::sync::OnceLock::new();
+        if INIT.get().is_some() {
+            return;
+        }
+        let rt = SHADOW_RT.get_or_init(|| {
+            tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(1)
+                .enable_all()
+                .build()
+                .expect("shadow rt")
+        });
+        rt.spawn(async {
+            static INNER: std::sync::OnceLock<()> = std::sync::OnceLock::new();
+            if INNER.get().is_some() {
+                return;
+            }
+            let tmp = tempfile::tempdir().expect("tmp");
+            let db_path = tmp.path().join("pipeline.db");
+            std::mem::forget(tmp);
+            shadow_write::init(&db_path).await.expect("shadow init");
+            let _ = INNER.set(());
+        })
+        .await
+        .expect("shadow init task");
+        let _ = INIT.set(());
+    }
+
+    /// Regression test for story 1097: `set_depends_on` must sync the shadow
+    /// table.  Before the fix, the CRDT register was updated but the
+    /// `pipeline_items.depends_on` column was never written.
+    #[tokio::test]
+    async fn set_depends_on_syncs_shadow_table() {
+        crate::crdt_state::init_for_test();
+        ensure_content_store();
+        ensure_shadow_db().await;
+
+        let story_id = "1097_story_depends_on_shadow_drift";
+
+        // Insert the story so it exists in both the CRDT and the shadow table.
+        write_item_with_content(
+            story_id,
+            "backlog",
+            "---\nname: Depends On Shadow Drift\n---\n",
+            ItemMeta::named("Depends On Shadow Drift"),
+        );
+
+        // Let the initial shadow write land.
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        // This is the write under test: it must update the shadow table.
+        let ok = crate::crdt_state::set_depends_on(story_id, &[1, 2]);
+        assert!(ok, "set_depends_on must return true for an existing item");
+
+        // Let the shadow write land.
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        let pool = shadow_write::get_shared_pool().expect("pool must be initialised");
+        let row: (Option<String>,) =
+            sqlx::query_as("SELECT depends_on FROM pipeline_items WHERE id = ?1")
+                .bind(story_id)
+                .fetch_one(pool)
+                .await
+                .expect("row must exist in shadow table");
+
+        assert_eq!(
+            row.0.as_deref(),
+            Some("[1,2]"),
+            "pipeline_items.depends_on must reflect the set_depends_on call"
+        );
+    }
+}
@@ -41,23 +41,30 @@ pub fn get_shared_pool() -> Option<&'static SqlitePool> {
 }

 /// A pending shadow write for one pipeline item.
-pub(super) struct PipelineWriteMsg {
-    pub(super) story_id: String,
-    pub(super) stage: String,
-    pub(super) name: Option<String>,
-    pub(super) agent: Option<String>,
-    pub(super) retry_count: Option<i64>,
-    pub(super) depends_on: Option<String>,
-    pub(super) content: Option<String>,
+pub(crate) struct PipelineWriteMsg {
+    pub(crate) story_id: String,
+    pub(crate) stage: String,
+    pub(crate) name: Option<String>,
+    pub(crate) agent: Option<String>,
+    pub(crate) retry_count: Option<i64>,
+    pub(crate) depends_on: Option<String>,
+    pub(crate) content: Option<String>,
 }

 /// Handle to the background shadow-write task.
 pub struct PipelineDb {
-    pub(super) tx: mpsc::UnboundedSender<PipelineWriteMsg>,
+    pub(crate) tx: mpsc::UnboundedSender<PipelineWriteMsg>,
 }

 /// Process-global handle to the background shadow-write task, set once during `init`.
-pub(super) static PIPELINE_DB: OnceLock<PipelineDb> = OnceLock::new();
+pub(crate) static PIPELINE_DB: OnceLock<PipelineDb> = OnceLock::new();
+
+/// Path of the SQLite file opened by [`init`], set once by the first successful caller.
+///
+/// Tests that need to open their own pool (because sqlx pools are not safe to
+/// share across Tokio runtimes) read this to find the right file regardless of
+/// which test won the `PIPELINE_DB` init race.
+pub(crate) static SHADOW_DB_PATH: OnceLock<std::path::PathBuf> = OnceLock::new();

 /// Initialise the pipeline database.
 ///
@@ -68,6 +75,10 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
    if PIPELINE_DB.get().is_some() {
        return Ok(());
    }
+    // Record the path before doing any real work so tests can always find the
+    // correct file even if two callers race — the OnceLock ensures only one
+    // path wins, and whichever wins will also win the PIPELINE_DB set below.
+    let _ = SHADOW_DB_PATH.set(db_path.to_path_buf());

    // Story 1087: before running the migration that splits `stage` into
    // (`pipeline`, `status`), take a timestamped side-car copy of the live DB
@@ -118,6 +118,7 @@ impl AppContext {
            )),
            permission_timeout_secs: 120,
            status: agents.status_broadcaster(),
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });
        Self {
            state: Arc::new(state),
@@ -26,6 +26,10 @@ pub(crate) fn tool_create_bug(args: &Value, ctx: &AppContext) -> Result<String,
    let acs = req.acceptance_criteria_strings();
    let depends_on = req.depends_on_ids();

+    // Bug 1102: resolve and validate origin BEFORE creating the bug file so a
+    // missing-attribution call leaves no half-state behind.
+    let origin = super::build_origin(args)?;
+
    let root = ctx.state.get_project_root()?;
    let bug_id = create_bug_file(
        &root,
@@ -38,7 +42,7 @@ pub(crate) fn tool_create_bug(args: &Value, ctx: &AppContext) -> Result<String,
        depends_on.as_deref(),
    )?;

-    crate::crdt_state::set_origin(&bug_id, &super::build_origin(args));
+    crate::crdt_state::set_origin(&bug_id, &origin);

    let _ = ctx
        .watcher_tx
@@ -243,7 +247,8 @@ mod tests {
                "steps_to_reproduce": "1. Open app\n2. Click login",
                "actual_result": "500 error",
                "expected_result": "Successful login",
-                "acceptance_criteria": ["Login succeeds without error"]
+                "acceptance_criteria": ["Login succeeds without error"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        )
@@ -364,7 +369,8 @@ mod tests {
                "steps_to_reproduce": "s",
                "actual_result": "a",
                "expected_result": "e",
-                "acceptance_criteria": ["Bug is fixed"]
+                "acceptance_criteria": ["Bug is fixed"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        );
@@ -406,7 +412,8 @@ mod tests {
                "steps_to_reproduce": "s",
                "actual_result": "a",
                "expected_result": "e",
-                "acceptance_criteria": ["TODO", "Real AC"]
+                "acceptance_criteria": ["TODO", "Real AC"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        );
@@ -13,6 +13,10 @@ use serde_json::{Value, json};
 pub(crate) fn tool_create_epic(args: &Value, ctx: &AppContext) -> Result<String, String> {
    let req = CreateEpicRequest::from_json(args)?;

+    // Bug 1102: resolve and validate origin BEFORE creating the epic so a
+    // missing-attribution call leaves no half-state behind.
+    let origin = super::build_origin(args)?;
+
    let root = ctx.state.get_project_root()?;
    let success_criteria = req.success_criteria_strings();

@@ -29,7 +33,7 @@ pub(crate) fn tool_create_epic(args: &Value, ctx: &AppContext) -> Result<String,
        },
    )?;

-    crate::crdt_state::set_origin(&epic_id, &super::build_origin(args));
+    crate::crdt_state::set_origin(&epic_id, &origin);

    Ok(format!("Created epic: {epic_id}"))
 }
@@ -170,7 +174,8 @@ mod tests {
                "name": "My Test Epic",
                "goal": "Achieve something great",
                "motivation": "Because it matters",
-                "success_criteria": ["All stories done", "Tests pass"]
+                "success_criteria": ["All stories done", "Tests pass"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        );
@@ -223,7 +228,11 @@ mod tests {

        // Create an epic.
        tool_create_epic(
-            &json!({"name": "List Epics Test Epic", "goal": "Testing list"}),
+            &json!({
+                "name": "List Epics Test Epic",
+                "goal": "Testing list",
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        )
        .unwrap();
@@ -14,29 +14,50 @@ mod story;

 /// Build a compact origin JSON string for a newly-created work item (story 1088).
 ///
-/// `args` may contain an `"origin"` object with `kind`, `id`, and `ts` fields
-/// supplied by the caller (e.g. a coder agent passing its own identity).  When
-/// absent the default is `{"kind":"user","id":"","ts":<now>}`.
+/// `args` must contain an `"origin"` object with a non-empty `id` field and an
+/// optional `kind` (defaulting to `"user"`) and `ts` (defaulting to now).  The
+/// `id` MUST identify the calling actor — e.g. `coder-1@story=42` for a coder
+/// agent, `chat-bot:Timmy@<room_id>` for a chat-bot session, or a human's user
+/// id for a CLI/MCP-direct call.  Empty / whitespace-only `id` is rejected so
+/// that every work item carries a usable provenance trail (bug 1102 — we lost
+/// 1102's attribution because the default was `id=""`).
 ///
-/// Callers that create items on behalf of system automation (e.g. gate-failure
-/// auto-filing) should pass `kind = "system"` and `id = "<automation-name>"`.
-pub(super) fn build_origin(args: &serde_json::Value) -> String {
+/// Returns the canonical origin JSON string on success.  Returns `Err` with a
+/// human-readable explanation when the caller failed to identify itself; the
+/// caller (`tool_create_*` handlers) must propagate the error without creating
+/// the work item, so a missing-attribution call leaves no half-state behind.
+pub(super) fn build_origin(args: &serde_json::Value) -> Result<String, String> {
    let ts = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs_f64();

-    if let Some(origin_obj) = args.get("origin").and_then(|v| v.as_object()) {
-        let kind = origin_obj
-            .get("kind")
-            .and_then(|v| v.as_str())
-            .unwrap_or("user");
-        let id = origin_obj.get("id").and_then(|v| v.as_str()).unwrap_or("");
-        let ts_val = origin_obj.get("ts").and_then(|v| v.as_f64()).unwrap_or(ts);
-        serde_json::json!({"kind": kind, "id": id, "ts": ts_val}).to_string()
-    } else {
-        serde_json::json!({"kind": "user", "id": "", "ts": ts}).to_string()
-    }
+    let origin_obj = args.get("origin").and_then(|v| v.as_object()).ok_or(
+        "Missing required argument: 'origin'. Every create_* MCP call must \
+         identify the calling actor. Pass origin = { \"kind\": \"agent\" | \
+         \"chat-bot\" | \"user\" | \"system\", \"id\": \"<your-identifier>\", \
+         \"ts\": <unix-seconds, optional> }. Example: { \"kind\": \"agent\", \
+         \"id\": \"coder-1@story=42\" }.",
+    )?;
+
+    let id = origin_obj
+        .get("id")
+        .and_then(|v| v.as_str())
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .ok_or(
+            "origin.id must be a non-empty string identifying the calling \
+             actor (e.g. \"coder-1@story=42\", \"chat-bot:Timmy@!room:home\", \
+             or a human user id). See bug 1102 / story 1104 for the rationale.",
+        )?;
+
+    let kind = origin_obj
+        .get("kind")
+        .and_then(|v| v.as_str())
+        .unwrap_or("user");
+    let ts_val = origin_obj.get("ts").and_then(|v| v.as_f64()).unwrap_or(ts);
+
+    Ok(serde_json::json!({"kind": kind, "id": id, "ts": ts_val}).to_string())
 }

 pub(crate) use bug::{tool_close_bug, tool_create_bug, tool_list_bugs};
@@ -27,6 +27,10 @@ pub(crate) fn tool_create_refactor(args: &Value, ctx: &AppContext) -> Result<Str
    let description = req.description.as_ref().map(|d| d.as_str());
    let depends_on = req.depends_on_ids();

+    // Bug 1102: resolve and validate origin BEFORE creating the refactor file
+    // so a missing-attribution call leaves no half-state behind.
+    let origin = super::build_origin(args)?;
+
    let root = ctx.state.get_project_root()?;
    let refactor_id = create_refactor_file(
        &root,
@@ -36,7 +40,7 @@ pub(crate) fn tool_create_refactor(args: &Value, ctx: &AppContext) -> Result<Str
        depends_on.as_deref(),
    )?;

-    crate::crdt_state::set_origin(&refactor_id, &super::build_origin(args));
+    crate::crdt_state::set_origin(&refactor_id, &origin);

    let _ = ctx
        .watcher_tx
@@ -114,7 +118,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_refactor(
-            &json!({"name": "Single Criterion Refactor", "acceptance_criteria": ["Code is clean"]}),
+            &json!({
+                "name": "Single Criterion Refactor",
+                "acceptance_criteria": ["Code is clean"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok: {result:?}");
@@ -141,7 +149,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_refactor(
-            &json!({"name": "Mixed Refactor", "acceptance_criteria": ["TODO", "Real AC"]}),
+            &json!({
+                "name": "Mixed Refactor",
+                "acceptance_criteria": ["TODO", "Real AC"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok for mixed AC: {result:?}");
@@ -27,6 +27,10 @@ pub(crate) fn tool_create_spike(args: &Value, ctx: &AppContext) -> Result<String
    let description = req.description.as_ref().map(|d| d.as_str());
    let depends_on = req.depends_on_ids();

+    // Bug 1102: resolve and validate origin BEFORE creating the spike file so
+    // a missing-attribution call leaves no half-state behind.
+    let origin = super::build_origin(args)?;
+
    let root = ctx.state.get_project_root()?;
    let spike_id = create_spike_file(
        &root,
@@ -36,7 +40,7 @@ pub(crate) fn tool_create_spike(args: &Value, ctx: &AppContext) -> Result<String
        depends_on.as_deref(),
    )?;

-    crate::crdt_state::set_origin(&spike_id, &super::build_origin(args));
+    crate::crdt_state::set_origin(&spike_id, &origin);

    let _ = ctx
        .watcher_tx
@@ -85,8 +89,14 @@ mod tests {
    fn tool_create_spike_rejects_empty_name() {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
-        let result =
-            tool_create_spike(&json!({"name": "!!!", "acceptance_criteria": ["AC"]}), &ctx);
+        let result = tool_create_spike(
+            &json!({
+                "name": "!!!",
+                "acceptance_criteria": ["AC"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
+            &ctx,
+        );
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("alphanumeric"));
    }
@@ -115,7 +125,8 @@ mod tests {
            &json!({
                "name": "Compare Encoders",
                "description": "Which encoder is fastest?",
-                "acceptance_criteria": ["Encoder comparison is documented"]
+                "acceptance_criteria": ["Encoder comparison is documented"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        )
@@ -140,7 +151,11 @@ mod tests {
        let ctx = test_ctx(tmp.path());

        let result = tool_create_spike(
-            &json!({"name": "My Spike", "acceptance_criteria": ["Spike findings documented"]}),
+            &json!({
+                "name": "My Spike",
+                "acceptance_criteria": ["Spike findings documented"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        )
        .unwrap();
@@ -190,7 +205,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_spike(
-            &json!({"name": "Single Criterion Spike", "acceptance_criteria": ["Findings documented"]}),
+            &json!({
+                "name": "Single Criterion Spike",
+                "acceptance_criteria": ["Findings documented"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok: {result:?}");
@@ -217,7 +236,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_spike(
-            &json!({"name": "Mixed Spike", "acceptance_criteria": ["TODO", "Real AC"]}),
+            &json!({
+                "name": "Mixed Spike",
+                "acceptance_criteria": ["TODO", "Real AC"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok for mixed AC: {result:?}");
@@ -15,6 +15,10 @@ use serde_json::Value;
 pub(crate) fn tool_create_story(args: &Value, ctx: &AppContext) -> Result<String, String> {
    let req = CreateStoryRequest::from_json(args)?;

+    // Bug 1102: resolve and validate origin BEFORE creating the story file so
+    // a missing-attribution call leaves no half-state behind.
+    let origin = super::super::build_origin(args)?;
+
    let root = ctx.state.get_project_root()?;
    let depends_on_ids = req.depends_on_ids();

@@ -31,7 +35,7 @@ pub(crate) fn tool_create_story(args: &Value, ctx: &AppContext) -> Result<String
        false,
    )?;

-    crate::crdt_state::set_origin(&story_id, &super::super::build_origin(args));
+    crate::crdt_state::set_origin(&story_id, &origin);

    let _ = ctx
        .watcher_tx
@@ -255,7 +259,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_story(
-            &json!({"name": "Single Criterion Story", "acceptance_criteria": ["It works"]}),
+            &json!({
+                "name": "Single Criterion Story",
+                "acceptance_criteria": ["It works"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok: {result:?}");
@@ -278,7 +286,11 @@ mod tests {
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_create_story(
-            &json!({"name": "Mixed Story", "acceptance_criteria": ["TODO", "Real AC"]}),
+            &json!({
+                "name": "Mixed Story",
+                "acceptance_criteria": ["TODO", "Real AC"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        );
        assert!(result.is_ok(), "expected ok for mixed AC: {result:?}");
@@ -294,7 +306,8 @@ mod tests {
            &json!({
                "name": "Story With Description",
                "description": "This is the background context.",
-                "acceptance_criteria": ["Described well"]
+                "acceptance_criteria": ["Described well"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        )
@@ -361,7 +374,8 @@ mod tests {
        let result = tool_create_story(
            &json!({
                "name": "Story with <b>bold</b> name",
-                "acceptance_criteria": ["AC1"]
+                "acceptance_criteria": ["AC1"],
+                "origin": {"kind": "test", "id": "test-suite"}
            }),
            &ctx,
        );
@@ -130,7 +130,11 @@ mod tests {
        let ctx = test_ctx(tmp.path());

        let result = super::super::tool_create_story(
-            &json!({"name": "Test Story", "acceptance_criteria": ["AC1", "AC2"]}),
+            &json!({
+                "name": "Test Story",
+                "acceptance_criteria": ["AC1", "AC2"],
+                "origin": {"kind": "test", "id": "test-suite"}
+            }),
            &ctx,
        )
        .unwrap();
@@ -2,6 +2,31 @@

 use serde_json::{Value, json};

+/// JSON schema fragment for the `origin` argument required by every `create_*`
+/// tool (bug 1102).  The caller MUST identify itself — empty `id` is rejected
+/// server-side so every work item carries a usable provenance trail.
+fn origin_schema() -> Value {
+    json!({
+        "type": "object",
+        "description": "Required: identifies the calling actor so every work item carries provenance. Empty/missing id is rejected (bug 1102). Examples: { \"kind\": \"agent\", \"id\": \"coder-1@story=42\" }, { \"kind\": \"chat-bot\", \"id\": \"Timmy@!room:home.local\" }, { \"kind\": \"user\", \"id\": \"dave\" }.",
+        "properties": {
+            "kind": {
+                "type": "string",
+                "description": "One of: \"agent\" (LLM coder/mergemaster/qa), \"chat-bot\" (Timmy or other chat-routed bot), \"user\" (human via CLI/MCP), \"system\" (server-automation)."
+            },
+            "id": {
+                "type": "string",
+                "description": "Non-empty identifier of the caller. For agents include the story id (e.g. \"coder-1@story=42\"); for chat-bots include the room/session (e.g. \"Timmy@!room:home.local\"); for users the user id or short name."
+            },
+            "ts": {
+                "type": "number",
+                "description": "Optional unix-seconds timestamp. Defaults to the server's clock when absent."
+            }
+        },
+        "required": ["kind", "id"]
+    })
+}
+
 /// Returns tool schemas for story/work-item lifecycle management.
 pub(super) fn story_tools() -> Vec<Value> {
    vec![
@@ -37,9 +62,10 @@ pub(super) fn story_tools() -> Vec<Value> {
                    "commit": {
                        "type": "boolean",
                        "description": "If true, git-add and git-commit the new story file to the current branch"
-                    }
+                    },
+                    "origin": origin_schema()
                },
-                "required": ["name", "acceptance_criteria"]
+                "required": ["name", "acceptance_criteria", "origin"]
            }
        }),
        json!({
@@ -282,9 +308,10 @@ pub(super) fn story_tools() -> Vec<Value> {
                        "items": { "type": "string" },
                        "minItems": 1,
                        "description": "List of acceptance criteria (at least one required)"
-                    }
+                    },
+                    "origin": origin_schema()
                },
-                "required": ["name", "acceptance_criteria"]
+                "required": ["name", "acceptance_criteria", "origin"]
            }
        }),
        json!({
@@ -323,9 +350,10 @@ pub(super) fn story_tools() -> Vec<Value> {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "Optional list of story numbers this bug depends on (e.g. [42, 43]). Persisted as depends_on in YAML front matter."
-                    }
+                    },
+                    "origin": origin_schema()
                },
-                "required": ["name", "description", "steps_to_reproduce", "actual_result", "expected_result", "acceptance_criteria"]
+                "required": ["name", "description", "steps_to_reproduce", "actual_result", "expected_result", "acceptance_criteria", "origin"]
            }
        }),
        json!({
@@ -360,9 +388,10 @@ pub(super) fn story_tools() -> Vec<Value> {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "Optional list of story numbers this refactor depends on (e.g. [42, 43]). Persisted as depends_on in YAML front matter."
-                    }
+                    },
+                    "origin": origin_schema()
                },
-                "required": ["name", "acceptance_criteria"]
+                "required": ["name", "acceptance_criteria", "origin"]
            }
        }),
        json!({
@@ -399,9 +428,10 @@ pub(super) fn story_tools() -> Vec<Value> {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "Optional: list of high-level success criteria for the epic"
-                    }
+                    },
+                    "origin": origin_schema()
                },
-                "required": ["name", "goal"]
+                "required": ["name", "goal", "origin"]
            }
        }),
        json!({
@@ -238,6 +238,12 @@ async fn main() -> Result<(), std::io::Error> {
            .map(|c| c.permission_timeout_secs)
            .unwrap_or(120),
        status: agents.status_broadcaster(),
+        chat_dispatcher: std::sync::Arc::new(chat::dispatcher::ChatDispatcher::new(
+            bot_cfg
+                .as_ref()
+                .map(|c| c.coalesce_window_ms)
+                .unwrap_or(1_500),
+        )),
    });

    // Sled uplink: forward permission requests to an upstream gateway when configured.
@@ -78,6 +78,16 @@ pub fn apply_transition(
        super::Stage::Rejected { reason, .. } | super::Stage::Blocked { reason } => {
            crate::crdt_state::set_resume_to_raw(story_id, reason);
        }
+        // Story 1105: write the resume target so read-back can reconstruct the
+        // correct variant.  Without this, the register is stale (or empty) and
+        // the deserialiser falls back to Coding regardless of where the story
+        // was when it was frozen.
+        super::Stage::Frozen { resume_to } => {
+            crate::crdt_state::set_resume_to_raw(story_id, resume_to.dir_name());
+        }
+        super::Stage::ReviewHold { resume_to, .. } => {
+            crate::crdt_state::set_resume_to_raw(story_id, resume_to.dir_name());
+        }
        _ => {}
    }

@@ -656,6 +656,52 @@ fn freeze_transitions_to_frozen_variant_with_resume_to() {
    );
 }

+// ── Story 1105: Freeze from Backlog round-trip ───────────────────────────────
+
+/// Regression test (story 1105): freezing a Backlog story must store
+/// `resume_to = Backlog` in the CRDT register so that unfreeze returns the
+/// story to Backlog, not to Coding.
+#[test]
+fn freeze_from_backlog_round_trips_resume_to_backlog() {
+    crate::crdt_state::init_for_test();
+    crate::db::ensure_content_store();
+
+    let story_id = "991105_freeze_backlog_roundtrip";
+    crate::db::write_item_with_content(
+        story_id,
+        "1_backlog",
+        "---\nname: Freeze Backlog Round-trip\n---\n# Story\n",
+        crate::db::ItemMeta::named("Freeze Backlog Round-trip"),
+    );
+
+    let item = read_typed(story_id).unwrap().unwrap();
+    assert!(
+        matches!(item.stage, Stage::Backlog),
+        "expected Backlog, got {:?}",
+        item.stage
+    );
+
+    super::apply::transition_to_frozen(story_id).expect("freeze should succeed");
+
+    let item = read_typed(story_id).unwrap().unwrap();
+    match &item.stage {
+        Stage::Frozen { resume_to } => assert!(
+            matches!(**resume_to, Stage::Backlog),
+            "resume_to should be Backlog after freezing from Backlog; got {resume_to:?}"
+        ),
+        other => panic!("expected Stage::Frozen after freeze; got {other:?}"),
+    }
+
+    super::apply::transition_to_unfrozen(story_id).expect("unfreeze should succeed");
+
+    let item = read_typed(story_id).unwrap().unwrap();
+    assert!(
+        matches!(item.stage, Stage::Backlog),
+        "unfreeze should restore to Backlog, not Coding; got {:?}",
+        item.stage
+    );
+}
+
 // ── Story 868: MergeFailure regression ─────────────────────────────

 /// Regression test (story 868): applying `PipelineEvent::MergeFailed` to a story
@@ -91,8 +91,10 @@ pub fn sigkill_pids_and_verify(pids: &[u32]) -> Result<usize, Vec<u32>> {
 /// "every process running in `<worktree>/`" or "every cargo invocation
 /// against this `Cargo.toml`".
 pub fn pids_matching(pattern: &str) -> Vec<u32> {
+    // `--` prevents pgrep (getopt_long-based) from interpreting patterns that
+    // start with `--` (e.g. `--manifest-path ...`) as unrecognised long options.
    let Ok(output) = std::process::Command::new("pgrep")
-        .args(["-f", pattern])
+        .args(["-f", "--", pattern])
        .output()
    else {
        return Vec::new();
@@ -267,6 +269,50 @@ mod tests {
        let _ = reaper.join();
    }

+    /// Verify that the merge-gate's stale-cargo kill pattern actually kills
+    /// processes whose command line contains `--manifest-path .../Cargo.toml`.
+    #[test]
+    fn stale_cargo_pattern_kills_matching_process() {
+        use std::os::unix::process::CommandExt;
+        let unique = format!("{}-{}", std::process::id(), rand_u64());
+        let fake_manifest = format!("/tmp/huskies-test-{unique}/Cargo.toml");
+        let argv0 = format!("cargo test --manifest-path {fake_manifest}");
+
+        let child: Child = Command::new("sleep")
+            .arg0(&argv0)
+            .arg("60")
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .stdin(Stdio::null())
+            .spawn()
+            .expect("failed to spawn sleeper");
+        let pid = child.id();
+        let reaper = std::thread::spawn(move || {
+            let mut c = child;
+            let _ = c.wait();
+        });
+
+        std::thread::sleep(std::time::Duration::from_millis(100));
+
+        let pattern = format!("--manifest-path {fake_manifest}");
+        let pids = pids_matching(&pattern);
+        assert!(
+            pids.contains(&pid),
+            "pids_matching must find the stale-cargo-like process (pid {pid}); got {pids:?}"
+        );
+
+        let result = sigkill_pids_and_verify(&pids);
+        assert!(
+            result.is_ok(),
+            "sigkill_pids_and_verify must succeed for stale cargo pids: {result:?}"
+        );
+        assert!(
+            !pid_is_alive(pid),
+            "stale cargo-like process (pid {pid}) must be dead after kill"
+        );
+        let _ = reaper.join();
+    }
+
    #[test]
    fn pids_matching_returns_empty_when_no_match() {
        let pattern = format!("nonexistent-pattern-{}-{}", std::process::id(), rand_u64());
@@ -147,6 +147,7 @@ pub(super) fn call_sync(
        pending_perm_replies: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
        permission_timeout_secs: 120,
        status: Arc::new(crate::service::status::StatusBroadcaster::new()),
+        chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
    });

    let dispatch = CommandDispatch {
@@ -558,6 +558,12 @@ pub fn spawn_gateway_bot(
            .as_ref()
            .map(|c| c.permission_timeout_secs)
            .unwrap_or(120),
+        chat_dispatcher: std::sync::Arc::new(crate::chat::dispatcher::ChatDispatcher::new(
+            bot_cfg
+                .as_ref()
+                .map(|c| c.coalesce_window_ms)
+                .unwrap_or(1_500),
+        )),
    });

    let timer_store = std::sync::Arc::new(crate::service::timer::TimerStore::load(
@@ -142,7 +142,10 @@ pub async fn delete_work_item(
    }

    // 5. Delete from database content store and shadow table.
-    crate::db::delete_item(story_id);
+    // Use the synchronous variant so the pipeline_items row is gone before we
+    // return — the fire-and-forget channel cannot guarantee the DELETE commits
+    // before a restart, which leaves zombie rows (bug 1094).
+    crate::db::delete_item_sync(story_id).await;
    slog_warn!("[delete_work_item] Deleted '{story_id}' from content store / shadow table");

    // 6. Remove the filesystem shadow file from work/N_stage/.
@@ -6,6 +6,7 @@
 //! transport's context struct.

 use crate::agents::AgentPool;
+use crate::chat::dispatcher::ChatDispatcher;
 use crate::http::context::{PermissionDecision, PermissionForward};
 use crate::service::status::StatusBroadcaster;
 use std::collections::{HashMap, HashSet};
@@ -44,6 +45,12 @@ pub struct Services {
    /// only to subscribers of this instance, providing natural multi-project
    /// isolation.
    pub status: Arc<StatusBroadcaster>,
+    /// Protocol-agnostic chat dispatcher shared by all transport handlers.
+    ///
+    /// Transport handlers call [`ChatDispatcher::submit`] instead of spawning
+    /// `claude -p` directly.  The dispatcher applies a coalesce window and a
+    /// per-session serial lock, preventing duplicate concurrent spawns.
+    pub chat_dispatcher: Arc<ChatDispatcher>,
 }

 #[cfg(test)]
@@ -63,6 +70,7 @@ impl Services {
            perm_rx: std::sync::Arc::new(TokioMutex::new(perm_rx)),
            pending_perm_replies: std::sync::Arc::new(TokioMutex::new(HashMap::new())),
            permission_timeout_secs: 120,
+            chat_dispatcher: std::sync::Arc::new(ChatDispatcher::new(1_500)),
        })
    }
 }
@@ -532,6 +532,7 @@ mod tests {
            perm_rx: Arc::new(tokio::sync::Mutex::new(perm_rx)),
            pending_perm_replies: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
            permission_timeout_secs: 120,
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });
        // Empty URL → noop; if it panicked or blocked the test would fail.
        spawn_uplink_task(
@@ -603,6 +604,7 @@ mod tests {
            perm_rx: Arc::new(tokio::sync::Mutex::new(perm_rx)),
            pending_perm_replies: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
            permission_timeout_secs: 120,
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });

        spawn_uplink_task(
@@ -700,6 +702,7 @@ mod tests {
            perm_rx: Arc::new(tokio::sync::Mutex::new(perm_rx)),
            pending_perm_replies: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
            permission_timeout_secs: 120,
+            chat_dispatcher: Arc::new(crate::chat::dispatcher::ChatDispatcher::new(1_500)),
        });

        spawn_uplink_task(
@@ -360,6 +360,10 @@ pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path, i
            // Story 1052: remove stale MergeJob entries for terminal-stage
            // stories so they can never cause "FAILED" labels in the UI.
            crdt_state::purge_done_stage_merge_jobs();
+            // Story 1094: delete pipeline_items rows whose CRDT entry is
+            // tombstoned but whose row survived with a non-terminal stage
+            // (pre-1094 fire-and-forget delete could be lost on restart).
+            crdt_state::migrate_zombie_pipeline_rows().await;
        }
    }
 }
@@ -209,6 +209,10 @@ pub(crate) fn spawn_tick_loop(
            {
                crate::slog!("[reconcile] Running periodic reconcile pass.");
                run_reconcile_pass(r, &agents, done_retention).await;
+                // Stop LLM agents whose pipeline stage no longer matches the
+                // story's current canonical stage.  Cleans up stale agents left
+                // behind after a stage transition (story 1100).
+                agents.reconcile_canonical_agents(r).await;
            }
        }
    });
Author	SHA1	Message	Date
Timmy	7db0b78e88	Bump version to 0.11.1	2026-05-15 23:38:09 +01:00
dave	979492449e	huskies: merge 1105 bug Freeze from Backlog stores wrong resume_to — Unfreeze restores to Coding instead of Backlog	2026-05-15 22:33:54 +00:00
Timmy	6fbe239313	fix(1102): require non-empty origin.id on create_* MCP tools bug 1102 was created today with origin={kind:user, id:""} because build_origin silently defaulted id to empty when the caller didn't pass one — we couldn't tell who filed it. Bug 1088's origin field is useless as audit if every caller can omit themselves. Changes: - build_origin (server/src/http/mcp/story_tools/mod.rs) now returns Result<String, String> and rejects missing/empty/whitespace-only id with an instructional error pointing at bug 1102 / story 1104. - 5 create_* tool handlers (bug, spike, refactor, epic, story) now resolve origin BEFORE create_*_file so an attribution-less call leaves no half-state behind. - 5 tool input schemas now advertise origin as a required object via a shared origin_schema() helper. The schema description gives every caller (coder agent, chat bot, user, system) a concrete example so the LLM populates the field correctly on first sight. - Test fixtures pass origin = {kind:"test", id:"test-suite"}. Story 1104 (signed actions) is the longer-term replacement; this is the quick attribution win agreed for master ahead of that design work. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 23:13:54 +01:00
Timmy	26527e7dae	diag(1101): log classify verdict + matched trigger on merge gate failures Bug 1101's reframed AC1: when a non-success merge runs, log the typed GateFailureKind, the matched classifier-trigger substring (if any) and ~90 chars of surrounding context. Fires on every gate failure regardless of routing, so the next fixup-loop bounce will tell us which substring is fooling classify() into Fmt\|Lint\|SourceMapCheck on what's actually a Test failure. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 23:13:38 +01:00
dave	04a57e92c2	huskies: merge 1103 bug Rate-limit warning at session start sticks the `rate_limit_exit` flag, causing 1053's fast-path bypass to skip completion on clean session exits	2026-05-15 21:02:37 +00:00
dave	d59efa0b5c	huskies: regen source-map.json	2026-05-15 20:24:31 +00:00
dave	4216ced493	huskies: merge 1100 bug Multiple LLM agents can run concurrently on the same story (coder + mergemaster + others) — enforce one-agent-per-story invariant	2026-05-15 20:24:31 +00:00
dave	9f4f493486	huskies: regen source-map.json	2026-05-15 19:05:56 +00:00
dave	63d86f1263	huskies: merge 1096 bug Shadow drift: set_agent writes CRDT agent register without updating pipeline_items.agent	2026-05-15 19:05:56 +00:00
dave	398a5806e7	huskies: regen source-map.json	2026-05-15 18:25:25 +00:00
dave	1adc734801	huskies: merge 1098 bug Shadow drift: set_retry_count / bump_retry_count write CRDT register without updating pipeline_items.retry_count	2026-05-15 18:25:25 +00:00
dave	0ae6dfd565	huskies: regen source-map.json	2026-05-15 12:40:17 +00:00
dave	8531bac6cd	huskies: merge 1097 bug Shadow drift: set_depends_on writes CRDT depends_on register without updating pipeline_items.depends_on	2026-05-15 12:40:17 +00:00
dave	ce13c00ebd	huskies: regen source-map.json	2026-05-15 12:27:48 +00:00
dave	2857c3b46b	huskies: merge 1094 bug delete_story leaks zombie rows in pipeline_items shadow table — 176 tombstoned items still report non-terminal stages	2026-05-15 12:27:48 +00:00
dave	d944885ce9	huskies: regen source-map.json	2026-05-15 12:10:11 +00:00
dave	62d1535e76	huskies: merge 1095 bug Shadow drift: set_name writes CRDT name register without updating pipeline_items.name	2026-05-15 12:10:11 +00:00
dave	46556d308a	huskies: regen source-map.json	2026-05-15 12:03:09 +00:00
dave	fc5481dbe4	huskies: merge 1093 bug Chat dispatcher spawns one Timmy per inbound message — needs coalesce window + per-session serial lock	2026-05-15 12:03:09 +00:00
dave	01e60a670c	huskies: merge 1091 refactor Migrate the merge-gate's stale-cargo kill path to `process_kill`	2026-05-15 11:50:03 +00:00
dave	c4010854a5	huskies: merge 1089 bug Stuck-agent detector blocks stories on legitimate exploration / debugging — uses too narrow a "progress" signal	2026-05-15 11:40:44 +00:00
dave	fb1311cdae	huskies: regen source-map.json	2026-05-15 11:16:16 +00:00
dave	4aa76ce673	huskies: merge 1090 refactor Migrate `AgentPool::kill_all_children` and `kill_child_for_key` to `process_kill` so server shutdown and `stop_agent` actually kill claude	2026-05-15 11:16:16 +00:00