test(tick_loop): de-flake reconcile_never_floods_broadcast_channel

The test asserted msg_count == 0 on a process-global broadcast channel (TRANSITION_TX is a single OnceLock<Sender> shared across the test binary), so any concurrent test calling apply_transition could land events in our receiver between the drain and the post-reconcile check. Observed failure: 3 stray transitions from parallel tests. Drop the strict count check. The real "never floods" invariant is captured by the Lagged check alone: 1000 seeded items must not overflow the 256-slot channel, which can only hold if the reconcile path bypasses the broadcast (AC4). The sibling test `reconcile_pass_scales_to_1000_items_without_lagged_divergence` already uses this Lagged-only pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
fix(agents): kill-then-status reorder in stop_agent
2026-05-15 11:13:31 +01:00 · 2026-05-15 10:46:02 +01:00 · 2026-05-15 10:36:33 +01:00 · 2026-05-15 10:06:37 +01:00 · 2026-05-15 08:47:38 +00:00 · 2026-05-15 08:47:38 +00:00
75 changed files with 3137 additions and 578 deletions
@@ -0,0 +1,23 @@
 #!/bin/sh
 #
 # Pre-commit hook installed by huskies.
 # Runs script/check (fmt-check, clippy, cargo check, source-map-check)
 # before every commit. Aborts if any gate fails.
 #
 # Emergency bypass: git commit --no-verify  (see AGENT.md — avoid this)
 REPO_ROOT="$(git rev-parse --show-toplevel)"
 printf '[pre-commit] Running script/check ...\n'
 OUTPUT=$("$REPO_ROOT/script/check" 2>&1)
 STATUS=$?
 if [ "$STATUS" -ne 0 ]; then
 printf '\n=== PRE-COMMIT HOOK FAILED ===\n\n'
 printf '%s\n' "$OUTPUT"
 printf '\nFix the issues above, then re-validate with:\n'
 printf '    script/check\n'
 printf '\nEmergency bypass (see AGENT.md -- avoid this):\n'
 printf '    git commit --no-verify\n\n'
 exit 1
 fi
@@ -29,6 +29,7 @@ timers.json
 # Misc
 wishlist.md
 double_timmy_log.md
 # Database
 pipeline.db
@@ -172,6 +172,8 @@
    "interface WizardStepInfo",
    "interface WizardStateData",
    "interface AgentAssignment",
    "type Pipeline",
    "type Status",
    "interface PipelineStageItem",
    "interface PipelineState",
    "type WsResponse",
@@ -200,6 +202,8 @@
    "interface JoinedAgent",
    "interface GatewayProject",
    "interface GatewayInfo",
    "type Pipeline",
    "type Status",
    "interface PipelineItem",
    "interface ProjectPipelineStatus",
    "interface AllProjectsPipeline",
@@ -517,6 +521,7 @@
  ],
  "server/src/agents/merge/squash/tests_advanced.rs": [],
  "server/src/agents/merge/squash/tests_basic.rs": [],
  "server/src/agents/merge/squash/tests_changelog.rs": [],
  "server/src/agents/mod.rs": [
    "mod gates",
    "mod lifecycle",
@@ -558,9 +563,11 @@
    "fn assign_merge_stage"
  ],
  "server/src/agents/pool/auto_assign/merge_failure_block_subscriber.rs": [
    "fn reconcile_merge_failure_block",
    "fn spawn_merge_failure_block_subscriber"
  ],
  "server/src/agents/pool/auto_assign/merge_failure_subscriber.rs": [
    "fn reconcile_merge_failure",
    "fn spawn_merge_failure_subscriber"
  ],
  "server/src/agents/pool/auto_assign/mod.rs": [
@@ -612,6 +619,7 @@
  ],
  "server/src/agents/pool/auto_assign/watchdog/tests/orphan_tests.rs": [],
  "server/src/agents/pool/cost_rollup_subscriber.rs": [
    "fn reconcile_cost_rollup",
    "fn spawn_cost_rollup_subscriber",
    "fn on_terminal_transition"
  ],
@@ -730,6 +738,8 @@
  "server/src/agents/pool/worktree_lifecycle.rs": [
    "fn spawn_worktree_create_subscriber",
    "fn spawn_worktree_cleanup_subscriber",
    "fn reconcile_worktree_create",
    "fn reconcile_worktree_cleanup",
    "fn on_coding_transition",
    "fn on_terminal_transition"
  ],
@@ -1390,6 +1400,7 @@
    "fn qa_mode",
    "fn item_type",
    "fn epic",
    "fn origin",
    "fn for_test",
    "type PipelineItemView",
    "struct NodePresenceView",
@@ -1416,6 +1427,7 @@
    "fn set_agent",
    "fn set_qa_mode",
    "fn set_plan_state",
    "fn set_origin",
    "fn write_item",
    "fn write_item_str",
    "fn set_retry_count",
@@ -1548,11 +1560,14 @@
    "fn recover_half_written_items"
  ],
  "server/src/db/shadow_write.rs": [
    "struct UnknownMigration",
    "fn get_shared_pool",
    "struct PipelineWriteMsg",
    "struct PipelineDb",
    "static PIPELINE_DB",
-    "fn init"
+    "fn init",
    "fn backup_pre_pipeline_status",
    "fn check_schema_drift"
  ],
  "server/src/gateway/mod.rs": [
    "fn build_gateway_route",
@@ -1734,7 +1749,9 @@
    "fn tool_list_epics",
    "fn tool_show_epic"
  ],
-  "server/src/http/mcp/story_tools/mod.rs": [],
+  "server/src/http/mcp/story_tools/mod.rs": [
    "fn build_origin"
  ],
  "server/src/http/mcp/story_tools/refactor.rs": [
    "fn tool_create_refactor",
    "fn tool_list_refactors"
@@ -2193,7 +2210,6 @@
  "server/src/pipeline_state/events.rs": [
    "fn subscribe_transitions",
    "fn try_broadcast",
    "fn replay_current_pipeline_state",
    "struct TransitionFired",
    "trait TransitionSubscriber",
    "struct EventBus",
@@ -2210,6 +2226,7 @@
  "server/src/pipeline_state/subscribers.rs": [
    "fn format_audit_entry",
    "struct AuditLogSubscriber",
    "fn reconcile_audit_log",
    "fn spawn_audit_log_subscriber",
    "struct MatrixBotSubscriber",
    "struct FileRendererSubscriber",
@@ -2243,6 +2260,12 @@
    "enum ArchiveReason",
    "fn dir_name",
    "fn from_dir",
    "enum Pipeline",
    "fn as_str",
    "enum Status",
    "fn as_str",
    "fn pipeline",
    "fn status",
    "enum ExecutionState",
    "struct PipelineItem",
    "fn retry_count",
@@ -2579,7 +2602,9 @@
    "fn format_oauth_accounts_exhausted",
    "fn format_agent_started_notification",
    "fn format_agent_completed_notification",
-    "fn merge_failure_snippet"
+    "fn format_new_item_notification",
    "const MERGE_FAILURE_TAIL_LINES",
    "fn truncate_gate_output"
  ],
  "server/src/service/notifications/io/listener.rs": [
    "fn spawn_notification_listener"
@@ -2965,6 +2990,7 @@
    "fn spawn_tick_loop",
    "fn spawn_gateway_relay",
    "fn spawn_event_trigger_subscriber",
    "fn run_reconcile_pass",
    "fn spawn_startup_reconciliation"
  ],
  "server/src/state.rs": [
@@ -50,6 +50,29 @@ export interface AgentAssignment {
 	status: string;
 }
 /** Display column for a work item — derived server-side from `Stage::pipeline()` (story 1085). */
 export type Pipeline =
 	| "backlog"
 	| "coding"
 	| "qa"
 	| "merge"
 	| "done"
 	| "closed"
 	| "archived";
 /** Badge/indicator for a work item — derived server-side from `Stage::status()` (story 1085). */
 export type Status =
 	| "active"
 	| "frozen"
 	| "review-hold"
 	| "blocked"
 	| "merge-failure"
 	| "merge-failure-final"
 	| "abandoned"
 	| "superseded"
 	| "rejected"
 	| "done";
 /** A single item in any pipeline stage (backlog, current, QA, merge, or done). */
 export interface PipelineStageItem {
 	story_id: string;
@@ -57,6 +80,10 @@ export interface PipelineStageItem {
 	error: string | null;
 	merge_failure: string | null;
 	agent: AgentAssignment | null;
 	/** Display column (story 1085); falls back to the bucket name on legacy servers. */
 	pipeline?: Pipeline;
 	/** Display badge (story 1085); falls back to derived `blocked`/`frozen` on legacy servers. */
 	status?: Status;
 	review_hold: boolean | null;
 	qa: string | null;
 	depends_on: number[] | null;
@@ -214,6 +241,8 @@ export interface WorkItemContent {
 	stage: string;
 	name: string;
 	agent: string | null;
 	/** Origin JSON string (story 1088), or null for pre-origin items. */
 	origin: string | null;
 }
 /** Result for a single test case from the server's test runner. */
@@ -24,10 +24,38 @@ export interface GatewayInfo {
 	projects: GatewayProject[];
 }
 /** Display column for a work item — derived server-side from `Stage::pipeline()` (story 1085). */
 export type Pipeline =
 	| "backlog"
 	| "coding"
 	| "qa"
 	| "merge"
 	| "done"
 	| "closed"
 	| "archived";
 /** Badge/indicator for a work item — derived server-side from `Stage::status()` (story 1085). */
 export type Status =
 	| "active"
 	| "frozen"
 	| "review-hold"
 	| "blocked"
 	| "merge-failure"
 	| "merge-failure-final"
 	| "abandoned"
 	| "superseded"
 	| "rejected"
 	| "done";
 export interface PipelineItem {
 	story_id: string;
 	name: string;
 	/** Legacy stage string (kept for back-compat); prefer `pipeline` + `status`. */
 	stage: string;
 	/** Display column (story 1085). Optional until all servers are upgraded. */
 	pipeline?: Pipeline;
 	/** Display badge (story 1085). Optional until all servers are upgraded. */
 	status?: Status;
 	agent?: { agent_name: string; model: string; status: string } | null;
 	blocked?: boolean;
 	retry_count?: number;
@@ -69,29 +69,34 @@ describe("StoryRow", () => {
 		expect(screen.getByText("awaiting-slot (#2)")).toBeInTheDocument();
 	});
-	// AC2: failure kind labels derived from merge_failure string
+	// Story 1085: failure kind no longer derived from substring.  Items in
-	it("shows ConflictDetected for merge_failure with conflict text", () => {
+	// the merge_failure / merge_failure_final status get a generic FAILED badge;
 	// the kind detail is exposed via the typed `status` field for callers that
 	// need it (instead of being squeezed into the badge text).
 	it("shows ✕ FAILED badge for merge-failure status", () => {
 		const item: PipelineItem = {
 			story_id: "73_story_conflict",
 			name: "Conflict Story",
 			stage: "merge",
-			blocked: true,
+			pipeline: "merge",
 			status: "merge-failure",
 			merge_failure: "Merge conflict: conflicts detected",
 		};
 		render(<StoryRow item={item} />);
-		expect(screen.getByText("ConflictDetected")).toBeInTheDocument();
+		expect(screen.getByText("✕ FAILED")).toBeInTheDocument();
 	});
-	it("shows GatesFailed for merge_failure with quality gates text", () => {
+	it("shows ⛔ FAILED (FINAL) badge for merge-failure-final status", () => {
 		const item: PipelineItem = {
 			story_id: "74_story_gates",
 			name: "Gates Failed Story",
 			stage: "merge",
-			blocked: true,
+			pipeline: "merge",
 			status: "merge-failure-final",
 			merge_failure: "Quality gates failed: cargo test failed",
 		};
 		render(<StoryRow item={item} />);
-		expect(screen.getByText("GatesFailed")).toBeInTheDocument();
+		expect(screen.getByText("⛔ FAILED (FINAL)")).toBeInTheDocument();
 	});
 	it("shows RECOVERING badge for merge_failure item with running mergemaster", () => {
@@ -163,4 +168,36 @@ describe("StoryRow", () => {
 		render(<StoryRow item={item} />);
 		expect(screen.getByText("⊘ BLOCKED")).toBeInTheDocument();
 	});
 	// Story 1085 AC 4 — Frozen items remain visible in their underlying column
 	// with a frozen indicator. The server hands us `pipeline: "coding"` for a
 	// frozen-while-coding story and the badge is decorated separately.
 	it("shows ❄ FROZEN badge for a frozen item (column stays as underlying pipeline)", () => {
 		const item: PipelineItem = {
 			story_id: "70_story_frozen_coding",
 			name: "Paused Coding Story",
 			stage: "current",
 			pipeline: "coding",
 			status: "frozen",
 		};
 		render(<StoryRow item={item} />);
 		expect(screen.getByText("❄ FROZEN")).toBeInTheDocument();
 	});
 	// Story 1085 AC 4 (subsumes 1052) — Done items must never get a
 	// MergeFailure indicator, even if a stale `merge_failure` string is present.
 	it("done items render Done badge, never MergeFailure", () => {
 		const item: PipelineItem = {
 			story_id: "71_story_done",
 			name: "Completed Story",
 			stage: "done",
 			pipeline: "done",
 			status: "done",
 			merge_failure: "ignored stale string",
 		};
 		render(<StoryRow item={item} />);
 		expect(screen.getByText("Done")).toBeInTheDocument();
 		expect(screen.queryByText("✕ FAILED")).not.toBeInTheDocument();
 		expect(screen.queryByText(/FAILED/)).not.toBeInTheDocument();
 	});
 });
@@ -14,9 +14,42 @@ import {
 	type JoinedAgent,
 	type GatewayProject,
 	type AllProjectsPipeline,
 	type Pipeline,
 	type PipelineItem,
 	type Status,
 } from "../api/gateway";
 /// Resolve an item's pipeline column.  Servers running the new (story 1085)
 /// backend send `pipeline`; older servers only send `stage` so we fall back to
 /// mapping the bucket name onto the new column vocabulary.
 function itemPipeline(item: PipelineItem): Pipeline {
 	if (item.pipeline) return item.pipeline;
 	switch (item.stage) {
 		case "current":
 			return "coding";
 		case "qa":
 			return "qa";
 		case "merge":
 			return "merge";
 		case "done":
 			return "done";
 		case "archived":
 			return "archived";
 		default:
 			return "backlog";
 	}
 }
 /// Resolve an item's badge.  Falls back to `merge_failure`/`blocked` on
 /// legacy servers that don't yet emit `status`.
 function itemStatus(item: PipelineItem): Status {
 	if (item.status) return item.status;
 	if (item.merge_failure) return "merge-failure";
 	if (item.blocked) return "blocked";
 	if (item.stage === "done") return "done";
 	return "active";
 }
 const { useCallback, useEffect, useRef, useState } = React;
 /// Seconds of silence before an agent is considered disconnected.
@@ -48,72 +81,86 @@ const STATUS_LABELS: Record<AgentStatus, string> = {
 	disconnected: "Disconnected",
 };
-const STAGE_COLORS: Record<string, string> = {
+const PIPELINE_COLORS: Record<Pipeline, string> = {
 	backlog: "#8b949e",
-	current: "#3fb950",
+	coding: "#3fb950",
 	qa: "#d2a679",
 	merge: "#79c0ff",
 	done: "#6e7681",
 	closed: "#6e7681",
 	archived: "#6e7681",
 };
-const STAGE_LABELS: Record<string, string> = {
+const PIPELINE_LABELS: Record<Pipeline, string> = {
 	backlog: "Backlog",
-	current: "In Progress",
+	coding: "In Progress",
 	qa: "QA",
 	merge: "Merging",
 	done: "Done",
 	closed: "Closed",
 	archived: "Archived",
 };
 /// Derive a short label from a merge failure string based on the failure kind.
 function mergeFailureKindLabel(failure: string): string {
 	if (failure.includes("Merge conflict") || failure.includes("CONFLICT")) {
 		return "ConflictDetected";
 	}
 	if (failure.includes("Quality gates failed") || failure.includes("gates failed")) {
 		return "GatesFailed";
 	}
 	if (failure.includes("no code changes") || failure.includes("empty diff")) {
 		return "EmptyDiff";
 	}
 	if (failure.includes("No commits")) {
 		return "NoCommits";
 	}
 	return "✕ FAILED";
 }
 /// A single story row inside a project pipeline card.
-/** Render one story row in a gateway-aggregate panel: `#<id> <name>` with stage badge. */
+/** Render one story row in a gateway-aggregate panel: `#<id> <name>` with status badge. */
 export function StoryRow({ item, mergeQueuePos }: { item: PipelineItem; mergeQueuePos?: number }) {
-	const isStuck = item.merge_failure != null || item.blocked;
+	const pipeline = itemPipeline(item);
-	const isMergeActive = item.stage === "merge" && !isStuck && item.agent?.status === "running";
+	const status = itemStatus(item);
 	const agentStatus = item.agent?.status;
 	let color: string;
 	let label: string;
 	let frozenPrefix = "";
-	if (isMergeActive) {
+	// Frozen items keep their underlying pipeline column but get a ❄️ badge.
-		color = "#58a6ff";
+	// (AC 4 — story 1085, subsumes the freeze-hides-item bug.)
-		label = "▶ MERGING";
+	if (status === "frozen") {
-	} else if (isStuck) {
+		color = "#79c0ff";
-		const agentStatus = item.agent?.status;
+		label = "❄ FROZEN";
 		frozenPrefix = "❄ ";
 	} else if (status === "merge-failure" || status === "merge-failure-final") {
 		// Done items never reach this branch — `Stage::status()` returns
 		// `Status::Done` for done items (AC 4).
 		if (agentStatus === "running") {
 			color = "#e3b341";
 			label = "⟳ RECOVERING";
 		} else if (agentStatus === "pending") {
 			color = "#e3b341";
 			label = "⏳ QUEUED";
-		} else if (item.merge_failure != null) {
+		} else {
 			color = "#f85149";
-			label = mergeFailureKindLabel(item.merge_failure);
+			label = status === "merge-failure-final" ? "⛔ FAILED (FINAL)" : "✕ FAILED";
 		}
 	} else if (status === "blocked") {
 		if (agentStatus === "running") {
 			color = "#e3b341";
 			label = "⟳ RECOVERING";
 		} else if (agentStatus === "pending") {
 			color = "#e3b341";
 			label = "⏳ QUEUED";
 		} else {
 			color = "#f85149";
 			label = "⊘ BLOCKED";
 		}
-	} else if (item.stage === "merge" && item.agent?.status === "pending") {
+	} else if (status === "review-hold") {
 		color = "#d2a679";
 		label = "REVIEW HOLD";
 	} else if (status === "abandoned") {
 		color = "#6e7681";
 		label = "ABANDONED";
 	} else if (status === "superseded") {
 		color = "#6e7681";
 		label = "SUPERSEDED";
 	} else if (status === "rejected") {
 		color = "#f85149";
 		label = "REJECTED";
 	} else if (pipeline === "merge" && agentStatus === "running") {
 		color = "#58a6ff";
 		label = "▶ MERGING";
 	} else if (pipeline === "merge" && agentStatus === "pending") {
 		color = "#e3b341";
 		label = "⏳ QUEUED";
-	} else if (item.stage === "merge") {
+	} else if (pipeline === "merge") {
 		color = "#6e7681";
 		if (mergeQueuePos === 1) {
 			label = "NEXT IN QUEUE";
@@ -123,10 +170,11 @@ export function StoryRow({ item, mergeQueuePos }: { item: PipelineItem; mergeQue
 			label = "awaiting-slot";
 		}
 	} else {
-		color = STAGE_COLORS[item.stage] ?? "#8b949e";
+		color = PIPELINE_COLORS[pipeline] ?? "#8b949e";
-		label = STAGE_LABELS[item.stage] ?? item.stage;
+		label = PIPELINE_LABELS[pipeline] ?? pipeline;
 	}
 	const isMergeActive = pipeline === "merge" && status === "active" && agentStatus === "running";
 	const idNum = item.story_id.match(/^(\d+)/)?.[1];
 	return (
@@ -158,7 +206,7 @@ export function StoryRow({ item, mergeQueuePos }: { item: PipelineItem; mergeQue
 			</span>
 			<span style={{ color: "#e6edf3", overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap" }}>
 				{idNum && <span style={{ color: "#8b949e", fontFamily: "monospace" }}>#{idNum}{" "}</span>}
-				{item.name}
+				{frozenPrefix}{item.name}
 			</span>
 		</div>
 	);
@@ -388,6 +436,8 @@ function aggregateItems(
 						story_id: b.story_id,
 						name: b.name,
 						stage: "backlog",
 						pipeline: "backlog" as Pipeline,
 						status: "active" as Status,
 					})),
 				};
 			}
@@ -395,14 +445,14 @@ function aggregateItems(
 				return {
 					project,
 					items: (status.active ?? []).filter(
-						(i) => i.stage !== "done",
+						(i) => itemPipeline(i) !== "done",
 					),
 				};
 			}
 			if (tab === "done") {
 				return {
 					project,
-					items: (status.active ?? []).filter((i) => i.stage === "done"),
+					items: (status.active ?? []).filter((i) => itemPipeline(i) === "done"),
 				};
 			}
 			// archived
@@ -419,12 +469,12 @@ function tabCount(pipeline: AllProjectsPipeline, tab: TabKey): number {
 		if (tab === "in-progress") {
 			return (
 				sum +
-				(status.active ?? []).filter((i) => i.stage !== "done").length
+				(status.active ?? []).filter((i) => itemPipeline(i) !== "done").length
 			);
 		}
 		if (tab === "done") {
 			return (
-				sum + (status.active ?? []).filter((i) => i.stage === "done").length
+				sum + (status.active ?? []).filter((i) => itemPipeline(i) === "done").length
 			);
 		}
 		return sum + (status.archived ?? []).length;
@@ -518,13 +568,16 @@ function ProjectStoryRow({
 	);
 }
-const IN_PROGRESS_STAGE_LABELS: Record<string, string> = {
+const IN_PROGRESS_PIPELINE_LABELS: Record<"coding" | "qa" | "merge", string> = {
-	current: "Coding",
+	coding: "Coding",
 	qa: "QA",
 	merge: "Merging",
 };
-/// In Progress tab content — items grouped by stage (coding / qa / merging).
+/// In Progress tab content — items grouped by their `pipeline` column.
 ///
 /// Frozen items appear in the column corresponding to their underlying
 /// `Stage::resume_to` (server-side), so they always show up in-place.
 function InProgressTabContent({
 	groups,
 }: {
@@ -535,25 +588,22 @@ function InProgressTabContent({
 	);
 	const multiProject = new Set(allItems.map((x) => x.project)).size > 1;
-	const byStage = {
+	const byPipeline = {
-		current: allItems.filter((x) => x.item.stage === "current"),
+		coding: allItems.filter((x) => itemPipeline(x.item) === "coding"),
-		qa: allItems.filter((x) => x.item.stage === "qa"),
+		qa: allItems.filter((x) => itemPipeline(x.item) === "qa"),
-		merge: allItems.filter((x) => x.item.stage === "merge"),
+		merge: allItems.filter((x) => itemPipeline(x.item) === "merge"),
 	};
-	const stages = (["current", "qa", "merge"] as const).filter(
+	const pipelines = (["coding", "qa", "merge"] as const).filter(
-		(s) => byStage[s].length > 0,
+		(p) => byPipeline[p].length > 0,
 	);
-	// Compute queue position among clean awaiting merge items (Stage::Merge, no failure, no running agent).
+	// Compute queue position among "clean" awaiting-merge items: pipeline=merge,
 	// status=active, and no agent currently running.
 	const mergeQueuePosMap = new Map<string, number>();
 	let queuePos = 0;
-	for (const { project, item } of byStage.merge) {
+	for (const { project, item } of byPipeline.merge) {
-		if (
+		if (itemStatus(item) === "active" && item.agent?.status !== "running") {
 			!item.blocked &&
 			!item.merge_failure &&
 			item.agent?.status !== "running"
 		) {
 			queuePos += 1;
 			mergeQueuePosMap.set(`${project}:${item.story_id}`, queuePos);
 		}
@@ -569,33 +619,33 @@ function InProgressTabContent({
 	return (
 		<div>
-			{stages.map((stage) => (
+			{pipelines.map((p) => (
-				<div key={stage} style={{ marginBottom: "20px" }}>
+				<div key={p} style={{ marginBottom: "20px" }}>
 					<div
 						style={{
 							fontSize: "0.8em",
 							fontWeight: 600,
-							color: STAGE_COLORS[stage] ?? "#8b949e",
+							color: PIPELINE_COLORS[p] ?? "#8b949e",
 							textTransform: "uppercase",
 							letterSpacing: "0.06em",
 							marginBottom: "8px",
 							paddingBottom: "4px",
-							borderBottom: `1px solid ${STAGE_COLORS[stage] ?? "#8b949e"}33`,
+							borderBottom: `1px solid ${PIPELINE_COLORS[p] ?? "#8b949e"}33`,
 						}}
 					>
-						{IN_PROGRESS_STAGE_LABELS[stage]}{" "}
+						{IN_PROGRESS_PIPELINE_LABELS[p]}{" "}
 						<span style={{ color: "#6e7681" }}>
-							({byStage[stage].length})
+							({byPipeline[p].length})
 						</span>
 					</div>
-					{byStage[stage].map(({ project, item }) => (
+					{byPipeline[p].map(({ project, item }) => (
 						<ProjectStoryRow
 							key={`${project}:${item.story_id}`}
 							project={project}
 							item={item}
 							showProject={multiProject}
 							mergeQueuePos={
-								stage === "merge"
+								p === "merge"
 									? mergeQueuePosMap.get(`${project}:${item.story_id}`)
 									: undefined
 							}
@@ -43,6 +43,7 @@ const DEFAULT_CONTENT = {
 	stage: "current",
 	name: "Big Title Story",
 	agent: null,
 	origin: null,
 };
 beforeEach(() => {
@@ -43,6 +43,7 @@ const DEFAULT_CONTENT = {
 	stage: "current",
 	name: "Big Title Story",
 	agent: null,
 	origin: null,
 };
 const sampleTestResults: TestResultsResponse = {
@@ -42,6 +42,7 @@ const DEFAULT_CONTENT = {
 	stage: "current",
 	name: "Big Title Story",
 	agent: null,
 	origin: null,
 };
 beforeEach(() => {
@@ -127,6 +128,7 @@ describe("WorkItemDetailPanel", () => {
 			stage: "current",
 			name: "My Story Name",
 			agent: null,
 			origin: null,
 		});
 		render(
 			<WorkItemDetailPanel
@@ -146,6 +148,7 @@ describe("WorkItemDetailPanel", () => {
 			stage: "current",
 			name: "My Story Name",
 			agent: null,
 			origin: null,
 		});
 		render(
 			<WorkItemDetailPanel
@@ -164,6 +167,7 @@ describe("WorkItemDetailPanel", () => {
 			stage: "current",
 			name: "My Story Name",
 			agent: null,
 			origin: null,
 		});
 		render(
 			<WorkItemDetailPanel
@@ -186,6 +190,7 @@ describe("WorkItemDetailPanel", () => {
 			stage: "current",
 			name: "My Story Name",
 			agent: null,
 			origin: null,
 		});
 		render(
 			<WorkItemDetailPanel
@@ -20,6 +20,26 @@ import { stripDisplayContent } from "./workItemDetailPanelUtils";
 const { useCallback, useEffect, useRef, useState } = React;
 /** Parse and format an origin JSON string for display. */
 function formatOrigin(origin: string | null): string {
 	if (!origin) return "unknown";
 	try {
 		const obj = JSON.parse(origin) as {
 			kind?: string;
 			id?: string;
 			ts?: number;
 		};
 		const kind = obj.kind ?? "unknown";
 		const id = obj.id ? ` (${obj.id})` : "";
 		const ts = obj.ts
 			? ` at ${new Date(obj.ts * 1000).toISOString().replace("T", " ").slice(0, 19)}Z`
 			: "";
 		return `${kind}${id}${ts}`;
 	} catch {
 		return origin;
 	}
 }
 interface WorkItemDetailPanelProps {
 	storyId: string;
 	pipelineVersion: number;
@@ -38,6 +58,7 @@ export function WorkItemDetailPanel({
 	const [stage, setStage] = useState<string>("");
 	const [name, setName] = useState<string | null>(null);
 	const [assignedAgent, setAssignedAgent] = useState<string | null>(null);
 	const [origin, setOrigin] = useState<string | null>(null);
 	const [loading, setLoading] = useState(true);
 	const [error, setError] = useState<string | null>(null);
 	const [agentInfo, setAgentInfo] = useState<AgentInfo | null>(null);
@@ -63,6 +84,7 @@ export function WorkItemDetailPanel({
 				setStage(data.stage);
 				setName(data.name);
 				setAssignedAgent(data.agent);
 				setOrigin(data.origin);
 			})
 			.catch((err: unknown) => {
 				setError(err instanceof Error ? err.message : "Failed to load content");
@@ -289,6 +311,19 @@ export function WorkItemDetailPanel({
 				<TestResultsSection testResults={testResults} />
 				{!loading && (
 					<div
 						data-testid="detail-panel-origin"
 						style={{
 							fontSize: "0.75em",
 							color: "#555",
 							fontFamily: "monospace",
 						}}
 					>
 						origin: {formatOrigin(origin)}
 					</div>
 				)}
 				<div
 					style={{
 						display: "flex",
@@ -124,19 +124,43 @@ else
 fi
 # Categorise merged work items and format names.
 # Supports two subject formats (after stripping the "huskies: merge " prefix):
 #   New: "1063 story Human Readable Name"
 #   Old: "1063_story_human_readable_name"
 FEATURES=""
 FIXES=""
 REFACTORS=""
 while IFS= read -r item; do
  [ -z "$item" ] && continue
-  # Strip the numeric prefix and type to get the human name.
+
-  name=$(echo "$item" | sed -E 's/^[0-9]+_(story|bug|refactor|spike)_//' | tr '_' ' ')
+  # Extract the leading numeric ID (present in both formats).
  id=$(echo "$item" | grep -oE '^[0-9]+')
  # Detect format and extract human name + type word.
  if echo "$item" | grep -qE '^[0-9]+ (story|bug|refactor|spike|epic) '; then
    # New format: "1063 story Human Name Here"
    type_word=$(echo "$item" | sed -E 's/^[0-9]+ ([a-z]+) .*/\1/')
    name=$(echo "$item" | sed -E 's/^[0-9]+ [a-z]+ //')
  else
    # Legacy slug format: "1063_story_human_name_here"
    type_word=$(echo "$item" | sed -E 's/^[0-9]+_([a-z]+)_.*/\1/')
    name=$(echo "$item" | sed -E 's/^[0-9]+_(story|bug|refactor|spike|epic)_//' | tr '_' ' ')
  fi
  # Capitalise first letter.
  name="$(echo "${name:0:1}" | tr '[:lower:]' '[:upper:]')${name:1}"
-  case "$item" in
+
-    *_bug_*)     FIXES="${FIXES}- ${name}\n" ;;
+  # Format as "Name (ID)" when a numeric ID was found, plain name otherwise.
-    *_refactor_*) REFACTORS="${REFACTORS}- ${name}\n" ;;
+  if [ -n "$id" ]; then
-    *)           FEATURES="${FEATURES}- ${name}\n" ;;
+    entry="${name} (${id})"
  else
    entry="${name}"
  fi
  case "$type_word" in
    bug)      FIXES="${FIXES}- ${entry}\n" ;;
    refactor) REFACTORS="${REFACTORS}- ${entry}\n" ;;
    *)        FEATURES="${FEATURES}- ${entry}\n" ;;
  esac
 done <<< "$MERGED_RAW"
@@ -53,7 +53,22 @@ cargo run --manifest-path "$PROJECT_ROOT/Cargo.toml" -p source-map-gen --bin sou
 echo "=== Building frontend ==="
 if [ -d "$PROJECT_ROOT/frontend" ]; then
  cd "$PROJECT_ROOT/frontend"
-  npm install
+  # The merge gate runs in workspaces whose pre-existing `node_modules` was
  # populated by an earlier `npm install --omit=dev` (or a partial install).
  # In that state `npm install` reports "up to date, audited N packages"
  # without actually adding the missing devDependencies, so the subsequent
  # `tsc && vite build` fails with `sh: 1: tsc: not found`.
  #
  # Repair the install when typescript isn't reachable (story 1086 merge gate
  # regression).  We probe the on-disk binary rather than relying on PATH so
  # this also covers the case where `node_modules/.bin/` is missing.
  if [ ! -x node_modules/typescript/bin/tsc ]; then
    echo "[script/test] node_modules missing typescript; performing clean install."
    rm -rf node_modules
    npm install --include=dev
  else
    npm install --include=dev
  fi
  npm run build
  cd "$PROJECT_ROOT"
 else
@@ -17,6 +17,20 @@ fn run(cmd: &str, args: &[&str], dir: &Path) {
 fn main() {
    println!("cargo:rerun-if-changed=build.rs");
    println!("cargo:rerun-if-env-changed=PROFILE");
    // Embed the current git commit hash at compile time so `get_version` always
    // reflects the binary that is actually running, not a potentially-stale file.
    println!("cargo:rerun-if-changed=../.git/HEAD");
    println!("cargo:rerun-if-changed=../.git/refs/");
    let git_hash = std::process::Command::new("git")
        .args(["rev-parse", "--short", "HEAD"])
        .output()
        .ok()
        .filter(|o| o.status.success())
        .and_then(|o| String::from_utf8(o.stdout).ok())
        .map(|s| s.trim().to_string())
        .unwrap_or_else(|| "unknown".to_string());
    println!("cargo:rustc-env=BUILD_GIT_HASH={git_hash}");
    println!("cargo:rerun-if-changed=../frontend/package.json");
    println!("cargo:rerun-if-changed=../frontend/package-lock.json");
    println!("cargo:rerun-if-changed=../frontend/vite.config.ts");
@@ -0,0 +1,56 @@
 -- Story 1087: split the legacy `stage` column on `pipeline_items` into a
 -- `(pipeline, status)` pair so the read side no longer needs to re-derive the
 -- display column and badge from the stage string.
 --
 -- The migration is additive: `stage` is retained for backwards compatibility
 -- while remaining Step E callers are migrated.  The backup of `pipeline.db`
 -- written by `shadow_write::init` immediately before this migration runs is
 -- the recovery path if the backfill produces an unexpected projection.
 ALTER TABLE pipeline_items ADD COLUMN pipeline TEXT NOT NULL DEFAULT '';
 ALTER TABLE pipeline_items ADD COLUMN status   TEXT NOT NULL DEFAULT '';
 -- Backfill `pipeline` from the existing `stage` column.  Every wire-form
 -- stage string emitted by `stage_dir_name` maps to exactly one of the seven
 -- Pipeline columns defined in `pipeline_state::types::Pipeline::as_str`.
 -- Legacy directory strings (`1_backlog`, `2_current`, ...) are also handled
 -- so that databases predating story 934 migrate cleanly.
 UPDATE pipeline_items SET pipeline = CASE stage
    WHEN 'upcoming'            THEN 'backlog'
    WHEN 'backlog'             THEN 'backlog'
    WHEN '1_backlog'           THEN 'backlog'
    WHEN 'coding'              THEN 'coding'
    WHEN 'blocked'             THEN 'coding'
    WHEN '2_current'           THEN 'coding'
    WHEN 'qa'                  THEN 'qa'
    WHEN 'review_hold'         THEN 'qa'
    WHEN '3_qa'                THEN 'qa'
    WHEN 'merge'               THEN 'merge'
    WHEN 'merge_failure'       THEN 'merge'
    WHEN 'merge_failure_final' THEN 'merge'
    WHEN '4_merge'             THEN 'merge'
    WHEN 'done'                THEN 'done'
    WHEN '5_done'              THEN 'done'
    WHEN 'abandoned'           THEN 'closed'
    WHEN 'superseded'          THEN 'closed'
    WHEN 'rejected'            THEN 'closed'
    WHEN 'archived'            THEN 'archived'
    WHEN '6_archived'          THEN 'archived'
    WHEN 'frozen'              THEN 'coding'
    ELSE ''
 END;
 -- Backfill `status` (badge) from the existing `stage` column.
 UPDATE pipeline_items SET status = CASE stage
    WHEN 'frozen'              THEN 'frozen'
    WHEN 'review_hold'         THEN 'review-hold'
    WHEN 'blocked'             THEN 'blocked'
    WHEN 'merge_failure'       THEN 'merge-failure'
    WHEN 'merge_failure_final' THEN 'merge-failure-final'
    WHEN 'abandoned'           THEN 'abandoned'
    WHEN 'superseded'          THEN 'superseded'
    WHEN 'rejected'            THEN 'rejected'
    WHEN 'done'                THEN 'done'
    WHEN '5_done'              THEN 'done'
    ELSE 'active'
 END;
@@ -198,10 +198,13 @@ pub async fn run(
        )
    };
-    // Replay current pipeline state so subscribers (worktree lifecycle, merge-failure
+    // Reconcile subscriber side effects for the current CRDT state without
-    // auto-spawn) react to any stories already in active stages, then auto-assign.
+    // flooding the broadcast channel (replaces the former replay_current_pipeline_state call).
-    slog!("[agent-mode] Replaying current pipeline state.");
+    slog!("[agent-mode] Running startup reconcile pass.");
-    crate::pipeline_state::replay_current_pipeline_state();
+    let done_retention = crate::config::ProjectConfig::load(&project_root)
        .map(|c| std::time::Duration::from_secs(c.watcher.done_retention_secs))
        .unwrap_or_else(|_| std::time::Duration::from_secs(4 * 3600));
    crate::startup::tick_loop::run_reconcile_pass(&project_root, &agents, done_retention).await;
    // Run initial auto-assign.
    slog!("[agent-mode] Initial auto-assign scan.");
@@ -10,10 +10,12 @@
 //! - `.huskies/README.md`
 //! - `.huskies/specs/00_CONTEXT.md`
 //! - `.huskies/AGENT.md`
 //! - `.huskies/source-map.json` (up to 200 KB; truncated with a log if larger)
 //!
-//! `STACK.md` is intentionally excluded — it is large and changes often; agents
+//! `STACK.md` and `.huskies/source-map.json` are intentionally excluded — they
-//! should grep it on demand.
+//! are large and change often; agents should grep on demand instead.  Earlier
 //! versions of this bundle inlined the source map, which ballooned the orientation
 //! to ~96 KB and drowned out the workflow rules in AGENT.md; the file is still
 //! kept on disk for the merge-time `source-map-check` doc-coverage gate.
 //!
 //! Behaviour contract:
 //! - Files that are missing or empty are skipped silently (no error, no section).
@@ -33,12 +35,6 @@ const ORIENTATION_FILES: &[&str] = &[
    ".huskies/AGENT.md",
 ];
 /// Path to the source map (relative to project root), appended after AGENT.md.
 const SOURCE_MAP_REL: &str = ".huskies/source-map.json";
 /// Maximum bytes of source-map content to embed in the prompt.
 const SOURCE_MAP_BYTE_CAP: usize = 200 * 1024;
 /// Attempt to load the project-local agent prompt by concatenating orientation
 /// files from the project root.
 ///
@@ -60,14 +56,11 @@ pub fn read_project_local_prompt(project_root: &Path) -> Option<String> {
        sections.push((rel_path, trimmed.to_string()));
    }
-    // Read source-map.json (after AGENT.md) with a byte cap.
+    if sections.is_empty() {
    let source_map_content = read_source_map_section(project_root);
    if sections.is_empty() && source_map_content.is_none() {
        return None;
    }
-    let mut included_files: Vec<&str> = sections.iter().map(|(name, _)| *name).collect();
+    let included_files: Vec<&str> = sections.iter().map(|(name, _)| *name).collect();
    let mut bundle = String::new();
    for (i, (name, content)) in sections.iter().enumerate() {
        if i > 0 {
@@ -77,15 +70,6 @@ pub fn read_project_local_prompt(project_root: &Path) -> Option<String> {
        bundle.push_str(content);
    }
    if let Some(sm) = source_map_content {
        if !bundle.is_empty() {
            bundle.push('\n');
        }
        bundle.push_str(&format!("=== {SOURCE_MAP_REL} ===\n"));
        bundle.push_str(&sm);
        included_files.push(SOURCE_MAP_REL);
    }
    crate::slog!(
        "[agents] orientation bundle: {} bytes, files: [{}]",
        bundle.len(),
@@ -95,39 +79,6 @@ pub fn read_project_local_prompt(project_root: &Path) -> Option<String> {
    Some(bundle)
 }
 /// Read `.huskies/source-map.json` from `project_root`, applying a byte cap.
 ///
 /// Returns `None` when the file is absent, unreadable, or empty.
 /// When the content exceeds [`SOURCE_MAP_BYTE_CAP`], truncates at a char
 /// boundary and logs the truncation.
 #[allow(clippy::string_slice)] // cap is walked back to a char boundary before slicing
 fn read_source_map_section(project_root: &Path) -> Option<String> {
    let path = project_root.join(SOURCE_MAP_REL);
    let Ok(content) = std::fs::read_to_string(&path) else {
        return None;
    };
    let trimmed = content.trim();
    if trimmed.is_empty() {
        return None;
    }
    if trimmed.len() > SOURCE_MAP_BYTE_CAP {
        let mut cap = SOURCE_MAP_BYTE_CAP;
        while cap > 0 && !trimmed.is_char_boundary(cap) {
            cap -= 1;
        }
        crate::slog!(
            "[agents] source-map.json truncated: {} bytes > {} byte cap; \
             including first {} bytes",
            trimmed.len(),
            SOURCE_MAP_BYTE_CAP,
            cap
        );
        Some(trimmed[..cap].to_string())
    } else {
        Some(trimmed.to_string())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -310,10 +261,13 @@ mod tests {
        );
    }
-    // ── source-map.json tests ────────────────────────────────────────────────
+    // ── source-map.json must NOT be inlined into the bundle ──────────────────
    // The file is kept on disk for the merge-time source-map-check gate, but
    // inlining it into every agent spawn ballooned the orientation past 96 KB
    // and drowned out the workflow rules in AGENT.md.
    #[test]
-    fn source_map_included_after_agent_md() {
+    fn source_map_not_included_even_when_present() {
        let tmp = tempfile::tempdir().unwrap();
        write_file(tmp.path(), ".huskies/AGENT.md", "agent content");
        write_file(
@@ -324,92 +278,12 @@ mod tests {
        let result = read_project_local_prompt(tmp.path()).unwrap();
        assert!(
-            result.contains("=== .huskies/source-map.json ==="),
+            !result.contains("=== .huskies/source-map.json ==="),
-            "source-map delimiter must be present: {result}"
+            "source-map must not appear as an orientation section: {result}"
        );
        assert!(
-            result.contains(r#""src/lib.rs""#),
+            !result.contains("src/lib.rs"),
-            "source-map content must be present: {result}"
+            "source-map content must not be inlined: {result}"
        );
        // source-map section must appear after AGENT.md section
        let agent_pos = result.find("=== .huskies/AGENT.md ===").unwrap();
        let sm_pos = result.find("=== .huskies/source-map.json ===").unwrap();
        assert!(
            sm_pos > agent_pos,
            "source-map section must come after AGENT.md section"
        );
    }
    #[test]
    fn source_map_missing_skipped_silently() {
        let tmp = tempfile::tempdir().unwrap();
        write_file(tmp.path(), ".huskies/AGENT.md", "agent content");
        // source-map.json intentionally absent
        let result = read_project_local_prompt(tmp.path()).unwrap();
        assert!(
            !result.contains("source-map.json"),
            "absent source-map must not create a section: {result}"
        );
    }
    #[test]
    fn source_map_empty_skipped_silently() {
        let tmp = tempfile::tempdir().unwrap();
        write_file(tmp.path(), ".huskies/AGENT.md", "agent content");
        write_file(tmp.path(), ".huskies/source-map.json", "");
        let result = read_project_local_prompt(tmp.path()).unwrap();
        assert!(
            !result.contains("source-map.json"),
            "empty source-map must not create a section: {result}"
        );
    }
    #[test]
    fn source_map_only_returns_some() {
        let tmp = tempfile::tempdir().unwrap();
        // Only source-map.json present; all orientation files absent.
        write_file(
            tmp.path(),
            ".huskies/source-map.json",
            r#"{"src/main.rs": {}}"#,
        );
        let result = read_project_local_prompt(tmp.path());
        assert!(
            result.is_some(),
            "source-map alone must produce Some bundle"
        );
        assert!(
            result.unwrap().contains("=== .huskies/source-map.json ==="),
            "bundle must contain source-map section"
        );
    }
    #[test]
    #[allow(clippy::string_slice)] // sm_start is derived from str::find — always a char boundary
    fn source_map_truncated_at_byte_cap() {
        let tmp = tempfile::tempdir().unwrap();
        write_file(tmp.path(), ".huskies/AGENT.md", "agent");
        // Build content larger than SOURCE_MAP_BYTE_CAP (200 KB).
        let big = "x".repeat(SOURCE_MAP_BYTE_CAP + 1024);
        write_file(tmp.path(), ".huskies/source-map.json", &big);
        let result = read_project_local_prompt(tmp.path()).unwrap();
        assert!(
            result.contains("=== .huskies/source-map.json ==="),
            "truncated source-map must still produce a section: {result}"
        );
        // The content length of just the source-map section must be <= SOURCE_MAP_BYTE_CAP.
        let sm_start = result.find("=== .huskies/source-map.json ===").unwrap()
            + "=== .huskies/source-map.json ===\n".len();
        let sm_content = &result[sm_start..];
        assert!(
            sm_content.len() <= SOURCE_MAP_BYTE_CAP,
            "source-map section content must be <= {} bytes, got {}",
            SOURCE_MAP_BYTE_CAP,
            sm_content.len()
        );
    }
 }
@@ -124,7 +124,15 @@ pub(crate) fn run_squash_merge(
    // ── Commit in the temporary worktree ──────────────────────────
    all_output.push_str("=== git commit ===\n");
-    let commit_msg = format!("huskies: merge {story_id}");
+    // Include human-readable name and item type when the CRDT is available.
    // Falls back to the bare ID when running outside the server (e.g. in tests).
    let story_label = crate::crdt_state::read_item(story_id)
        .map(|item| {
            let type_str = item.item_type().map(|t| t.as_str()).unwrap_or("story");
            format!(" {} {}", type_str, item.name())
        })
        .unwrap_or_default();
    let commit_msg = format!("huskies: merge {story_id}{story_label}");
    let commit = Command::new("git")
        .args(["commit", "-m", &commit_msg])
        .current_dir(&merge_wt_path)
@@ -507,3 +515,5 @@ fn run_merge_quality_gates(
 mod tests_advanced;
 #[cfg(test)]
 mod tests_basic;
 #[cfg(test)]
 mod tests_changelog;
@@ -0,0 +1,142 @@
 //! Regression tests for changelog entry parsing — both legacy-slug and new-format
 //! merge commit subjects must resolve to a human-readable "Name (ID)" entry.
 /// Parse a single merge commit subject (after stripping the `huskies: merge ` prefix)
 /// into `(id, type_word, human_name)`.
 ///
 /// Returns `None` for subjects that are not recognised merge items.
 fn parse_changelog_entry(item: &str) -> Option<(String, String, String)> {
    let item = item.trim();
    if item.is_empty() {
        return None;
    }
    // Extract leading numeric ID present in both formats.
    let id: String = item.chars().take_while(|c| c.is_ascii_digit()).collect();
    if id.is_empty() {
        return None;
    }
    // Detect format by the character immediately following the digits.
    // id contains only ASCII digits so id.len() is a valid char boundary.
    let rest = item.get(id.len()..).unwrap_or("");
    if let Some(space_rest) = rest.strip_prefix(' ') {
        // New format: "1063 story Human Name Here"
        let mut words = space_rest.splitn(2, ' ');
        let type_word = words.next().unwrap_or("story").to_string();
        let name = words.next().unwrap_or("").trim().to_string();
        if name.is_empty() {
            return None;
        }
        Some((id, type_word, name))
    } else if let Some(slug_rest) = rest.strip_prefix('_') {
        // Legacy slug format: "1063_story_human_name_here"
        let mut parts = slug_rest.splitn(2, '_');
        let type_word = parts.next().unwrap_or("story").to_string();
        let slug = parts.next().unwrap_or("").replace('_', " ");
        if slug.is_empty() {
            return None;
        }
        Some((id, type_word, slug))
    } else {
        None
    }
 }
 /// Format a parsed entry as "Human Name (ID)".
 fn format_entry(id: &str, name: &str) -> String {
    let mut chars = name.chars();
    let capitalised = match chars.next() {
        None => String::new(),
        Some(c) => c.to_uppercase().collect::<String>() + chars.as_str(),
    };
    format!("{capitalised} ({id})")
 }
 #[test]
 fn changelog_new_format_story_resolves_to_name_and_id() {
    let item = "1063 story Tee pipeline events into gateway context";
    let (id, _type_word, name) = parse_changelog_entry(item).expect("should parse new format");
    assert_eq!(id, "1063");
    assert_eq!(
        format_entry(&id, &name),
        "Tee pipeline events into gateway context (1063)"
    );
 }
 #[test]
 fn changelog_new_format_bug_resolves_to_name_and_id() {
    let item = "999 bug Fix the broken auth token";
    let (id, type_word, name) = parse_changelog_entry(item).expect("should parse new-format bug");
    assert_eq!(id, "999");
    assert_eq!(type_word, "bug");
    assert_eq!(format_entry(&id, &name), "Fix the broken auth token (999)");
 }
 #[test]
 fn changelog_new_format_refactor_resolves_to_name_and_id() {
    let item = "777 refactor Extract config parsing";
    let (id, type_word, name) = parse_changelog_entry(item).expect("should parse refactor");
    assert_eq!(type_word, "refactor");
    assert_eq!(format_entry(&id, &name), "Extract config parsing (777)");
 }
 #[test]
 fn changelog_legacy_slug_story_resolves_to_name_and_id() {
    let item = "1063_story_tee_pipeline_events_into_gateway_context";
    let (id, _type_word, name) = parse_changelog_entry(item).expect("should parse legacy slug");
    assert_eq!(id, "1063");
    assert_eq!(
        format_entry(&id, &name),
        "Tee pipeline events into gateway context (1063)"
    );
 }
 #[test]
 fn changelog_legacy_slug_bug_resolves_to_name_and_id() {
    let item = "999_bug_fix_the_broken_auth_token";
    let (id, type_word, name) = parse_changelog_entry(item).expect("should parse legacy bug slug");
    assert_eq!(id, "999");
    assert_eq!(type_word, "bug");
    assert_eq!(format_entry(&id, &name), "Fix the broken auth token (999)");
 }
 #[test]
 fn changelog_mixed_fixture_all_entries_have_human_names() {
    // Fixture: a mix of legacy-slug and new-format subjects (as they appear
    // after stripping the "huskies: merge " prefix from the git log).
    let fixture = [
        // Legacy slug formats (pre-migration)
        "1001_story_add_matrix_transport",
        "1002_bug_fix_crdt_sync_disconnect",
        "1003_refactor_extract_gateway_config",
        // New format (post-story-1069)
        "1050 story Add agent pool auto-assign",
        "1063 story Tee pipeline events into gateway context",
        "1064 bug Stop lagged handler re-emitting via same channel",
        "1065 refactor Move squash merge into own module",
    ];
    for item in &fixture {
        let result = parse_changelog_entry(item);
        assert!(result.is_some(), "failed to parse merge subject: {item:?}");
        let (id, _type_word, name) = result.unwrap();
        let entry = format_entry(&id, &name);
        // Every entry must contain the numeric ID in parentheses.
        assert!(
            entry.contains(&format!("({id})")),
            "entry missing numeric ID: {entry:?}"
        );
        // Name must not be empty or just whitespace.
        assert!(
            !name.trim().is_empty(),
            "empty human name for item: {item:?}"
        );
        // Name must not be a raw slug (contains underscores as word separators).
        // (Underscores are OK inside words like "auto-assign" but not as spaces.)
        assert!(
            !name.contains('_'),
            "name still contains underscores (slug not decoded): {name:?}"
        );
    }
 }
@@ -569,14 +569,15 @@ mod tests {
        );
    }
-    // ── AC4: startup event replay + pool reconstruction ──────────────────
+    // ── AC4: startup reconcile + pool reconstruction ──────────────────
    /// AC4: Simulates a server restart by seeding the CRDT with a story in
-    /// Coding stage, calling `replay_current_pipeline_state` (the new startup
+    /// Coding stage, then running `auto_assign_available_work` (startup no longer
-    /// path), then `auto_assign_available_work`.  Asserts the pool ends in the
+    /// floods the broadcast channel via replay — it calls reconcile functions
-    /// expected state: exactly one agent assigned to the story.
+    /// directly).  Asserts the pool ends in the expected state: exactly one agent
    /// assigned to the story, and a second pass does not double-spawn.
    #[tokio::test]
-    async fn startup_replay_followed_by_auto_assign_assigns_agent_once() {
+    async fn startup_auto_assign_assigns_agent_once() {
        let tmp = tempfile::tempdir().unwrap();
        let sk = tmp.path().join(".huskies");
        std::fs::create_dir_all(&sk).unwrap();
@@ -597,8 +598,7 @@ mod tests {
        let pool = AgentPool::new_test(3001);
-        // Simulate startup: replay current state, then auto-assign.
+        // First auto-assign pass.
        crate::pipeline_state::replay_current_pipeline_state();
        pool.auto_assign_available_work(tmp.path()).await;
        let count_after_first = {
@@ -612,8 +612,7 @@ mod tests {
                .count()
        };
-        // AC3 (idempotency): replaying twice must not double-spawn agents.
+        // Second pass (idempotency): must not double-spawn agents.
        crate::pipeline_state::replay_current_pipeline_state();
        pool.auto_assign_available_work(tmp.path()).await;
        let count_after_second = {
@@ -629,11 +628,11 @@ mod tests {
        assert!(
            count_after_first <= 1,
-            "after first replay+assign at most one agent must be assigned to {story_id}"
+            "after first auto-assign at most one agent must be assigned to {story_id}"
        );
        assert_eq!(
            count_after_first, count_after_second,
-            "second replay must not spawn additional agents (idempotency)"
+            "second auto-assign must not spawn additional agents (idempotency)"
        );
    }
 }
@@ -1,29 +1,39 @@
-//! Backlog promotion: scan `1_backlog/` and promote stories whose `depends_on` are all met.
+//! Backlog promotion: scan items in `Pipeline::Backlog` and promote stories whose `depends_on` are all met.
-use crate::pipeline_state::Stage;
+use crate::pipeline_state::Pipeline;
 use crate::slog;
 use crate::slog_warn;
 use super::super::AgentPool;
 use super::scan::scan_stage_items;
 use super::story_checks::{check_archived_dependencies, has_unmet_dependencies};
 impl AgentPool {
-    /// Scan `1_backlog/` and promote any story whose `depends_on` are all met.
+    /// Scan items in `Pipeline::Backlog` and promote any story whose `depends_on` are all met.
    ///
    /// A story is only promoted if it explicitly lists `depends_on` AND every
-    /// listed dependency has reached `5_done` or `6_archived`.  Stories with no
+    /// listed dependency has reached `Pipeline::Done` or `Pipeline::Archived`.
-    /// `depends_on` are left in the backlog for human scheduling.
+    /// Stories with no `depends_on` are left in the backlog for human scheduling.
    ///
-    /// **Archived dep semantics:** a dep in `6_archived` counts as satisfied (since
+    /// **Archived dep semantics:** a dep in `Pipeline::Archived` counts as satisfied
-    /// stories auto-sweep from `5_done` to `6_archived` after 4 hours, and the
+    /// (since stories auto-sweep from `Done` to `Archived` after 4 hours, and the
    /// dependent story would normally already be promoted by then).  However, if a
-    /// dep was already in `6_archived` when the dependent story was created (e.g. it
+    /// dep was already archived when the dependent story was created (e.g. it
    /// was abandoned/superseded before the dependent existed), a prominent warning is
    /// logged so the user can see the promotion was triggered by an archived dep, not
    /// a clean completion.
    pub(super) fn promote_ready_backlog_stories(&self) {
-        let items = scan_stage_items(&Stage::Backlog);
+        // Story 1086: scan by Pipeline column, not Stage variant. Pipeline::Backlog
        // covers Stage::Upcoming and Stage::Backlog uniformly.
        let items: Vec<String> = {
            use std::collections::BTreeSet;
            let mut ids = BTreeSet::new();
            for item in crate::pipeline_state::read_all_typed() {
                if item.stage.pipeline() == Pipeline::Backlog {
                    ids.insert(item.story_id.0.clone());
                }
            }
            ids.into_iter().collect()
        };
        for story_id in &items {
            // Only promote stories that explicitly declare dependencies
            // (story 929: read from the CRDT register, not YAML).
@@ -13,7 +13,7 @@ use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use crate::pipeline_state::{MergeFailureKind, PipelineEvent, Stage, StoryId};
+use crate::pipeline_state::{MergeFailureKind, PipelineEvent, Stage, Status, StoryId};
 use crate::slog;
 use crate::slog_warn;
@@ -21,6 +21,15 @@ use super::super::super::PipelineStage;
 use super::super::AgentPool;
 use super::scan::is_story_assigned_for_stage;
 /// Reconcile: no-op for the merge-failure block subscriber.
 ///
 /// The block subscriber maintains an in-memory per-story consecutive-failure counter
 /// that cannot be reconstructed from CRDT state alone (only the current stage is
 /// stored, not the history of how many times each story failed).  Eventual consistency
 /// is guaranteed by the live subscriber reacting to each new `MergeFailure` event;
 /// the periodic reconciler cannot add value here without risking spurious blocks.
 pub(crate) fn reconcile_merge_failure_block() {}
 /// Spawn a background task that blocks stories after N consecutive `MergeFailure` transitions.
 ///
 /// Subscribes to the pipeline transition broadcast channel and tracks a per-story
@@ -86,6 +95,13 @@ fn on_transition(
    counters: &mut HashMap<StoryId, (u32, MergeFailureKind)>,
    recovery_running: bool,
 ) {
    // Story 1086: gate on the typed `Status` projection — `Status::MergeFailure`
    // is precisely the set of stages we count toward the block threshold.  We
    // still need the variant pattern below to read `kind`.
    if fired.after.status() != Status::MergeFailure {
        counters.remove(&fired.story_id);
        return;
    }
    match &fired.after {
        Stage::MergeFailure { kind, .. } => {
            if recovery_running {
@@ -9,7 +9,7 @@
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use crate::pipeline_state::{MergeFailureKind, Stage};
+use crate::pipeline_state::{MergeFailureKind, Stage, Status};
 use crate::slog;
 use crate::slog_warn;
@@ -17,6 +17,35 @@ use super::super::super::PipelineStage;
 use super::super::AgentPool;
 use super::scan::{find_free_agent_for_stage, is_story_assigned_for_stage};
 /// Reconcile: for each story currently in `MergeFailure { kind: ConflictDetected }`,
 /// ensure a mergemaster agent is running.
 ///
 /// Idempotent — `on_merge_failure_transition` guards against double-spawning via
 /// `is_story_assigned_for_stage`.  Called by the periodic reconciler so that a Lagged
 /// startup event never leaves a ConflictDetected story without a recovery agent.
 pub(crate) async fn reconcile_merge_failure(pool: &Arc<AgentPool>, project_root: &Path) {
    use crate::pipeline_state::{MergeFailureKind, PipelineEvent, Stage, TransitionFired};
    for item in crate::pipeline_state::read_all_typed() {
        // Story 1086: scan via the Status projection; the variant pattern is
        // still needed to read `kind`.
        if item.stage.status() != Status::MergeFailure {
            continue;
        }
        if let Stage::MergeFailure { ref kind, .. } = item.stage
            && matches!(kind, MergeFailureKind::ConflictDetected(_))
        {
            let fired = TransitionFired {
                story_id: item.story_id.clone(),
                before: item.stage.clone(),
                after: item.stage.clone(),
                event: PipelineEvent::MergeFailed { kind: kind.clone() },
                at: chrono::Utc::now(),
            };
            on_merge_failure_transition(pool, project_root, &fired).await;
        }
    }
 }
 /// Spawn a background task that auto-spawns mergemaster agents on
 /// `Stage::MergeFailure { kind: ConflictDetected(_) }` transitions.
 ///
@@ -49,6 +78,11 @@ async fn on_merge_failure_transition(
    project_root: &Path,
    fired: &crate::pipeline_state::TransitionFired,
 ) {
    // Story 1086: gate on the typed `Status` projection first; only the
    // `MergeFailure` kind extraction needs the variant pattern.
    if fired.after.status() != Status::MergeFailure {
        return;
    }
    let Stage::MergeFailure { ref kind, .. } = fired.after else {
        return;
    };
@@ -17,7 +17,11 @@ pub(crate) mod watchdog;
 // so that pool::lifecycle and pool::pipeline continue to access them unchanged.
 pub(super) use scan::{find_free_agent_for_stage, is_agent_free};
 /// Re-export for `startup::tick_loop`.
 pub(crate) use merge_failure_block_subscriber::reconcile_merge_failure_block;
 /// Re-export for `startup::tick_loop`.
 pub(crate) use merge_failure_block_subscriber::spawn_merge_failure_block_subscriber;
 /// Re-export for `startup::tick_loop`.
 pub(crate) use merge_failure_subscriber::reconcile_merge_failure;
 /// Re-export for `startup::tick_loop`.
 pub(crate) use merge_failure_subscriber::spawn_merge_failure_subscriber;
@@ -187,13 +187,14 @@ pub(super) fn check_agent_limits(
                ),
            };
-            // Mark agent as Failed with termination reason.
+            // NOTE: agent status is intentionally NOT updated here.  Setting
-            if let Ok(mut lock) = agents.lock()
+            // `status = Failed` before the kill (the previous behaviour)
-                && let Some(agent) = lock.get_mut(key)
+            // opened a window where the `start_agent` idempotency check
-            {
+            // (which whitelists Running/Pending) would let a fresh spawn
-                agent.status = AgentStatus::Failed;
+            // through while the prior PTY child was still alive — directly
-                agent.termination_reason = Some(reason.clone());
+            // causing the concurrent-agents bug we hit on story 1086
-            }
+            // (2026-05-15).  The caller (`run_watchdog_pass`) is responsible
            // for: (1) verifying the kill, (2) THEN updating the agent record.
            slog!("[watchdog] Terminating agent '{key}': {reason_str}.");
@@ -9,8 +9,11 @@ mod tests;
 use std::path::Path;
 use crate::agents::AgentStatus;
 use crate::config::ProjectConfig;
 use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
 use crate::slog;
 use crate::slog_warn;
 use super::super::AgentPool;
 use limits::check_agent_limits;
@@ -42,15 +45,71 @@ impl AgentPool {
        if let Some(root) = project_root {
            let terminated = check_agent_limits(&self.agents, root);
            let config = ProjectConfig::load(root).unwrap_or_default();
-            for (key, _reason) in &terminated {
+            for (key, reason) in &terminated {
-                // Kill the PTY child and abort the task, same as stop_agent.
+                // Step 1: snapshot the agent's worktree path so we can find every
                // process running in it (claude + any subprocesses).  This must
                // happen BEFORE we mutate the agent record so we can read the
                // worktree info safely.
                let worktree_path = self.agents.lock().ok().and_then(|lock| {
                    lock.get(key)
                        .and_then(|a| a.worktree_info.as_ref().map(|wt| wt.path.clone()))
                });
                // Step 2: SIGKILL every process running in the worktree and
                // BLOCK until verified gone.  The previous mechanism — portable_pty's
                // `ChildKiller::kill()` — sends SIGHUP, which claude-code
                // ignores, leaving the process alive while the agent record
                // was being marked terminated; that gap let a fresh spawn race
                // in alongside the surviving one.  SIGKILL is uncatchable;
                // [`sigkill_pids_and_verify`] only returns once the kernel has
                // reaped each pid.
                if let Some(wt_path) = worktree_path.as_ref() {
                    let pids = pids_matching(&wt_path.display().to_string());
                    if pids.is_empty() {
                        // Nothing in this worktree — agent likely already
                        // exited on its own before the watchdog noticed.
                    } else {
                        match sigkill_pids_and_verify(&pids) {
                            Ok(n) => slog!(
                                "[watchdog] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
                                wt_path.display()
                            ),
                            Err(survivors) => slog_warn!(
                                "[watchdog] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
                                 Proceeding with cleanup; concurrent spawn protection may be weakened."
                            ),
                        }
                    }
                } else {
                    slog_warn!(
                        "[watchdog] No worktree path recorded for '{key}'; cannot tree-kill, \
                         falling back to portable_pty SIGHUP (likely no-op for claude-code)."
                    );
                    self.kill_child_for_key(key);
                }
                // Step 3: NOW update the agent record.  The process is verified
                // gone (or we logged that SIGKILL didn't take effect, which is
                // exceptional), so flipping status away from Running can no
                // longer open a window for a concurrent spawn.
                if let Ok(mut lock) = self.agents.lock()
                    && let Some(agent) = lock.get_mut(key)
                    && let Some(handle) = agent.task_handle.take()
                {
                    agent.status = AgentStatus::Failed;
                    agent.termination_reason = Some(reason.clone());
                    if let Some(handle) = agent.task_handle.take() {
                        // Best-effort abort of the outer tokio task.  The PTY
                        // blocking thread already returned (claude is dead),
                        // so this is bookkeeping rather than load-bearing.
                        handle.abort();
                    }
                }
                // Step 4: drop the (now-stale) child_killers entry — the
                // process it pointed at is gone.
                if let Ok(mut killers) = self.child_killers.lock() {
                    killers.remove(key);
                }
                // Use the retry mechanism: increment retry_count and only block
                // when the limit is exceeded, matching the pipeline's behaviour.
@@ -9,10 +9,19 @@
 use std::path::{Path, PathBuf};
-use crate::pipeline_state::Stage;
+use crate::pipeline_state::{Pipeline, Stage, Status};
 use crate::slog;
 use crate::slog_warn;
 /// Reconcile: re-populate the CostRollup register from disk for all known stories.
 ///
 /// Idempotent — `init_from_disk` scans all existing token-usage JSONL files and
 /// overwrites the in-memory register.  Called by the periodic reconciler so that
 /// a Lagged event can never leave a story with a stale or absent cost entry.
 pub(crate) fn reconcile_cost_rollup(project_root: &Path) {
    crate::service::agents::cost_rollup::init_from_disk(project_root);
 }
 /// Spawn a background task that maintains the CostRollup register.
 ///
 /// On every terminal stage transition (Done, Archived, Abandoned, Superseded,
@@ -41,17 +50,15 @@ pub(crate) fn spawn_cost_rollup_subscriber(project_root: PathBuf) {
 /// Returns `true` if `stage` is a terminal pipeline stage.
 ///
 /// Terminal stages are those from which no further work is expected:
-/// Done, Archived, Abandoned, Superseded, Rejected.
+/// Done, Archived, Abandoned, Superseded, Rejected. Story 1086 routes the
-/// MergeFailure variants are NOT terminal — stories can recover from them.
+/// classification through the [`Status`] / [`Pipeline`] projection so future
 /// Stage variants automatically participate.  MergeFailure variants are NOT
 /// terminal — stories can recover from them.
 fn is_terminal(stage: &Stage) -> bool {
    matches!(
-        stage,
+        stage.status(),
-        Stage::Done { .. }
+        Status::Done | Status::Abandoned | Status::Superseded | Status::Rejected
-            | Stage::Archived { .. }
+    ) || matches!(stage.pipeline(), Pipeline::Archived)
            | Stage::Abandoned { .. }
            | Stage::Superseded { .. }
            | Stage::Rejected { .. }
    )
 }
 /// Snapshot the cost data for `fired.story_id` into the register when
@@ -1,4 +1,11 @@
 //! Process management — kills orphaned PTY child processes on server shutdown.
 //!
 //! See [`crate::process_kill`] for the general process-termination primitives
 //! this module's existing methods (`kill_all_children`, `kill_child_for_key`)
 //! should eventually be migrated to.  Those methods currently use
 //! `portable_pty::ChildKiller::kill()`, which sends `SIGHUP` — a signal
 //! claude-code ignores — so they leave orphans on every shutdown/stop.  The
 //! migration is tracked in a separate story to keep its diff focused.
 use crate::slog;
 use super::AgentPool;
@@ -1,6 +1,8 @@
 //! Agent stop — terminates a running agent while preserving its worktree.
 use crate::process_kill::{pids_matching, sigkill_pids_and_verify};
 use crate::slog;
 use crate::slog_error;
 use crate::slog_warn;
 use std::path::Path;
 use super::super::{AgentEvent, AgentStatus};
@@ -9,6 +11,22 @@ use super::types::composite_key;
 impl AgentPool {
    /// Stop a running agent. Worktree is preserved for inspection.
    ///
    /// **Order of operations matters here.**  The naive implementation set
    /// `status = Failed` before killing the process, which opened the same
    /// idempotency window that produced the 2026-05-15 watchdog
    /// double-spawn: the `start_agent` check whitelists Running/Pending,
    /// so flipping status away from Running while the underlying claude
    /// process was still alive let a fresh spawn race in alongside the
    /// surviving one.  The fix is:
    ///
    /// 1. Read the worktree path (so we can find every process running
    ///    in it) without mutating the agent record yet.
    /// 2. SIGKILL the process tree via [`crate::process_kill`] and BLOCK
    ///    until verified gone.  While this is in progress, status stays
    ///    Running and `start_agent` continues to reject duplicate spawns.
    /// 3. Now that the process is gone, mutate the agent record (status,
    ///    handle abort, removal).
    pub async fn stop_agent(
        &self,
        _project_root: &Path,
@@ -17,27 +35,62 @@ impl AgentPool {
    ) -> Result<(), String> {
        let key = composite_key(story_id, agent_name);
-        let (worktree_info, task_handle, tx) = {
+        // Step 1: snapshot the worktree path (no status mutation yet).
        let worktree_info = {
            let agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents
                .get(&key)
                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
            agent.worktree_info.clone()
        };
        // Step 2: SIGKILL every process running in the worktree, verify gone.
        // We do this BEFORE updating the agent record so the idempotency check
        // in `start_agent` keeps rejecting duplicate spawns until the slot is
        // legitimately free.  Replaces the prior `kill_child_for_key` path,
        // which sent SIGHUP via portable_pty (ignored by claude-code).
        if let Some(wt) = worktree_info.as_ref() {
            let pids = pids_matching(&wt.path.display().to_string());
            if !pids.is_empty() {
                match sigkill_pids_and_verify(&pids) {
                    Ok(n) => slog!(
                        "[stop_agent] SIGKILL'd {n} process(es) in worktree {} for '{key}'.",
                        wt.path.display()
                    ),
                    Err(survivors) => slog_warn!(
                        "[stop_agent] SIGKILL incomplete for '{key}': pids still alive: {survivors:?}. \
                         Proceeding with record cleanup anyway; concurrent spawn protection may be weakened."
                    ),
                }
            }
        } else {
            slog_warn!(
                "[stop_agent] No worktree path recorded for '{key}'; cannot tree-kill, \
                 falling back to portable_pty SIGHUP (likely no-op for claude-code)."
            );
            self.kill_child_for_key(&key);
        }
        // Step 3: now safe to mutate.  Status flip, handle abort, drop the
        // child_killers entry.
        let (task_handle, tx) = {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            let agent = agents
                .get_mut(&key)
                .ok_or_else(|| format!("No agent '{agent_name}' for story '{story_id}'"))?;
            let wt = agent.worktree_info.clone();
            let handle = agent.task_handle.take();
            let tx = agent.tx.clone();
            agent.status = AgentStatus::Failed;
-            (wt, handle, tx)
+            (handle, tx)
        };
        // Abort the task and kill the PTY child process.
        // Note: aborting a spawn_blocking task handle does not interrupt the blocking
        // thread, so we must also kill the child process directly via the killer registry.
        if let Some(handle) = task_handle {
            handle.abort();
            let _ = handle.await;
        }
-        self.kill_child_for_key(&key);
+        if let Ok(mut killers) = self.child_killers.lock() {
            killers.remove(&key);
        }
        // Preserve worktree for inspection — don't destroy agent's work on stop.
        if let Some(ref wt) = worktree_info {
@@ -53,7 +106,7 @@ impl AgentPool {
            status: "stopped".to_string(),
        });
-        // Remove from map
+        // Remove from map.
        {
            let mut agents = self.agents.lock().map_err(|e| e.to_string())?;
            agents.remove(&key);
@@ -6,10 +6,20 @@
 use std::path::{Path, PathBuf};
-use crate::pipeline_state::Stage;
+use crate::pipeline_state::{Pipeline, Stage, Status};
 use crate::slog;
 use crate::slog_warn;
 /// Story 1086: matches the set of terminal stages used by the worktree-cleanup
 /// subscriber via the typed [`Status`] / [`Pipeline`] projections.  Excludes
 /// `Status::Rejected` so rejected stories keep their worktree for human review.
 fn is_cleanup_terminal(stage: &Stage) -> bool {
    matches!(
        stage.status(),
        Status::Done | Status::Abandoned | Status::Superseded
    ) || matches!(stage.pipeline(), Pipeline::Archived)
 }
 /// Spawn a background task that creates a git worktree when a story enters `Stage::Coding`.
 ///
 /// Subscribes to the pipeline transition broadcast channel. On each
@@ -22,7 +32,14 @@ pub(crate) fn spawn_worktree_create_subscriber(project_root: PathBuf, port: u16)
        loop {
            match rx.recv().await {
                Ok(fired) => {
-                    if matches!(fired.after, Stage::Coding { .. }) {
+                    // Story 1086: classify by Pipeline column. `Pipeline::Coding`
                    // covers `Stage::Coding` and `Stage::Blocked` — but Blocked has
                    // no worktree to create, so we still need the Stage::Coding
                    // payload check.  Use a layered match: pipeline first for fast
                    // skip, then variant guard.
                    if fired.after.pipeline() == Pipeline::Coding
                        && matches!(fired.after, Stage::Coding { .. })
                    {
                        on_coding_transition(&project_root, port, &fired.story_id.0).await;
                    }
                }
@@ -50,13 +67,7 @@ pub(crate) fn spawn_worktree_cleanup_subscriber(project_root: PathBuf) {
        loop {
            match rx.recv().await {
                Ok(fired) => {
-                    if matches!(
+                    if is_cleanup_terminal(&fired.after) {
                        fired.after,
                        Stage::Done { .. }
                            | Stage::Archived { .. }
                            | Stage::Abandoned { .. }
                            | Stage::Superseded { .. }
                    ) {
                        on_terminal_transition(&project_root, &fired.story_id.0).await;
                    }
                }
@@ -72,6 +83,36 @@ pub(crate) fn spawn_worktree_cleanup_subscriber(project_root: PathBuf) {
    });
 }
 /// Reconcile worktree creation: for each story currently in `Stage::Coding`, ensure its worktree exists.
 ///
 /// Idempotent — creates worktrees for Coding stories that have no worktree yet, and is
 /// a no-op for stories whose worktree already exists.  Called by the periodic reconciler
 /// so that Lagged events on the broadcast channel never leave Coding stories without worktrees.
 pub(crate) async fn reconcile_worktree_create(project_root: &Path, port: u16) {
    for item in crate::pipeline_state::read_all_typed() {
        // Story 1086: filter by Pipeline column then narrow to the `Coding`
        // variant (Blocked is in `Pipeline::Coding` but has no worktree).
        if item.stage.pipeline() == Pipeline::Coding
            && matches!(item.stage, crate::pipeline_state::Stage::Coding { .. })
        {
            on_coding_transition(project_root, port, &item.story_id.0).await;
        }
    }
 }
 /// Reconcile worktree cleanup: for each story in a terminal stage, ensure its worktree is removed.
 ///
 /// Idempotent — removes worktrees for terminal stories that still have one, and is a no-op
 /// for stories with no worktree.  Called by the periodic reconciler so that Lagged events on
 /// the broadcast channel never leave terminal stories with dangling worktrees.
 pub(crate) async fn reconcile_worktree_cleanup(project_root: &Path) {
    for item in crate::pipeline_state::read_all_typed() {
        if is_cleanup_terminal(&item.stage) {
            on_terminal_transition(project_root, &item.story_id.0).await;
        }
    }
 }
 /// Create the worktree and feature branch for `story_id` when it enters `Stage::Coding`.
 pub(crate) async fn on_coding_transition(project_root: &Path, port: u16, story_id: &str) {
    let config = match crate::config::ProjectConfig::load(project_root) {
@@ -2,37 +2,30 @@
 use crate::agents::{AgentPool, AgentStatus};
 use crate::config::ProjectConfig;
-use crate::pipeline_state::{ArchiveReason, PipelineItem, Stage};
+use crate::pipeline_state::{ArchiveReason, Pipeline, PipelineItem, Stage, Status};
 use std::collections::{HashMap, HashSet};
 /// Map a stage to its display section label, or `None` to skip it entirely.
 ///
-/// This is the single source of truth for the "where does this item appear"
+/// This routes through [`Stage::pipeline`] so chat output and the web UI use
-/// decision.  It mirrors the bucket routing in `http/workflow/pipeline.rs`
+/// the same column derivation.  Frozen stories appear in their underlying
-/// so that chat output and the web UI are always consistent.
+/// `resume_to` column (handled inside `Stage::pipeline`) and items in
-///
+/// `Stage::Archived` (with non-Blocked reasons) stay hidden.
 /// `Stage::Frozen { resume_to }` is handled recursively: a frozen story
 /// appears in the same section its `resume_to` stage would land in.
 pub(crate) fn display_section(s: &Stage) -> Option<&'static str> {
-    match s {
+    // Archived items with non-Blocked reasons are hidden from chat output.
-        Stage::Upcoming | Stage::Backlog => Some("Backlog"),
+    if matches!(s, Stage::Archived { reason, .. } if !matches!(reason, ArchiveReason::Blocked { .. }))
-        Stage::Coding { .. }
+    {
-        | Stage::Blocked { .. }
+        return None;
        | Stage::Archived {
            reason: ArchiveReason::Blocked { .. },
            ..
        } => Some("In Progress"),
        Stage::Qa | Stage::ReviewHold { .. } => Some("QA"),
        Stage::Merge { .. } | Stage::MergeFailure { .. } | Stage::MergeFailureFinal { .. } => {
            Some("Merge")
        }
        Stage::Done { .. } => Some("Done"),
        Stage::Frozen { resume_to } => display_section(resume_to),
        Stage::Abandoned { .. } | Stage::Superseded { .. } | Stage::Rejected { .. } => {
            Some("Closed")
        }
        Stage::Archived { .. } => None, // Completed/MergeFailed/ReviewHeld stay hidden
    }
    Some(match s.pipeline() {
        Pipeline::Backlog => "Backlog",
        Pipeline::Coding => "In Progress",
        Pipeline::Qa => "QA",
        Pipeline::Merge => "Merge",
        Pipeline::Done => "Done",
        Pipeline::Closed => "Closed",
        Pipeline::Archived => return None,
    })
 }
 /// Check which dependency numbers from `item.depends_on` are unmet.
@@ -114,10 +107,10 @@ pub(crate) fn build_status_from_items(
    let config = ProjectConfig::load(project_root).ok();
-    // Pre-fetch working tree state for all Coding-stage items whose worktrees exist.
+    // Pre-fetch working tree state for all Coding-column items whose worktrees exist.
    let dirty_files_by_story: HashMap<String, crate::service::git_ops::DirtyFiles> = items
        .iter()
-        .filter(|i| matches!(i.stage, Stage::Coding { .. }))
+        .filter(|i| i.stage.pipeline() == Pipeline::Coding && i.stage.status() == Status::Active)
        .filter_map(|i| {
            let wt = crate::worktree::worktree_path(project_root, &i.story_id.0);
            if wt.is_dir() {
@@ -137,10 +130,13 @@ pub(crate) fn build_status_from_items(
        .into_iter()
        .collect();
    // Merge-failure detail now lives on the typed MergeJob CRDT entry
-    // (story 929 — CRDT is the sole source of metadata).
+    // (story 929 — CRDT is the sole source of metadata).  Only items in the
    // Merge column with an Active status (i.e. `Stage::Merge { .. }`) need a
    // pre-fetched failure snippet; MergeFailure(Final) items render their
    // own snippet from the typed kind.
    let merge_failures: HashMap<String, String> = items
        .iter()
-        .filter(|i| matches!(i.stage, Stage::Merge { .. }))
+        .filter(|i| i.stage.pipeline() == Pipeline::Merge && i.stage.status() == Status::Active)
        .filter_map(|i| {
            let job = crate::crdt_state::read_merge_job(&i.story_id.0)?;
            let err = job.error?;
@@ -215,11 +211,12 @@ pub(crate) fn build_status_from_items(
    out
 }
-/// Render the one-line working tree summary for a story with uncommitted changes.
+/// Return an inline working-tree suffix for a story with uncommitted changes.
 ///
-/// Returns an empty string when the working tree is clean. File paths are not
+/// Returns an empty string when the working tree is clean. The suffix is
-/// listed here; use `status N` (triage) for the per-file breakdown.
+/// appended directly to the coder line, e.g. `, Working tree: 3 modified (uncommitted)`.
-fn render_working_tree_lines(info: &crate::service::git_ops::DirtyFiles) -> String {
+/// File paths are not listed here; use `status N` (triage) for the per-file breakdown.
 fn working_tree_suffix(info: &crate::service::git_ops::DirtyFiles) -> String {
    if info.is_clean() {
        return String::new();
    }
@@ -228,7 +225,7 @@ fn render_working_tree_lines(info: &crate::service::git_ops::DirtyFiles) -> Stri
        (0, n) => format!("{n} new"),
        (m, n) => format!("{m} modified, {n} new"),
    };
-    format!("     Working tree: {summary} (uncommitted)\n")
+    format!(", Working tree: {summary} (uncommitted)")
 }
 /// Shared lookup tables passed to [`render_item_line`] to keep the argument count manageable.
@@ -259,8 +256,10 @@ fn render_item_line(
    } else {
        Some(item.name.as_str())
    };
-    // Use the typed CRDT stage as the sole source of truth (story 945).
+    // Use the new Pipeline + Status helpers (story 1085).
-    let frozen = matches!(item.stage, Stage::Frozen { .. });
+    let pipeline = item.stage.pipeline();
    let status = item.stage.status();
    let frozen = status == Status::Frozen;
    let base_label = super::story_short_label(story_id, name_opt);
    let display = if frozen {
        format!("\u{2744}\u{FE0F} {base_label}") // ❄️ prefix
@@ -281,41 +280,52 @@ fn render_item_line(
        format!(" *(waiting on: {})*", nums.join(", "))
    };
-    // Closed-stage items (abandoned / superseded / rejected) each get a
+    // Closed-pipeline items (abandoned / superseded / rejected) each get a
    // distinct indicator and optionally display their metadata.
-    match &item.stage {
+    match status {
-        Stage::Abandoned { .. } => {
+        Status::Abandoned => {
            return format!("  \u{1F5D1}\u{FE0F} {display}{cost_suffix}\n"); // 🗑️
        }
-        Stage::Superseded { superseded_by, .. } => {
+        Status::Superseded => {
            let superseded_by = match &item.stage {
                Stage::Superseded { superseded_by, .. } => superseded_by.0.as_str(),
                _ => "",
            };
            return format!(
-                "  \u{1F500} {display}{cost_suffix} — superseded by {}\n", // 🔀
+                "  \u{1F500} {display}{cost_suffix} — superseded by {superseded_by}\n", // 🔀
                superseded_by.0
            );
        }
-        Stage::Rejected { reason, .. } => {
+        Status::Rejected => {
            let reason = match &item.stage {
                Stage::Rejected { reason, .. } => reason.as_str(),
                _ => "",
            };
            let snippet = first_non_empty_snippet(reason, 120);
            return format!("  \u{1F6AB} {display}{cost_suffix} — {snippet}\n"); // 🚫
        }
        _ => {}
    }
-    // Merge-stage items get dedicated breakdown indicators instead of the
+    // Merge-column items get dedicated breakdown indicators instead of the
    // generic traffic-light dot.  MergeFailure / MergeFailureFinal items
-    // now also appear in the Merge section (in-place) so they are handled
+    // appear in the Merge column (in-place) and are handled by the same arm.
-    // here alongside normal Merge items.
+    if pipeline == Pipeline::Merge {
-    if matches!(
+        match status {
        item.stage,
        Stage::Merge { .. } | Stage::MergeFailure { .. } | Stage::MergeFailureFinal { .. }
    ) {
        match &item.stage {
            // MergeFailureFinal: mergemaster already tried and gave up — always ⛔.
-            Stage::MergeFailureFinal { kind } => {
+            Status::MergeFailureFinal => {
                let kind = match &item.stage {
                    Stage::MergeFailureFinal { kind } => kind,
                    _ => unreachable!(),
                };
                let snippet = first_non_empty_snippet(&kind.display_reason(), 120);
                return format!("  \u{26D4} {display}{cost_suffix}{dep_suffix} — {snippet}\n");
            }
            // MergeFailure: a recovery agent may be running or queued.
-            Stage::MergeFailure { kind, .. } => {
+            Status::MergeFailure => {
                let kind = match &item.stage {
                    Stage::MergeFailure { kind, .. } => kind,
                    _ => unreachable!(),
                };
                return match agent.map(|a| &a.status) {
                    Some(AgentStatus::Running) => format!(
                        "  \u{1F916} {display}{cost_suffix}{dep_suffix} — mergemaster running\n"
@@ -352,16 +362,7 @@ fn render_item_line(
        }
    }
-    let blocked = matches!(
+    let blocked = status == Status::Blocked;
        item.stage,
        Stage::Blocked { .. }
            | Stage::MergeFailure { .. }
            | Stage::MergeFailureFinal { .. }
            | Stage::Archived {
                reason: ArchiveReason::Blocked { .. },
                ..
            }
    );
    // Blocked items with a recovery agent get differentiated indicators.
    if blocked {
        return match agent.map(|a| &a.status) {
@@ -378,9 +379,9 @@ fn render_item_line(
        .and_then(|a| a.throttled)
        .is_some_and(|until| until > chrono::Utc::now());
    let dot = super::traffic_light_dot(blocked, throttled, agent.is_some());
-    let wt_lines = dirty_files_by_story
+    let wt_suffix = dirty_files_by_story
        .get(story_id)
-        .map(render_working_tree_lines)
+        .map(working_tree_suffix)
        .unwrap_or_default();
    if let Some(agent) = agent {
        let model_str = config
@@ -389,10 +390,10 @@ fn render_item_line(
            .and_then(|ac| ac.model.as_ref().map(|m| m.as_str()))
            .unwrap_or("?");
        format!(
-            "  {dot}{display}{cost_suffix}{dep_suffix} — {} ({model_str})\n{wt_lines}",
+            "  {dot}{display}{cost_suffix}{dep_suffix} — {} ({model_str}){wt_suffix}\n",
            agent.agent_name
        )
    } else {
-        format!("  {dot}{display}{cost_suffix}{dep_suffix}\n{wt_lines}")
+        format!("  {dot}{display}{cost_suffix}{dep_suffix}{wt_suffix}\n")
    }
 }
@@ -41,7 +41,16 @@ pub(in crate::chat::transport::matrix::bot) async fn handle_message(
        let all_lines: Vec<String> = sled_guard.drain(..).chain(gtw_guard.drain(..)).collect();
        drop(sled_guard);
        drop(gtw_guard);
-        format_drained_events(all_lines)
+        slog!(
            "[matrix-bot] drained {} gateway audit lines for LLM context",
            all_lines.len()
        );
        let prefix = format_drained_events(all_lines);
        slog!(
            "[matrix-bot] format_drained_events output: {} bytes",
            prefix.len()
        );
        prefix
    };
    // The prompt is just the current message with sender attribution.
@@ -326,21 +326,49 @@ pub async fn run_bot(
    }
    // Subscribe to gateway-side status events and buffer compact audit lines for
-    // the LLM context.  A separate resubscribed receiver is used so both the
+    // the LLM context.
-    // buffer task and the room-forwarder task receive every event independently.
+    //
    // Investigation log (story 1078) — hypotheses ruled out:
    //   (A) gateway_event_rx is None: impossible — spawn_gateway_bot always passes
    //       Some(state.event_tx.clone()) in gateway mode (gateway/mod.rs:130).
    //   (B) recv() never returns: buf task uses the ORIGINAL event_rx (subscribed
    //       before Matrix init) so any events buffered during init are visible;
    //       future events arrive normally via the shared broadcast channel.
    //   (C) Different Arc: buf and ctx.pending_gateway_events are both clones of
    //       the same Arc<TokioMutex<Vec<String>>> — writes in the buf task are
    //       immediately visible to handle_message.
    //   (D) format_drained_events empty on non-empty input: the function is
    //       pure/tested; the drain slog in handle_message now makes the count
    //       observable so we can confirm it is non-zero when events arrive.
    //
    // Bug fixed here: previously the buffer task held `event_rx.resubscribe()`,
    // which starts at the *current tail* (next unsent message) and silently
    // discards every event that arrived during the Matrix login / room-join /
    // cross-signing phase (~5–30 s window).  The forwarder now gets the
    // resubscribed receiver (only needs live events going forward); the buffer
    // task holds the original `event_rx` so it drains the init-window backlog
    // on first poll.
    let pending_gateway_events: Arc<TokioMutex<Vec<String>>> =
        Arc::new(TokioMutex::new(Vec::new()));
    let gateway_event_rx_for_forwarder = if let Some(event_rx) = gateway_event_rx {
-        // Buffer task: silently accumulate compact audit lines for Timmy's context.
+        // The forwarder only needs live (future) events — resubscribe is fine.
        let forwarder_rx = event_rx.resubscribe();
        // Buffer task: hold the *original* receiver so init-window events are
        // not lost.  Silently accumulate compact audit lines for Timmy's context.
        {
            use crate::service::gateway::polling::format_gateway_audit_line;
            let buf_rx = event_rx.resubscribe();
            let buf = Arc::clone(&pending_gateway_events);
            slog!("[matrix-bot] subscribed to gateway events; buffer task starting");
            tokio::spawn(async move {
-                let mut rx = buf_rx;
+                let mut rx = event_rx;
                loop {
                    match rx.recv().await {
                        Ok(event) => {
                            slog!(
                                "[matrix-bot] buffered audit line for project={} id={}",
                                event.project,
                                event.event.timestamp_ms()
                            );
                            let line = format_gateway_audit_line(&event.project, &event.event);
                            buf.lock().await.push(line);
                        }
@@ -352,7 +380,7 @@ pub async fn run_bot(
                }
            });
        }
-        Some(event_rx)
+        Some(forwarder_rx)
    } else {
        None
    };
@@ -592,4 +620,89 @@ mod tests {
        assert_eq!(steps[2], 20);
        assert_eq!(steps[3], 40);
    }
    /// Regression test (story 1078): gateway broadcast events must reach
    /// `pending_gateway_events` and produce an `audit ts=…` line in the
    /// `format_drained_events` output that is prepended to Timmy's prompt.
    ///
    /// The test spins up a mock `event_tx` broadcaster, sends one
    /// `StageTransition` event, lets the buffer task process it, drains the
    /// buffer, and asserts the result contains the expected audit prefix.
    #[tokio::test]
    async fn gateway_buffer_task_injects_audit_line_into_context() {
        use super::super::messages::format_drained_events;
        use crate::service::events::StoredEvent;
        use crate::service::gateway::GatewayStatusEvent;
        use crate::service::gateway::polling::format_gateway_audit_line;
        let (event_tx, event_rx) = tokio::sync::broadcast::channel::<GatewayStatusEvent>(16);
        // pending_gateway_events shared between buffer task and drain site.
        let pending: Arc<TokioMutex<Vec<String>>> = Arc::new(TokioMutex::new(Vec::new()));
        // Spawn a minimal buffer task — same logic as run_bot uses.
        {
            let buf = Arc::clone(&pending);
            tokio::spawn(async move {
                let mut rx = event_rx;
                loop {
                    match rx.recv().await {
                        Ok(event) => {
                            let line = format_gateway_audit_line(&event.project, &event.event);
                            buf.lock().await.push(line);
                        }
                        Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => {}
                        Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
                    }
                }
            });
        }
        // Send one stage-transition event, as a project node would.
        let evt = GatewayStatusEvent {
            project: "huskies".to_string(),
            event: StoredEvent::StageTransition {
                story_id: "42_story_feat".to_string(),
                story_name: String::new(),
                from_stage: "2_current".to_string(),
                to_stage: "3_qa".to_string(),
                timestamp_ms: 1_000_000,
            },
        };
        let receivers = event_tx.send(evt).unwrap_or(0);
        assert!(
            receivers > 0,
            "event must have at least one active receiver"
        );
        // Wait for the buffer task to process the event.
        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(2);
        loop {
            if !pending.lock().await.is_empty() {
                break;
            }
            assert!(
                std::time::Instant::now() < deadline,
                "buffer task did not receive the event within 2 s"
            );
            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
        }
        // Drain and format — mirrors what handle_message does.
        let lines: Vec<String> = pending.lock().await.drain(..).collect();
        let prefix = format_drained_events(lines);
        assert!(
            prefix.contains("audit ts="),
            "prompt prefix must contain 'audit ts='; got: {prefix}"
        );
        assert!(
            prefix.contains("project=huskies"),
            "prompt prefix must name the project; got: {prefix}"
        );
        assert!(
            prefix.starts_with("<system-reminder>\n"),
            "prefix must open with <system-reminder>; got: {prefix}"
        );
    }
 }
@@ -161,6 +161,12 @@ pub struct WatcherConfig {
    /// moved to `6_archived/`. Default: 14400 (4 hours).
    #[serde(default = "default_done_retention_secs")]
    pub done_retention_secs: u64,
    /// How often (in seconds) the periodic reconciler runs to converge
    /// subscriber side effects.  The reconciler calls each subscriber's
    /// `reconcile()` entry point so that Lagged events never leave persistent
    /// state diverged. Default: 30 seconds.
    #[serde(default = "default_reconcile_interval_secs")]
    pub reconcile_interval_secs: u64,
 }
 impl Default for WatcherConfig {
@@ -168,6 +174,7 @@ impl Default for WatcherConfig {
        Self {
            sweep_interval_secs: default_sweep_interval_secs(),
            done_retention_secs: default_done_retention_secs(),
            reconcile_interval_secs: default_reconcile_interval_secs(),
        }
    }
 }
@@ -180,6 +187,10 @@ fn default_done_retention_secs() -> u64 {
    4 * 60 * 60 // 4 hours
 }
 fn default_reconcile_interval_secs() -> u64 {
    30
 }
 fn default_qa() -> String {
    "server".to_string()
 }
@@ -56,7 +56,8 @@ pub use write::{
    bump_retry_count, migrate_legacy_stage_strings, migrate_merge_job, migrate_names_from_slugs,
    migrate_node_claims_to_agent_claims, migrate_story_ids_to_numeric, name_from_story_id,
    purge_done_stage_merge_jobs, set_agent, set_depends_on, set_epic, set_item_type, set_name,
-    set_plan_state, set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count, write_item,
+    set_origin, set_plan_state, set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count,
    write_item,
 };
 #[cfg(test)]
@@ -29,6 +29,8 @@ pub struct CrdtItemDump {
    /// Hex-encoded OpId of the list insert op — cross-reference with `crdt_ops`.
    pub content_index: String,
    pub is_deleted: bool,
    /// Origin JSON string, or `None` for items that pre-date story 1088.
    pub origin: Option<String>,
 }
 /// Top-level debug dump of the in-memory CRDT state.
@@ -149,6 +151,10 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
            JsonValue::Number(n) if n > 0.0 => Some(n),
            _ => None,
        };
        let origin = match item_crdt.origin.view() {
            JsonValue::String(s) if !s.is_empty() => Some(s),
            _ => None,
        };
        let content_index = op.id.iter().map(|b| format!("{b:02x}")).collect::<String>();
@@ -163,6 +169,7 @@ pub fn dump_crdt_state(story_id_filter: Option<&str>) -> CrdtStateDump {
            claim_ts,
            content_index,
            is_deleted: op.is_deleted,
            origin,
        });
    }
@@ -408,6 +415,11 @@ pub(super) fn extract_item_view(item: &PipelineItemCrdt) -> Option<PipelineItemV
        _ => None,
    };
    let origin = match item.origin.view() {
        JsonValue::String(s) if !s.is_empty() => Some(s),
        _ => None,
    };
    let stage = project_stage_for_view(
        &stage_str,
        &story_id,
@@ -429,6 +441,7 @@ pub(super) fn extract_item_view(item: &PipelineItemCrdt) -> Option<PipelineItemV
        qa_mode,
        item_type,
        epic,
        origin,
    })
 }
@@ -585,56 +598,48 @@ fn project_stage_for_view(
    }
 }
-/// Check whether a dependency (by numeric ID prefix) is in `5_done` or `6_archived`
+/// Check whether a dependency (by numeric ID prefix) is in `Pipeline::Done` or
-/// according to CRDT state.
+/// `Pipeline::Archived` according to CRDT state.
 ///
-/// Returns `true` if the dependency is satisfied (item found in a done stage).
+/// Returns `true` if the dependency is satisfied (item found in a Done or
-/// Matches both legacy slug-form IDs (`"664_story_foo"`) and numeric-only IDs
+/// Archived pipeline column). Matches both legacy slug-form IDs
-/// (`"664"`) so the check remains correct after the slug→numeric migration.
+/// (`"664_story_foo"`) and numeric-only IDs (`"664"`) so the check remains
-/// See `dep_is_archived_crdt` to distinguish archive-satisfied from cleanly-done.
+/// correct after the slug→numeric migration. Story 1086 routes the check
 /// through the `Pipeline` projection so that future Stage variants automatically
 /// participate via [`crate::pipeline_state::Stage::pipeline`]. See
 /// `dep_is_archived_crdt` to distinguish archive-satisfied from cleanly-done.
 pub fn dep_is_done_crdt(dep_number: u32) -> bool {
-    use crate::pipeline_state::{Stage, read_all_typed};
+    use crate::pipeline_state::{Pipeline, read_all_typed};
    let exact = dep_number.to_string();
    let prefix = format!("{dep_number}_");
    read_all_typed().into_iter().any(|item| {
        (item.story_id.0 == exact || item.story_id.0.starts_with(&prefix))
-            && matches!(
+            && matches!(item.stage.pipeline(), Pipeline::Done | Pipeline::Archived)
                item.stage,
                Stage::Done { .. }
                    | Stage::Archived { .. }
                    | Stage::Abandoned { .. }
                    | Stage::Superseded { .. }
                    | Stage::Rejected { .. }
            )
    })
 }
-/// Check whether a dependency (by numeric ID prefix) is specifically in `6_archived`
+/// Check whether a dependency (by numeric ID prefix) is specifically in
-/// according to CRDT state.
+/// `Pipeline::Archived` according to CRDT state.
 ///
 /// Used to detect when a dependency is satisfied via archive rather than via a clean
-/// completion through `5_done`.  Returns `false` when the CRDT layer is not initialised.
+/// completion through `Pipeline::Done`.  Returns `false` when the CRDT layer is not
-/// Matches both legacy slug-form IDs (`"664_story_foo"`) and numeric-only IDs (`"664"`).
+/// initialised.  Matches both legacy slug-form IDs (`"664_story_foo"`) and
 /// numeric-only IDs (`"664"`).
 pub fn dep_is_archived_crdt(dep_number: u32) -> bool {
-    use crate::pipeline_state::{Stage, read_all_typed};
+    use crate::pipeline_state::{Pipeline, read_all_typed};
    let exact = dep_number.to_string();
    let prefix = format!("{dep_number}_");
    read_all_typed().into_iter().any(|item| {
        (item.story_id.0 == exact || item.story_id.0.starts_with(&prefix))
-            && matches!(
+            && item.stage.pipeline() == Pipeline::Archived
                item.stage,
                Stage::Archived { .. }
                    | Stage::Abandoned { .. }
                    | Stage::Superseded { .. }
                    | Stage::Rejected { .. }
            )
    })
 }
 /// Check unmet dependencies for a story by reading its `depends_on` from the
 /// CRDT document and checking each dependency against CRDT state.
 ///
-/// Returns the list of dependency numbers that are NOT in `5_done` or `6_archived`.
+/// Returns the list of dependency numbers whose stage is NOT in `Pipeline::Done`
 /// or `Pipeline::Archived`.
 pub fn check_unmet_deps_crdt(story_id: &str) -> Vec<u32> {
    let item = match read_item(story_id) {
        Some(i) => i,
@@ -105,6 +105,26 @@ pub struct PipelineItemCrdt {
    /// means no merge task is in flight.  Projected into `Stage::Merge {
    /// server_start_time }` so callers never read this register directly.
    pub merge_server_start: LwwRegisterCrdt<f64>,
    /// Story 1086: kebab-case wire form of the [`crate::pipeline_state::Pipeline`]
    /// projection of the current `stage`.  Written by `write_item` alongside
    /// `stage` so display/scan code on remote peers can route by pipeline column
    /// without re-deriving from the stage string.  Empty string means "use the
    /// value derived from `stage`" (legacy items predating 1086).
    pub pipeline: LwwRegisterCrdt<String>,
    /// Story 1086: kebab-case wire form of the [`crate::pipeline_state::Status`]
    /// projection of the current `stage`.  Written alongside `stage` so badge
    /// renderers can read the status directly without re-projecting from the
    /// stage string.  Empty string means "use the value derived from `stage`"
    /// (legacy items predating 1086).
    pub status: LwwRegisterCrdt<String>,
    /// Story 1088: origin of the work item — who or what created it.
    ///
    /// Stored as a compact JSON string, e.g.
    /// `{"kind":"user","id":"","ts":1716768000.0}` or
    /// `{"kind":"agent","id":"coder-1","ts":1716768000.0}`.
    /// Empty string on older items that pre-date this register; the typed
    /// read path surfaces those as `None`, which the UI renders as `"unknown"`.
    pub origin: LwwRegisterCrdt<String>,
 }
 /// CRDT node that holds a single peer's presence entry.
@@ -203,6 +223,9 @@ pub struct WorkItem {
    pub(super) item_type: Option<crate::io::story_metadata::ItemType>,
    /// Epic this item belongs to. `None` when the item has no parent epic.
    pub(super) epic: Option<EpicId>,
    /// Origin of the work item (story 1088).  `None` for items created before
    /// the origin register was introduced; those display as `"unknown"`.
    pub(super) origin: Option<String>,
 }
 impl WorkItem {
@@ -261,6 +284,12 @@ impl WorkItem {
        self.epic
    }
    /// Origin of the work item (story 1088), or `None` for items created before
    /// the origin register was introduced.
    pub fn origin(&self) -> Option<&str> {
        self.origin.as_deref()
    }
    /// Construct a `WorkItem` for use in tests outside `crdt_state::*`.
    ///
    /// Within `crdt_state` use a struct literal directly (fields are `pub(super)`).
@@ -286,6 +315,7 @@ impl WorkItem {
            qa_mode,
            item_type,
            epic,
            origin: None,
        }
    }
 }
@@ -235,6 +235,31 @@ pub fn set_plan_state(story_id: &str, state: crate::pipeline_state::PlanState) -
    true
 }
 /// Set the `origin` CRDT register for a pipeline item (story 1088).
 ///
 /// Writes a compact JSON string describing who or what created the item, e.g.
 /// `{"kind":"user","id":"","ts":1716768000.0}` or
 /// `{"kind":"agent","id":"coder-1","ts":1716768000.0}`.
 ///
 /// Passing an empty string is treated as "no origin set" (equivalent to the
 /// pre-1088 state for older items).  Returns `true` if the item was found and
 /// the op was applied, `false` otherwise.
 pub fn set_origin(story_id: &str, origin: &str) -> bool {
    let Some(state_mutex) = get_crdt() else {
        return false;
    };
    let Ok(mut state) = state_mutex.lock() else {
        return false;
    };
    let Some(&idx) = state.index.get(story_id) else {
        return false;
    };
    apply_and_persist(&mut state, |s| {
        s.crdt.doc.items[idx].origin.set(origin.to_string())
    });
    true
 }
 /// Write a pipeline item state through CRDT operations.
 ///
 /// If the item exists, updates its registers.  If not, inserts a new item
@@ -256,6 +281,11 @@ pub fn write_item(
    merged_at: Option<f64>,
 ) {
    let stage_str = stage_dir_name(stage);
    // Story 1086: persist the typed Pipeline + Status projections alongside
    // the stage register so subscribers/display code on remote peers can route
    // by them without re-deriving from the stage string.
    let pipeline_str = stage.pipeline().as_str();
    let status_str = stage.status().as_str();
    let claim: Option<&AgentClaim> = match stage {
        Stage::Coding { claim, .. } => claim.as_ref(),
        Stage::Merge { claim, .. } => claim.as_ref(),
@@ -311,6 +341,14 @@ pub fn write_item(
        apply_and_persist(&mut state, |s| {
            s.crdt.doc.items[idx].stage.set(stage_str.to_string())
        });
        // Story 1086: keep `pipeline` and `status` registers in lock-step with
        // the stage write so subscribers/display can read them directly.
        apply_and_persist(&mut state, |s| {
            s.crdt.doc.items[idx].pipeline.set(pipeline_str.to_string())
        });
        apply_and_persist(&mut state, |s| {
            s.crdt.doc.items[idx].status.set(status_str.to_string())
        });
        if let Some(n) = name {
            apply_and_persist(&mut state, |s| {
@@ -394,6 +432,10 @@ pub fn write_item(
            "resume_to": "",
            "plan_state": "",
            "merge_server_start": merge_server_start_val,
            // Story 1086: typed Pipeline + Status projections written at insert.
            "pipeline": pipeline_str,
            "status": status_str,
            "origin": "",
        })
        .into();
@@ -424,6 +466,10 @@ pub fn write_item(
            item.resume_to.advance_seq(floor);
            item.plan_state.advance_seq(floor);
            item.merge_server_start.advance_seq(floor);
            // Story 1086.
            item.pipeline.advance_seq(floor);
            item.status.advance_seq(floor);
            item.origin.advance_seq(floor);
        }
        // Broadcast a CrdtEvent for the new item.
@@ -10,8 +10,8 @@ mod migrations;
 mod tests;
 pub use item::{
-    bump_retry_count, set_agent, set_depends_on, set_epic, set_item_type, set_name, set_plan_state,
+    bump_retry_count, set_agent, set_depends_on, set_epic, set_item_type, set_name, set_origin,
-    set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count, write_item,
+    set_plan_state, set_qa_mode, set_resume_to, set_resume_to_raw, set_retry_count, write_item,
 };
 #[cfg(test)]
@@ -434,6 +434,7 @@ async fn handle_work_items_get(params: Value) -> Value {
            "stage": c.stage,
            "name": c.name,
            "agent": c.agent,
            "origin": c.origin,
        }),
        Err(e) => serde_json::json!({"error": e.to_string()}),
    }
@@ -12,7 +12,7 @@
 //!    zombie entries left over from sessions that predate the subscriber.
 use crate::db::{ContentKey, all_content_ids, delete_content};
-use crate::pipeline_state::Stage;
+use crate::pipeline_state::{Pipeline, Stage, Status};
 use crate::slog;
 use crate::slog_warn;
@@ -111,16 +111,18 @@ pub(crate) fn sweep_zombie_content_on_startup() {
    }
 }
-/// Return `true` when `stage` is one of the five terminal pipeline stages.
+/// Return `true` when `stage` is one of the terminal pipeline classifications.
 ///
 /// Story 1086: matches via the [`Status`] projection (Done / Abandoned /
 /// Superseded / Rejected) plus [`Pipeline::Archived`] for plain archived items
 /// (which carry `Status::Active`).  Future Stage variants automatically
 /// participate by returning the appropriate Status / Pipeline from
 /// [`Stage::status`] / [`Stage::pipeline`].
 fn is_terminal_stage(stage: &Stage) -> bool {
    matches!(
-        stage,
+        stage.status(),
-        Stage::Done { .. }
+        Status::Done | Status::Abandoned | Status::Superseded | Status::Rejected
-            | Stage::Archived { .. }
+    ) || matches!(stage.pipeline(), Pipeline::Archived)
            | Stage::Abandoned { .. }
            | Stage::Superseded { .. }
            | Stage::Rejected { .. }
    )
 }
 #[cfg(test)]
@@ -29,7 +29,7 @@ pub mod shadow_write;
 pub use content_store::{ContentKey, all_content_ids, delete_content, read_content, write_content};
 pub use ops::{ItemMeta, delete_item, move_item_stage, next_item_number, write_item_with_content};
-pub use shadow_write::{get_shared_pool, init};
+pub use shadow_write::{check_schema_drift, get_shared_pool, init};
 #[cfg(test)]
 pub use content_store::ensure_content_store;
@@ -395,6 +395,112 @@ mod tests {
        );
    }
    /// Regression: root cause of the 2026-05-14 21:07 production outage.
    ///
    /// A headless agent on a feature branch (whose binary includes a new
    /// sqlx migration) must NEVER apply that migration to the production
    /// pipeline.db.  Verify that opening an agent-local DB and running
    /// migrations on it leaves the production DB's `_sqlx_migrations` table
    /// unchanged.
    ///
    /// The enforcement mechanism is in `init_subsystems(is_agent=true)`, which
    /// redirects to a temp path.  This test validates the SQLite isolation
    /// property: migrations applied to one file are confined to that file.
    #[tokio::test]
    async fn agent_db_isolation_does_not_affect_production_db() {
        let tmp = tempfile::tempdir().unwrap();
        let prod_db_path = tmp.path().join("production.db");
        let agent_db_path = tmp.path().join("agent_temp.db");
        // Set up the production DB — apply the current compiled-in migrations.
        let prod_opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&prod_db_path)
            .create_if_missing(true);
        let prod_pool = sqlx::SqlitePool::connect_with(prod_opts).await.unwrap();
        sqlx::migrate!("./migrations")
            .run(&prod_pool)
            .await
            .unwrap();
        // Record the migration versions present in the production DB.
        let before: Vec<(i64,)> =
            sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
                .fetch_all(&prod_pool)
                .await
                .unwrap();
        // Simulate the agent opening its own isolated DB and running migrations.
        let agent_opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&agent_db_path)
            .create_if_missing(true);
        let agent_pool = sqlx::SqlitePool::connect_with(agent_opts).await.unwrap();
        sqlx::migrate!("./migrations")
            .run(&agent_pool)
            .await
            .unwrap();
        // Production DB must be completely unaffected by the agent's migration run.
        let after: Vec<(i64,)> =
            sqlx::query_as("SELECT version FROM _sqlx_migrations ORDER BY version")
                .fetch_all(&prod_pool)
                .await
                .unwrap();
        assert_eq!(
            before, after,
            "agent opening its own DB must not alter the production DB migration table"
        );
    }
    /// Verify that `check_schema_drift` returns an empty list when all
    /// migrations in the database are recognised by this binary.
    #[tokio::test]
    async fn check_schema_drift_empty_when_all_known() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("drift_test.db");
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        let drift = super::shadow_write::check_schema_drift(&pool).await;
        assert!(
            drift.is_empty(),
            "no drift expected when DB matches the compiled-in migration set"
        );
    }
    /// Verify that `check_schema_drift` identifies a manually-inserted
    /// migration row that is not part of the compiled-in set.
    #[tokio::test]
    async fn check_schema_drift_detects_unknown_migration() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("drift_future.db");
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        // Inject a fake "future" migration that no binary compiled today would know.
        let fake_checksum: Vec<u8> = vec![0u8; 20];
        sqlx::query(
            "INSERT INTO _sqlx_migrations \
             (version, description, installed_on, success, checksum, execution_time) \
             VALUES (99999999999999, 'future_migration', '2099-01-01T00:00:00Z', 1, ?1, 0)",
        )
        .bind(&fake_checksum)
        .execute(&pool)
        .await
        .unwrap();
        let drift = super::shadow_write::check_schema_drift(&pool).await;
        assert_eq!(drift.len(), 1, "exactly one unknown migration expected");
        assert_eq!(drift[0].version, 99999999999999_i64);
        assert_eq!(drift[0].description, "future_migration");
    }
    /// Story 864: passing `ItemMeta::default()` against a content blob that
    /// LOOKS like front-matter must NOT silently extract metadata into the
    /// CRDT.  The whole point of removing the implicit YAML round-trip is
@@ -482,4 +588,218 @@ mod tests {
            "retry_count must reset to 0 on stage transition"
        );
    }
    /// Story 1087, AC2: the split-stage migration projects every supported
    /// wire-form `stage` string into the canonical `(pipeline, status)` pair.
    /// The fixture covers each Stage variant (and the legacy numeric-prefix
    /// directory names retained for back-compat).
    #[tokio::test]
    async fn split_stage_migration_backfills_pipeline_and_status_for_every_variant() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("pipeline.db");
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        // (stage written by older code, expected pipeline, expected status)
        let fixture: &[(&str, &str, &str)] = &[
            ("upcoming", "backlog", "active"),
            ("backlog", "backlog", "active"),
            ("coding", "coding", "active"),
            ("blocked", "coding", "blocked"),
            ("qa", "qa", "active"),
            ("review_hold", "qa", "review-hold"),
            ("merge", "merge", "active"),
            ("merge_failure", "merge", "merge-failure"),
            ("merge_failure_final", "merge", "merge-failure-final"),
            ("done", "done", "done"),
            ("abandoned", "closed", "abandoned"),
            ("superseded", "closed", "superseded"),
            ("rejected", "closed", "rejected"),
            ("archived", "archived", "active"),
            ("frozen", "coding", "frozen"),
            // Legacy numeric-prefix directory names.
            ("1_backlog", "backlog", "active"),
            ("2_current", "coding", "active"),
            ("3_qa", "qa", "active"),
            ("4_merge", "merge", "active"),
            ("5_done", "done", "done"),
            ("6_archived", "archived", "active"),
        ];
        let now = chrono::Utc::now().to_rfc3339();
        for (idx, (stage, _, _)) in fixture.iter().enumerate() {
            let id = format!("1087_fixture_{idx}");
            sqlx::query(
                "INSERT INTO pipeline_items \
                     (id, name, stage, agent, retry_count, depends_on, content, created_at, updated_at) \
                 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?8)",
            )
            .bind(&id)
            .bind("fixture")
            .bind(*stage)
            .bind(Option::<String>::None)
            .bind(Option::<i64>::None)
            .bind(Option::<String>::None)
            .bind("---\nname: fixture\n---\n")
            .bind(&now)
            .execute(&pool)
            .await
            .unwrap();
        }
        // Force the split-stage backfill to run against the rows we just
        // inserted.  In production this is `sqlx::migrate!`'s job, but the
        // sqlx migrator only runs migrations once per DB and they were already
        // applied at the top of the test before any rows existed.  Reissuing
        // the backfill statements is the migration logic under test.
        sqlx::query(
            "UPDATE pipeline_items SET pipeline = CASE stage \
                WHEN 'upcoming'            THEN 'backlog' \
                WHEN 'backlog'             THEN 'backlog' \
                WHEN '1_backlog'           THEN 'backlog' \
                WHEN 'coding'              THEN 'coding' \
                WHEN 'blocked'             THEN 'coding' \
                WHEN '2_current'           THEN 'coding' \
                WHEN 'qa'                  THEN 'qa' \
                WHEN 'review_hold'         THEN 'qa' \
                WHEN '3_qa'                THEN 'qa' \
                WHEN 'merge'               THEN 'merge' \
                WHEN 'merge_failure'       THEN 'merge' \
                WHEN 'merge_failure_final' THEN 'merge' \
                WHEN '4_merge'             THEN 'merge' \
                WHEN 'done'                THEN 'done' \
                WHEN '5_done'              THEN 'done' \
                WHEN 'abandoned'           THEN 'closed' \
                WHEN 'superseded'          THEN 'closed' \
                WHEN 'rejected'            THEN 'closed' \
                WHEN 'archived'            THEN 'archived' \
                WHEN '6_archived'          THEN 'archived' \
                WHEN 'frozen'              THEN 'coding' \
                ELSE '' END",
        )
        .execute(&pool)
        .await
        .unwrap();
        sqlx::query(
            "UPDATE pipeline_items SET status = CASE stage \
                WHEN 'frozen'              THEN 'frozen' \
                WHEN 'review_hold'         THEN 'review-hold' \
                WHEN 'blocked'             THEN 'blocked' \
                WHEN 'merge_failure'       THEN 'merge-failure' \
                WHEN 'merge_failure_final' THEN 'merge-failure-final' \
                WHEN 'abandoned'           THEN 'abandoned' \
                WHEN 'superseded'          THEN 'superseded' \
                WHEN 'rejected'            THEN 'rejected' \
                WHEN 'done'                THEN 'done' \
                WHEN '5_done'              THEN 'done' \
                ELSE 'active' END",
        )
        .execute(&pool)
        .await
        .unwrap();
        for (idx, (stage_input, expect_pipeline, expect_status)) in fixture.iter().enumerate() {
            let id = format!("1087_fixture_{idx}");
            let row: (String, String) =
                sqlx::query_as("SELECT pipeline, status FROM pipeline_items WHERE id = ?1")
                    .bind(&id)
                    .fetch_one(&pool)
                    .await
                    .unwrap();
            assert_eq!(
                row.0, *expect_pipeline,
                "stage {stage_input:?} should backfill pipeline to {expect_pipeline:?}, got {:?}",
                row.0
            );
            assert_eq!(
                row.1, *expect_status,
                "stage {stage_input:?} should backfill status to {expect_status:?}, got {:?}",
                row.1
            );
        }
    }
    /// Story 1087, AC1: `shadow_write::init` writes a timestamped backup of
    /// pipeline.db before the split-stage migration applies, and skips the
    /// backup on subsequent restarts (after the migration is recorded).
    #[tokio::test]
    async fn pre_pipeline_status_backup_only_runs_once() {
        let tmp = tempfile::tempdir().unwrap();
        let db_path = tmp.path().join("pipeline.db");
        // Seed a "pre-1087" DB: open without applying the split-stage migration.
        // We do this by opening with `create_if_missing` and running only the
        // legacy migrations — but the simplest way to simulate that here is to
        // hand-craft a DB containing an `_sqlx_migrations` table that lists
        // every migration EXCEPT the split-stage one.
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(true);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        // Apply migrations the normal way, then delete the split-stage row so
        // the backup branch fires on the next `init`.
        sqlx::migrate!("./migrations").run(&pool).await.unwrap();
        sqlx::query("DELETE FROM _sqlx_migrations WHERE version = 20260515000000")
            .execute(&pool)
            .await
            .unwrap();
        pool.close().await;
        // First call: backup branch fires, side-car file appears.
        super::shadow_write::backup_pre_pipeline_status(&db_path).await;
        let backups: Vec<_> = std::fs::read_dir(tmp.path())
            .unwrap()
            .filter_map(Result::ok)
            .filter(|e| {
                e.file_name()
                    .to_string_lossy()
                    .contains(".pre-pipeline-status.")
            })
            .collect();
        assert_eq!(
            backups.len(),
            1,
            "expected exactly one .pre-pipeline-status backup, got {}",
            backups.len()
        );
        // Re-apply the migration so the marker row is back, simulating a
        // post-migration server restart.
        let opts = sqlx::sqlite::SqliteConnectOptions::new()
            .filename(&db_path)
            .create_if_missing(false);
        let pool = sqlx::SqlitePool::connect_with(opts).await.unwrap();
        let fake_checksum: Vec<u8> = vec![0u8; 20];
        sqlx::query(
            "INSERT INTO _sqlx_migrations \
             (version, description, installed_on, success, checksum, execution_time) \
             VALUES (20260515000000, 'split_stage_into_pipeline_status', '2026-05-15T00:00:00Z', 1, ?1, 0)",
        )
        .bind(&fake_checksum)
        .execute(&pool)
        .await
        .unwrap();
        pool.close().await;
        // Second call: no new backup written.
        super::shadow_write::backup_pre_pipeline_status(&db_path).await;
        let backups_after: Vec<_> = std::fs::read_dir(tmp.path())
            .unwrap()
            .filter_map(Result::ok)
            .filter(|e| {
                e.file_name()
                    .to_string_lossy()
                    .contains(".pre-pipeline-status.")
            })
            .collect();
        assert_eq!(
            backups_after.len(),
            1,
            "post-migration init must not create another backup; got {} backups",
            backups_after.len()
        );
    }
 }
@@ -11,10 +11,23 @@ use crate::slog;
 use sqlx::SqlitePool;
 use sqlx::sqlite::SqliteConnectOptions;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::path::Path;
 use std::sync::OnceLock;
 use tokio::sync::mpsc;
 /// One migration row in the live database that is not in the compiled-in set.
 ///
 /// Returned by [`check_schema_drift`] for each unknown migration.
 pub struct UnknownMigration {
    /// sqlx migration version number (derived from the filename timestamp).
    pub version: i64,
    /// Human-readable description from the migration filename.
    pub description: String,
    /// When the migration was applied, as stored in `_sqlx_migrations.installed_on`.
    pub installed_on: String,
 }
 /// The process-global SQLite pool, set once by [`init`].
 ///
 /// Other modules call [`get_shared_pool`] to access the pool without needing
@@ -56,6 +69,13 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
        return Ok(());
    }
    // Story 1087: before running the migration that splits `stage` into
    // (`pipeline`, `status`), take a timestamped side-car copy of the live DB
    // so the pre-split state is recoverable.  Skip the copy when the file does
    // not yet exist (fresh installs) or when the split-stage migration has
    // already been applied (subsequent restarts).
    backup_pre_pipeline_status(db_path).await;
    let options = SqliteConnectOptions::new()
        .filename(db_path)
        .create_if_missing(true);
@@ -133,3 +153,88 @@ pub async fn init(db_path: &Path) -> Result<(), sqlx::Error> {
    let _ = PIPELINE_DB.set(PipelineDb { tx });
    Ok(())
 }
 /// Story 1087: file name of the split-stage migration.  The version prefix is
 /// the same `i64` sqlx assigns to that migration on `installed_on` rows in
 /// `_sqlx_migrations`.
 const SPLIT_STAGE_MIGRATION_VERSION: i64 = 20260515000000;
 /// Story 1087: take a timestamped side-car copy of `pipeline.db` if and only if
 /// the split-stage migration has not yet been applied.  This is the AC1 backup
 /// — `pipeline.db.pre-pipeline-status.<unix-ts>.bak` next to the live file.
 ///
 /// Failures are logged but never propagated: a missing backup must not block
 /// the server from starting (a corrupt source file or a read-only directory
 /// will be surfaced by the migration step itself).
 pub(crate) async fn backup_pre_pipeline_status(db_path: &Path) {
    if !db_path.exists() {
        return;
    }
    // Cheap pre-check: open the DB read-only and see whether the split-stage
    // migration version is recorded in `_sqlx_migrations`.  If it is, the
    // backup has already been taken on a previous start and there is nothing
    // to do.
    let options = SqliteConnectOptions::new()
        .filename(db_path)
        .read_only(true)
        .create_if_missing(false);
    let probe = SqlitePool::connect_with(options).await;
    if let Ok(pool) = probe {
        let already_split: Result<Option<(i64,)>, _> =
            sqlx::query_as("SELECT version FROM _sqlx_migrations WHERE version = ?1 LIMIT 1")
                .bind(SPLIT_STAGE_MIGRATION_VERSION)
                .fetch_optional(&pool)
                .await;
        pool.close().await;
        if let Ok(Some(_)) = already_split {
            return;
        }
    }
    let ts = chrono::Utc::now().timestamp();
    let mut backup = db_path.as_os_str().to_owned();
    backup.push(format!(".pre-pipeline-status.{ts}.bak"));
    let backup_path = std::path::PathBuf::from(backup);
    match tokio::fs::copy(db_path, &backup_path).await {
        Ok(_) => slog!(
            "[db] Wrote pre-pipeline-status backup of {} to {}",
            db_path.display(),
            backup_path.display(),
        ),
        Err(e) => slog!(
            "[db] Failed to write pre-pipeline-status backup of {}: {e}",
            db_path.display(),
        ),
    }
 }
 /// Compare the live `_sqlx_migrations` table against the compiled-in migration
 /// set and return any rows whose version is not known to this binary.
 ///
 /// A non-empty result means the database was previously opened by a newer
 /// binary that applied additional migrations.  The server must refuse to start
 /// in that state because the schema may contain tables or columns that this
 /// binary does not understand.
 pub async fn check_schema_drift(pool: &SqlitePool) -> Vec<UnknownMigration> {
    let migrator = sqlx::migrate!("./migrations");
    let known: HashSet<i64> = migrator.migrations.iter().map(|m| m.version).collect();
    let rows: Vec<(i64, String, String)> = sqlx::query_as(
        "SELECT version, description, installed_on FROM _sqlx_migrations ORDER BY version",
    )
    .fetch_all(pool)
    .await
    .unwrap_or_default();
    rows.into_iter()
        .filter(|(v, _, _)| !known.contains(v))
        .map(|(version, description, installed_on)| UnknownMigration {
            version,
            description,
            installed_on,
        })
        .collect()
 }
@@ -92,9 +92,20 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
        .items
        .into_iter()
        .map(|item| {
            // Story 1087: emit `pipeline` and `status` alongside `stage` so
            // crdt-dump consumers can route by column/badge without re-deriving
            // the projection from the stage string.
            let (pipeline, status) = item
                .stage
                .as_deref()
                .and_then(crate::pipeline_state::Stage::from_dir)
                .map(|s| (s.pipeline().as_str(), s.status().as_str()))
                .unwrap_or(("", ""));
            json!({
                "story_id": item.story_id,
                "stage": item.stage,
                "pipeline": pipeline,
                "status": status,
                "name": item.name,
                "agent": item.agent,
                "retry_count": item.retry_count,
@@ -103,6 +114,7 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
                "claimed_at": item.claim_ts,
                "content_index": item.content_index,
                "is_deleted": item.is_deleted,
                "origin": item.origin,
            })
        })
        .collect();
@@ -123,11 +135,10 @@ pub(crate) fn tool_dump_crdt(args: &Value) -> Result<String, String> {
 /// MCP tool: return the server version, build hash, and running port.
 pub(crate) fn tool_get_version(ctx: &AppContext) -> Result<String, String> {
-    let build_hash =
+    let build_hash = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
        std::fs::read_to_string(".huskies/build_hash").unwrap_or_else(|_| "unknown".to_string());
    serde_json::to_string_pretty(&json!({
        "version": env!("CARGO_PKG_VERSION"),
-        "build_hash": build_hash.trim(),
+        "build_hash": build_hash,
        "port": ctx.services.agents.port(),
    }))
    .map_err(|e| format!("Serialization error: {e}"))
@@ -312,4 +323,33 @@ mod tests {
        let result = tool_get_server_logs(&json!({"lines": 9999})).unwrap();
        let _ = result;
    }
    #[test]
    fn tool_get_version_ignores_build_hash_file_and_reports_compile_time_value() {
        // Regression: get_version must NOT read .huskies/build_hash at runtime.
        // Write a deliberately wrong value to the file and assert get_version
        // returns the compile-time hash, not the file content.
        let dir = tempfile::tempdir().expect("tempdir");
        let huskies_dir = dir.path().join(".huskies");
        std::fs::create_dir_all(&huskies_dir).unwrap();
        std::fs::write(huskies_dir.join("build_hash"), "wrong_hash_sentinel_xyz").unwrap();
        let ctx = crate::http::test_helpers::test_ctx(dir.path());
        let result = tool_get_version(&ctx).expect("tool_get_version must not fail");
        let parsed: serde_json::Value = serde_json::from_str(&result).expect("must be valid JSON");
        let returned_hash = parsed["build_hash"]
            .as_str()
            .expect("build_hash must be a string");
        assert_ne!(
            returned_hash, "wrong_hash_sentinel_xyz",
            "get_version must not read .huskies/build_hash; got '{returned_hash}'"
        );
        // The returned hash must equal the compile-time value.
        let compile_time_hash = option_env!("BUILD_GIT_HASH").unwrap_or("unknown");
        assert_eq!(
            returned_hash, compile_time_hash,
            "get_version must return compile-time BUILD_GIT_HASH"
        );
    }
 }
@@ -195,6 +195,9 @@ pub(super) async fn tool_status(args: &Value, ctx: &AppContext) -> Result<String
        if !deps.is_empty() {
            front_matter.insert("depends_on".to_string(), json!(deps));
        }
        // Story 1088: origin tracking.
        let origin_str = view.origin().unwrap_or("unknown");
        front_matter.insert("origin".to_string(), json!(origin_str));
        let stage_claim = match &typed_item.stage {
            crate::pipeline_state::Stage::Coding { claim, .. } => claim.as_ref(),
            crate::pipeline_state::Stage::Merge { claim, .. } => claim.as_ref(),
@@ -38,6 +38,16 @@ pub(crate) fn tool_create_bug(args: &Value, ctx: &AppContext) -> Result<String,
        depends_on.as_deref(),
    )?;
    crate::crdt_state::set_origin(&bug_id, &super::build_origin(args));
    let _ = ctx
        .watcher_tx
        .send(crate::io::watcher::WatcherEvent::NewItemCreated {
            item_id: bug_id.clone(),
            item_type: "bug".to_string(),
            name: req.name.as_ref().to_string(),
        });
    Ok(format!("Created bug: {bug_id}"))
 }
@@ -29,6 +29,8 @@ pub(crate) fn tool_create_epic(args: &Value, ctx: &AppContext) -> Result<String,
        },
    )?;
    crate::crdt_state::set_origin(&epic_id, &super::build_origin(args));
    Ok(format!("Created epic: {epic_id}"))
 }
@@ -127,10 +129,14 @@ pub(crate) fn tool_show_epic(args: &Value, _ctx: &AppContext) -> Result<String,
            if matches!(item.stage, Stage::Done { .. }) {
                done += 1;
            }
            // Story 1087: expose pipeline + status alongside the legacy
            // stage name so epic-show callers can route by column/badge.
            member_items.push(json!({
                "story_id": sid,
                "name": item.name,
                "stage": stage_name,
                "pipeline": item.stage.pipeline().as_str(),
                "status": item.stage.status().as_str(),
            }));
        }
    }
@@ -12,6 +12,33 @@ mod refactor;
 mod spike;
 mod story;
 /// Build a compact origin JSON string for a newly-created work item (story 1088).
 ///
 /// `args` may contain an `"origin"` object with `kind`, `id`, and `ts` fields
 /// supplied by the caller (e.g. a coder agent passing its own identity).  When
 /// absent the default is `{"kind":"user","id":"","ts":<now>}`.
 ///
 /// Callers that create items on behalf of system automation (e.g. gate-failure
 /// auto-filing) should pass `kind = "system"` and `id = "<automation-name>"`.
 pub(super) fn build_origin(args: &serde_json::Value) -> String {
    let ts = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs_f64();
    if let Some(origin_obj) = args.get("origin").and_then(|v| v.as_object()) {
        let kind = origin_obj
            .get("kind")
            .and_then(|v| v.as_str())
            .unwrap_or("user");
        let id = origin_obj.get("id").and_then(|v| v.as_str()).unwrap_or("");
        let ts_val = origin_obj.get("ts").and_then(|v| v.as_f64()).unwrap_or(ts);
        serde_json::json!({"kind": kind, "id": id, "ts": ts_val}).to_string()
    } else {
        serde_json::json!({"kind": "user", "id": "", "ts": ts}).to_string()
    }
 }
 pub(crate) use bug::{tool_close_bug, tool_create_bug, tool_list_bugs};
 pub(crate) use criteria::{
    tool_add_criterion, tool_check_criterion, tool_edit_criterion, tool_ensure_acceptance,
@@ -36,6 +36,16 @@ pub(crate) fn tool_create_refactor(args: &Value, ctx: &AppContext) -> Result<Str
        depends_on.as_deref(),
    )?;
    crate::crdt_state::set_origin(&refactor_id, &super::build_origin(args));
    let _ = ctx
        .watcher_tx
        .send(crate::io::watcher::WatcherEvent::NewItemCreated {
            item_id: refactor_id.clone(),
            item_type: "refactor".to_string(),
            name: req.name.as_ref().to_string(),
        });
    Ok(format!("Created refactor: {refactor_id}"))
 }
@@ -36,6 +36,16 @@ pub(crate) fn tool_create_spike(args: &Value, ctx: &AppContext) -> Result<String
        depends_on.as_deref(),
    )?;
    crate::crdt_state::set_origin(&spike_id, &super::build_origin(args));
    let _ = ctx
        .watcher_tx
        .send(crate::io::watcher::WatcherEvent::NewItemCreated {
            item_id: spike_id.clone(),
            item_type: "spike".to_string(),
            name: req.name.as_ref().to_string(),
        });
    Ok(format!("Created spike: {spike_id}"))
 }
@@ -31,6 +31,16 @@ pub(crate) fn tool_create_story(args: &Value, ctx: &AppContext) -> Result<String
        false,
    )?;
    crate::crdt_state::set_origin(&story_id, &super::super::build_origin(args));
    let _ = ctx
        .watcher_tx
        .send(crate::io::watcher::WatcherEvent::NewItemCreated {
            item_id: story_id.clone(),
            item_type: "story".to_string(),
            name: req.name.as_ref().to_string(),
        });
    // Bug 503: warn at creation time if any depends_on points at an already-archived story.
    let archived_deps: Vec<u32> = depends_on_ids
        .as_deref()
@@ -39,34 +39,32 @@ pub(crate) fn tool_get_pipeline_status(ctx: &AppContext) -> Result<String, Strin
    let state = load_pipeline_state(ctx)?;
    let running_merges = ctx.services.agents.list_running_merges()?;
    fn slim_name(name: &str) -> &str {
        crate::chat::util::truncate_at_char_boundary(name, 120)
    }
    fn map_items(items: &[crate::http::workflow::UpcomingStory], stage: &str) -> Vec<Value> {
        items
            .iter()
            .map(|s| {
                let mut item = json!({
                    "story_id": s.story_id,
-                    "name": s.name,
+                    "name": slim_name(&s.name),
                    "stage": stage,
                    "pipeline": s.pipeline.as_str(),
                    "status": s.status.as_str(),
                    "agent": s.agent.as_ref().map(|a| json!({
                        "agent_name": a.agent_name,
                        "model": a.model,
                        "status": a.status,
                    })),
                });
                // Include blocked/retry_count when present so callers can
                // identify stories stuck in the pipeline.
                if let Some(true) = s.blocked {
                    item["blocked"] = json!(true);
                }
                if let Some(rc) = s.retry_count {
                    item["retry_count"] = json!(rc);
                }
                if let Some(ref mf) = s.merge_failure {
                    item["merge_failure"] = json!(mf);
                }
                if let Some(ref epic_id) = s.epic_id {
                    item["epic_id"] = json!(epic_id);
                }
                item
            })
            .collect()
@@ -81,19 +79,21 @@ pub(crate) fn tool_get_pipeline_status(ctx: &AppContext) -> Result<String, Strin
    let backlog: Vec<Value> = state
        .backlog
        .iter()
-        .map(|s| {
+        .map(|s| json!({ "story_id": s.story_id, "name": slim_name(&s.name) }))
            let mut item = json!({ "story_id": s.story_id, "name": s.name });
            if let Some(ref epic_id) = s.epic_id {
                item["epic_id"] = json!(epic_id);
            }
            item
        })
        .collect();
    let archived: Vec<Value> = state
        .archived
        .iter()
-        .map(|s| json!({ "story_id": s.story_id, "name": s.name, "stage": "archived" }))
+        .map(|s| {
            json!({
                "story_id": s.story_id,
                "name": slim_name(&s.name),
                "stage": "archived",
                "pipeline": s.pipeline.as_str(),
                "status": s.status.as_str(),
            })
        })
        .collect();
    serde_json::to_string_pretty(&json!({
@@ -248,6 +248,82 @@ mod tests {
        assert_eq!(item["valid"], true);
    }
    #[test]
    fn pipeline_status_50_items_under_10kb() {
        crate::db::ensure_content_store();
        let stages = [
            ("1_backlog", "backlog"),
            ("2_current", "current"),
            ("3_qa", "qa"),
            ("4_merge", "merge"),
            ("5_done", "done"),
        ];
        for (i, (dir, _)) in stages.iter().enumerate() {
            for j in 0..10 {
                let id = format!("99{i}{j}0_story_size_test");
                let name = format!("Pipeline Size Test Story {i}-{j}");
                crate::db::write_item_with_content(
                    &id,
                    dir,
                    &format!("---\nname: \"{name}\"\n---\n"),
                    crate::db::ItemMeta {
                        name: Some(name),
                        ..Default::default()
                    },
                );
            }
        }
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_get_pipeline_status(&ctx).unwrap();
        assert!(
            result.len() < 10 * 1024,
            "50-item response must be under 10 KB; got {} bytes",
            result.len()
        );
    }
    #[test]
    fn pipeline_status_per_item_under_500_bytes() {
        crate::db::ensure_content_store();
        // Insert one item per active stage with a moderately long name.
        let stages = [
            ("2_current", "9995_story_peritem_current"),
            ("3_qa", "9996_story_peritem_qa"),
            ("4_merge", "9997_story_peritem_merge"),
            ("5_done", "9998_story_peritem_done"),
        ];
        for (dir, id) in &stages {
            let name = "A Reasonably Named Story For Size Testing";
            crate::db::write_item_with_content(
                id,
                dir,
                &format!("---\nname: \"{name}\"\n---\n"),
                crate::db::ItemMeta {
                    name: Some(name.to_string()),
                    ..Default::default()
                },
            );
        }
        let tmp = tempfile::tempdir().unwrap();
        let ctx = test_ctx(tmp.path());
        let result = tool_get_pipeline_status(&ctx).unwrap();
        let parsed: Value = serde_json::from_str(&result).unwrap();
        let active = parsed["active"].as_array().unwrap();
        for item in active {
            if stages.iter().any(|(_, id)| item["story_id"] == *id) {
                let item_json = serde_json::to_string(item).unwrap();
                assert!(
                    item_json.len() < 500,
                    "per-item payload must be under 500 bytes; story_id={} got {} bytes: {}",
                    item["story_id"],
                    item_json.len(),
                    item_json
                );
            }
        }
    }
    #[test]
    fn tool_validate_stories_with_invalid_front_matter() {
        let tmp = tempfile::tempdir().unwrap();
@@ -574,7 +574,7 @@ pub(super) fn story_tools() -> Vec<Value> {
        }),
        json!({
            "name": "get_pipeline_status",
-            "description": "Return a structured snapshot of the full work item pipeline. Includes all active stages (current, qa, merge, done) with each item's stage, name, and assigned agent. Also includes upcoming backlog items.",
+            "description": "Return a structured snapshot of the full work item pipeline. Each item includes only slim fields: story_id, name (capped at 120 chars), stage, agent (with agent_name/model/status), and optional boolean flags blocked and retry_count. Active stages (current, qa, merge, done) appear in the 'active' array; backlog items in 'backlog'. For full story details, use status(story_id) or dump_crdt.",
            "inputSchema": {
                "type": "object",
                "properties": {}
@@ -24,6 +24,10 @@ pub struct UpcomingStory {
    pub merge_failure: Option<String>,
    /// Active agent working on this item, if any.
    pub agent: Option<AgentAssignment>,
    /// Display column (story 1085) — derived from `Stage::pipeline()`.
    pub pipeline: crate::pipeline_state::Pipeline,
    /// Display badge/indicator (story 1085) — derived from `Stage::status()`.
    pub status: crate::pipeline_state::Status,
    /// True when the item is held in QA for human review.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub review_hold: Option<bool>,
@@ -142,6 +146,8 @@ pub fn load_pipeline_state(ctx: &AppContext) -> Result<PipelineState, String> {
            error: None,
            merge_failure,
            agent,
            pipeline: item.stage.pipeline(),
            status: item.stage.status(),
            review_hold,
            qa,
            retry_count: if item.retry_count() > 0 {
@@ -278,6 +284,8 @@ pub fn load_upcoming_stories(_ctx: &AppContext) -> Result<Vec<UpcomingStory>, St
                error: None,
                merge_failure: None,
                agent: None,
                pipeline: item.stage.pipeline(),
                status: item.stage.status(),
                review_hold: None,
                qa: None,
                retry_count: if item_retry_count > 0 {
@@ -90,4 +90,14 @@ pub enum WatcherEvent {
        /// `true` if acceptance gates passed; `false` if they failed.
        success: bool,
    },
    /// A new work item was successfully created and added to the backlog.
    /// Triggers a creation notification to configured chat rooms.
    NewItemCreated {
        /// Work item ID (e.g. `"1075_refactor_split_stage_enum"`).
        item_id: String,
        /// Human-readable item type (`"story"`, `"bug"`, `"refactor"`, `"spike"`).
        item_type: String,
        /// Human-readable item name.
        name: String,
    },
 }
@@ -21,7 +21,6 @@ mod sweep;
 pub use events::WatcherEvent;
 pub(crate) use sweep::spawn_done_to_archived_subscriber;
 #[cfg(test)]
 pub(crate) use sweep::sweep_done_to_archived;
 use crate::slog;
@@ -29,13 +29,20 @@ use std::time::Duration;
 ///
 /// Replaces the periodic `sweep_done_to_archived` call from the tick loop.
 pub(crate) fn spawn_done_to_archived_subscriber(done_retention: Duration) {
-    use crate::pipeline_state::{PipelineEvent, Stage, apply_transition, subscribe_transitions};
+    use crate::pipeline_state::{
        PipelineEvent, Stage, Status, apply_transition, subscribe_transitions,
    };
    let mut rx = subscribe_transitions();
    tokio::spawn(async move {
        loop {
            match rx.recv().await {
                Ok(fired) => {
                    // Story 1086: gate on the typed `Status::Done` projection;
                    // the variant pattern is still required to read `merged_at`.
                    if fired.after.status() != Status::Done {
                        continue;
                    }
                    if let Stage::Done { merged_at, .. } = fired.after {
                        let story_id = fired.story_id.0.clone();
                        let retention = done_retention;
@@ -70,7 +77,7 @@ pub(crate) fn spawn_done_to_archived_subscriber(done_retention: Duration) {
    });
 }
-/// Sweep items in `Stage::Done` whose `merged_at` timestamp exceeds the
+/// Reconcile: sweep items in `Stage::Done` whose `merged_at` timestamp exceeds the
 /// retention duration to `Stage::Archived` via the typed transition table.
 ///
 /// Routes through [`crate::pipeline_state::apply_transition`] so the
@@ -78,14 +85,22 @@ pub(crate) fn spawn_done_to_archived_subscriber(done_retention: Duration) {
 /// `TransitionFired` event is emitted to subscribers (worktree pruning,
 /// matrix notifier, etc.).
 ///
-/// Used in tests for direct one-shot sweeps; production code uses
+/// Called at startup and by the periodic reconciler to archive Done stories
 /// whose retention has elapsed, even when the `TransitionFired` subscriber
 /// lagged and missed their Done event.  Production reactive archiving uses
 /// [`spawn_done_to_archived_subscriber`] instead.
-#[cfg(test)]
+///
 /// Logs a summary INFO line on every call: candidates evaluated and items
 /// archived, or "no items past retention" when nothing was swept.
 pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
    use crate::pipeline_state::{PipelineEvent, Stage, apply_transition, read_all_typed};
    let mut candidates: usize = 0;
    let mut archived: usize = 0;
    for item in read_all_typed() {
        if let Stage::Done { merged_at, .. } = &item.stage {
            candidates += 1;
            let age = chrono::Utc::now()
                .signed_duration_since(*merged_at)
                .to_std()
@@ -93,7 +108,10 @@ pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
            if age >= done_retention {
                let story_id = item.story_id.0.clone();
                match apply_transition(&story_id, PipelineEvent::Accepted, None) {
-                    Ok(_) => slog!("[watcher] sweep: promoted {story_id} → archived"),
+                    Ok(_) => {
                        archived += 1;
                        slog!("[watcher] sweep: promoted {story_id} → archived")
                    }
                    Err(e) => {
                        slog!("[watcher] sweep: transition error for {story_id}: {e}")
                    }
@@ -101,4 +119,10 @@ pub(crate) fn sweep_done_to_archived(done_retention: Duration) {
            }
        }
    }
    if archived > 0 {
        slog!("[watcher] sweep: {candidates} candidate(s) evaluated, {archived} archived");
    } else {
        slog!("[watcher] sweep: {candidates} candidate(s) evaluated, no items past retention");
    }
 }
@@ -301,6 +301,48 @@ async fn done_to_archived_subscriber_archives_on_transition() {
    );
 }
 /// Regression: simulates a server restart occurring between move-to-done and
 /// the configured retention window expiry.
 ///
 /// Before the fix the archive-deadline was held only in the reactive
 /// subscriber's volatile sleep task; a restart would lose that task and the
 /// item would never be archived.  The fix is that `sweep_done_to_archived`
 /// reads `merged_at` from the CRDT (durable across restarts) and archives any
 /// item whose age exceeds the retention, so the next periodic reconcile tick
 /// after restart picks it up regardless of whether a sleep task existed.
 #[test]
 fn restart_scenario_sweep_archives_past_retention_after_sweep_tick() {
    crate::crdt_state::init_for_test();
    crate::db::ensure_content_store();
    let story_id = "9885_sweep_restart_regression";
    // Simulate: item moved to Done 10 seconds before the restart.
    // The reactive subscriber would have had a sleep task for the remaining
    // retention time; that task is now gone (process restarted).
    let ten_seconds_ago = (chrono::Utc::now() - chrono::Duration::seconds(10)).timestamp() as f64;
    crate::crdt_state::write_item_str(
        story_id,
        "5_done",
        Some("Restart regression test"),
        None,
        None,
        Some(ten_seconds_ago),
    );
    // The next periodic reconcile tick after restart calls sweep_done_to_archived
    // directly.  With 5-second retention and merged_at 10s ago, the item must
    // be archived even though no reactive subscriber sleep task exists.
    sweep_done_to_archived(Duration::from_secs(5));
    let items = crate::pipeline_state::read_all_typed();
    let item = items.iter().find(|i| i.story_id.0 == story_id);
    assert!(
        item.is_some_and(|i| matches!(i.stage, crate::pipeline_state::Stage::Archived { .. })),
        "item past retention must be archived on the next sweep tick after a server restart"
    );
 }
 /// Prove that an item with merged_at NEWER than done_retention is NOT swept.
 #[test]
 fn sweep_keeps_item_newer_than_retention() {
@@ -33,6 +33,8 @@ pub mod mesh;
 /// Node identity — Ed25519 keypair generation and stable node ID management.
 pub mod node_identity;
 pub(crate) mod pipeline_state;
 /// Reliable process-termination primitives shared across the server.
 pub mod process_kill;
 /// Rebuild — process restart and shutdown coordination.
 pub mod rebuild;
 mod service;
@@ -82,12 +84,10 @@ async fn main() -> Result<(), std::io::Error> {
    });
    // Log version and build hash so we can verify what's running.
    let build_hash =
        std::fs::read_to_string(".huskies/build_hash").unwrap_or_else(|_| "unknown".to_string());
    slog!(
        "[startup] huskies v{} (build {})",
        env!("CARGO_PKG_VERSION"),
-        build_hash.trim()
+        option_env!("BUILD_GIT_HASH").unwrap_or("unknown")
    );
    let app_state = Arc::new(SessionState::default());
@@ -151,7 +151,7 @@ async fn main() -> Result<(), std::io::Error> {
    startup::project::open_project_root(is_init, explicit_path, &cwd, &app_state, &store, port)
        .await;
-    startup::project::init_subsystems(&app_state, &cwd).await;
+    startup::project::init_subsystems(&app_state, &cwd, is_agent).await;
    let crdt_join_token = cli
        .join_token
@@ -36,32 +36,6 @@ pub(super) fn try_broadcast(fired: &TransitionFired) {
    let _ = get_or_init_tx().send(fired.clone());
 }
 /// Replay the current CRDT pipeline state as a burst of synthetic
 /// [`TransitionFired`] events at server startup.
 ///
 /// Reads every item from the CRDT and broadcasts a self-transition
 /// (`before == after`) for each one so that all existing subscribers
 /// (worktree lifecycle, merge-failure auto-spawn, auto-assign) react
 /// identically to a live event.  This replaces the legacy scan-based
 /// `reconcile_on_startup` path.
 ///
 /// Idempotent: a second call produces another burst of events, but every
 /// subscriber already guards against duplicate work (e.g.
 /// `is_story_assigned_for_stage` returns true once an agent is running,
 /// and worktree creation is a no-op when the worktree already exists).
 pub fn replay_current_pipeline_state() {
    for item in super::read_all_typed() {
        let fired = TransitionFired {
            story_id: item.story_id.clone(),
            before: item.stage.clone(),
            after: item.stage,
            event: super::PipelineEvent::DepsMet,
            at: chrono::Utc::now(),
        };
        try_broadcast(&fired);
    }
 }
 /// Fired when a pipeline stage transition completes.
 #[derive(Debug, Clone)]
 pub struct TransitionFired {
@@ -183,58 +157,4 @@ mod tests {
    }
    // ── TransitionError Display ─────────────────────────────────────────
    // ── replay_current_pipeline_state ──────────────────────────────────
    /// AC1: replay broadcasts a synthetic event for every item in the CRDT.
    #[test]
    fn replay_broadcasts_event_for_crdt_item_in_coding_stage() {
        crate::crdt_state::init_for_test();
        crate::db::ensure_content_store();
        let story_id = "9901_replay_coding";
        crate::db::write_item_with_content(
            story_id,
            "2_current",
            "---\nname: Replay Coding\n---\n",
            crate::db::ItemMeta::named("Replay Coding"),
        );
        let mut rx = subscribe_transitions();
        replay_current_pipeline_state();
        let mut found = false;
        while let Ok(fired) = rx.try_recv() {
            if fired.story_id.0 == story_id && matches!(fired.after, Stage::Coding { .. }) {
                found = true;
            }
        }
        assert!(
            found,
            "replay must broadcast a Coding event for a story in 2_current"
        );
    }
    /// AC3: calling replay_current_pipeline_state twice fires events both times.
    ///
    /// Pool-state idempotency (no duplicate agents) is enforced by subscribers,
    /// not by the replay function itself.  This test verifies that replay is safe
    /// to call multiple times without panicking.
    #[test]
    fn replay_twice_does_not_panic() {
        crate::crdt_state::init_for_test();
        crate::db::ensure_content_store();
        let story_id = "9902_replay_idem";
        crate::db::write_item_with_content(
            story_id,
            "3_qa",
            "---\nname: Replay QA\n---\n",
            crate::db::ItemMeta::named("Replay QA"),
        );
        // Two successive replays must not panic.
        replay_current_pipeline_state();
        replay_current_pipeline_state();
    }
 }
@@ -41,8 +41,8 @@ mod tests;
 #[allow(unused_imports)]
 pub use types::{
    AgentClaim, AgentName, ArchiveReason, BranchName, ExecutionState, GitSha, MergeFailureKind,
-    NodePubkey, PipelineItem, PlanState, Stage, StoryId, TransitionError, stage_dir_name,
+    NodePubkey, Pipeline, PipelineItem, PlanState, Stage, Status, StoryId, TransitionError,
-    stage_label,
+    stage_dir_name, stage_label,
 };
 #[allow(unused_imports)]
@@ -51,10 +51,7 @@ pub use transition::{
 };
 #[allow(unused_imports)]
-pub use events::{
+pub use events::{EventBus, TransitionFired, TransitionSubscriber, subscribe_transitions};
    EventBus, TransitionFired, TransitionSubscriber, replay_current_pipeline_state,
    subscribe_transitions,
 };
 #[allow(unused_imports)]
 pub use projection::ProjectionError;
@@ -66,6 +63,7 @@ pub use apply::{
    transition_to_unfrozen,
 };
 pub(crate) use subscribers::reconcile_audit_log;
 pub use subscribers::spawn_audit_log_subscriber;
 #[allow(unused_imports)]
@@ -35,6 +35,14 @@ impl TransitionSubscriber for AuditLogSubscriber {
    }
 }
 /// Reconcile: no-op for the audit log subscriber.
 ///
 /// The audit log records live transitions only.  Replaying historical CRDT state at
 /// reconcile time would produce misleading entries (wrong timestamps, duplicate lines).
 /// Eventual consistency of the audit log is not required — missed events are simply
 /// absent from the log, which is acceptable.
 pub(crate) fn reconcile_audit_log() {}
 /// Spawn a background task that writes a structured audit log entry for every pipeline transition.
 ///
 /// Subscribes to the transition broadcast channel.  Every `TransitionFired` event produces
@@ -429,6 +429,144 @@ impl Stage {
    }
 }
 // ── Display split (story 1085): Pipeline column + Status badge ─────────────
 /// Column placement for a work item in the UI/chat status display.
 ///
 /// Derived from [`Stage`] via [`Stage::pipeline`].  Display callers route items
 /// to columns by this enum instead of pattern-matching `Stage` variants, so
 /// new badges (e.g. `Frozen`, `Blocked`) do not produce new columns.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "kebab-case")]
 pub enum Pipeline {
    /// Items in `Upcoming` or `Backlog` stages.
    Backlog,
    /// Items being coded (or blocked while in the coding lane).
    Coding,
    /// Items in QA or `ReviewHold`.
    Qa,
    /// Items in `Merge`, `MergeFailure`, or `MergeFailureFinal`.
    Merge,
    /// Items in `Done`.
    Done,
    /// Abandoned, superseded, or rejected items.
    Closed,
    /// Items swept into `Archived`.
    Archived,
 }
 impl Pipeline {
    /// Stable wire-format identifier (kebab-case).
    pub fn as_str(&self) -> &'static str {
        match self {
            Pipeline::Backlog => "backlog",
            Pipeline::Coding => "coding",
            Pipeline::Qa => "qa",
            Pipeline::Merge => "merge",
            Pipeline::Done => "done",
            Pipeline::Closed => "closed",
            Pipeline::Archived => "archived",
        }
    }
 }
 /// Badge/indicator for a work item, orthogonal to its [`Pipeline`] column.
 ///
 /// Derived from [`Stage`] via [`Stage::status`].  A `Frozen` story stays in
 /// its underlying `Pipeline` column (e.g. `Coding`) and is decorated with
 /// `Status::Frozen` for the display.  `Status::Done` is reserved for items in
 /// the `Done` column and is never produced for items still in flight, so a
 /// done item never carries a `MergeFailure*` badge (story 1052).
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "kebab-case", tag = "kind")]
 pub enum Status {
    /// No special badge — normal in-progress item.
    Active,
    /// Item is paused (`Stage::Frozen`).
    Frozen,
    /// Item is held for human review (`Stage::ReviewHold`).
    ReviewHold,
    /// Item is blocked (`Stage::Blocked` or legacy `Archived(Blocked)`).
    Blocked,
    /// Merge failed; mergemaster may still be recovering.
    MergeFailure,
    /// Merge failed beyond automatic recovery.
    MergeFailureFinal,
    /// User abandoned the item.
    Abandoned,
    /// Item was superseded by another work item.
    Superseded,
    /// Item was permanently rejected.
    Rejected,
    /// Item completed successfully.
    Done,
 }
 impl Status {
    /// Stable wire-format identifier (kebab-case).
    pub fn as_str(&self) -> &'static str {
        match self {
            Status::Active => "active",
            Status::Frozen => "frozen",
            Status::ReviewHold => "review-hold",
            Status::Blocked => "blocked",
            Status::MergeFailure => "merge-failure",
            Status::MergeFailureFinal => "merge-failure-final",
            Status::Abandoned => "abandoned",
            Status::Superseded => "superseded",
            Status::Rejected => "rejected",
            Status::Done => "done",
        }
    }
 }
 impl Stage {
    /// Display column for this stage.  `Frozen { resume_to }` recurses so a
    /// paused story keeps its underlying column.
    pub fn pipeline(&self) -> Pipeline {
        match self {
            Stage::Upcoming | Stage::Backlog => Pipeline::Backlog,
            Stage::Coding { .. } | Stage::Blocked { .. } => Pipeline::Coding,
            Stage::Qa | Stage::ReviewHold { .. } => Pipeline::Qa,
            Stage::Merge { .. } | Stage::MergeFailure { .. } | Stage::MergeFailureFinal { .. } => {
                Pipeline::Merge
            }
            Stage::Frozen { resume_to } => resume_to.pipeline(),
            Stage::Done { .. } => Pipeline::Done,
            Stage::Abandoned { .. } | Stage::Superseded { .. } | Stage::Rejected { .. } => {
                Pipeline::Closed
            }
            Stage::Archived {
                reason: ArchiveReason::Blocked { .. },
                ..
            } => Pipeline::Coding,
            Stage::Archived { .. } => Pipeline::Archived,
        }
    }
    /// Display badge for this stage.  `Frozen { resume_to }` returns
    /// `Status::Frozen` regardless of the inner stage; callers wanting the
    /// underlying badge inspect `resume_to` directly.
    pub fn status(&self) -> Status {
        match self {
            Stage::Frozen { .. } => Status::Frozen,
            Stage::ReviewHold { .. } => Status::ReviewHold,
            Stage::Blocked { .. }
            | Stage::Archived {
                reason: ArchiveReason::Blocked { .. },
                ..
            } => Status::Blocked,
            Stage::MergeFailure { .. } => Status::MergeFailure,
            Stage::MergeFailureFinal { .. } => Status::MergeFailureFinal,
            Stage::Abandoned { .. } => Status::Abandoned,
            Stage::Superseded { .. } => Status::Superseded,
            Stage::Rejected { .. } => Status::Rejected,
            Stage::Done { .. } => Status::Done,
            _ => Status::Active,
        }
    }
 }
 // ── Per-node execution state ────────────────────────────────────────────────
 /// Per-node execution tracking, stored in the CRDT under each node's pubkey.
@@ -0,0 +1,322 @@
 //! Reliable process-termination primitives.
 //!
 //! The huskies server kills child processes in several distinct places:
 //! the watchdog terminates agents that have exceeded turn/budget limits,
 //! `stop_agent` terminates on operator request, `kill_all_children` runs at
 //! server shutdown, the merge-gate completion path kills stale `cargo`
 //! processes, and `script/local-release` tears down the gateway during a
 //! redeploy.  Every one of these used to send a signal that the target was
 //! free to ignore (most commonly `portable_pty`'s `SIGHUP`), with no
 //! verification that the process actually exited.  Agents and bots that
 //! ignore `SIGHUP` survived the "kill", which produced concurrent claude
 //! processes on the same story — directly the duplicate-spawn bug we hit on
 //! 2026-05-15.
 //!
 //! This module provides one trustworthy way to kill processes: SIGKILL with
 //! verification.  Build a pid set with the helpers in this module (or your
 //! own), then hand it to [`sigkill_pids_and_verify`].
 //!
 //! All functions on this module are deliberately Unix-only — huskies runs in
 //! Linux containers and macOS dev hosts, both POSIX.
 use crate::slog_warn;
 /// Maximum time we'll wait for SIGKILL'd processes to disappear before
 /// declaring failure.  SIGKILL is uncatchable, so the kernel normally
 /// reaps within tens of milliseconds; anything past 2 s indicates the
 /// process is wedged in uninterruptible IO (e.g. waiting on a frozen NFS
 /// mount).  Caller can decide whether to proceed despite survivors.
 const KILL_VERIFY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(2);
 /// Polling interval while waiting for processes to disappear.  100 ms is
 /// fine-grained enough that the typical few-ms reap latency is barely
 /// observable, but coarse enough that we don't burn CPU spinning.
 const KILL_VERIFY_POLL: std::time::Duration = std::time::Duration::from_millis(100);
 /// SIGKILL every pid in `pids`, then poll until all of them are gone.
 ///
 /// Returns `Ok(n)` where `n == pids.len()` when every pid is verified
 /// reaped within [`KILL_VERIFY_TIMEOUT`].  Returns `Err(survivors)` with the
 /// pids still alive after the timeout — extremely rare for SIGKILL but
 /// possible if a process is wedged in uninterruptible IO.  An empty `pids`
 /// slice returns `Ok(0)` immediately.
 ///
 /// **Why SIGKILL and not SIGTERM-first:** several huskies-internal targets
 /// (claude-code, the bot itself) either ignore the polite signals or take
 /// arbitrarily long to honour them.  The watchdog only kills agents that
 /// have already misbehaved by definition (exceeded budget/turn limits), so
 /// there is no reason to give them a graceful-shutdown grace period.
 pub fn sigkill_pids_and_verify(pids: &[u32]) -> Result<usize, Vec<u32>> {
    if pids.is_empty() {
        return Ok(0);
    }
    for &pid in pids {
        // libc::kill returns -1 on failure (with errno).  We deliberately
        // ignore the result: the process may already be gone (errno ESRCH),
        // and trying again wouldn't help.  The verification loop below is
        // the source of truth for "did this work".
        unsafe { libc::kill(pid as i32, libc::SIGKILL) };
    }
    let deadline = std::time::Instant::now() + KILL_VERIFY_TIMEOUT;
    while std::time::Instant::now() < deadline {
        if pids.iter().copied().all(|pid| !pid_is_alive(pid)) {
            return Ok(pids.len());
        }
        std::thread::sleep(KILL_VERIFY_POLL);
    }
    let survivors: Vec<u32> = pids
        .iter()
        .copied()
        .filter(|&pid| pid_is_alive(pid))
        .collect();
    if survivors.is_empty() {
        Ok(pids.len())
    } else {
        slog_warn!(
            "[process_kill] SIGKILL did not reap pids within {:?}: {survivors:?}. \
             They may be wedged in uninterruptible IO.",
            KILL_VERIFY_TIMEOUT
        );
        Err(survivors)
    }
 }
 /// Return every pid whose command line matches `pattern` (passed to
 /// `pgrep -f`).  Empty when nothing matches or when `pgrep` is unavailable.
 ///
 /// Useful for collecting processes by a path or argument substring — e.g.
 /// "every process running in `<worktree>/`" or "every cargo invocation
 /// against this `Cargo.toml`".
 pub fn pids_matching(pattern: &str) -> Vec<u32> {
    let Ok(output) = std::process::Command::new("pgrep")
        .args(["-f", pattern])
        .output()
    else {
        return Vec::new();
    };
    String::from_utf8_lossy(&output.stdout)
        .lines()
        .filter_map(|l| l.trim().parse::<u32>().ok())
        .collect()
 }
 /// Return every descendant pid of `root_pid`, deepest-first, **excluding**
 /// `root_pid` itself.  Walks the parent→child relation via `pgrep -P`.
 ///
 /// Deepest-first ordering lets callers signal leaves before their parents
 /// when that matters; for SIGKILL it makes no difference.
 pub fn descendant_pids(root_pid: u32) -> Vec<u32> {
    let mut out: Vec<u32> = Vec::new();
    walk_descendants(root_pid, &mut out);
    out
 }
 fn walk_descendants(pid: u32, out: &mut Vec<u32>) {
    let Ok(output) = std::process::Command::new("pgrep")
        .args(["-P", &pid.to_string()])
        .output()
    else {
        return;
    };
    let kids: Vec<u32> = String::from_utf8_lossy(&output.stdout)
        .lines()
        .filter_map(|l| l.trim().parse::<u32>().ok())
        .collect();
    for kid in kids {
        walk_descendants(kid, out);
        out.push(kid);
    }
 }
 /// Check whether `pid` currently exists.  Implemented via `kill(pid, 0)` —
 /// no signal is sent, only existence is probed.
 fn pid_is_alive(pid: u32) -> bool {
    // signal 0: "is this process around?"  Returns 0 if the process exists
    // and we have permission to signal it, -1 with errno otherwise.
    unsafe { libc::kill(pid as i32, 0) == 0 }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::process::{Child, Command, Stdio};
    use std::thread::JoinHandle;
    /// Spawn a sleeper for kill testing, and spawn a background reaper that
    /// calls `wait()` as soon as the child exits.  Returns the pid plus the
    /// reaper join handle so the test can confirm reaping after the kill.
    ///
    /// The reaper is essential because the production code's verify loop
    /// uses `kill(pid, 0)` to test existence — which returns 0 for zombies.
    /// If no one reaps the test's sleeper, its pid stays occupied (as a
    /// zombie) and `sigkill_pids_and_verify` mistakenly reports survivors.
    /// In production the PTY blocking thread is always reaping on behalf of
    /// portable_pty, so this isn't a concern there.
    fn spawn_sleeper_with_reaper(secs: u64) -> (u32, JoinHandle<()>) {
        let child: Child = Command::new("sleep")
            .arg(secs.to_string())
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("failed to spawn sleep");
        let pid = child.id();
        let reaper = std::thread::spawn(move || {
            let mut c = child;
            let _ = c.wait();
        });
        (pid, reaper)
    }
    #[test]
    fn sigkill_empty_slice_is_ok() {
        let result = sigkill_pids_and_verify(&[]);
        assert!(matches!(result, Ok(0)));
    }
    #[test]
    fn sigkill_real_process_is_verified_gone() {
        let (pid, reaper) = spawn_sleeper_with_reaper(60);
        assert!(pid_is_alive(pid), "sleeper should be alive before kill");
        let result = sigkill_pids_and_verify(&[pid]);
        assert!(
            matches!(result, Ok(1)),
            "sigkill must verify the process is gone: {result:?}"
        );
        let _ = reaper.join();
        assert!(!pid_is_alive(pid), "sleeper must be dead after kill");
    }
    #[test]
    fn sigkill_already_dead_pid_is_ok() {
        let (pid, reaper) = spawn_sleeper_with_reaper(0);
        let _ = reaper.join();
        // Wait briefly for the kernel to recycle the pid.
        for _ in 0..20 {
            if !pid_is_alive(pid) {
                break;
            }
            std::thread::sleep(std::time::Duration::from_millis(100));
        }
        // Now SIGKILL a pid that no longer exists.  Result must still be Ok.
        let result = sigkill_pids_and_verify(&[pid]);
        assert!(
            result.is_ok(),
            "sigkill of already-dead pid must succeed: {result:?}"
        );
    }
    #[test]
    fn sigkill_multiple_real_processes() {
        let mut handles: Vec<(u32, JoinHandle<()>)> =
            (0..3).map(|_| spawn_sleeper_with_reaper(60)).collect();
        let pids: Vec<u32> = handles.iter().map(|(p, _)| *p).collect();
        for &pid in &pids {
            assert!(pid_is_alive(pid));
        }
        let result = sigkill_pids_and_verify(&pids);
        assert!(
            matches!(result, Ok(3)),
            "all 3 sleepers must die: {result:?}"
        );
        for (_, reaper) in handles.drain(..) {
            let _ = reaper.join();
        }
        for &pid in &pids {
            assert!(!pid_is_alive(pid), "pid {pid} survived sigkill");
        }
    }
    #[test]
    fn pids_matching_finds_a_running_process() {
        // pgrep -f matches the FULL command line, so the marker has to be
        // in argv somewhere.  Putting it in a shell comment doesn't work —
        // sh strips it.  Override argv[0] so the marker is durably visible.
        use std::os::unix::process::CommandExt;
        let marker = format!("kill-test-marker-{}-{}", std::process::id(), rand_u64());
        let argv0 = format!("test-marker-{marker}");
        let child: Child = Command::new("sleep")
            .arg0(argv0)
            .arg("60")
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("spawn");
        let child_pid = child.id();
        let reaper = std::thread::spawn(move || {
            let mut c = child;
            let _ = c.wait();
        });
        // pgrep needs a moment to see the new process.
        std::thread::sleep(std::time::Duration::from_millis(100));
        let found = pids_matching(&marker);
        assert!(
            found.contains(&child_pid),
            "pids_matching should find pid {child_pid} for marker '{marker}'; got {found:?}"
        );
        // Cleanup so the test doesn't leak a sleeper.
        let _ = sigkill_pids_and_verify(&[child_pid]);
        let _ = reaper.join();
    }
    #[test]
    fn pids_matching_returns_empty_when_no_match() {
        let pattern = format!("nonexistent-pattern-{}-{}", std::process::id(), rand_u64());
        let found = pids_matching(&pattern);
        assert!(found.is_empty(), "expected empty result, got {found:?}");
    }
    /// Cheap unique-ish u64 for distinguishing test invocations without a
    /// dependency on a randomness crate.
    fn rand_u64() -> u64 {
        use std::time::{SystemTime, UNIX_EPOCH};
        SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|d| d.as_nanos() as u64)
            .unwrap_or(0)
    }
    #[test]
    fn descendant_pids_of_real_process_tree() {
        // Build a parent sh that spawns a child sleep.  The descendants of
        // the parent should include the sleep.
        let parent: Child = Command::new("sh")
            .args(["-c", "sleep 60"])
            .stdout(Stdio::null())
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .spawn()
            .expect("spawn parent");
        let parent_pid = parent.id();
        let reaper = std::thread::spawn(move || {
            let mut c = parent;
            let _ = c.wait();
        });
        // Let the shell get around to fork+execing its child.
        std::thread::sleep(std::time::Duration::from_millis(200));
        let descendants = descendant_pids(parent_pid);
        // On some shells `sh -c "sleep N"` exec-replaces sh with sleep, leaving
        // zero descendants.  On others it forks.  We don't care which; we only
        // care that the function doesn't panic and returns a sensible vec.
        assert!(
            descendants.iter().all(|&pid| pid != parent_pid),
            "descendant_pids must not include the root itself: {descendants:?}"
        );
        // Cleanup: kill the parent and any descendants.
        let mut all = descendants;
        all.push(parent_pid);
        let _ = sigkill_pids_and_verify(&all);
        let _ = reaper.join();
    }
 }
@@ -62,6 +62,9 @@ pub struct WorkItemContent {
    pub stage: crate::pipeline_state::Stage,
    pub name: String,
    pub agent: Option<crate::config::AgentName>,
    /// Origin of the work item (story 1088).  `None` for items that pre-date
    /// the origin register; the web UI renders these as `"unknown"`.
    pub origin: Option<String>,
 }
 /// A single entry in the project's configured agent roster.
@@ -176,6 +179,9 @@ pub fn get_work_item_content(
        .map(|v| v.name().to_string())
        .unwrap_or_default();
    let crdt_agent = crdt_view.as_ref().and_then(|v| v.agent());
    let crdt_origin = crdt_view
        .as_ref()
        .and_then(|v| v.origin().map(str::to_string));
    for (stage_dir, stage) in &stages {
        if let Some(content) = io::read_work_item_from_stage(&work_dir, stage_dir, &filename)? {
@@ -184,6 +190,7 @@ pub fn get_work_item_content(
                stage: stage.clone(),
                name: crdt_name.clone(),
                agent: crdt_agent,
                origin: crdt_origin.clone(),
            });
        }
    }
@@ -201,6 +208,7 @@ pub fn get_work_item_content(
            stage,
            name: crdt_name,
            agent: crdt_agent,
            origin: crdt_origin,
        });
    }
@@ -26,6 +26,8 @@ pub enum EventAction {
        /// `true` if acceptance gates passed.
        success: bool,
    },
    /// Post a new-item-created notification.
    NewItemCreated,
    /// Log server-side only; do not post to chat (e.g. hard rate-limit blocks).
    LogOnly,
    /// Reload the project configuration.
@@ -51,6 +53,7 @@ pub fn classify(event: &WatcherEvent) -> EventAction {
        WatcherEvent::AgentCompleted { success, .. } => {
            EventAction::AgentCompleted { success: *success }
        }
        WatcherEvent::NewItemCreated { .. } => EventAction::NewItemCreated,
        _ => EventAction::Skip,
    }
 }
@@ -178,4 +181,14 @@ mod tests {
            EventAction::AgentCompleted { success: false }
        );
    }
    #[test]
    fn new_item_created_is_classified_correctly() {
        let event = WatcherEvent::NewItemCreated {
            item_id: "1075_refactor_split_stage".to_string(),
            item_type: "refactor".to_string(),
            name: "Split Stage enum".to_string(),
        };
        assert_eq!(classify(&event), EventAction::NewItemCreated);
    }
 }
@@ -220,21 +220,48 @@ pub fn format_agent_completed_notification(
    (plain, html)
 }
-/// Extract the first non-empty line from a merge failure reason, truncated to `max_len` chars.
+/// Format a new-work-item creation notification.
 ///
-/// Used to produce a compact snippet for chat notifications.
+/// Returns `(plain_text, html)` suitable for `ChatTransport::send_message`.
-pub fn merge_failure_snippet(reason: &str, max_len: usize) -> String {
+pub fn format_new_item_notification(
-    let line = reason
+    item_id: &str,
-        .lines()
+    item_type: &str,
-        .find(|l| !l.trim().is_empty())
+    name: &str,
-        .unwrap_or(reason);
+) -> (String, String) {
-    let mut chars = line.chars();
+    let number = extract_item_number(item_id).unwrap_or(item_id);
-    let truncated: String = chars.by_ref().take(max_len).collect();
+    let emoji = match item_type {
-    if chars.next().is_some() {
+        "bug" => "\u{1f41b}",      // 🐛
-        format!("{truncated}\u{2026}") // append …
+        "refactor" => "\u{1f4dd}", // 📝
-    } else {
+        "spike" => "\u{1f52c}",    // 🔬
-        truncated
+        _ => "\u{1f4d6}",          // 📖 (story and unknown)
    };
    let plain = format!("{emoji} New {item_type} #{number} \u{2014} {name}");
    let html = format!("{emoji} New {item_type} <strong>#{number}</strong> \u{2014} {name}");
    (plain, html)
 }
 /// Maximum number of trailing gate-output lines included in a merge-failure
 /// chat notification.
 ///
 /// Gate output can be hundreds of lines; only the tail (where errors appear)
 /// is useful at a glance.  Full output remains available via `get_merge_status`
 /// or the web UI — this limit is chat-display-only.
 pub const MERGE_FAILURE_TAIL_LINES: usize = 30;
 /// Truncate `gate_output` to its last `max_lines` lines for chat notifications.
 ///
 /// If the output contains more than `max_lines` non-empty lines, a leading
 /// marker line `[...output truncated, last N lines shown...]` is prepended to
 /// the tail so readers know output was cut.  If the output fits within the
 /// limit it is returned unchanged (no marker added).
 pub fn truncate_gate_output(gate_output: &str, max_lines: usize) -> String {
    let lines: Vec<&str> = gate_output.lines().collect();
    if lines.len() <= max_lines {
        return gate_output.to_string();
    }
    let tail = &lines[lines.len() - max_lines..];
    let marker = format!("[...output truncated, last {max_lines} lines shown...]");
    format!("{marker}\n{}", tail.join("\n"))
 }
 #[cfg(test)]
@@ -568,6 +595,64 @@ mod tests {
        assert_eq!(plain, "\u{1F916} #42 \u{2014} coder-1 started");
    }
    // ── truncate_gate_output ──────────────────────────────────────────────────
    #[test]
    fn truncate_gate_output_short_output_returned_unchanged() {
        let output = "line1\nline2\nline3";
        assert_eq!(truncate_gate_output(output, 30), output);
    }
    #[test]
    fn truncate_gate_output_exact_limit_returned_unchanged() {
        let lines: Vec<String> = (1..=30).map(|i| format!("line{i}")).collect();
        let output = lines.join("\n");
        assert_eq!(truncate_gate_output(&output, 30), output);
    }
    #[test]
    fn truncate_gate_output_over_limit_prepends_marker() {
        let lines: Vec<String> = (1..=35).map(|i| format!("line{i}")).collect();
        let output = lines.join("\n");
        let result = truncate_gate_output(&output, 30);
        assert!(
            result.starts_with("[...output truncated, last 30 lines shown...]"),
            "must start with truncation marker; got: {result}"
        );
    }
    #[test]
    fn truncate_gate_output_over_limit_contains_tail_lines() {
        let lines: Vec<String> = (1..=35).map(|i| format!("line{i}")).collect();
        let output = lines.join("\n");
        let result = truncate_gate_output(&output, 30);
        // Last 30 lines are line6..line35.
        assert!(result.contains("line35"), "must contain last line");
        assert!(result.contains("line6"), "must contain first tail line");
        assert!(!result.contains("line5"), "must not contain dropped line");
    }
    #[test]
    fn truncate_gate_output_empty_input_returned_unchanged() {
        assert_eq!(truncate_gate_output("", 30), "");
    }
    #[test]
    fn truncate_gate_output_single_line_returned_unchanged() {
        assert_eq!(truncate_gate_output("only one line", 30), "only one line");
    }
    #[test]
    fn truncate_gate_output_marker_contains_configured_limit() {
        let lines: Vec<String> = (1..=10).map(|i| format!("x{i}")).collect();
        let output = lines.join("\n");
        let result = truncate_gate_output(&output, 5);
        assert!(
            result.contains("last 5 lines shown"),
            "marker must state configured limit; got: {result}"
        );
    }
    // ── format_agent_completed_notification ───────────────────────────────────
    #[test]
@@ -599,6 +684,67 @@ mod tests {
        );
    }
    // ── format_new_item_notification ──────────────────────────────────────────
    #[test]
    fn format_new_item_notification_story() {
        let (plain, html) =
            format_new_item_notification("42_story_my_feature", "story", "My Feature");
        assert_eq!(plain, "\u{1f4d6} New story #42 \u{2014} My Feature");
        assert_eq!(
            html,
            "\u{1f4d6} New story <strong>#42</strong> \u{2014} My Feature"
        );
    }
    #[test]
    fn format_new_item_notification_bug() {
        let (plain, html) =
            format_new_item_notification("99_bug_login_crash", "bug", "Login Crash");
        assert_eq!(plain, "\u{1f41b} New bug #99 \u{2014} Login Crash");
        assert_eq!(
            html,
            "\u{1f41b} New bug <strong>#99</strong> \u{2014} Login Crash"
        );
    }
    #[test]
    fn format_new_item_notification_refactor() {
        let (plain, html) = format_new_item_notification(
            "1075_refactor_split_stage",
            "refactor",
            "Split Stage enum into Pipeline + Status",
        );
        assert_eq!(
            plain,
            "\u{1f4dd} New refactor #1075 \u{2014} Split Stage enum into Pipeline + Status"
        );
        assert_eq!(
            html,
            "\u{1f4dd} New refactor <strong>#1075</strong> \u{2014} Split Stage enum into Pipeline + Status"
        );
    }
    #[test]
    fn format_new_item_notification_spike() {
        let (plain, html) =
            format_new_item_notification("7_spike_encoder_comparison", "spike", "Compare Encoders");
        assert_eq!(plain, "\u{1f52c} New spike #7 \u{2014} Compare Encoders");
        assert_eq!(
            html,
            "\u{1f52c} New spike <strong>#7</strong> \u{2014} Compare Encoders"
        );
    }
    #[test]
    fn format_new_item_notification_non_numeric_id_uses_full_id() {
        let (plain, _html) = format_new_item_notification("abc_story_thing", "story", "Some Story");
        assert_eq!(
            plain,
            "\u{1f4d6} New story #abc_story_thing \u{2014} Some Story"
        );
    }
    #[test]
    fn format_agent_completed_notification_empty_name_falls_back_to_number() {
        let (plain, _html) =
@@ -14,9 +14,10 @@ use tokio::sync::broadcast;
 use super::super::events::classify;
 use super::super::filter::{AGENT_EVENT_DEBOUNCE, should_send_rate_limit};
 use super::super::format::{
-    format_agent_completed_notification, format_agent_started_notification,
+    MERGE_FAILURE_TAIL_LINES, format_agent_completed_notification,
-    format_blocked_notification, format_error_notification, format_oauth_account_swapped,
+    format_agent_started_notification, format_blocked_notification, format_error_notification,
-    format_oauth_accounts_exhausted, format_rate_limit_notification, merge_failure_snippet,
+    format_new_item_notification, format_oauth_account_swapped, format_oauth_accounts_exhausted,
    format_rate_limit_notification, truncate_gate_output,
 };
 use super::super::route::rooms_for_notification;
 use super::{find_story_name_any_stage, read_story_name};
@@ -119,9 +120,7 @@ pub fn spawn_notification_listener(
                        continue;
                    };
                    let story_name = read_story_name(&project_root, "4_merge", story_id);
-                    // AC3: include only the first non-empty line of the failure,
+                    let snippet = truncate_gate_output(reason, MERGE_FAILURE_TAIL_LINES);
                    // truncated to ~120 chars.
                    let snippet = merge_failure_snippet(reason, 120);
                    let (plain, html) = format_error_notification(story_id, &story_name, &snippet);
                    slog!("[bot] Sending error notification: {plain}");
                    for room_id in &rooms_for_notification(&get_room_ids) {
@@ -276,6 +275,26 @@ pub fn spawn_notification_listener(
                    pending_agent_events.insert(key, (plain, html));
                    agent_flush_deadline = Some(tokio::time::Instant::now() + AGENT_EVENT_DEBOUNCE);
                }
                EventAction::NewItemCreated => {
                    if !config.status_push_enabled {
                        continue;
                    }
                    let WatcherEvent::NewItemCreated {
                        ref item_id,
                        ref item_type,
                        ref name,
                    } = event
                    else {
                        continue;
                    };
                    let (plain, html) = format_new_item_notification(item_id, item_type, name);
                    slog!("[bot] Sending new-item notification: {plain}");
                    for room_id in &rooms_for_notification(&get_room_ids) {
                        if let Err(e) = transport.send_message(room_id, &plain, &html).await {
                            slog!("[bot] Failed to send new-item notification to {room_id}: {e}");
                        }
                    }
                }
                EventAction::LogOnly => {
                    // Hard-block: log server-side for debugging; do NOT post to chat.
                    // Hard-block auto-resume is normal operation — the status command
@@ -5,6 +5,89 @@ use super::spawn_notification_listener;
 use crate::io::watcher::WatcherEvent;
 use tokio::sync::broadcast;
 // ── spawn_notification_listener: MergeFailure ────────────────────────────────
 /// Long gate output is truncated to the tail and includes the marker line.
 #[tokio::test]
 async fn merge_failure_long_output_is_truncated_to_tail() {
    let tmp = tempfile::tempdir().unwrap();
    let (watcher_tx, watcher_rx) = broadcast::channel::<WatcherEvent>(16);
    let (transport, calls) = MockTransport::new();
    spawn_notification_listener(
        transport,
        || vec!["!room1:example.org".to_string()],
        watcher_rx,
        tmp.path().to_path_buf(),
    );
    // Build a reason with 50 lines (more than MERGE_FAILURE_TAIL_LINES = 30).
    let long_reason: String = (1..=50).map(|i| format!("gate-line-{i}\n")).collect();
    watcher_tx
        .send(WatcherEvent::MergeFailure {
            story_id: "1077_story_trunc".to_string(),
            reason: long_reason,
        })
        .unwrap();
    tokio::time::sleep(std::time::Duration::from_millis(100)).await;
    let calls = calls.lock().unwrap();
    assert_eq!(calls.len(), 1, "Expected exactly one notification");
    let (_, plain, _) = &calls[0];
    assert!(
        plain.contains("truncated"),
        "notification must contain the truncation marker; got: {plain}"
    );
    assert!(
        plain.contains("gate-line-50"),
        "notification must contain the last line; got: {plain}"
    );
    assert!(
        !plain.contains("gate-line-1\n"),
        "notification must not contain the first (dropped) line; got: {plain}"
    );
 }
 /// Short gate output (within limit) passes through unchanged, no marker added.
 #[tokio::test]
 async fn merge_failure_short_output_passes_through_unchanged() {
    let tmp = tempfile::tempdir().unwrap();
    let (watcher_tx, watcher_rx) = broadcast::channel::<WatcherEvent>(16);
    let (transport, calls) = MockTransport::new();
    spawn_notification_listener(
        transport,
        || vec!["!room1:example.org".to_string()],
        watcher_rx,
        tmp.path().to_path_buf(),
    );
    let short_reason = "error: type mismatch on line 42\nexpected i32, found &str".to_string();
    watcher_tx
        .send(WatcherEvent::MergeFailure {
            story_id: "1077_story_short".to_string(),
            reason: short_reason.clone(),
        })
        .unwrap();
    tokio::time::sleep(std::time::Duration::from_millis(100)).await;
    let calls = calls.lock().unwrap();
    assert_eq!(calls.len(), 1, "Expected exactly one notification");
    let (_, plain, _) = &calls[0];
    assert!(
        !plain.contains("truncated"),
        "short output must not have a truncation marker; got: {plain}"
    );
    assert!(
        plain.contains("type mismatch"),
        "short output must be included verbatim; got: {plain}"
    );
 }
 // ── spawn_notification_listener: RateLimitWarning ────────────────────────────
 /// AC2 + AC3: when a RateLimitWarning event arrives, send_message is called
@@ -191,6 +191,7 @@ mod tests {
            watcher: crate::config::WatcherConfig {
                sweep_interval_secs: 30,
                done_retention_secs: 7200,
                reconcile_interval_secs: 30,
            },
            ..Default::default()
        };
@@ -37,6 +37,8 @@ pub fn watcher_event_to_response(e: WatcherEvent) -> Option<WsResponse> {
        // Agent lifecycle events are forwarded to chat transports only; no WebSocket message.
        WatcherEvent::AgentStarted { .. } => None,
        WatcherEvent::AgentCompleted { .. } => None,
        // Creation notifications are forwarded to chat transports only; no WebSocket message.
        WatcherEvent::NewItemCreated { .. } => None,
    }
 }
@@ -210,6 +212,8 @@ mod tests {
                error: None,
                merge_failure: None,
                agent: None,
                pipeline: crate::pipeline_state::Pipeline::Backlog,
                status: crate::pipeline_state::Status::Active,
                review_hold: None,
                qa: None,
                retry_count: None,
@@ -224,6 +228,8 @@ mod tests {
                error: None,
                merge_failure: None,
                agent: None,
                pipeline: crate::pipeline_state::Pipeline::Coding,
                status: crate::pipeline_state::Status::Active,
                review_hold: None,
                qa: None,
                retry_count: None,
@@ -240,6 +246,8 @@ mod tests {
                error: None,
                merge_failure: None,
                agent: None,
                pipeline: crate::pipeline_state::Pipeline::Done,
                status: crate::pipeline_state::Status::Done,
                review_hold: None,
                qa: None,
                retry_count: None,
@@ -301,6 +309,8 @@ mod tests {
                    model: Some(crate::agents::AgentModel::Sonnet),
                    status: crate::agents::AgentStatus::Running,
                }),
                pipeline: crate::pipeline_state::Pipeline::Coding,
                status: crate::pipeline_state::Status::Active,
                review_hold: None,
                qa: None,
                retry_count: None,
@@ -205,6 +205,8 @@ mod tests {
            error: None,
            merge_failure: None,
            agent: None,
            pipeline: crate::pipeline_state::Pipeline::Backlog,
            status: crate::pipeline_state::Status::Active,
            review_hold: None,
            qa: None,
            retry_count: None,
@@ -217,7 +217,13 @@ async fn migrate_json_stores_to_sqlite(huskies_dir: &Path) {
 }
 /// Set up the server log file, node identity keypair, pipeline DB, and CRDT state.
-pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) {
+///
 /// When `is_agent` is `true` the pipeline database is opened at an isolated
 /// temporary path (or at `HUSKIES_DB_PATH` if that env-var is set) so that the
 /// headless build agent never touches the production `.huskies/pipeline.db`.
 /// This prevents feature-branch migrations from being applied to the shared
 /// database and bricking the next server restart.
 pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path, is_agent: bool) {
    // Enable persistent server log file now that the project root is known.
    if let Some(ref root) = *app_state.project_root.lock().unwrap() {
        let log_dir = root.join(".huskies").join("logs");
@@ -242,20 +248,91 @@ pub(crate) async fn init_subsystems(app_state: &Arc<SessionState>, cwd: &Path) {
        }
    }
-    // Initialise the SQLite pipeline shadow-write database and CRDT state layer.
+    // Resolve the pipeline DB path.
-    // Clone the path out before the await so we don't hold the MutexGuard across
+    //
-    // an await point.
+    // Priority order:
-    let pipeline_db_path = app_state
+    //   1. HUSKIES_DB_PATH env var (operator override, any mode)
    //   2. Agent mode: process-local temp file so the production DB is never touched
    //   3. Default: {project_root}/.huskies/pipeline.db
    let pipeline_db_path: Option<PathBuf> = if let Ok(env_path) = std::env::var("HUSKIES_DB_PATH") {
        let p = PathBuf::from(&env_path);
        crate::slog!("[db] HUSKIES_DB_PATH override: {}", p.display());
        Some(p)
    } else if is_agent {
        // Headless agent: use an isolated temp DB so that any migrations compiled
        // into this binary (e.g. from a feature branch) are never applied to the
        // production database.  The temp file is process-unique and harmless to
        // leave behind after the agent exits.
        let pid = std::process::id();
        let temp_path = std::env::temp_dir().join(format!("huskies-agent-{pid}.db"));
        crate::slog!(
            "[db] Agent mode: using isolated DB at {} (not touching production pipeline.db)",
            temp_path.display()
        );
        Some(temp_path)
    } else {
        // Server mode: use the project-local production database.
        app_state
            .project_root
            .lock()
            .unwrap()
            .as_ref()
-        .map(|root| root.join(".huskies").join("pipeline.db"));
+            .map(|root| root.join(".huskies").join("pipeline.db"))
    };
    if let Some(ref db_path) = pipeline_db_path {
        if let Err(e) = db::init(db_path).await {
            crate::slog!("[db] Failed to initialise pipeline.db: {e}");
        } else {
            // ── Migration drift self-check (server mode only) ─────────────────────
            //
            // In server mode, detect whether the live database contains migrations
            // that were applied by a newer binary (e.g. a feature-branch agent that
            // ran before the feature was merged).  If so, log each unknown migration
            // and exit with a clear actionable message.  This is the root cause of
            // the 2026-05-14 21:07 production outage where the server came up but
            // the CRDT never initialised.
            if !is_agent && let Some(pool) = db::get_shared_pool() {
                let drift = db::check_schema_drift(pool).await;
                if !drift.is_empty() {
                    for m in &drift {
                        crate::slog!(
                            "[db] UNKNOWN migration {} ('{}') applied at {} \
                             is not in the compiled-in set",
                            m.version,
                            m.description,
                            m.installed_on,
                        );
                    }
                    eprintln!();
                    eprintln!(
                        "error: pipeline.db contains {} migration(s) that are not \
                         recognised by this binary:",
                        drift.len()
                    );
                    for m in &drift {
                        eprintln!(
                            "  \u{2022} migration {} ('{}') applied at {}",
                            m.version, m.description, m.installed_on
                        );
                    }
                    eprintln!();
                    eprintln!(
                        "This means the database was previously opened by a newer \
                         version of huskies."
                    );
                    eprintln!(
                        "To fix: rebuild huskies from the latest source (the branch \
                         that added these migrations) and restart."
                    );
                    eprintln!(
                        "Do NOT start the old binary against this database — it will \
                         behave incorrectly."
                    );
                    std::process::exit(1);
                }
            }
            // One-shot migration: move any existing JSON store files into SQLite.
            let huskies_dir = db_path.parent().unwrap_or(db_path);
            migrate_json_stores_to_sqlite(huskies_dir).await;
@@ -156,6 +156,17 @@ pub(crate) fn spawn_tick_loop(
         {scheduled_count} scheduled timer(s)"
    );
    let (reconcile_interval, done_retention) = root
        .as_ref()
        .and_then(|r| config::ProjectConfig::load(r).ok())
        .map(|c| {
            (
                c.watcher.reconcile_interval_secs,
                std::time::Duration::from_secs(c.watcher.done_retention_secs),
            )
        })
        .unwrap_or((30, std::time::Duration::from_secs(4 * 3600)));
    tokio::spawn(async move {
        let mut interval = tokio::time::interval(std::time::Duration::from_secs(1));
        let mut tick_count: u64 = 0;
@@ -190,6 +201,15 @@ pub(crate) fn spawn_tick_loop(
                }
                agents.reap_stale_merge_jobs();
            }
            // Periodic reconciler: converge subscriber side effects so that
            // Lagged broadcast events never leave state permanently diverged.
            if tick_count.is_multiple_of(reconcile_interval)
                && let Some(ref r) = root
            {
                crate::slog!("[reconcile] Running periodic reconcile pass.");
                run_reconcile_pass(r, &agents, done_retention).await;
            }
        }
    });
 }
@@ -450,16 +470,50 @@ async fn execute_prompt_action(
    }
 }
-/// Spawn the startup reconstruction task: replay the current pipeline state
+/// Run one full reconcile pass: call each subscriber's idempotent `reconcile()`
-/// through the [`TransitionFired`][crate::pipeline_state::TransitionFired]
+/// entry point so that side effects converge regardless of whether the
-/// broadcast channel so that all existing subscribers (worktree lifecycle,
+/// broadcast channel lagged during startup or at runtime.
 /// merge-failure auto-spawn, auto-assign) react identically to a live
 /// transition, then trigger a full auto-assign pass.
 ///
-/// Replaces the legacy scan-based `reconcile_on_startup` approach.  The CRDT
+/// Safe to call any number of times — every reconcile function is idempotent.
-/// is the durable source of truth; replaying it as synthetic self-transitions
+pub(crate) async fn run_reconcile_pass(
-/// is cheaper, simpler, and idempotent: a second replay produces another burst
+    root: &std::path::Path,
-/// of events that subscribers safely ignore for already-assigned stories.
+    agents: &Arc<AgentPool>,
    done_retention: std::time::Duration,
 ) {
    // Content-GC: purge content-store entries for terminal/tombstoned stories.
    crate::db::gc::sweep_zombie_content_on_startup();
    // Worktree create: ensure every Coding story has a worktree.
    crate::agents::pool::worktree_lifecycle::reconcile_worktree_create(root, agents.port()).await;
    // Worktree cleanup: remove worktrees for terminal stories.
    crate::agents::pool::worktree_lifecycle::reconcile_worktree_cleanup(root).await;
    // Done-archive: archive Done stories whose retention period has elapsed.
    crate::io::watcher::sweep_done_to_archived(done_retention);
    // Cost-rollup: re-populate the in-memory register from disk.
    crate::agents::pool::cost_rollup_subscriber::reconcile_cost_rollup(root);
    // Merge-failure: spawn mergemaster for ConflictDetected stories with no active agent.
    crate::agents::pool::auto_assign::reconcile_merge_failure(agents, root).await;
    // Merge-block: no-op (in-memory counter cannot be reconstructed from CRDT).
    crate::agents::pool::auto_assign::reconcile_merge_failure_block();
    // Audit-log: no-op (historical replay would produce misleading entries).
    crate::pipeline_state::reconcile_audit_log();
 }
 /// Spawn the startup reconciliation task: run a full reconcile pass so that all
 /// side-effect subscribers converge on the current CRDT state without flooding
 /// the broadcast channel, then trigger a full auto-assign pass.
 ///
 /// Replaces the former `replay_current_pipeline_state()` approach, which
 /// sent one synthetic `TransitionFired` per CRDT item through the broadcast
 /// channel.  With >256 items that caused `Subscriber lagged` warnings and
 /// left subscribers with diverged state.  Direct reconcile calls bypass the
 /// channel entirely and scale to any CRDT size.
 pub(crate) fn spawn_startup_reconciliation(
    startup_root: Option<PathBuf>,
    startup_agents: Arc<AgentPool>,
@@ -467,20 +521,189 @@ pub(crate) fn spawn_startup_reconciliation(
 ) {
    if let Some(root) = startup_root {
        tokio::spawn(async move {
-            // Purge content-store entries for stories that reached terminal
+            let done_retention = crate::config::ProjectConfig::load(&root)
-            // stages in a previous session (before the GC subscriber was active).
+                .map(|c| std::time::Duration::from_secs(c.watcher.done_retention_secs))
-            crate::db::gc::sweep_zombie_content_on_startup();
+                .unwrap_or_else(|_| std::time::Duration::from_secs(4 * 3600));
-            crate::slog!(
+            crate::slog!("[startup] Running per-subscriber reconcile pass.");
-                "[startup] Replaying current pipeline state through TransitionFired channel."
+            run_reconcile_pass(&root, &startup_agents, done_retention).await;
            );
            crate::pipeline_state::replay_current_pipeline_state();
            crate::slog!("[auto-assign] Scanning pipeline stages for unassigned work.");
            startup_agents.auto_assign_available_work(&root).await;
            let _ = startup_reconciliation_tx.send(ReconciliationEvent {
                story_id: String::new(),
                status: "done".to_string(),
-                message: "Startup event replay complete.".to_string(),
+                message: "Startup reconcile pass complete.".to_string(),
            });
        });
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::db::{
        ContentKey, ItemMeta, ensure_content_store, write_content, write_item_with_content,
    };
    use crate::io::watcher::WatcherEvent;
    use tokio::sync::broadcast;
    fn make_pool() -> Arc<AgentPool> {
        let (tx, _) = broadcast::channel::<WatcherEvent>(16);
        Arc::new(AgentPool::new(3099, tx))
    }
    fn setup_huskies_dir(tmp: &tempfile::TempDir) -> std::path::PathBuf {
        let root = tmp.path().to_path_buf();
        std::fs::create_dir_all(root.join(".huskies")).unwrap();
        std::fs::write(root.join(".huskies/project.toml"), "").unwrap();
        root
    }
    /// AC4 + AC6: seeding >256 CRDT items and running the reconcile pass must not
    /// produce any "Subscriber lagged" warnings (structural guarantee — the new
    /// path never broadcasts through the channel) and must purge zombie content
    /// for all terminal stories after one reconcile tick.
    ///
    /// Distribution: 300 Backlog + 200 Coding + 200 Abandoned (terminal) + 300 QA
    /// = 1000 items.  Each of the 200 Abandoned stories gets a content-store entry
    /// seeded before the reconcile so we can assert it is cleaned up.
    #[tokio::test]
    async fn reconcile_pass_scales_to_1000_items_without_lagged_divergence() {
        crate::crdt_state::init_for_test();
        ensure_content_store();
        let tmp = tempfile::tempdir().unwrap();
        let root = setup_huskies_dir(&tmp);
        let pool = make_pool();
        // ── Seed 1000 items across several stages ──────────────────────────
        for i in 0..300u32 {
            let id = format!("1066_backlog_{i:04}");
            write_item_with_content(
                &id,
                "1_backlog",
                "---\nname: Backlog\n---\n",
                ItemMeta::named("Backlog"),
            );
        }
        for i in 0..200u32 {
            let id = format!("1066_coding_{i:04}");
            write_item_with_content(
                &id,
                "2_current",
                "---\nname: Coding\n---\n",
                ItemMeta::named("Coding"),
            );
        }
        for i in 0..200u32 {
            let id = format!("1066_abandoned_{i:04}");
            write_item_with_content(
                &id,
                "2_current",
                "---\nname: Abandoned\n---\n",
                ItemMeta::named("Abandoned"),
            );
            // Move to terminal stage (Abandoned).
            crate::agents::lifecycle::abandon_story(&id).expect("abandon must succeed");
            // Seed a content-store entry to verify GC cleans it up.
            write_content(ContentKey::Story(&id), "zombie content");
        }
        for i in 0..300u32 {
            let id = format!("1066_qa_{i:04}");
            write_item_with_content(&id, "3_qa", "---\nname: QA\n---\n", ItemMeta::named("QA"));
        }
        // ── Subscribe BEFORE the reconcile to catch any Lagged events ──────
        let mut transition_rx = crate::pipeline_state::subscribe_transitions();
        // ── Run one reconcile pass ─────────────────────────────────────────
        // Use zero retention so any Done items (none here, but defensive) archive immediately.
        run_reconcile_pass(&root, &pool, std::time::Duration::ZERO).await;
        // ── Drain the transition channel; must contain zero Lagged events ──
        // The reconcile path never broadcasts through TRANSITION_TX, so any
        // events here are from the abandon_story calls above (all pre-reconcile).
        let mut lagged_count = 0u64;
        loop {
            match transition_rx.try_recv() {
                Ok(_) => {}
                Err(tokio::sync::broadcast::error::TryRecvError::Lagged(n)) => {
                    lagged_count += n;
                }
                Err(tokio::sync::broadcast::error::TryRecvError::Empty)
                | Err(tokio::sync::broadcast::error::TryRecvError::Closed) => break,
            }
        }
        // The reconcile pass itself must not have sent anything through the channel.
        // (abandon_story above may have sent some events, but those are pre-reconcile
        // lifecycle transitions, not the reconcile itself.)
        assert_eq!(
            lagged_count, 0,
            "run_reconcile_pass must not broadcast through the transition channel (no Lagged)"
        );
        // ── Assert: zombie content purged for all 200 Abandoned stories ────
        for i in 0..200u32 {
            let id = format!("1066_abandoned_{i:04}");
            assert!(
                crate::db::read_content(ContentKey::Story(&id)).is_none(),
                "zombie content must be purged for abandoned story {id}"
            );
        }
    }
    /// AC4 regression: the subscriber channel (capacity 256) must not lag when
    /// 1000 items are seeded — the reconcile path bypasses the channel entirely.
    #[tokio::test]
    async fn reconcile_never_floods_broadcast_channel() {
        crate::crdt_state::init_for_test();
        ensure_content_store();
        let tmp = tempfile::tempdir().unwrap();
        let root = setup_huskies_dir(&tmp);
        let pool = make_pool();
        // Seed 1000 Backlog items (no lifecycle transitions — clean slate).
        for i in 0..1000u32 {
            let id = format!("1066_flood_{i:04}");
            write_item_with_content(
                &id,
                "1_backlog",
                "---\nname: Flood\n---\n",
                ItemMeta::named("Flood"),
            );
        }
        // Subscribe and drain pre-existing channel noise.  Note: `TRANSITION_TX`
        // is a single process-global broadcast channel shared by every test in
        // this binary, so other tests running on parallel threads may write to
        // it during our window.  We can't assert `msg_count == 0` — that's
        // racy by construction.  The real "never floods" invariant is captured
        // by the Lagged check: 1000 seeded items must not overflow the
        // 256-slot channel, which is only possible if the reconcile path
        // bypasses the broadcast (which is what AC4 requires).
        let mut rx = crate::pipeline_state::subscribe_transitions();
        while let Ok(_) | Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) =
            rx.try_recv()
        {}
        run_reconcile_pass(&root, &pool, std::time::Duration::ZERO).await;
        let mut lagged = false;
        loop {
            match rx.try_recv() {
                Ok(_) => {}
                Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) => {
                    lagged = true;
                    break;
                }
                Err(_) => break,
            }
        }
        assert!(
            !lagged,
            "run_reconcile_pass must never cause Lagged on the broadcast channel"
        );
    }
 }
@@ -14,8 +14,26 @@ use super::{WorktreeInfo, worktree_path, write_mcp_json};
 /// - Creates the worktree at `{project_root}/.huskies/worktrees/{story_id}`
 ///   on branch `feature/story-{story_id}`.
 /// - Writes `.mcp.json` in the worktree pointing to the MCP server at `port`.
-/// - Runs setup commands from the config for each component.
+/// - Runs setup commands from the config for each component **only on fresh
 ///   creation** — see below.
 /// - If the worktree/branch already exists, reuses rather than errors.
 ///
 /// **Idempotency on reuse:** when `wt_path` already exists, this function does
 /// **not** re-run [`run_setup_commands`].  Setup commands typically include
 /// destructive operations like `npm ci` (`rm -rf node_modules` then reinstall)
 /// that, if run concurrently with another reuse from a different caller, leave
 /// `node_modules` in a half-populated state (broken `.bin/*` symlinks pointing
 /// at empty package directories).  This used to be rare and tolerable, but
 /// after story 1066 added a 30-second periodic reconciler that calls
 /// `reconcile_worktree_create` → `create_worktree`, every Coding story got a
 /// destructive `npm ci` every 30s — racing the merge-gate's own frontend
 /// build and producing the `sh: 1: tsc: not found` failure that bricked
 /// story 1086 retries on 2026-05-15.
 ///
 /// The reuse path now matches the documented contract of
 /// `reconcile_worktree_create`: "no-op for stories whose worktree already
 /// exists."  If a worktree is in a bad state and needs re-setup, the caller
 /// must explicitly delete it and call `create_worktree` again.
 pub async fn create_worktree(
    project_root: &Path,
    story_id: &str,
@@ -30,14 +48,15 @@ pub async fn create_worktree(
        .unwrap_or_else(|| detect_base_branch(project_root));
    let root = project_root.to_path_buf();
-    // Already exists — reuse (ensure sparse checkout is configured)
+    // Already exists — reuse without re-running destructive setup commands.
    // Sparse checkout is reconfigured (cheap, idempotent) and `.mcp.json` is
    // rewritten in case the server port changed across restarts.
    if wt_path.exists() {
        let wt_clone = wt_path.clone();
        tokio::task::spawn_blocking(move || configure_sparse_checkout(&wt_clone))
            .await
            .map_err(|e| format!("spawn_blocking: {e}"))??;
        write_mcp_json(&wt_path, port)?;
        run_setup_commands(&wt_path, config).await;
        return Ok(WorktreeInfo {
            path: wt_path,
            branch,
@@ -374,32 +393,80 @@ mod tests {
    }
    #[tokio::test]
-    async fn create_worktree_reuse_succeeds_despite_setup_failure() {
+    async fn create_worktree_reuse_does_not_rerun_setup_commands() {
        // Regression for the 2026-05-15 1086 outage: the reuse path used to
        // re-run setup commands (including destructive `npm ci`).  Combined
        // with story 1066's 30-second periodic reconciler, this fired
        // `npm ci` against every Coding story every 30s and caused
        // `tsc: not found` gate failures.  The reuse path must now be a
        // no-op for setup commands.
        let tmp = TempDir::new().unwrap();
        let project_root = tmp.path().join("my-project");
        fs::create_dir_all(&project_root).unwrap();
        init_git_repo(&project_root);
        // First creation — no setup commands, should succeed
-        create_worktree(&project_root, "173_reuse_fail", &empty_config(), 3001)
+        create_worktree(&project_root, "173_reuse_no_setup", &empty_config(), 3001)
            .await
            .unwrap();
-        // Second call — worktree exists, setup commands fail, must still succeed
+        // Second call — worktree exists.  Setup commands are configured to
        // FAIL (`exit 1`); if the reuse path were still running them, the
        // failure log would surface — but more importantly, this test
        // documents that the reuse path is expected to NEVER reach
        // `run_setup_commands` and therefore can never produce a setup
        // failure regardless of how broken the setup config is.
        let result = create_worktree(
            &project_root,
-            "173_reuse_fail",
+            "173_reuse_no_setup",
            &failing_setup_config(),
            3002,
        )
        .await;
        assert!(
            result.is_ok(),
-            "create_worktree reuse must succeed even if setup commands fail: {:?}",
+            "reuse must succeed and must not run setup commands: {:?}",
            result.err()
        );
    }
    #[tokio::test]
    async fn create_worktree_reuse_does_not_create_setup_marker_file() {
        // Stronger version of the above: assert that on reuse, a setup
        // command that would have created a marker file does NOT run.
        let tmp = TempDir::new().unwrap();
        let project_root = tmp.path().join("my-project");
        fs::create_dir_all(&project_root).unwrap();
        init_git_repo(&project_root);
        // First creation — no setup, so no marker yet.
        let info = create_worktree(&project_root, "174_reuse_marker", &empty_config(), 3001)
            .await
            .unwrap();
        let marker = info.path.join("__setup_ran__");
        assert!(!marker.exists(), "no marker after empty-setup creation");
        // Second call with a setup command that WOULD create the marker if
        // run.  The reuse path must not run it.
        let cfg = ProjectConfig {
            component: vec![ComponentConfig {
                name: "marker".to_string(),
                path: ".".to_string(),
                setup: vec!["touch __setup_ran__".to_string()],
                teardown: vec![],
            }],
            ..empty_config()
        };
        create_worktree(&project_root, "174_reuse_marker", &cfg, 3002)
            .await
            .unwrap();
        assert!(
            !marker.exists(),
            "reuse path must not run setup commands; marker file was created"
        );
    }
    #[test]
    fn install_pre_commit_hook_creates_executable_hook_and_sets_hookspath() {
        let tmp = TempDir::new().unwrap();
Author	SHA1	Message	Date
Timmy	fb82bd7bca	test(tick_loop): de-flake reconcile_never_floods_broadcast_channel The test asserted msg_count == 0 on a process-global broadcast channel (TRANSITION_TX is a single OnceLock<Sender> shared across the test binary), so any concurrent test calling apply_transition could land events in our receiver between the drain and the post-reconcile check. Observed failure: 3 stray transitions from parallel tests. Drop the strict count check. The real "never floods" invariant is captured by the Lagged check alone: 1000 seeded items must not overflow the 256-slot channel, which can only hold if the reconcile path bypasses the broadcast (AC4). The sibling test `reconcile_pass_scales_to_1000_items_without_lagged_divergence` already uses this Lagged-only pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 11:13:31 +01:00
Timmy	b7df5cbe4e	fix(agents): kill-then-status reorder in stop_agent stop_agent had the same order-of-operations bug fixed in the watchdog: status flipped to Failed before the claude process was verified gone, opening the idempotency window that allowed a duplicate spawn to race in alongside the surviving process. Now follows the three-step protocol: 1. Read worktree path under a read-only lock (no mutation). 2. SIGKILL the worktree's process tree via process_kill and block until verified gone — start_agent's Running/Pending whitelist continues to reject duplicate spawns throughout. 3. Only then mutate the agent record, abort the task handle, and drop the child_killers entry. Falls back to the old portable_pty SIGHUP path (with a warning) when no worktree was recorded, matching the watchdog's behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 10:46:02 +01:00
Timmy	fe9804b32c	feat: add process_kill module + use it to fix watchdog double-spawn Adds `crate::process_kill` — reliable SIGKILL-with-verify primitives used across the server in place of the various ad-hoc kill paths that ignored their kill-effective return values. The module exposes three pieces: - `sigkill_pids_and_verify(pids)`: SIGKILL each pid and block (up to 2s) until every pid is verified gone. Returns survivors if not. - `pids_matching(pattern)`: pgrep -f wrapper. - `descendant_pids(root)`: recursive pgrep -P walker for process trees. Wires the watchdog's limit-termination path through it, and reorders the protocol to fix the duplicate-coder bug observed on story 1086 (2026-05-15): Before: check_agent_limits set status=Failed before the kill ran. The kill itself was `portable_pty::ChildKiller::kill()`, which sends SIGHUP on Unix — claude-code ignores SIGHUP, so the process kept running while the agent record was already marked terminated. The idempotency check in `start_agent` whitelists Running/Pending, so the next auto-assign pass spawned a fresh agent alongside the still-alive prior one. Two claude PIDs sharing one session_id, racing on the same worktree. After: status update is moved OUT of check_agent_limits and into the caller AFTER the kill is verified. The kill itself is now SIGKILL-the- process-tree-in-the-worktree, with explicit verification that every pid is gone. The idempotency window is closed. The existing watchdog test suite (14 tests) still passes; 7 new tests cover the process_kill primitives directly. `agents/pool/process.rs`'s `kill_all_children` and `kill_child_for_key` still use the old portable_pty SIGHUP path — they have the same bug but in lower-impact code paths (shutdown, operator stop). They will be migrated under a separate story to keep this commit focused. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 10:36:33 +01:00
Timmy	8446ab1c71	chore: gitignore .huskies/double_timmy_log.md Local-only scratchpad for tracking suspected duplicate-Timmy / duplicate-create_story incidents while we hunt the cause. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 10:06:37 +01:00
dave	b5054b08d3	huskies: regen source-map.json	2026-05-15 08:47:38 +00:00
dave	df32a1542b	huskies: merge 1087 story Pipeline+Status split — Step D: migrate CRDT storage to (Pipeline, Status) and remove the Stage enum	2026-05-15 08:47:38 +00:00
dave	e82602db77	huskies: merge 1086 story Pipeline+Status split — Step C: migrate auto-assign, subscribers, and lifecycle transitions to read Pipeline + Status	2026-05-15 08:26:39 +00:00
Timmy	2d6105c778	fix: skip setup commands on worktree reuse so reconciler doesn't fire npm ci every 30s Story 1066 (merged 2026-05-14 23:39) introduced a periodic reconciler that calls `reconcile_worktree_create` every 30 seconds (default `reconcile_interval_secs`). The reconciler's docstring promises it is a no-op for stories whose worktree already exists — but the implementation calls `create_worktree`, whose reuse path was running `run_setup_commands` unconditionally. Setup includes destructive `npm ci` (rm -rf node_modules then reinstall), so every Coding story got `npm ci` fired every 30 seconds. When story 1086 hit a gate-failure retry loop on 2026-05-15, the merge gate's own `npm install`/`npm run build` raced one of these reconciler-driven `npm ci` runs that was wiping node_modules — leaving `.bin/tsc` as a broken symlink pointing into a half-populated `typescript/` package and producing `sh: 1: tsc: not found`. 37 npm ci fires for 1086 in 5 hours against only 3 real Coding transitions, a 12x amplification driven entirely by the 30-second reconcile cadence. Fix: align `create_worktree`'s behaviour with the contract `reconcile_worktree_create` already documents — reuse is a no-op for setup commands. Sparse checkout and `.mcp.json` rewrite still run (both cheap and idempotent). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 08:57:38 +01:00
Timmy	d89940e85b	fix: drop source-map.json from agent orientation bundle The orientation bundle was 96 KB per coder spawn with 85 KB of that being source-map.json — a static symbol listing that drowned out the workflow rules in AGENT.md and likely explains why PLAN.md ceremony is being skipped (the instruction is ~5% of the bundle, buried under a wall of symbols). Agents are excellent at grep on demand, so the source map adds little value as a preloaded cheat sheet. File stays on disk for the merge-time source-map-check doc-coverage gate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-15 07:48:18 +01:00
dave	60fceee204	huskies: regen source-map.json	2026-05-15 02:03:30 +00:00
dave	13f7dab5f0	huskies: merge 1088	2026-05-15 02:03:30 +00:00
dave	f7413cc711	huskies: regen source-map.json	2026-05-15 01:38:05 +00:00
dave	b053f14d58	huskies: merge 1085	2026-05-15 01:38:05 +00:00
dave	56179d712e	huskies: merge 1078	2026-05-15 01:32:29 +00:00
dave	a06bf6778b	huskies: regen source-map.json	2026-05-15 01:27:25 +00:00
dave	1506141155	huskies: merge 1072	2026-05-15 01:27:25 +00:00
dave	ae69cd50b1	huskies: regen source-map.json	2026-05-15 00:58:57 +00:00
dave	0c23d209a0	huskies: merge 1077	2026-05-15 00:58:57 +00:00
dave	eac5763e03	huskies: merge 1075	2026-05-15 00:48:06 +00:00
dave	6530eeab6d	huskies: merge 811	2026-05-15 00:42:14 +00:00
dave	5eb8f2f8a7	huskies: regen source-map.json	2026-05-15 00:37:01 +00:00
dave	f9b140add9	huskies: merge 1073	2026-05-15 00:37:01 +00:00
dave	d4db96f709	huskies: merge 1070	2026-05-15 00:20:29 +00:00
dave	5f08573db8	huskies: merge 1076	2026-05-15 00:10:15 +00:00
dave	da83fcb78d	huskies: merge 1074	2026-05-15 00:01:58 +00:00
dave	f04bdd1f14	huskies: regen source-map.json	2026-05-14 23:45:53 +00:00
dave	bb6a6063e8	huskies: merge 1066	2026-05-14 23:45:53 +00:00
dave	bf813d910b	huskies: regen source-map.json	2026-05-14 23:29:32 +00:00
dave	374aa77f27	huskies: merge 1069	2026-05-14 23:29:32 +00:00