huskies: merge 1066

This commit is contained in:
dave
2026-05-14 23:39:56 +00:00
parent bf813d910b
commit bb6a6063e8
15 changed files with 361 additions and 120 deletions
@@ -569,14 +569,15 @@ mod tests {
);
}
// ── AC4: startup event replay + pool reconstruction ──────────────────
// ── AC4: startup reconcile + pool reconstruction ──────────────────
/// AC4: Simulates a server restart by seeding the CRDT with a story in
/// Coding stage, calling `replay_current_pipeline_state` (the new startup
/// path), then `auto_assign_available_work`. Asserts the pool ends in the
/// expected state: exactly one agent assigned to the story.
/// Coding stage, then running `auto_assign_available_work` (startup no longer
/// floods the broadcast channel via replay — it calls reconcile functions
/// directly). Asserts the pool ends in the expected state: exactly one agent
/// assigned to the story, and a second pass does not double-spawn.
#[tokio::test]
async fn startup_replay_followed_by_auto_assign_assigns_agent_once() {
async fn startup_auto_assign_assigns_agent_once() {
let tmp = tempfile::tempdir().unwrap();
let sk = tmp.path().join(".huskies");
std::fs::create_dir_all(&sk).unwrap();
@@ -597,8 +598,7 @@ mod tests {
let pool = AgentPool::new_test(3001);
// Simulate startup: replay current state, then auto-assign.
crate::pipeline_state::replay_current_pipeline_state();
// First auto-assign pass.
pool.auto_assign_available_work(tmp.path()).await;
let count_after_first = {
@@ -612,8 +612,7 @@ mod tests {
.count()
};
// AC3 (idempotency): replaying twice must not double-spawn agents.
crate::pipeline_state::replay_current_pipeline_state();
// Second pass (idempotency): must not double-spawn agents.
pool.auto_assign_available_work(tmp.path()).await;
let count_after_second = {
@@ -629,11 +628,11 @@ mod tests {
assert!(
count_after_first <= 1,
"after first replay+assign at most one agent must be assigned to {story_id}"
"after first auto-assign at most one agent must be assigned to {story_id}"
);
assert_eq!(
count_after_first, count_after_second,
"second replay must not spawn additional agents (idempotency)"
"second auto-assign must not spawn additional agents (idempotency)"
);
}
}
@@ -21,6 +21,15 @@ use super::super::super::PipelineStage;
use super::super::AgentPool;
use super::scan::is_story_assigned_for_stage;
/// Reconcile: no-op for the merge-failure block subscriber.
///
/// The block subscriber maintains an in-memory per-story consecutive-failure counter
/// that cannot be reconstructed from CRDT state alone (only the current stage is
/// stored, not the history of how many times each story failed). Eventual consistency
/// is guaranteed by the live subscriber reacting to each new `MergeFailure` event;
/// the periodic reconciler cannot add value here without risking spurious blocks.
pub(crate) fn reconcile_merge_failure_block() {}
/// Spawn a background task that blocks stories after N consecutive `MergeFailure` transitions.
///
/// Subscribes to the pipeline transition broadcast channel and tracks a per-story
@@ -17,6 +17,30 @@ use super::super::super::PipelineStage;
use super::super::AgentPool;
use super::scan::{find_free_agent_for_stage, is_story_assigned_for_stage};
/// Reconcile: for each story currently in `MergeFailure { kind: ConflictDetected }`,
/// ensure a mergemaster agent is running.
///
/// Idempotent — `on_merge_failure_transition` guards against double-spawning via
/// `is_story_assigned_for_stage`. Called by the periodic reconciler so that a Lagged
/// startup event never leaves a ConflictDetected story without a recovery agent.
pub(crate) async fn reconcile_merge_failure(pool: &Arc<AgentPool>, project_root: &Path) {
use crate::pipeline_state::{MergeFailureKind, PipelineEvent, Stage, TransitionFired};
for item in crate::pipeline_state::read_all_typed() {
if let Stage::MergeFailure { ref kind, .. } = item.stage
&& matches!(kind, MergeFailureKind::ConflictDetected(_))
{
let fired = TransitionFired {
story_id: item.story_id.clone(),
before: item.stage.clone(),
after: item.stage.clone(),
event: PipelineEvent::MergeFailed { kind: kind.clone() },
at: chrono::Utc::now(),
};
on_merge_failure_transition(pool, project_root, &fired).await;
}
}
}
/// Spawn a background task that auto-spawns mergemaster agents on
/// `Stage::MergeFailure { kind: ConflictDetected(_) }` transitions.
///
@@ -17,7 +17,11 @@ pub(crate) mod watchdog;
// so that pool::lifecycle and pool::pipeline continue to access them unchanged.
pub(super) use scan::{find_free_agent_for_stage, is_agent_free};
/// Re-export for `startup::tick_loop`.
pub(crate) use merge_failure_block_subscriber::reconcile_merge_failure_block;
/// Re-export for `startup::tick_loop`.
pub(crate) use merge_failure_block_subscriber::spawn_merge_failure_block_subscriber;
/// Re-export for `startup::tick_loop`.
pub(crate) use merge_failure_subscriber::reconcile_merge_failure;
/// Re-export for `startup::tick_loop`.
pub(crate) use merge_failure_subscriber::spawn_merge_failure_subscriber;
@@ -13,6 +13,15 @@ use crate::pipeline_state::Stage;
use crate::slog;
use crate::slog_warn;
/// Reconcile: re-populate the CostRollup register from disk for all known stories.
///
/// Idempotent — `init_from_disk` scans all existing token-usage JSONL files and
/// overwrites the in-memory register. Called by the periodic reconciler so that
/// a Lagged event can never leave a story with a stale or absent cost entry.
pub(crate) fn reconcile_cost_rollup(project_root: &Path) {
crate::service::agents::cost_rollup::init_from_disk(project_root);
}
/// Spawn a background task that maintains the CostRollup register.
///
/// On every terminal stage transition (Done, Archived, Abandoned, Superseded,
@@ -72,6 +72,38 @@ pub(crate) fn spawn_worktree_cleanup_subscriber(project_root: PathBuf) {
});
}
/// Reconcile worktree creation: for each story currently in `Stage::Coding`, ensure its worktree exists.
///
/// Idempotent — creates worktrees for Coding stories that have no worktree yet, and is
/// a no-op for stories whose worktree already exists. Called by the periodic reconciler
/// so that Lagged events on the broadcast channel never leave Coding stories without worktrees.
pub(crate) async fn reconcile_worktree_create(project_root: &Path, port: u16) {
for item in crate::pipeline_state::read_all_typed() {
if matches!(item.stage, crate::pipeline_state::Stage::Coding { .. }) {
on_coding_transition(project_root, port, &item.story_id.0).await;
}
}
}
/// Reconcile worktree cleanup: for each story in a terminal stage, ensure its worktree is removed.
///
/// Idempotent — removes worktrees for terminal stories that still have one, and is a no-op
/// for stories with no worktree. Called by the periodic reconciler so that Lagged events on
/// the broadcast channel never leave terminal stories with dangling worktrees.
pub(crate) async fn reconcile_worktree_cleanup(project_root: &Path) {
for item in crate::pipeline_state::read_all_typed() {
if matches!(
item.stage,
crate::pipeline_state::Stage::Done { .. }
| crate::pipeline_state::Stage::Archived { .. }
| crate::pipeline_state::Stage::Abandoned { .. }
| crate::pipeline_state::Stage::Superseded { .. }
) {
on_terminal_transition(project_root, &item.story_id.0).await;
}
}
}
/// Create the worktree and feature branch for `story_id` when it enters `Stage::Coding`.
pub(crate) async fn on_coding_transition(project_root: &Path, port: u16, story_id: &str) {
let config = match crate::config::ProjectConfig::load(project_root) {