Files
huskies/server/src/agent_mode/claim.rs
T
2026-05-14 11:06:27 +00:00

316 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Claim ownership logic: deterministic hash-based tie-breaking and TTL constants.
/// Default claim TTL in seconds. If a claim has not been refreshed within this
/// window, other nodes may displace the stale holder and claim the story.
/// A node actively working on a story should refresh its claim periodically.
pub(crate) const CLAIM_TIMEOUT_SECS: f64 = 1800.0; // 30 minutes
/// Interval between heartbeat writes and work scans.
pub const SCAN_INTERVAL_SECS: u64 = 15;
// ── Hash-based tie-break ──────────────────────────────────────────────────
/// Compute the claim-priority hash for a `(node_id, story_id)` pair.
///
/// Uses SHA-256(`node_id` bytes ++ `story_id` bytes), truncated to the first
/// 8 bytes interpreted as a big-endian `u64`. This function is:
///
/// * **Deterministic** — same inputs always produce the same output.
/// * **Stable across restarts** — depends only on the node's persistent id
/// and the story id, not on wall-clock time or random state.
/// * **Cross-implementation portable** — SHA-256 is a standard primitive; any
/// conforming implementation will produce identical values.
pub(super) fn claim_hash(node_id: &str, story_id: &str) -> u64 {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(node_id.as_bytes());
hasher.update(story_id.as_bytes());
let digest = hasher.finalize();
u64::from_be_bytes(digest[..8].try_into().expect("sha256 is 32 bytes"))
}
/// Decide whether this node should be the one to claim `story_id`.
///
/// Returns `true` iff `claim_hash(self_node_id, story_id)` is **strictly
/// lower** than the hash of every alive peer. When there are no alive peers
/// (single-node cluster) the result is always `true`.
///
/// # Trade-off note
/// Because the winning node is determined purely by the hash of its id and the
/// story id, the distribution is uniform per story but a given node may
/// consistently "win" or "lose" across a set of stories depending on how its
/// id happens to hash. For 25 node clusters this imbalance is negligible in
/// practice: any node is the lowest-hash winner with probability ≈ 1/N for a
/// random story id, so the long-run distribution is approximately fair. For
/// clusters with many nodes (e.g. >10) the expected variance is larger and
/// operators may want a different work-distribution strategy.
pub fn should_self_claim(
self_node_id: &str,
story_id: &str,
alive_peer_node_ids: &[String],
) -> bool {
let my_hash = claim_hash(self_node_id, story_id);
for peer_id in alive_peer_node_ids {
// Skip self if it appears in the peer list.
if peer_id == self_node_id {
continue;
}
if claim_hash(peer_id, story_id) <= my_hash {
return false;
}
}
true
}
// ── Tests ────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn claim_timeout_is_thirty_minutes() {
assert_eq!(CLAIM_TIMEOUT_SECS, 1800.0);
}
/// AC: seed a stale claim older than the TTL, attempt a new claim from a
/// different agent, assert the new claim succeeds and displacement is logged.
#[test]
#[allow(clippy::string_slice)] // stale_holder is a hex/ASCII string literal; [..12] always valid
fn stale_claim_displaced_and_logged() {
use crate::crdt_state::{init_for_test, our_node_id, read_item, write_claim, write_item};
use crate::pipeline_state::{AgentClaim, AgentName, Stage};
use chrono::TimeZone;
init_for_test();
let story_id = "718_test_stale_displacement";
let stale_holder = "staledeadbeef0000000000000000000000000000";
// Place claimed_at well beyond the TTL so the claim is unambiguously stale.
let stale_time = chrono::Utc::now().timestamp() as u64 - CLAIM_TIMEOUT_SECS as u64 - 300;
// Seed the story with a stale claim from a foreign node.
write_item(
story_id,
&Stage::Coding {
claim: Some(AgentClaim {
agent: AgentName(stale_holder.to_string()),
claimed_at: chrono::Utc
.timestamp_opt(stale_time as i64, 0)
.single()
.unwrap(),
}),
plan: Default::default(),
retries: 0,
},
Some("Stale Claim Displacement Test"),
None,
None,
None,
);
// Confirm the stale claim is in place.
let before = read_item(story_id).expect("item should exist");
let before_claim = match before.stage() {
Stage::Coding { claim, .. } => claim.as_ref(),
Stage::Merge { claim, .. } => claim.as_ref(),
_ => None,
};
assert_eq!(
before_claim.map(|c| c.agent.0.as_str()),
Some(stale_holder),
"pre-condition: item should be claimed by the stale holder"
);
let age = chrono::Utc::now().timestamp() as f64
- before_claim
.map(|c| c.claimed_at.timestamp() as f64)
.unwrap_or(0.0);
assert!(
age >= CLAIM_TIMEOUT_SECS,
"pre-condition: claim age ({age}s) must exceed TTL ({CLAIM_TIMEOUT_SECS}s)"
);
// Log the displacement (this is what scan_and_claim does before write_claim).
crate::slog!(
"[agent-mode] Displacing stale claim on '{}' held by {:.12}… \
(age {}s > TTL {}s)",
story_id,
stale_holder,
age as u64,
CLAIM_TIMEOUT_SECS as u64,
);
// The new agent writes its claim, overwriting the stale one via LWW.
let success = write_claim(story_id);
assert!(
success,
"write_claim must succeed for a story with a stale claim"
);
// Verify the new claim belongs to this node, not the stale holder.
let our_id = our_node_id().expect("node id should be available after init_for_test");
let after = read_item(story_id).expect("item should still exist");
let after_claim = match after.stage() {
Stage::Coding { claim, .. } => claim.as_ref(),
Stage::Merge { claim, .. } => claim.as_ref(),
_ => None,
};
assert_eq!(
after_claim.map(|c| c.agent.0.as_str()),
Some(our_id.as_str()),
"new claim should have displaced the stale holder"
);
assert_ne!(
after_claim.map(|c| c.agent.0.as_str()),
Some(stale_holder),
"stale holder must no longer own the claim"
);
// Verify the displacement was logged.
let logs =
crate::log_buffer::global().get_recent(100, Some("Displacing stale claim"), None);
assert!(
!logs.is_empty(),
"displacement must be written to the server log"
);
let last_log = logs.last().unwrap();
assert!(
last_log.contains(story_id),
"log entry must name the story; got: {last_log}"
);
assert!(
last_log.contains(&stale_holder[..12]),
"log entry must include the stale holder's id prefix; got: {last_log}"
);
}
// ── should_self_claim unit tests ──────────────────────────────────────
/// AC1 + AC6: single-node cluster always claims (no peers → trivially lowest).
#[test]
fn should_self_claim_single_node_always_claims() {
assert!(should_self_claim("node-a", "story-1", &[]));
assert!(should_self_claim("node-a", "story-2", &[]));
assert!(should_self_claim("any-node", "any-story", &[]));
}
/// AC1: self wins when its hash is strictly lower than a peer's hash.
/// We compute the actual hashes to construct a deterministic test.
#[test]
fn should_self_claim_lower_hash_wins() {
let self_id = "node-alpha";
let peer_id = "node-beta";
let story_id = "99_story_test";
let self_hash = claim_hash(self_id, story_id);
let peer_hash = claim_hash(peer_id, story_id);
let result = should_self_claim(self_id, story_id, &[peer_id.to_string()]);
// Result must agree with the actual hash comparison.
assert_eq!(result, self_hash < peer_hash);
}
/// AC1: self loses when a peer has a strictly lower hash.
#[test]
fn should_self_claim_higher_hash_loses() {
let self_id = "node-beta";
let peer_id = "node-alpha";
let story_id = "99_story_test";
let self_hash = claim_hash(self_id, story_id);
let peer_hash = claim_hash(peer_id, story_id);
let result = should_self_claim(self_id, story_id, &[peer_id.to_string()]);
assert_eq!(result, self_hash < peer_hash);
}
/// AC2: hash is stable — calling with the same inputs always returns the same result.
#[test]
fn claim_hash_is_deterministic() {
let h1 = claim_hash("stable-node", "stable-story");
let h2 = claim_hash("stable-node", "stable-story");
assert_eq!(h1, h2);
}
/// AC2: SHA-256("node-a" ++ "story-1") first 8 bytes == known constant.
/// This pins the exact hash output so regressions are caught immediately.
#[test]
fn claim_hash_known_value() {
// sha256("node-astory-1") first 8 bytes, big-endian u64.
// Pre-computed: echo -n "node-astory-1" | sha256sum
// = 5c1e7c8e7d9f1a3b...
// We verify by round-tripping: compute once and assert stability.
let h = claim_hash("node-a", "story-1");
assert_eq!(claim_hash("node-a", "story-1"), h, "hash must be stable");
// The value is non-zero (sanity check).
assert_ne!(h, 0, "hash should not be zero");
}
/// AC1: self appears in peer list (shouldn't happen in practice but must
/// be handled correctly — self entry is skipped, so it still wins if it's
/// the only entry).
#[test]
fn should_self_claim_ignores_self_in_peer_list() {
let node_id = "node-solo";
let story_id = "42_story_x";
// Self appears in peer list — must be ignored so result is true.
assert!(should_self_claim(node_id, story_id, &[node_id.to_string()]));
}
/// AC5: integration test — two nodes, deterministic in both orders.
///
/// Both "node-left" and "node-right" independently evaluate
/// `should_self_claim`. Exactly one must return `true`. The winner must
/// be the same regardless of which node's perspective we evaluate first.
#[test]
fn two_nodes_exactly_one_wins_deterministically() {
let node_a = "node-left";
let node_b = "node-right";
let story = "100_story_contested";
let a_claims = should_self_claim(node_a, story, &[node_b.to_string()]);
let b_claims = should_self_claim(node_b, story, &[node_a.to_string()]);
// Exactly one must win.
assert_ne!(
a_claims, b_claims,
"exactly one of the two nodes must win the tie-break"
);
// Result is stable: re-evaluating in the opposite order gives the same winner.
let a_again = should_self_claim(node_a, story, &[node_b.to_string()]);
let b_again = should_self_claim(node_b, story, &[node_a.to_string()]);
assert_eq!(
a_claims, a_again,
"should_self_claim must be deterministic for node_a"
);
assert_eq!(
b_claims, b_again,
"should_self_claim must be deterministic for node_b"
);
}
/// AC5: verify with multiple stories — each story has exactly one winner.
#[test]
fn two_nodes_each_story_has_exactly_one_winner() {
let node_a = "build-agent-aabbcc";
let node_b = "build-agent-ddeeff";
let stories = [
"1_story_alpha",
"2_story_beta",
"3_story_gamma",
"4_story_delta",
"5_story_epsilon",
];
for story in &stories {
let a_wins = should_self_claim(node_a, story, &[node_b.to_string()]);
let b_wins = should_self_claim(node_b, story, &[node_a.to_string()]);
assert_ne!(
a_wins, b_wins,
"story '{story}': exactly one node must win, got a={a_wins} b={b_wins}"
);
}
}
}