0572af2193
The progress-aware no-progress cap (3 consecutive byte-identical diffs) doesn't catch the degenerate pattern where the agent keeps making DIFFERENT file edits each session but never commits — every respawn resets the no-progress counter, infinite loop, budget burns. Adds ContentKey::CommitRecoveryTotalAttempts: an absolute counter that increments on every commit-recovery respawn regardless of progress. TOTAL_ATTEMPTS_CAP = 8; when hit, block with reason 'agent flapped — N respawns without ever committing'. Two caps now bound the recovery loop: - NO_PROGRESS_CAP (3): catches stuck-agent (same diff repeatedly) - TOTAL_ATTEMPTS_CAP (8): catches flapping-agent (different diffs, no commits) Easy to tune the constant lower if we see runaway in practice. All 2936 tests pass.
230 lines
9.2 KiB
Rust
230 lines
9.2 KiB
Rust
//! In-memory content store — fast synchronous reads for story markdown.
|
|
//!
|
|
//! Backed by a `HashMap<story_id, markdown>` wrapped in a `Mutex`. In
|
|
//! non-test builds the store lives in a process-global `OnceLock`; in tests
|
|
//! each thread gets its own isolated copy via a `thread_local!` to avoid
|
|
//! cross-test pollution.
|
|
use std::collections::HashMap;
|
|
use std::sync::{Mutex, OnceLock};
|
|
|
|
/// Typed key for the in-memory content store.
|
|
///
|
|
/// Each variant maps to a distinct raw key namespace so that content written
|
|
/// under one variant is never visible under another — no raw `format!()`
|
|
/// key construction is needed at call sites outside `db/`.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum ContentKey<'a> {
|
|
/// Main markdown body of a work item (story, bug, spike, refactor, epic).
|
|
Story(&'a str),
|
|
/// Gate failure output from the last failed agent run.
|
|
GateOutput(&'a str),
|
|
/// Consecutive abort-respawn counter.
|
|
AbortRespawnCount(&'a str),
|
|
/// Mergemaster re-spawn counter.
|
|
MergeMasterSpawnCount(&'a str),
|
|
/// Evidence that `run_tests` passed during an agent session.
|
|
RunTestsOk(&'a str),
|
|
/// Flag indicating a commit-recovery respawn is in progress. Stored as
|
|
/// a decimal string counting consecutive respawns that made NO file-edit
|
|
/// progress (worktree diff byte-identical to the previous attempt). Reset
|
|
/// to "1" whenever a respawn produces a different diff fingerprint.
|
|
CommitRecoveryPending(&'a str),
|
|
/// Worktree diff byte-length captured at the last commit-recovery respawn
|
|
/// trigger. Used to detect whether the agent made any file-edit progress
|
|
/// between consecutive session-boundary-clean exits. Same byte length on
|
|
/// two consecutive attempts → no progress → increment CommitRecoveryPending.
|
|
CommitRecoveryDiffFingerprint(&'a str),
|
|
/// Absolute count of commit-recovery respawns issued for a story since the
|
|
/// last successful commit. Increments every respawn regardless of whether
|
|
/// the diff fingerprint changed. Outer cap that catches the "agent flaps
|
|
/// between different file edits each session but never commits" pattern
|
|
/// where the progress-aware counter would never trigger.
|
|
CommitRecoveryTotalAttempts(&'a str),
|
|
/// Flag indicating a merge gate fixup coder session is in progress.
|
|
///
|
|
/// Set when the merge gate fails with a self-evident-fix class of failure
|
|
/// (fmt drift, clippy warning, missing doc) so the pipeline advance handler
|
|
/// can route the fixup coder's completion directly back to merge instead of
|
|
/// through the normal QA path (story 981).
|
|
MergeFixupPending(&'a str),
|
|
/// JSON-serialised `MergeFailureKind` written alongside `GateOutput` so the
|
|
/// CRDT projection layer can reconstruct the typed kind on server restart
|
|
/// without substring-scanning the gate output string (story 986).
|
|
MergeFailureKind(&'a str),
|
|
/// Flag set by the merge runner when a squash merge succeeds with
|
|
/// `story_archived: true`. Written before the CRDT job status is set to
|
|
/// "completed" so the mergemaster agent exit handler in `spawn.rs` can
|
|
/// distinguish a clean success from a transient crash (bug 1008).
|
|
MergeSuccess(&'a str),
|
|
}
|
|
|
|
impl<'a> ContentKey<'a> {
|
|
/// Lower this typed key to the underlying storage string used by the
|
|
/// CRDT content store (`{story_id}` for the base story, `{story_id}:<kind>`
|
|
/// for per-purpose sub-keys). Internal — callers should use the typed
|
|
/// `read_content` / `write_content` wrappers instead of touching strings.
|
|
pub(super) fn as_raw_key(&self) -> String {
|
|
match self {
|
|
ContentKey::Story(id) => id.to_string(),
|
|
ContentKey::GateOutput(id) => format!("{id}:gate_output"),
|
|
ContentKey::AbortRespawnCount(id) => format!("{id}:abort_respawn_count"),
|
|
ContentKey::MergeMasterSpawnCount(id) => format!("{id}:mergemaster_spawn_count"),
|
|
ContentKey::RunTestsOk(id) => format!("{id}:run_tests_ok"),
|
|
ContentKey::CommitRecoveryPending(id) => format!("{id}:commit_recovery_pending"),
|
|
ContentKey::CommitRecoveryDiffFingerprint(id) => {
|
|
format!("{id}:commit_recovery_diff_fingerprint")
|
|
}
|
|
ContentKey::CommitRecoveryTotalAttempts(id) => {
|
|
format!("{id}:commit_recovery_total_attempts")
|
|
}
|
|
ContentKey::MergeFixupPending(id) => format!("{id}:merge_fixup_pending"),
|
|
ContentKey::MergeFailureKind(id) => format!("{id}:merge_failure_kind"),
|
|
ContentKey::MergeSuccess(id) => format!("{id}:merge_success"),
|
|
}
|
|
}
|
|
}
|
|
|
|
static CONTENT_STORE: OnceLock<Mutex<HashMap<String, String>>> = OnceLock::new();
|
|
|
|
#[cfg(test)]
|
|
thread_local! {
|
|
/// Per-thread isolated content store used in tests to prevent cross-test pollution.
|
|
pub(super) static CONTENT_STORE_TL: OnceLock<Mutex<HashMap<String, String>>> = const { OnceLock::new() };
|
|
}
|
|
|
|
#[cfg(not(test))]
|
|
/// Return a reference to the process-global content store, or `None` if not yet initialised.
|
|
pub(super) fn get_content_store() -> Option<&'static Mutex<HashMap<String, String>>> {
|
|
CONTENT_STORE.get()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
/// Return the thread-local content store for tests, falling back to the global store.
|
|
pub(super) fn get_content_store() -> Option<&'static Mutex<HashMap<String, String>>> {
|
|
let tl = CONTENT_STORE_TL.with(|lock| {
|
|
if lock.get().is_some() {
|
|
Some(lock as *const OnceLock<Mutex<HashMap<String, String>>>)
|
|
} else {
|
|
None
|
|
}
|
|
});
|
|
if let Some(ptr) = tl {
|
|
// SAFETY: The thread-local lives as long as the thread, which outlives
|
|
// any test using it. We only need 'static for the return type.
|
|
let lock = unsafe { &*ptr };
|
|
lock.get()
|
|
} else {
|
|
CONTENT_STORE.get()
|
|
}
|
|
}
|
|
|
|
/// Read content from the in-memory store by typed key.
|
|
pub fn read_content(key: ContentKey<'_>) -> Option<String> {
|
|
let store = get_content_store()?;
|
|
let map = store.lock().ok()?;
|
|
map.get(&key.as_raw_key()).cloned()
|
|
}
|
|
|
|
/// Write (or overwrite) content in the in-memory store by typed key.
|
|
pub fn write_content(key: ContentKey<'_>, content: &str) {
|
|
if let Some(store) = get_content_store()
|
|
&& let Ok(mut map) = store.lock()
|
|
{
|
|
map.insert(key.as_raw_key(), content.to_string());
|
|
}
|
|
}
|
|
|
|
/// Remove an entry from the in-memory store by typed key.
|
|
pub fn delete_content(key: ContentKey<'_>) {
|
|
if let Some(store) = get_content_store()
|
|
&& let Ok(mut map) = store.lock()
|
|
{
|
|
map.remove(&key.as_raw_key());
|
|
}
|
|
}
|
|
|
|
/// Ensure the in-memory content store is initialised.
|
|
///
|
|
/// Safe to call multiple times — the `OnceLock` is set at most once.
|
|
pub fn ensure_content_store() {
|
|
#[cfg(not(test))]
|
|
{
|
|
let _ = CONTENT_STORE.set(Mutex::new(HashMap::new()));
|
|
}
|
|
|
|
#[cfg(test)]
|
|
{
|
|
CONTENT_STORE_TL.with(|lock| {
|
|
if lock.get().is_none() {
|
|
let _ = lock.set(Mutex::new(HashMap::new()));
|
|
}
|
|
});
|
|
crate::crdt_state::init_for_test();
|
|
}
|
|
}
|
|
|
|
/// Return all story IDs present in the content store.
|
|
pub fn all_content_ids() -> Vec<String> {
|
|
match get_content_store() {
|
|
Some(store) => match store.lock() {
|
|
Ok(map) => map.keys().cloned().collect(),
|
|
Err(_) => Vec::new(),
|
|
},
|
|
None => Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Initialise the content store from a pre-loaded map (used during DB startup).
|
|
pub(super) fn init_content_store(map: HashMap<String, String>) {
|
|
let _ = CONTENT_STORE.set(Mutex::new(map));
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
/// AC 2 regression: writing under `ContentKey::Story` is not visible under
|
|
/// `ContentKey::GateOutput` (and vice versa). The typed key namespace, not
|
|
/// runtime substring matching, enforces the separation.
|
|
#[test]
|
|
fn wrong_key_variant_is_isolated() {
|
|
ensure_content_store();
|
|
let id = "9961_regression_key_isolation";
|
|
|
|
write_content(ContentKey::Story(id), "story body");
|
|
|
|
// A different variant for the same base id must not surface the story body.
|
|
assert!(
|
|
read_content(ContentKey::GateOutput(id)).is_none(),
|
|
"GateOutput key must not read Story content"
|
|
);
|
|
assert!(
|
|
read_content(ContentKey::RunTestsOk(id)).is_none(),
|
|
"RunTestsOk key must not read Story content"
|
|
);
|
|
|
|
// The Story variant itself must still return the content.
|
|
assert_eq!(
|
|
read_content(ContentKey::Story(id)).as_deref(),
|
|
Some("story body")
|
|
);
|
|
|
|
// Write under a second variant; reading under Story must still return
|
|
// the original body, not the gate output.
|
|
write_content(ContentKey::GateOutput(id), "gate failure text");
|
|
assert_eq!(
|
|
read_content(ContentKey::Story(id)).as_deref(),
|
|
Some("story body"),
|
|
"Story key must not be polluted by GateOutput write"
|
|
);
|
|
assert_eq!(
|
|
read_content(ContentKey::GateOutput(id)).as_deref(),
|
|
Some("gate failure text")
|
|
);
|
|
|
|
// Cleanup.
|
|
delete_content(ContentKey::Story(id));
|
|
delete_content(ContentKey::GateOutput(id));
|
|
}
|
|
}
|