fix(914): thread-local ALL_OPS/VECTOR_CLOCK in cfg(test) so compaction tests don't race

Root cause was not the persist channel (the test-mode channel is unbounded
and its receiver is leaked, so sends never fail). It was that `ALL_OPS` and
`VECTOR_CLOCK` were process-wide `OnceLock` globals while `CRDT_STATE` was
already thread-local — so one test thread's `apply_compaction` would prune
another test thread's freshly-written ops out of the shared journal, and
the subsequent `all_ops_json()` read in `compaction_reduces_ops` would
return fewer than the 5 it had just written.

Mirror the pattern already used for `CRDT_STATE` and `SnapshotState`: in
`cfg(test)` use thread-local `OnceLock<Mutex<...>>`s for the op journal and
vector clock, accessed via new `all_ops_lock()` / `vector_clock_lock()`
helpers. Production code path is unchanged (still the global statics set
during `init()`).

Touches ops/read/snapshot call sites to go through the helpers. Note in
passing that this overlaps backlog story 518; that story is about the
production-side persist path, this is the cfg(test)-only journal-isolation
slice.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Timmy
2026-05-12 16:09:38 +01:00
parent 379ff16d3e
commit 8421104645
6 changed files with 74 additions and 18 deletions
+10 -3
View File
@@ -35,8 +35,8 @@ pub(super) use indices::{
rebuild_index, rebuild_merge_job_index, rebuild_node_index, rebuild_test_job_index,
rebuild_token_index,
};
pub(crate) use statics::{ALL_OPS, VECTOR_CLOCK};
pub(super) use statics::{SYNC_TX, track_op};
pub(crate) use statics::{all_ops_lock, vector_clock_lock};
// ── CrdtState struct ─────────────────────────────────────────────────
@@ -152,6 +152,13 @@ pub fn init_for_test() {
});
let _ = statics::CRDT_EVENT_TX.get_or_init(|| broadcast::channel::<CrdtEvent>(256).0);
let _ = statics::SYNC_TX.get_or_init(|| broadcast::channel::<SignedOp>(1024).0);
let _ = statics::ALL_OPS.get_or_init(|| Mutex::new(Vec::new()));
let _ = statics::VECTOR_CLOCK.get_or_init(|| Mutex::new(VectorClock::new()));
// Per-thread op journal + vector clock — keeps parallel tests' writes
// from corrupting each other's view of ALL_OPS (notably, one thread's
// `apply_compaction` could otherwise prune another thread's ops).
statics::ALL_OPS_TL.with(|lock| {
let _ = lock.set(Mutex::new(Vec::new()));
});
statics::VECTOR_CLOCK_TL.with(|lock| {
let _ = lock.set(Mutex::new(VectorClock::new()));
});
}
+53 -2
View File
@@ -4,6 +4,11 @@
//! channel ([`CRDT_EVENT_TX`]), and the in-memory op journal
//! ([`ALL_OPS`] / [`VECTOR_CLOCK`]) that tracks every applied op for
//! delta-sync.
//!
//! In `cfg(test)`, the op journal and vector clock are stored in
//! thread-local `OnceLock`s (mirroring [`super::CRDT_STATE_TL`]) so parallel
//! tests do not share `ALL_OPS` — preventing one test's `apply_compaction`
//! from pruning another test's freshly-written ops.
use std::sync::{Mutex, OnceLock};
@@ -32,16 +37,62 @@ pub(crate) static ALL_OPS: OnceLock<Mutex<Vec<String>>> = OnceLock::new();
/// re-parsing all ops when a peer requests `our_vector_clock()`.
pub(crate) static VECTOR_CLOCK: OnceLock<Mutex<VectorClock>> = OnceLock::new();
#[cfg(test)]
thread_local! {
/// Per-thread op journal for test isolation. Each test thread sees its
/// own ALL_OPS so parallel tests cannot prune each other's ops via
/// `apply_compaction`. Set up by `init_for_test`.
pub(in crate::crdt_state) static ALL_OPS_TL: OnceLock<Mutex<Vec<String>>> = const { OnceLock::new() };
/// Per-thread vector clock for test isolation. See [`ALL_OPS_TL`].
pub(in crate::crdt_state) static VECTOR_CLOCK_TL: OnceLock<Mutex<VectorClock>> = const { OnceLock::new() };
}
/// Return the mutex guarding the op journal, if initialised.
///
/// In production: the global `ALL_OPS`. In `cfg(test)`: the per-thread
/// `ALL_OPS_TL`, so parallel tests do not share the journal.
pub(crate) fn all_ops_lock() -> Option<&'static Mutex<Vec<String>>> {
#[cfg(not(test))]
{
ALL_OPS.get()
}
#[cfg(test)]
{
let ptr = ALL_OPS_TL.with(|lock| lock as *const OnceLock<Mutex<Vec<String>>>);
// SAFETY: the thread-local lives as long as the spawning thread,
// which outlives any test code using it. We only need 'static for
// the return type; consumers never hold the reference past the test.
unsafe { &*ptr }.get()
}
}
/// Return the mutex guarding the vector clock, if initialised.
///
/// In production: the global `VECTOR_CLOCK`. In `cfg(test)`: the per-thread
/// `VECTOR_CLOCK_TL`.
pub(crate) fn vector_clock_lock() -> Option<&'static Mutex<VectorClock>> {
#[cfg(not(test))]
{
VECTOR_CLOCK.get()
}
#[cfg(test)]
{
let ptr = VECTOR_CLOCK_TL.with(|lock| lock as *const OnceLock<Mutex<VectorClock>>);
// SAFETY: see all_ops_lock above.
unsafe { &*ptr }.get()
}
}
/// Append an op's JSON to `ALL_OPS` and bump the author's count in `VECTOR_CLOCK`.
///
/// Centralises the bookkeeping that must stay in sync between the two statics.
pub(in crate::crdt_state) fn track_op(signed: &SignedOp, json: String) {
if let Some(all) = ALL_OPS.get()
if let Some(all) = all_ops_lock()
&& let Ok(mut v) = all.lock()
{
v.push(json);
}
if let Some(vc) = VECTOR_CLOCK.get()
if let Some(vc) = vector_clock_lock()
&& let Ok(mut clock) = vc.lock()
{
let author_hex = hex::encode(&signed.author());