fix(crdt_snapshot): per-thread SNAPSHOT_STATE in cfg(test) instead of shared static (bug 669)
Replaces the test-time GLOBAL_STATE_LOCK approach (which was just disguised single-threading) with proper test isolation: each test thread gets its own SnapshotState via a thread_local!. Pattern matches crdt_state::CRDT_STATE_TL — production keeps the global OnceLock; tests get a per-thread OnceLock that's accessed through a snapshot_state() helper. The unsafe `&*ptr` cast to 'static is safe because the thread_local lives as long as the spawning test thread. The race: latest_snapshot_available_after_compaction captured at_seq from a freshly-generated snapshot, then asserted it equalled SNAPSHOT_STATE's latest.at_seq. With shared SNAPSHOT_STATE, another test thread's apply_compaction could overwrite latest_snapshot between capture and read. Per-thread state eliminates the race at its source. ALL_OPS / VECTOR_CLOCK stay shared — the tests don't assert on absolute counts, only on (this-thread's at_seq) == (this-thread's latest.at_seq). 5 consecutive default-parallel `cargo test --bin huskies` runs all green at 2636/2636.
This commit is contained in:
+60
-17
@@ -122,20 +122,65 @@ struct SnapshotState {
|
||||
pending_at_seq: Option<u64>,
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
static SNAPSHOT_STATE: OnceLock<Mutex<SnapshotState>> = OnceLock::new();
|
||||
|
||||
/// Initialise snapshot state. Safe to call multiple times (OnceLock).
|
||||
#[cfg(test)]
|
||||
thread_local! {
|
||||
/// Per-thread snapshot state for test isolation. Each test thread gets its
|
||||
/// own SnapshotState so parallel tests do not leak into each other's
|
||||
/// snapshot/coordination history.
|
||||
static SNAPSHOT_STATE_TL: OnceLock<Mutex<SnapshotState>> = const { OnceLock::new() };
|
||||
}
|
||||
|
||||
/// Read access to the snapshot state.
|
||||
///
|
||||
/// In production this returns the global `SNAPSHOT_STATE`. In `cfg(test)`
|
||||
/// each thread sees its own thread-local state, so parallel tests do not
|
||||
/// share `SnapshotState`.
|
||||
fn snapshot_state() -> Option<&'static Mutex<SnapshotState>> {
|
||||
#[cfg(not(test))]
|
||||
{
|
||||
SNAPSHOT_STATE.get()
|
||||
}
|
||||
#[cfg(test)]
|
||||
{
|
||||
let ptr = SNAPSHOT_STATE_TL.with(|lock| lock as *const OnceLock<Mutex<SnapshotState>>);
|
||||
// SAFETY: the thread-local lives as long as the thread. We only need
|
||||
// 'static for the return type; consumers never outlive the spawning
|
||||
// test thread.
|
||||
unsafe { &*ptr }.get()
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialise snapshot state. Safe to call multiple times.
|
||||
///
|
||||
/// In production: idempotent via `OnceLock`. In `cfg(test)`: initialises the
|
||||
/// per-thread state on first call; subsequent calls on the same thread are
|
||||
/// no-ops.
|
||||
pub fn init() {
|
||||
let _ = SNAPSHOT_STATE.set(Mutex::new(SnapshotState {
|
||||
latest_snapshot: None,
|
||||
pending_acks: HashMap::new(),
|
||||
pending_at_seq: None,
|
||||
}));
|
||||
let value = || {
|
||||
Mutex::new(SnapshotState {
|
||||
latest_snapshot: None,
|
||||
pending_acks: HashMap::new(),
|
||||
pending_at_seq: None,
|
||||
})
|
||||
};
|
||||
#[cfg(not(test))]
|
||||
{
|
||||
let _ = SNAPSHOT_STATE.set(value());
|
||||
}
|
||||
#[cfg(test)]
|
||||
{
|
||||
SNAPSHOT_STATE_TL.with(|lock| {
|
||||
let _ = lock.set(value());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the most recent completed snapshot, if any.
|
||||
pub fn latest_snapshot() -> Option<Snapshot> {
|
||||
SNAPSHOT_STATE.get()?.lock().ok()?.latest_snapshot.clone()
|
||||
snapshot_state()?.lock().ok()?.latest_snapshot.clone()
|
||||
}
|
||||
|
||||
// ── Leader selection ───────────────────────────────────────────────────
|
||||
@@ -256,7 +301,7 @@ pub fn begin_coordination(
|
||||
alive_peer_node_ids: &[String],
|
||||
self_node_id: &str,
|
||||
) -> Option<Snapshot> {
|
||||
let state = SNAPSHOT_STATE.get()?;
|
||||
let state = snapshot_state()?;
|
||||
let mut s = state.lock().ok()?;
|
||||
|
||||
let mut pending = HashMap::new();
|
||||
@@ -276,7 +321,7 @@ pub fn begin_coordination(
|
||||
///
|
||||
/// Returns `true` if quorum has been reached (all alive peers have acked).
|
||||
pub fn record_ack(node_id: &str, at_seq: u64) -> bool {
|
||||
let Some(state) = SNAPSHOT_STATE.get() else {
|
||||
let Some(state) = snapshot_state() else {
|
||||
return false;
|
||||
};
|
||||
let Ok(mut s) = state.lock() else {
|
||||
@@ -300,7 +345,7 @@ pub fn record_ack(node_id: &str, at_seq: u64) -> bool {
|
||||
///
|
||||
/// Called when a peer goes offline mid-coordination or quorum times out.
|
||||
pub fn abort_coordination() {
|
||||
if let Some(state) = SNAPSHOT_STATE.get()
|
||||
if let Some(state) = snapshot_state()
|
||||
&& let Ok(mut s) = state.lock()
|
||||
{
|
||||
s.pending_at_seq = None;
|
||||
@@ -310,8 +355,7 @@ pub fn abort_coordination() {
|
||||
|
||||
/// Check whether there is a pending snapshot coordination in progress.
|
||||
pub fn has_pending_coordination() -> bool {
|
||||
SNAPSHOT_STATE
|
||||
.get()
|
||||
snapshot_state()
|
||||
.and_then(|s| s.lock().ok())
|
||||
.map(|s| s.pending_at_seq.is_some())
|
||||
.unwrap_or(false)
|
||||
@@ -319,8 +363,7 @@ pub fn has_pending_coordination() -> bool {
|
||||
|
||||
/// Return the list of peers that have NOT yet acked the pending snapshot.
|
||||
pub fn unacked_peers() -> Vec<String> {
|
||||
SNAPSHOT_STATE
|
||||
.get()
|
||||
snapshot_state()
|
||||
.and_then(|s| s.lock().ok())
|
||||
.map(|s| {
|
||||
s.pending_acks
|
||||
@@ -346,7 +389,7 @@ pub fn unacked_peers() -> Vec<String> {
|
||||
/// Returns `true` if compaction was applied successfully.
|
||||
pub fn apply_compaction(snapshot: Snapshot) -> bool {
|
||||
// Store the snapshot as the latest for future new-node onboarding.
|
||||
if let Some(state) = SNAPSHOT_STATE.get()
|
||||
if let Some(state) = snapshot_state()
|
||||
&& let Ok(mut s) = state.lock()
|
||||
{
|
||||
s.latest_snapshot = Some(snapshot.clone());
|
||||
@@ -724,7 +767,7 @@ mod tests {
|
||||
let snapshot = generate_snapshot().unwrap();
|
||||
|
||||
// Store as latest.
|
||||
if let Some(state) = SNAPSHOT_STATE.get()
|
||||
if let Some(state) = snapshot_state()
|
||||
&& let Ok(mut s) = state.lock()
|
||||
{
|
||||
s.latest_snapshot = Some(snapshot.clone());
|
||||
@@ -957,7 +1000,7 @@ mod tests {
|
||||
// No compaction was applied — state is clean.
|
||||
// The latest_snapshot should NOT be set.
|
||||
// (The snapshot was never committed.)
|
||||
let state = SNAPSHOT_STATE.get().unwrap().lock().unwrap();
|
||||
let state = snapshot_state().unwrap().lock().unwrap();
|
||||
// pending_at_seq is cleared.
|
||||
assert!(state.pending_at_seq.is_none());
|
||||
assert!(state.pending_acks.is_empty());
|
||||
|
||||
Reference in New Issue
Block a user