From 8bdaabd06ccff32ee362669dd90cd9b439b591bc Mon Sep 17 00:00:00 2001 From: dave Date: Sun, 26 Apr 2026 20:36:40 +0000 Subject: [PATCH] refactor: split crdt_sync.rs into auth/wire/server/dispatch/client modules The 3672-line crdt_sync.rs is split into a sub-module directory with co-located tests per Rust convention: - auth.rs: trusted-keys + bearer-token validation (230 lines) - wire.rs: ChallengeMessage / AuthMessage / SyncMessage types (141 lines) - server.rs: WebSocket server handler (1680 lines) - dispatch.rs: incoming-message dispatch + bulk/clock/op handling (1028 lines) - client.rs: rendezvous client + reconnect/backoff (464 lines) - mod.rs: doc, cross-cutting constants, re-exports (75 lines) No behaviour change. All 65 crdt_sync tests pass; full suite green (2635 tests with --test-threads=1). --- server/src/crdt_sync.rs | 3672 ------------------------------ server/src/crdt_sync/auth.rs | 230 ++ server/src/crdt_sync/client.rs | 464 ++++ server/src/crdt_sync/dispatch.rs | 1028 +++++++++ server/src/crdt_sync/mod.rs | 75 + server/src/crdt_sync/server.rs | 1680 ++++++++++++++ server/src/crdt_sync/wire.rs | 141 ++ 7 files changed, 3618 insertions(+), 3672 deletions(-) delete mode 100644 server/src/crdt_sync.rs create mode 100644 server/src/crdt_sync/auth.rs create mode 100644 server/src/crdt_sync/client.rs create mode 100644 server/src/crdt_sync/dispatch.rs create mode 100644 server/src/crdt_sync/mod.rs create mode 100644 server/src/crdt_sync/server.rs create mode 100644 server/src/crdt_sync/wire.rs diff --git a/server/src/crdt_sync.rs b/server/src/crdt_sync.rs deleted file mode 100644 index baee5c97..00000000 --- a/server/src/crdt_sync.rs +++ /dev/null @@ -1,3672 +0,0 @@ -//! CRDT sync — WebSocket-based replication of pipeline state between huskies nodes. -/// WebSocket-based CRDT sync layer for replicating pipeline state between -/// huskies nodes. -/// -/// # Protocol -/// -/// ## Version negotiation -/// -/// After the auth handshake, both sides send their first sync message: -/// -/// - **v2 peers** send a `clock` frame: `{"type":"clock","clock":{ : , ... }}` -/// containing a vector clock that maps each author's hex Ed25519 pubkey to the -/// count of ops received from that author. Upon receiving the peer's clock, -/// each side computes the delta via [`crdt_state::ops_since`] and sends only -/// the missing ops as a `bulk` frame. -/// -/// - **v1 (legacy) peers** send a `bulk` frame directly (full op dump). -/// A v2 peer receiving a `bulk` first (instead of a `clock`) falls back to -/// the full-dump path: applies the incoming bulk and responds with its own -/// full bulk. This preserves backward compatibility — no code change needed -/// on the v1 side. -/// -/// ## Text frames -/// A JSON object with a `"type"` field: -/// - `{"type":"clock","clock":{...}}` — Vector clock (v2 protocol). -/// - `{"type":"bulk","ops":[...]}` — Ops dump (full or delta). -/// - `{"type":"ready"}` — Signals that the bulk-delta phase is complete and the -/// sender is ready for real-time op streaming. Locally-generated ops are -/// buffered until the peer's `ready` is received, then flushed in order. -/// -/// ## Binary frames (real-time op broadcast) -/// Individual `SignedOp`s encoded via [`crate::crdt_wire`] (versioned JSON -/// envelope: `{"v":1,"op":{...}}`). Each locally-applied op is immediately -/// broadcast as a binary frame to all connected peers. -/// -/// Both the server endpoint and the rendezvous client use the same protocol, -/// making the connection fully symmetric. -/// -/// ## Backpressure -/// Each connected peer has its own [`tokio::sync::broadcast`] receiver. If a -/// slow peer allows the channel to fill (indicated by a `Lagged` error), the -/// connection is dropped with a warning log. The peer can reconnect and -/// receive a fresh bulk state dump to catch up. -use bft_json_crdt::json_crdt::SignedOp; -use futures::{SinkExt, StreamExt}; -use poem::handler; -use poem::http::StatusCode; -use poem::web::Data; -use poem::web::Query; -use poem::web::websocket::{Message as WsMessage, WebSocket}; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::{Arc, OnceLock}; - -use crate::crdt_snapshot; -use crate::crdt_state; -use crate::crdt_wire; -use crate::http::context::AppContext; -use crate::node_identity; -use crate::slog; -use crate::slog_error; -use crate::slog_warn; - -// ── Auth configuration ────────────────────────────────────────────── - -/// Default timeout for the auth handshake (seconds). -const AUTH_TIMEOUT_SECS: u64 = 10; - -// ── Keepalive configuration ───────────────────────────────────────── - -/// Interval (seconds) between WebSocket Ping frames sent by each side. -pub const PING_INTERVAL_SECS: u64 = 30; - -/// Seconds without a Pong response before the connection is dropped. -pub const PONG_TIMEOUT_SECS: u64 = 60; - -/// Trusted public keys loaded once at startup. -static TRUSTED_KEYS: OnceLock> = OnceLock::new(); - -/// Initialise the trusted-key allow-list for connect-time mutual auth. -/// -/// Must be called once at startup before any WebSocket connections are -/// accepted. Subsequent calls are no-ops (OnceLock). -pub fn init_trusted_keys(keys: Vec) { - let _ = TRUSTED_KEYS.set(keys); -} - -/// Return a reference to the trusted-key allow-list. -fn trusted_keys() -> &'static [String] { - TRUSTED_KEYS.get().map(|v| v.as_slice()).unwrap_or(&[]) -} - -// ── Bearer-token auth ─────────────────────────────────────────────── - -/// Time-to-live for CRDT bearer tokens in seconds (30 days). -const TOKEN_TTL_SECS: f64 = 30.0 * 24.0 * 3600.0; - -/// Whether a bearer token is required for `/crdt-sync` connections. -/// `None` (uninitialised) → open access (backward compatible). -static REQUIRE_TOKEN: OnceLock = OnceLock::new(); - -/// Valid bearer tokens — maps token string to its expiry unix timestamp. -static CRDT_TOKENS: OnceLock>> = OnceLock::new(); - -/// Initialise bearer-token auth for CRDT-sync connections. -/// -/// Must be called once at startup before any WebSocket connections are accepted. -/// When `require` is `true`, clients must supply a valid `?token=` query -/// parameter on the upgrade request or receive HTTP 401. When `require` is -/// `false` (default) a token is optional — connections without one are -/// accepted, but a supplied token is still validated. -pub fn init_token_auth(require: bool, tokens: Vec) { - let _ = REQUIRE_TOKEN.set(require); - let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new())); - if let Ok(mut map) = store.write() { - let now = chrono::Utc::now().timestamp() as f64; - for token in tokens { - map.insert(token, now + TOKEN_TTL_SECS); - } - } -} - -/// Add a bearer token to the CRDT-sync token store. -/// -/// The token expires after [`TOKEN_TTL_SECS`] seconds. Returns the expiry -/// unix timestamp so callers can surface it in admin tooling. -pub fn add_join_token(token: String) -> f64 { - let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new())); - let now = chrono::Utc::now().timestamp() as f64; - let expires_at = now + TOKEN_TTL_SECS; - if let Ok(mut map) = store.write() { - map.insert(token, expires_at); - } - expires_at -} - -/// Validate a bearer token against the CRDT-sync token store. -/// -/// Returns `true` if the token exists in the store and has not expired. -fn validate_join_token(token: &str) -> bool { - let Some(store) = CRDT_TOKENS.get() else { - return false; - }; - let now = chrono::Utc::now().timestamp() as f64; - store - .read() - .ok() - .and_then(|map| map.get(token).copied()) - .is_some_and(|expires_at| expires_at > now) -} - -// ── Wire protocol types ───────────────────────────────────────────── - -/// Auth handshake: challenge sent by the listener to the connector. -#[derive(Serialize, Deserialize, Debug)] -struct ChallengeMessage { - r#type: String, - nonce: String, -} - -/// Auth handshake: auth reply sent by the connector to the listener. -#[derive(Serialize, Deserialize, Debug)] -struct AuthMessage { - r#type: String, - pubkey_hex: String, - signature_hex: String, -} - -#[derive(Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] -pub(crate) enum SyncMessage { - /// Bulk state dump sent on connect (v1) or delta ops after clock exchange (v2). - Bulk { ops: Vec }, - /// A single new op. - Op { op: String }, - /// Vector clock exchanged on connect (v2 protocol). - /// - /// Each entry maps a node's hex-encoded Ed25519 public key to the count of - /// ops received from that node. The receiving side computes the delta via - /// [`crdt_state::ops_since`] and sends only the missing ops. - Clock { - clock: std::collections::HashMap, - }, - /// Signals that the bulk-delta phase is complete; the sender is ready for - /// real-time op streaming. Locally-generated ops are buffered until the - /// peer's `Ready` is received, then flushed in-order. - Ready, -} - -/// Crate-visible re-export of `SyncMessage` for backwards-compatibility testing. -/// -/// Used by `crdt_snapshot` tests to verify that snapshot messages are NOT -/// parseable as legacy `SyncMessage` variants — confirming that old peers -/// will gracefully reject them. -#[cfg(test)] -pub(crate) type SyncMessagePublic = SyncMessage; - -// ── Server-side WebSocket handler ─────────────────────────────────── - -/// Query parameters accepted on the `/crdt-sync` WebSocket upgrade request. -#[derive(Deserialize)] -struct SyncQueryParams { - /// Optional bearer token. Required when the server is in token-required mode. - token: Option, -} - -/// WebSocket handler for CRDT peer synchronisation. -/// -/// Accepts an optional `?token=` query parameter. When the -/// server is configured with `crdt_require_token = true`, a valid token must -/// be supplied or the upgrade is rejected with HTTP 401. When the server is -/// in open-access mode (the default), a token is optional but still validated -/// if present. -#[handler] -pub async fn crdt_sync_handler( - ws: WebSocket, - _ctx: Data<&Arc>, - remote_addr: &poem::web::RemoteAddr, - Query(params): Query, -) -> poem::Response { - // ── Bearer-token check (pre-upgrade) ──────────────────────────── - let require_token = REQUIRE_TOKEN.get().copied().unwrap_or(false); - match ¶ms.token { - Some(t) => { - if !validate_join_token(t) { - slog!("[crdt-sync] Rejected connection: invalid or expired token"); - return poem::Response::builder() - .status(StatusCode::UNAUTHORIZED) - .body("invalid or expired token"); - } - } - None if require_token => { - slog!("[crdt-sync] Rejected connection: token required but not provided"); - return poem::Response::builder() - .status(StatusCode::UNAUTHORIZED) - .body("token required"); - } - None => {} - } - - // ── WebSocket upgrade ──────────────────────────────────────────── - use poem::IntoResponse as _; - let peer_addr = remote_addr.to_string(); - ws.on_upgrade(move |socket| async move { - let (mut sink, mut stream) = socket.split(); - - slog!("[crdt-sync] Peer connected, starting auth handshake"); - - // ── Step 1: Send challenge to the connecting peer ───────── - let challenge = node_identity::generate_challenge(); - let challenge_msg = ChallengeMessage { - r#type: "challenge".to_string(), - nonce: challenge.clone(), - }; - let challenge_json = match serde_json::to_string(&challenge_msg) { - Ok(j) => j, - Err(_) => return, - }; - if sink.send(WsMessage::Text(challenge_json)).await.is_err() { - return; - } - - // ── Step 2: Await auth reply within timeout ─────────────── - let auth_result = tokio::time::timeout( - std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), - stream.next(), - ) - .await; - - let auth_text = match auth_result { - Ok(Some(Ok(WsMessage::Text(text)))) => text, - Ok(_) | Err(_) => { - // Timeout or connection closed before auth reply. - slog!("[crdt-sync] Auth timeout or connection lost during handshake"); - let _ = sink - .send(WsMessage::Close(Some(( - poem::web::websocket::CloseCode::from(4001), - "auth_timeout".to_string(), - )))) - .await; - let _ = sink.close().await; - return; - } - }; - - // ── Step 3: Verify auth reply ───────────────────────────── - let auth_msg: AuthMessage = match serde_json::from_str(&auth_text) { - Ok(m) => m, - Err(_) => { - slog!("[crdt-sync] Invalid auth message from peer"); - close_with_auth_failed(&mut sink).await; - return; - } - }; - - // Verify signature AND check allow-list. - let sig_valid = - node_identity::verify_challenge(&auth_msg.pubkey_hex, &challenge, &auth_msg.signature_hex); - let key_trusted = trusted_keys().iter().any(|k| k == &auth_msg.pubkey_hex); - - if !sig_valid || !key_trusted { - slog!("[crdt-sync] Auth failed for peer (sig_valid={sig_valid}, key_trusted={key_trusted})"); - close_with_auth_failed(&mut sink).await; - return; - } - - slog!( - "[crdt-sync] Peer authenticated: {:.12}…", - &auth_msg.pubkey_hex - ); - - // ── Auth passed — proceed with CRDT sync ────────────────── - - // v2 protocol: send our vector clock so the peer can compute the delta. - let our_clock = crdt_state::our_vector_clock().unwrap_or_default(); - let clock_msg = SyncMessage::Clock { clock: our_clock }; - if let Ok(json) = serde_json::to_string(&clock_msg) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - - // Wait for the peer's first sync message to determine protocol version. - let first_msg = tokio::time::timeout( - std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), - wait_for_sync_text(&mut stream, &mut sink), - ) - .await; - - match first_msg { - Ok(Some(SyncMessage::Clock { clock: peer_clock })) => { - // v2 peer — if we have a snapshot and the peer has an empty - // clock (new node), send the snapshot first for onboarding. - if peer_clock.is_empty() - && let Some(snapshot) = crdt_snapshot::latest_snapshot() - { - let snap_msg = crdt_snapshot::SnapshotMessage::Snapshot(snapshot); - if let Ok(json) = serde_json::to_string(&snap_msg) { - if sink.send(WsMessage::Text(json)).await.is_err() { - return; - } - slog!("[crdt-sync] Sent snapshot to new node for onboarding"); - } - } - - // Send only the ops the peer is missing. - let delta = crdt_state::ops_since(&peer_clock).unwrap_or_default(); - slog!( - "[crdt-sync] v2 delta sync: sending {} ops (peer missing)", - delta.len() - ); - let msg = SyncMessage::Bulk { ops: delta }; - if let Ok(json) = serde_json::to_string(&msg) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - } - Ok(Some(SyncMessage::Bulk { ops })) => { - // v1 peer — apply their bulk and send our full bulk. - let mut applied = 0u64; - for op_json in &ops { - if let Ok(signed_op) = serde_json::from_str::(op_json) - && crdt_state::apply_remote_op(signed_op) - { - applied += 1; - } - } - slog!( - "[crdt-sync] v1 bulk sync: received {} ops, applied {applied}", - ops.len() - ); - if let Some(all) = crdt_state::all_ops_json() { - let msg = SyncMessage::Bulk { ops: all }; - if let Ok(json) = serde_json::to_string(&msg) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - } - } - Ok(Some(SyncMessage::Op { op })) => { - // Single op before negotiation — treat as v1. - if let Ok(signed_op) = serde_json::from_str::(&op) { - crdt_state::apply_remote_op(signed_op); - } - if let Some(all) = crdt_state::all_ops_json() { - let msg = SyncMessage::Bulk { ops: all }; - if let Ok(json) = serde_json::to_string(&msg) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - } - } - _ => { - // Timeout or error — send full bulk as fallback. - slog!("[crdt-sync] No sync message from peer; sending full bulk as fallback"); - if let Some(all) = crdt_state::all_ops_json() { - let msg = SyncMessage::Bulk { ops: all }; - if let Ok(json) = serde_json::to_string(&msg) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - } - } - } - - // Bulk-delta phase complete — signal the peer that we are ready for - // real-time op streaming. - if let Ok(json) = serde_json::to_string(&SyncMessage::Ready) - && sink.send(WsMessage::Text(json)).await.is_err() - { - return; - } - - // Subscribe to new local ops. - let Some(mut op_rx) = crdt_state::subscribe_ops() else { - return; - }; - - // Buffer for locally-generated ops produced before the peer's `ready` - // arrives. Flushed in-order once the peer signals catch-up. - let mut peer_ready = false; - let mut op_buffer: Vec = Vec::new(); - - // ── Keepalive state ─────────────────────────────────────────── - let mut pong_deadline = tokio::time::Instant::now() - + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); - let mut ping_ticker = tokio::time::interval_at( - tokio::time::Instant::now() - + std::time::Duration::from_secs(PING_INTERVAL_SECS), - std::time::Duration::from_secs(PING_INTERVAL_SECS), - ); - - loop { - tokio::select! { - // Send periodic Ping and enforce Pong timeout. - _ = ping_ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { - slog_warn!( - "[crdt-sync] No pong from peer {} in {}s; disconnecting", - peer_addr, - PONG_TIMEOUT_SECS - ); - break; - } - if sink.send(WsMessage::Ping(vec![])).await.is_err() { - break; - } - } - // Forward new local ops to the peer encoded via the wire codec. - result = op_rx.recv() => { - match result { - Ok(signed_op) => { - if peer_ready { - let bytes = crdt_wire::encode(&signed_op); - if sink.send(WsMessage::Binary(bytes)).await.is_err() { - break; - } - } else { - // Buffer until the peer signals ready. - op_buffer.push(signed_op); - } - } - Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { - // The peer cannot keep up; disconnect so it can - // reconnect and receive a fresh bulk state dump. - slog!("[crdt-sync] Slow peer lagged {n} ops; disconnecting"); - break; - } - Err(_) => break, - } - } - // Receive ops from the peer. - frame = stream.next() => { - match frame { - Some(Ok(WsMessage::Pong(_))) => { - // Reset the pong deadline on every Pong received. - pong_deadline = tokio::time::Instant::now() - + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); - } - Some(Ok(WsMessage::Ping(data))) => { - // Respond to peer's Ping so the peer's keepalive passes. - let _ = sink.send(WsMessage::Pong(data)).await; - } - Some(Ok(WsMessage::Text(text))) => { - // Check for the ready signal before other text frames. - if let Ok(SyncMessage::Ready) = serde_json::from_str(&text) { - peer_ready = true; - slog!("[crdt-sync] Peer ready; flushing {} buffered ops", op_buffer.len()); - let mut flush_ok = true; - for op in op_buffer.drain(..) { - let bytes = crdt_wire::encode(&op); - if sink.send(WsMessage::Binary(bytes)).await.is_err() { - flush_ok = false; - break; - } - } - if !flush_ok { - break; - } - } else { - // Bulk state dump, legacy op frame, or clock frame. - handle_incoming_text(&text); - } - } - Some(Ok(WsMessage::Binary(bytes))) => { - // Real-time op encoded via wire codec — applied immediately - // regardless of our own ready state. - handle_incoming_binary(&bytes); - } - Some(Ok(WsMessage::Close(_))) | None => break, - _ => {} - } - } - } - } - - slog!("[crdt-sync] Peer disconnected"); - }) - .into_response() -} - -/// Wait for the next text-frame sync message from the peer, handling Ping/Pong -/// transparently. -/// -/// Returns `None` on connection close or read error. -async fn wait_for_sync_text( - stream: &mut futures::stream::SplitStream, - sink: &mut futures::stream::SplitSink, -) -> Option { - loop { - match stream.next().await { - Some(Ok(WsMessage::Text(text))) => { - return serde_json::from_str(&text).ok(); - } - Some(Ok(WsMessage::Ping(data))) => { - let _ = sink.send(WsMessage::Pong(data)).await; - } - Some(Ok(WsMessage::Pong(_))) => continue, - _ => return None, - } - } -} - -/// Close the WebSocket with a generic `auth_failed` reason. -/// -/// The close reason is intentionally the same for all auth failures -/// (bad signature, untrusted key, malformed message) to avoid leaking -/// which check failed. -async fn close_with_auth_failed( - sink: &mut futures::stream::SplitSink, -) { - let _ = sink - .send(WsMessage::Close(Some(( - poem::web::websocket::CloseCode::from(4002), - "auth_failed".to_string(), - )))) - .await; - let _ = sink.close().await; -} - -/// Process an incoming text-frame sync message from a peer. -/// -/// Text frames carry the bulk state dump (`SyncMessage::Bulk`), legacy -/// single-op messages (`SyncMessage::Op`), or snapshot protocol messages. -fn handle_incoming_text(text: &str) { - // First try to parse as a snapshot protocol message. - if let Ok(snapshot_msg) = serde_json::from_str::(text) { - handle_snapshot_message(snapshot_msg); - return; - } - - let msg: SyncMessage = match serde_json::from_str(text) { - Ok(m) => m, - Err(e) => { - slog!("[crdt-sync] Bad text message from peer: {e}"); - return; - } - }; - - match msg { - SyncMessage::Bulk { ops } => { - let mut applied = 0u64; - for op_json in &ops { - if let Ok(signed_op) = serde_json::from_str::(op_json) - && crdt_state::apply_remote_op(signed_op) - { - applied += 1; - } - } - slog!( - "[crdt-sync] Bulk sync: received {} ops, applied {applied}", - ops.len() - ); - } - SyncMessage::Op { op } => { - if let Ok(signed_op) = serde_json::from_str::(&op) { - crdt_state::apply_remote_op(signed_op); - } - } - SyncMessage::Clock { .. } => { - // Clock frames are handled during the initial negotiation phase. - // If one arrives during the streaming loop it is a protocol error - // on the peer's part — log and ignore. - slog!("[crdt-sync] Ignoring unexpected clock frame during streaming phase"); - } - SyncMessage::Ready => { - // Ready frames are intercepted inline in the streaming loop before - // this function is called. If one reaches here it is a protocol - // error — log and ignore. - slog!("[crdt-sync] Ignoring unexpected ready frame in handle_incoming_text"); - } - } -} - -/// Handle an incoming snapshot protocol message. -/// -/// - **Snapshot**: apply the snapshot state and send an ack back. -/// Peers without snapshot support will never reach this code path because -/// the `SnapshotMessage` parse will fail and the message falls through to -/// the legacy `SyncMessage` handler, which logs and ignores unknown types. -/// - **SnapshotAck**: record the ack for quorum tracking. -fn handle_snapshot_message(msg: crdt_snapshot::SnapshotMessage) { - match msg { - crdt_snapshot::SnapshotMessage::Snapshot(snapshot) => { - slog!( - "[crdt-sync] Received snapshot at_seq={}, {} ops, {} manifest entries", - snapshot.at_seq, - snapshot.state.len(), - snapshot.op_manifest.len() - ); - // Apply compaction on this peer. - crdt_snapshot::apply_compaction(snapshot.clone()); - - // Send ack back to leader via the sync broadcast channel. - // The ack is sent as a CRDT event that the streaming loop picks up. - // For now, log the ack intent — actual transport is handled by the - // caller that invokes handle_incoming_text. - slog!( - "[crdt-sync] Snapshot applied, ack for at_seq={}", - snapshot.at_seq - ); - } - crdt_snapshot::SnapshotMessage::SnapshotAck(ack) => { - if let Some(node_id) = crdt_state::our_node_id() { - let _ = node_id; // The ack comes from a peer, not from us. - } - slog!( - "[crdt-sync] Received snapshot_ack for at_seq={}", - ack.at_seq - ); - // Record the ack — the coordination logic checks for quorum. - // Note: we don't know the peer's node_id from the message alone; - // in a full implementation the ack would include the sender's - // node_id. For now we log it for protocol completeness. - } - } -} - -/// Process an incoming binary-frame op from a peer. -/// -/// Binary frames carry a single `SignedOp` encoded via [`crdt_wire`]. -fn handle_incoming_binary(bytes: &[u8]) { - match crdt_wire::decode(bytes) { - Ok(signed_op) => { - crdt_state::apply_remote_op(signed_op); - } - Err(e) => { - slog!("[crdt-sync] Bad binary frame from peer: {e}"); - } - } -} - -// ── Rendezvous client ─────────────────────────────────────────────── - -/// Number of consecutive connection failures before escalating from WARN to ERROR. -pub const RENDEZVOUS_ERROR_THRESHOLD: u32 = 10; - -/// Spawn a background task that connects to the configured rendezvous -/// peer and exchanges CRDT ops bidirectionally. -/// -/// The client reconnects with exponential backoff if the connection drops. -/// Individual failures are logged at WARN; after [`RENDEZVOUS_ERROR_THRESHOLD`] -/// consecutive failures the log level escalates to ERROR. -/// -/// When `token` is provided it is appended to the upgrade URL as -/// `?token=` so the server's bearer-token check is satisfied. This -/// reuses the existing `--join-token` / `HUSKIES_JOIN_TOKEN` plumbing on the -/// agent side. -pub fn spawn_rendezvous_client(url: String, token: Option) { - tokio::spawn(async move { - let mut backoff_secs = 1u64; - let mut consecutive_failures: u32 = 0; - loop { - slog!("[crdt-sync] Connecting to rendezvous peer: {url}"); - match connect_and_sync(&url, token.as_deref()).await { - Ok(()) => { - slog!("[crdt-sync] Rendezvous connection closed cleanly"); - backoff_secs = 1; - consecutive_failures = 0; - } - Err(e) => { - consecutive_failures += 1; - if consecutive_failures >= RENDEZVOUS_ERROR_THRESHOLD { - slog_error!( - "[crdt-sync] Rendezvous peer unreachable ({consecutive_failures} consecutive failures): {e}" - ); - } else { - slog_warn!( - "[crdt-sync] Rendezvous connection error (attempt {consecutive_failures}): {e}" - ); - } - } - } - slog!("[crdt-sync] Reconnecting in {backoff_secs}s..."); - tokio::time::sleep(std::time::Duration::from_secs(backoff_secs)).await; - backoff_secs = (backoff_secs * 2).min(30); - } - }); -} - -/// Connect to a remote sync endpoint and exchange ops until disconnect. -/// -/// When `token` is supplied it is appended as `?token=` to the -/// connection URL so the server's bearer-token check passes. -pub(crate) async fn connect_and_sync(url: &str, token: Option<&str>) -> Result<(), String> { - let connect_url = match token { - Some(t) => { - if url.contains('?') { - format!("{url}&token={t}") - } else { - format!("{url}?token={t}") - } - } - None => url.to_string(), - }; - let (ws_stream, _) = tokio_tungstenite::connect_async(connect_url.as_str()) - .await - .map_err(|e| format!("WebSocket connect failed: {e}"))?; - - let (mut sink, mut stream) = ws_stream.split(); - - slog!("[crdt-sync] Connected to rendezvous peer, awaiting challenge"); - - // ── Step 1: Receive challenge from listener ─────────────────── - use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; - - let challenge_frame = tokio::time::timeout( - std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), - stream.next(), - ) - .await - .map_err(|_| "Auth timeout waiting for challenge".to_string())? - .ok_or_else(|| "Connection closed before challenge".to_string())? - .map_err(|e| format!("WebSocket read error: {e}"))?; - - let challenge_text = match challenge_frame { - TungsteniteMsg::Text(t) => t.to_string(), - _ => return Err("Expected text frame for challenge".to_string()), - }; - - let challenge_msg: ChallengeMessage = serde_json::from_str(&challenge_text) - .map_err(|e| format!("Invalid challenge message: {e}"))?; - - if challenge_msg.r#type != "challenge" { - return Err(format!( - "Expected challenge message, got type={}", - challenge_msg.r#type - )); - } - - // ── Step 2: Sign challenge and send auth reply ──────────────── - let (pubkey_hex, signature_hex) = crdt_state::sign_challenge(&challenge_msg.nonce) - .ok_or_else(|| "CRDT not initialised — cannot sign challenge".to_string())?; - - let auth_msg = AuthMessage { - r#type: "auth".to_string(), - pubkey_hex, - signature_hex, - }; - let auth_json = serde_json::to_string(&auth_msg).map_err(|e| format!("Serialize auth: {e}"))?; - sink.send(TungsteniteMsg::Text(auth_json.into())) - .await - .map_err(|e| format!("Send auth failed: {e}"))?; - - slog!("[crdt-sync] Auth reply sent, waiting for sync data"); - - // v2 protocol: send our vector clock. - let our_clock = crdt_state::our_vector_clock().unwrap_or_default(); - let clock_msg = SyncMessage::Clock { clock: our_clock }; - if let Ok(json) = serde_json::to_string(&clock_msg) { - sink.send(TungsteniteMsg::Text(json.into())) - .await - .map_err(|e| format!("Send clock failed: {e}"))?; - } - - // Wait for the server's first sync message. - let first_msg = tokio::time::timeout( - std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), - wait_for_rendezvous_sync_text(&mut stream), - ) - .await - .map_err(|_| "Timeout waiting for server sync message".to_string())?; - - match first_msg { - Some(SyncMessage::Clock { clock: peer_clock }) => { - // v2 server — send only the ops the server is missing. - let delta = crdt_state::ops_since(&peer_clock).unwrap_or_default(); - slog!( - "[crdt-sync] v2 delta sync: sending {} ops to server (server missing)", - delta.len() - ); - let msg = SyncMessage::Bulk { ops: delta }; - if let Ok(json) = serde_json::to_string(&msg) { - sink.send(TungsteniteMsg::Text(json.into())) - .await - .map_err(|e| format!("Send delta failed: {e}"))?; - } - } - Some(SyncMessage::Bulk { ops }) => { - // v1 server — apply their bulk and send our full bulk. - let mut applied = 0u64; - for op_json in &ops { - if let Ok(signed_op) = serde_json::from_str::(op_json) - && crdt_state::apply_remote_op(signed_op) - { - applied += 1; - } - } - slog!( - "[crdt-sync] v1 bulk sync: received {} ops from server, applied {applied}", - ops.len() - ); - if let Some(all) = crdt_state::all_ops_json() { - let msg = SyncMessage::Bulk { ops: all }; - if let Ok(json) = serde_json::to_string(&msg) { - sink.send(TungsteniteMsg::Text(json.into())) - .await - .map_err(|e| format!("Send bulk failed: {e}"))?; - } - } - } - _ => { - // Fallback — send full bulk. - slog!("[crdt-sync] No sync message from server; sending full bulk as fallback"); - if let Some(all) = crdt_state::all_ops_json() { - let msg = SyncMessage::Bulk { ops: all }; - if let Ok(json) = serde_json::to_string(&msg) { - sink.send(TungsteniteMsg::Text(json.into())) - .await - .map_err(|e| format!("Send bulk failed: {e}"))?; - } - } - } - } - - // Bulk-delta phase complete — signal the server that we are ready for - // real-time op streaming. - if let Ok(json) = serde_json::to_string(&SyncMessage::Ready) { - sink.send(TungsteniteMsg::Text(json.into())) - .await - .map_err(|e| format!("Send ready failed: {e}"))?; - } - - // Subscribe to new local ops. - let Some(mut op_rx) = crdt_state::subscribe_ops() else { - return Err("CRDT not initialised".to_string()); - }; - - // Buffer for locally-generated ops produced before the server's `ready` - // arrives. Flushed in-order once the server signals catch-up. - let mut peer_ready = false; - let mut op_buffer: Vec = Vec::new(); - - // ── Keepalive state ─────────────────────────────────────────────── - let mut pong_deadline = - tokio::time::Instant::now() + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); - let mut ping_ticker = tokio::time::interval_at( - tokio::time::Instant::now() + std::time::Duration::from_secs(PING_INTERVAL_SECS), - std::time::Duration::from_secs(PING_INTERVAL_SECS), - ); - - loop { - tokio::select! { - // Send periodic Ping and enforce Pong timeout. - _ = ping_ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { - slog_warn!( - "[crdt-sync] No pong from rendezvous peer {} in {}s; disconnecting", - url, - PONG_TIMEOUT_SECS - ); - return Err(format!( - "Keepalive timeout: no pong from {url} in {PONG_TIMEOUT_SECS}s" - )); - } - use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; - if sink.send(TungsteniteMsg::Ping(bytes::Bytes::new())).await.is_err() { - break; - } - } - result = op_rx.recv() => { - match result { - Ok(signed_op) => { - if peer_ready { - // Encode via wire codec and send as binary frame. - let bytes = crdt_wire::encode(&signed_op); - use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; - if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() { - break; - } - } else { - // Buffer until the server signals ready. - op_buffer.push(signed_op); - } - } - Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { - slog!("[crdt-sync] Slow rendezvous link lagged {n} ops; disconnecting"); - break; - } - Err(_) => break, - } - } - frame = stream.next() => { - match frame { - Some(Ok(tokio_tungstenite::tungstenite::Message::Pong(_))) => { - // Reset the pong deadline on every Pong received. - pong_deadline = tokio::time::Instant::now() - + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); - } - Some(Ok(tokio_tungstenite::tungstenite::Message::Ping(_))) => { - // tungstenite auto-responds to Ping with Pong at the - // protocol level; no manual response needed here. - } - Some(Ok(tokio_tungstenite::tungstenite::Message::Text(text))) => { - // Check for the ready signal before other text frames. - if let Ok(SyncMessage::Ready) = serde_json::from_str(text.as_ref()) { - peer_ready = true; - slog!("[crdt-sync] Server ready; flushing {} buffered ops", op_buffer.len()); - let mut flush_ok = true; - for op in op_buffer.drain(..) { - let bytes = crdt_wire::encode(&op); - use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; - if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() { - flush_ok = false; - break; - } - } - if !flush_ok { - break; - } - } else { - handle_incoming_text(text.as_ref()); - } - } - Some(Ok(tokio_tungstenite::tungstenite::Message::Binary(bytes))) => { - // Real-time op — applied immediately regardless of ready state. - handle_incoming_binary(&bytes); - } - Some(Ok(tokio_tungstenite::tungstenite::Message::Close(_))) | None => break, - Some(Err(e)) => { - slog!("[crdt-sync] Rendezvous read error: {e}"); - break; - } - _ => {} - } - } - } - } - - Ok(()) -} - -/// Wait for the next text-frame sync message from a tungstenite stream, -/// handling Ping/Pong transparently. -/// -/// Returns `None` on connection close or read error. -async fn wait_for_rendezvous_sync_text( - stream: &mut futures::stream::SplitStream< - tokio_tungstenite::WebSocketStream< - tokio_tungstenite::MaybeTlsStream, - >, - >, -) -> Option { - use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; - loop { - match stream.next().await { - Some(Ok(TungsteniteMsg::Text(text))) => { - return serde_json::from_str(text.as_ref()).ok(); - } - Some(Ok(TungsteniteMsg::Ping(_) | TungsteniteMsg::Pong(_))) => continue, - _ => return None, - } - } -} - -// ── Tests ──────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn sync_message_bulk_serialization_roundtrip() { - let msg = SyncMessage::Bulk { - ops: vec!["op1".to_string(), "op2".to_string()], - }; - let json = serde_json::to_string(&msg).unwrap(); - assert!(json.contains(r#""type":"bulk""#)); - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - match deserialized { - SyncMessage::Bulk { ops } => { - assert_eq!(ops.len(), 2); - assert_eq!(ops[0], "op1"); - assert_eq!(ops[1], "op2"); - } - _ => panic!("Expected Bulk"), - } - } - - #[test] - fn sync_message_op_serialization_roundtrip() { - let msg = SyncMessage::Op { - op: r#"{"inner":{}}"#.to_string(), - }; - let json = serde_json::to_string(&msg).unwrap(); - assert!(json.contains(r#""type":"op""#)); - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - match deserialized { - SyncMessage::Op { op } => { - assert_eq!(op, r#"{"inner":{}}"#); - } - _ => panic!("Expected Op"), - } - } - - #[test] - fn handle_incoming_text_bad_json_does_not_panic() { - handle_incoming_text("not valid json"); - } - - #[test] - fn handle_incoming_text_bulk_with_invalid_ops_does_not_panic() { - let msg = SyncMessage::Bulk { - ops: vec!["not a valid signed op".to_string()], - }; - let json = serde_json::to_string(&msg).unwrap(); - handle_incoming_text(&json); - } - - #[test] - fn handle_incoming_text_op_with_invalid_op_does_not_panic() { - let msg = SyncMessage::Op { - op: "garbage".to_string(), - }; - let json = serde_json::to_string(&msg).unwrap(); - handle_incoming_text(&json); - } - - #[test] - fn handle_incoming_binary_bad_bytes_does_not_panic() { - handle_incoming_binary(b"not valid wire codec"); - } - - #[test] - fn handle_incoming_binary_empty_bytes_does_not_panic() { - handle_incoming_binary(b""); - } - - #[test] - fn subscribe_ops_returns_none_before_init() { - // Before crdt_state::init() the channel doesn't exist yet. - // In test binaries it may or may not be initialised depending on - // other tests, so we just verify no panic. - let _ = crdt_state::subscribe_ops(); - } - - #[test] - fn all_ops_json_returns_none_before_init() { - let _ = crdt_state::all_ops_json(); - } - - #[test] - fn sync_message_bulk_empty_ops() { - let msg = SyncMessage::Bulk { ops: vec![] }; - let json = serde_json::to_string(&msg).unwrap(); - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - match deserialized { - SyncMessage::Bulk { ops } => assert!(ops.is_empty()), - _ => panic!("Expected Bulk"), - } - } - - /// Simulate the sync protocol by creating real SignedOps on two separate - /// CRDT instances and exchanging them through the SyncMessage wire format. - #[test] - fn two_node_sync_via_protocol_messages() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - // ── Node A: create an item ── - let kp_a = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp_a); - - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "100_story_sync_test", - "stage": "1_backlog", - "name": "Sync Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); - assert_eq!(crdt_a.apply(op1.clone()), OpState::Ok); - - // Serialise op1 into a SyncMessage::Op. - let op1_json = serde_json::to_string(&op1).unwrap(); - let wire_msg = SyncMessage::Op { - op: op1_json.clone(), - }; - let wire_json = serde_json::to_string(&wire_msg).unwrap(); - - // ── Node B: receive the op through protocol ── - let kp_b = make_keypair(); - let mut crdt_b = BaseCrdt::::new(&kp_b); - assert!(crdt_b.doc.items.view().is_empty()); - - // Parse wire message and apply. - let parsed: SyncMessage = serde_json::from_str(&wire_json).unwrap(); - match parsed { - SyncMessage::Op { op } => { - let signed_op: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(&op).unwrap(); - let result = crdt_b.apply(signed_op); - assert_eq!(result, OpState::Ok); - } - _ => panic!("Expected Op"), - } - - // Verify Node B has the same state as Node A. - assert_eq!(crdt_b.doc.items.view().len(), 1); - assert_eq!( - crdt_a.doc.items[0].story_id.view(), - crdt_b.doc.items[0].story_id.view() - ); - assert_eq!( - crdt_a.doc.items[0].stage.view(), - crdt_b.doc.items[0].stage.view() - ); - - // ── Node A: update stage ── - let op2 = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp_a); - crdt_a.apply(op2.clone()); - - // Send via bulk message. - let op2_json = serde_json::to_string(&op2).unwrap(); - let bulk_msg = SyncMessage::Bulk { - ops: vec![op1_json, op2_json], - }; - let bulk_wire = serde_json::to_string(&bulk_msg).unwrap(); - - // ── Node C: receives full state via bulk ── - let kp_c = make_keypair(); - let mut crdt_c = BaseCrdt::::new(&kp_c); - - let parsed_bulk: SyncMessage = serde_json::from_str(&bulk_wire).unwrap(); - match parsed_bulk { - SyncMessage::Bulk { ops } => { - for op_str in &ops { - let signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(op_str).unwrap(); - crdt_c.apply(signed); - } - } - _ => panic!("Expected Bulk"), - } - - // Node C should have the updated stage. - assert_eq!(crdt_c.doc.items.view().len(), 1); - assert_eq!( - crdt_c.doc.items[0].stage.view(), - bft_json_crdt::json_crdt::JsonValue::String("2_current".to_string()) - ); - } - - /// Verify that a single node's ops (insert + update) can be replayed - /// on another node via bulk sync and produce the same final state. - /// This is the core property needed for partition healing: when a - /// disconnected node reconnects, it sends all its ops as a bulk - /// message and the receiver catches up. - #[test] - fn partition_heal_via_bulk_replay() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp = make_keypair(); - - // Node A creates an item and advances it. - let mut crdt_a = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "200_story_heal", - "stage": "1_backlog", - "name": "Heal Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - - let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp); - crdt_a.apply(op1.clone()); - - let op2 = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp); - crdt_a.apply(op2.clone()); - - let op3 = crdt_a.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); - crdt_a.apply(op3.clone()); - - // Serialise all ops as a bulk message (simulates partition heal). - let ops_json: Vec = [&op1, &op2, &op3] - .iter() - .map(|op| serde_json::to_string(op).unwrap()) - .collect(); - let bulk = SyncMessage::Bulk { ops: ops_json }; - let wire = serde_json::to_string(&bulk).unwrap(); - - // Node B receives the bulk and reconstructs state. - let mut crdt_b = BaseCrdt::::new(&kp); - let parsed: SyncMessage = serde_json::from_str(&wire).unwrap(); - match parsed { - SyncMessage::Bulk { ops } => { - for op_str in &ops { - let signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(op_str).unwrap(); - crdt_b.apply(signed); - } - } - _ => panic!("Expected Bulk"), - } - - // Node B should match Node A exactly. - assert_eq!(crdt_b.doc.items.view().len(), 1); - assert_eq!( - crdt_b.doc.items[0].stage.view(), - JV::String("3_qa".to_string()) - ); - assert_eq!( - crdt_a.doc.items[0].stage.view(), - crdt_b.doc.items[0].stage.view() - ); - assert_eq!( - crdt_a.doc.items[0].name.view(), - crdt_b.doc.items[0].name.view() - ); - } - - #[test] - fn config_rendezvous_parsed_from_toml() { - let toml_str = r#" -rendezvous = "ws://remote:3001/crdt-sync" - -[[agent]] -name = "test" -"#; - let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap(); - assert_eq!( - config.rendezvous.as_deref(), - Some("ws://remote:3001/crdt-sync") - ); - } - - #[test] - fn config_rendezvous_defaults_to_none() { - let config = crate::config::ProjectConfig::default(); - assert!(config.rendezvous.is_none()); - } - - // ── AC4: Failure logging escalation ────────────────────────────────────── - - /// AC4: Connection errors must be logged at WARN for the first nine - /// consecutive failures and escalate to ERROR from the tenth onwards. - #[test] - fn failure_counter_warn_below_threshold() { - let threshold = super::RENDEZVOUS_ERROR_THRESHOLD; - let mut consecutive_failures: u32 = 0; - - // First threshold-1 failures are below the ERROR threshold. - for _ in 0..(threshold - 1) { - consecutive_failures += 1; - assert!( - consecutive_failures < threshold, - "failure {consecutive_failures} must be below ERROR threshold {threshold}" - ); - } - } - - /// AC4: The tenth consecutive failure must trigger ERROR-level logging. - #[test] - fn failure_counter_error_at_threshold() { - let threshold = super::RENDEZVOUS_ERROR_THRESHOLD; - let consecutive_failures: u32 = threshold; - assert!( - consecutive_failures >= threshold, - "failure {consecutive_failures} must reach or exceed ERROR threshold {threshold}" - ); - } - - /// AC4: A successful connection resets the failure counter so subsequent - /// single failures are again logged at WARN (not ERROR). - #[test] - fn failure_counter_resets_on_success() { - let threshold = super::RENDEZVOUS_ERROR_THRESHOLD; - // Simulate sustained failure. - let mut consecutive_failures: u32 = threshold + 5; - assert!(consecutive_failures >= threshold); - - // Simulate a clean reconnect. - consecutive_failures = 0; - assert_eq!( - consecutive_failures, 0, - "counter must reset to 0 on success" - ); - - // Next error is attempt 1 — well below the ERROR threshold. - consecutive_failures += 1; - assert!( - consecutive_failures < threshold, - "first failure after reset must be below ERROR threshold" - ); - } - - /// AC4: The RENDEZVOUS_ERROR_THRESHOLD constant must equal 10. - #[test] - fn error_threshold_is_ten() { - assert_eq!( - super::RENDEZVOUS_ERROR_THRESHOLD, - 10, - "ERROR escalation threshold must be 10 consecutive failures" - ); - } - - // ── AC5: Self-loop dedup ────────────────────────────────────────────────── - - /// AC5: Applying the same SignedOp twice returns AlreadySeen on the second - /// call and leaves the CRDT state unchanged. - #[test] - fn self_loop_dedup_second_apply_is_noop() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_dedup_test", - "stage": "1_backlog", - "name": "Dedup Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - - let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - - // First apply: succeeds. - assert_eq!(crdt.apply(op.clone()), OpState::Ok); - assert_eq!(crdt.doc.items.view().len(), 1); - - // Second apply (self-loop): must be a no-op. - assert_eq!(crdt.apply(op.clone()), OpState::AlreadySeen); - - // State must not have changed. - assert_eq!(crdt.doc.items.view().len(), 1); - - // Stage update also deduplicated correctly. - let stage_op = crdt.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp); - assert_eq!(crdt.apply(stage_op.clone()), OpState::Ok); - assert_eq!( - crdt.doc.items[0].stage.view(), - JV::String("2_current".to_string()) - ); - assert_eq!(crdt.apply(stage_op), OpState::AlreadySeen); - assert_eq!( - crdt.doc.items[0].stage.view(), - JV::String("2_current".to_string()), - "stage must not change on duplicate apply" - ); - } - - // ── AC3 & AC7: Out-of-order causal queueing ─────────────────────────────── - - /// AC3/AC7: An op whose causal dependency has not yet arrived is held in the - /// queue (returns MissingCausalDependencies). When the dependency arrives - /// the queued op is released and applied automatically. - #[test] - fn out_of_order_causal_queueing_releases_on_dep_arrival() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_causal_test", - "stage": "1_backlog", - "name": "Causal Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - - // op1 = insert item (no deps) - let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - - // op2 = set stage, declared to depend on op1 - // We must first apply op1 locally to generate op2 from the right state, - // then we'll test op2-before-op1 on a fresh CRDT. - crdt.apply(op1.clone()); - let op2 = crdt.doc.items[0] - .stage - .set("2_current".to_string()) - .sign_with_dependencies(&kp, vec![&op1]); - - // Create a fresh receiver CRDT. - let mut receiver = BaseCrdt::::new(&kp); - - // Apply op2 first — dependency (op1) has not arrived yet. - let r = receiver.apply(op2.clone()); - assert_eq!( - r, - OpState::MissingCausalDependencies, - "op2 must be queued when op1 has not arrived" - ); - // Queue length must reflect the held op. - assert_eq!(receiver.causal_queue_len(), 1); - - // Item has NOT been inserted yet (op1 not applied). - assert_eq!(receiver.doc.items.view().len(), 0); - - // Now deliver op1 — this should trigger op2 to be flushed automatically. - let r = receiver.apply(op1.clone()); - assert_eq!(r, OpState::Ok); - - // Both ops are now applied — item is present at stage 2_current. - assert_eq!(receiver.doc.items.view().len(), 1); - assert_eq!( - receiver.doc.items[0].stage.view(), - JV::String("2_current".to_string()), - "op2 must have been applied automatically after op1 arrived" - ); - - // Queue must be empty now. - assert_eq!(receiver.causal_queue_len(), 0); - } - - /// AC7: In-order apply works correctly (no causal queueing needed). - #[test] - fn in_order_apply_works_without_queueing() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp); - - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_inorder_test", - "stage": "1_backlog", - "name": "In-Order Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - - let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp); - crdt_a.apply(op1.clone()); - let op2 = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp); - crdt_a.apply(op2.clone()); - let op3 = crdt_a.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); - crdt_a.apply(op3.clone()); - - // Receiver applies all ops in the correct order. - let mut crdt_b = BaseCrdt::::new(&kp); - assert_eq!(crdt_b.apply(op1), OpState::Ok); - assert_eq!(crdt_b.apply(op2), OpState::Ok); - assert_eq!(crdt_b.apply(op3), OpState::Ok); - assert_eq!(crdt_b.causal_queue_len(), 0); - assert_eq!( - crdt_b.doc.items[0].stage.view(), - JV::String("3_qa".to_string()) - ); - } - - // ── AC4: Queue overflow behaviour ───────────────────────────────────────── - - /// AC4: When the causal-order queue exceeds CAUSAL_QUEUE_MAX the oldest - /// pending op is evicted (queue never grows beyond the cap). - #[test] - fn causal_queue_overflow_drops_oldest() { - use bft_json_crdt::json_crdt::{BaseCrdt, CAUSAL_QUEUE_MAX, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp = make_keypair(); - - // Build one "phantom" op that we'll claim as a dependency but never deliver. - // We do this by creating it on a separate CRDT and never applying it. - let mut source = BaseCrdt::::new(&kp); - let phantom_item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_phantom", - "stage": "1_backlog", - "name": "Phantom", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let phantom_op = source.doc.items.insert(ROOT_ID, phantom_item).sign(&kp); - - // Receiver never sees phantom_op, so any op declaring it as a dep will - // sit in the causal queue forever (until evicted by overflow). - let mut receiver = BaseCrdt::::new(&kp); - source.apply(phantom_op.clone()); - - // Send CAUSAL_QUEUE_MAX + 5 stage-update ops all depending on phantom_op. - // Each one will be queued because phantom_op is never delivered. - let mut queued = 0usize; - for i in 0..CAUSAL_QUEUE_MAX + 5 { - let stage_name = format!("stage_{i}"); - // Generate from source so seq numbers are valid. - let op = source.doc.items[0] - .stage - .set(stage_name) - .sign_with_dependencies(&kp, vec![&phantom_op]); - source.apply(op.clone()); - let r = receiver.apply(op); - if r == OpState::MissingCausalDependencies { - queued += 1; - } - } - - // We sent more than CAUSAL_QUEUE_MAX ops, but the queue must stay bounded. - assert!( - receiver.causal_queue_len() <= CAUSAL_QUEUE_MAX, - "queue ({}) must not exceed CAUSAL_QUEUE_MAX ({CAUSAL_QUEUE_MAX})", - receiver.causal_queue_len() - ); - assert!( - queued > 0, - "at least some ops must have been accepted into the queue" - ); - } - - // ── AC6: Convergence test ───────────────────────────────────────────────── - - /// AC6: Two CRDT instances generate interleaved ops on each side, simulate a - /// network partition by withholding each other's ops, then exchange all - /// buffered ops. Final state must be byte-identical on both nodes. - #[test] - fn convergence_after_partition_and_replay() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp_a = make_keypair(); - let kp_b = make_keypair(); - - let mut crdt_a = BaseCrdt::::new(&kp_a); - let mut crdt_b = BaseCrdt::::new(&kp_b); - - // ── Phase 1: A generates ops while partitioned from B ────────────── - - let item_a: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_convergence_a", - "stage": "1_backlog", - "name": "Story A", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op_a1 = crdt_a.doc.items.insert(ROOT_ID, item_a).sign(&kp_a); - crdt_a.apply(op_a1.clone()); - - let op_a2 = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp_a); - crdt_a.apply(op_a2.clone()); - - // ── Phase 2: B generates ops while partitioned from A ────────────── - - let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "507_convergence_b", - "stage": "1_backlog", - "name": "Story B", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); - crdt_b.apply(op_b1.clone()); - - let op_b2 = crdt_b.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp_b); - crdt_b.apply(op_b2.clone()); - - // ── Phase 3: Reconnect — both sides replay all buffered ops ──────── - - // A sends its ops to B. - let r = crdt_b.apply(op_a1.clone()); - assert!(r == OpState::Ok || r == OpState::AlreadySeen); - let r = crdt_b.apply(op_a2.clone()); - assert!(r == OpState::Ok || r == OpState::AlreadySeen); - - // B sends its ops to A. - let r = crdt_a.apply(op_b1.clone()); - assert!(r == OpState::Ok || r == OpState::AlreadySeen); - let r = crdt_a.apply(op_b2.clone()); - assert!(r == OpState::Ok || r == OpState::AlreadySeen); - - // ── Phase 4: Assert convergence ──────────────────────────────────── - - // Both nodes must have both stories. - assert_eq!( - crdt_a.doc.items.view().len(), - 2, - "A must have 2 items after convergence" - ); - assert_eq!( - crdt_b.doc.items.view().len(), - 2, - "B must have 2 items after convergence" - ); - - // Serialise both CRDT views to JSON and assert byte-identical. - let view_a = serde_json::to_string(&CrdtNode::view(&crdt_a.doc.items)).unwrap(); - let view_b = serde_json::to_string(&CrdtNode::view(&crdt_b.doc.items)).unwrap(); - assert_eq!( - view_a, view_b, - "CRDT states must be byte-identical after convergence" - ); - - // Spot-check: both stories are at 2_current on both nodes. - let stories_a: Vec = crdt_a - .doc - .items - .view() - .iter() - .filter_map(|item| { - if let JV::Object(m) = CrdtNode::view(item) { - m.get("story_id").and_then(|s| { - if let JV::String(s) = s { - Some(s.clone()) - } else { - None - } - }) - } else { - None - } - }) - .collect(); - assert!( - stories_a.contains(&"507_convergence_a".to_string()), - "A must contain story_a" - ); - assert!( - stories_a.contains(&"507_convergence_b".to_string()), - "A must contain story_b" - ); - } - - // ── AC8: peer lifecycle tests ───────────────────────────────────────────── - - /// AC8: A peer that connects and then receives a subsequently-applied op - /// gets that op encoded via the wire codec (binary frame). - #[test] - fn peer_receives_op_encoded_via_wire_codec() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - use crate::crdt_wire; - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "506_story_lifecycle_test", - "stage": "1_backlog", - "name": "Lifecycle Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - - // Simulate what the broadcast handler does: encode via wire codec. - let bytes = crdt_wire::encode(&op); - - // The bytes must be a versioned JSON envelope, not a SyncMessage wrapper. - let text = std::str::from_utf8(&bytes).expect("wire output is valid UTF-8"); - assert!( - text.contains("\"v\":1"), - "wire codec version tag must be present: {text}" - ); - assert!( - !text.contains("\"type\":\"op\""), - "must not be wrapped in SyncMessage: {text}" - ); - - // The receiving peer can decode and apply the op. - let decoded = crdt_wire::decode(&bytes).expect("decode must succeed"); - assert_eq!(op, decoded); - } - - /// AC8: Multiple connected peers all receive the same broadcast op. - #[tokio::test] - async fn multiple_peers_all_receive_broadcast_op() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - use tokio::sync::broadcast; - - use crate::crdt_state::PipelineDoc; - use crate::crdt_wire; - - // Create a broadcast channel (analogous to SYNC_TX). - let (tx, _) = broadcast::channel::(16); - let mut rx_peer1 = tx.subscribe(); - let mut rx_peer2 = tx.subscribe(); - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "506_story_multi_peer_test", - "stage": "1_backlog", - "name": "Multi-Peer Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - - // Broadcast one op. - tx.send(op.clone()).expect("send must succeed"); - - // Both peers receive the same op. - let received1 = rx_peer1.recv().await.expect("peer 1 must receive"); - let received2 = rx_peer2.recv().await.expect("peer 2 must receive"); - assert_eq!(received1, op); - assert_eq!(received2, op); - - // Both encode identically via wire codec. - let bytes1 = crdt_wire::encode(&received1); - let bytes2 = crdt_wire::encode(&received2); - assert_eq!(bytes1, bytes2, "wire-encoded bytes must be identical"); - } - - /// AC8: A peer disconnecting mid-broadcast does not panic. - /// Simulated by dropping the receiver before the sender sends an op. - #[test] - fn disconnected_peer_does_not_panic() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - use tokio::sync::broadcast; - - use crate::crdt_state::PipelineDoc; - - let (tx, rx) = broadcast::channel::(16); - // Drop the receiver to simulate a peer that disconnected. - drop(rx); - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "506_story_disconnect_test", - "stage": "1_backlog", - "name": "Disconnect Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - - // Sending to a channel with no receivers returns an error; must not panic. - let _ = tx.send(op); - } - - /// AC8: A lagged receiver gets a `Lagged` error (confirming the - /// disconnect-on-overflow behaviour is reachable). - #[tokio::test] - async fn lagged_peer_gets_lagged_error() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - use tokio::sync::broadcast; - - use crate::crdt_state::PipelineDoc; - - // Tiny capacity so we can trigger Lagged easily. - let (tx, mut rx) = broadcast::channel::(2); - - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "506_story_lag_test", - "stage": "1_backlog", - "name": "Lag Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - crdt.apply(op1.clone()); - - // Overflow the tiny buffer by sending more ops than the capacity. - let op2 = crdt.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp); - crdt.apply(op2.clone()); - let op3 = crdt.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); - crdt.apply(op3.clone()); - let op4 = crdt.doc.items[0].stage.set("4_merge".to_string()).sign(&kp); - crdt.apply(op4.clone()); - - // Send more ops than the channel capacity without consuming. - let _ = tx.send(op1); - let _ = tx.send(op2); - let _ = tx.send(op3); - let _ = tx.send(op4); - - // The slow peer should now see a Lagged error on next recv. - // Consume until we hit Lagged or run out. - let mut got_lagged = false; - for _ in 0..10 { - match rx.recv().await { - Err(broadcast::error::RecvError::Lagged(_)) => { - got_lagged = true; - break; - } - Ok(_) => continue, - Err(broadcast::error::RecvError::Closed) => break, - } - } - assert!( - got_lagged, - "slow peer must receive a Lagged error when channel overflows" - ); - } - - // ── AC5 (story 508): E2E convergence test via real WebSocket ───────────── - - /// AC5: Spin up two in-process WebSocket nodes. Node A serves a - /// `/crdt-sync`-compatible endpoint; Node B connects as a rendezvous client. - /// Node A sends a bulk state; Node B applies it. Assert both nodes see the - /// same items within a bounded time window. - #[tokio::test] - async fn e2e_convergence_two_websocket_nodes() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use futures::{SinkExt, StreamExt}; - use serde_json::json; - use std::sync::{Arc, Mutex}; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - use crate::crdt_state::PipelineDoc; - - // ── Node A: build local state ────────────────────────────────────── - let kp_a = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp_a); - - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "508_e2e_convergence", - "stage": "2_current", - "name": "E2E Convergence Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); - crdt_a.apply(op1.clone()); - - // Serialise A's full state as a bulk message. - let op1_json = serde_json::to_string(&op1).unwrap(); - let bulk_msg = SyncMessage::Bulk { - ops: vec![op1_json], - }; - let bulk_wire = serde_json::to_string(&bulk_msg).unwrap(); - - // ── Start Node A's WebSocket server on a random port ─────────────── - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let bulk_to_send = bulk_wire.clone(); - let received_by_a: Arc>> = Arc::new(Mutex::new(vec![])); - let received_by_a_clone = received_by_a.clone(); - - tokio::spawn(async move { - let (tcp_stream, _) = listener.accept().await.unwrap(); - let ws_stream = accept_async(tcp_stream).await.unwrap(); - let (mut sink, mut stream) = ws_stream.split(); - - // Send bulk state to the connecting peer. - sink.send(TMsg::Text(bulk_to_send.into())).await.unwrap(); - - // Also listen for ops sent by the peer. - if let Some(Ok(TMsg::Text(txt))) = stream.next().await { - received_by_a_clone.lock().unwrap().push(txt.to_string()); - } - }); - - // ── Node B: connect to Node A and exchange state ─────────────────── - let kp_b = make_keypair(); - let mut crdt_b = BaseCrdt::::new(&kp_b); - - let url = format!("ws://{addr}"); - let (ws_b, _) = connect_async(&url).await.unwrap(); - let (mut sink_b, mut stream_b) = ws_b.split(); - - // Node B receives bulk from A. - if let Some(Ok(TMsg::Text(txt))) = stream_b.next().await { - let msg: SyncMessage = serde_json::from_str(txt.as_str()).unwrap(); - match msg { - SyncMessage::Bulk { ops } => { - for op_str in &ops { - let signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(op_str).unwrap(); - let r = crdt_b.apply(signed); - assert!(r == OpState::Ok || r == OpState::AlreadySeen); - } - } - _ => panic!("Expected Bulk from Node A"), - } - } - - // Node B also creates a new op and sends it to A. - let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "508_e2e_convergence_b", - "stage": "1_backlog", - "name": "E2E Convergence B", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); - crdt_b.apply(op_b1.clone()); - - let op_b1_json = serde_json::to_string(&op_b1).unwrap(); - let msg_to_a = SyncMessage::Op { op: op_b1_json }; - sink_b - .send(TMsg::Text(serde_json::to_string(&msg_to_a).unwrap().into())) - .await - .unwrap(); - - // Wait a moment for Node A to process. - tokio::time::sleep(std::time::Duration::from_millis(50)).await; - - // ── Assert convergence ───────────────────────────────────────────── - - // Node B received Node A's item. - assert_eq!( - crdt_b.doc.items.view().len(), - 2, - "Node B must see both items after sync" - ); - let has_a_item = crdt_b - .doc - .items - .view() - .iter() - .any(|item| item.story_id.view() == JV::String("508_e2e_convergence".to_string())); - assert!(has_a_item, "Node B must have Node A's item"); - - // Node A received Node B's op via the WebSocket. - let a_received = received_by_a.lock().unwrap(); - assert!( - !a_received.is_empty(), - "Node A must have received an op from Node B" - ); - } - - // ── AC6 (story 508): Partition healing E2E via real WebSocket ───────────── - - /// AC6: Two nodes exchange ops, the connection is dropped (partition), each - /// side mutates independently, then they reconnect and the reconnecting - /// client sends a fresh bulk state. Assert both converge to the same final - /// state. - #[tokio::test] - async fn e2e_partition_healing_websocket() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use futures::{SinkExt, StreamExt}; - use serde_json::json; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - use crate::crdt_state::PipelineDoc; - - // ── Phase 1: Both nodes start with op_a1 (before partition) ─────── - let kp_a = make_keypair(); - let kp_b = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp_a); - let mut crdt_b = BaseCrdt::::new(&kp_b); - - let item_a: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "508_heal_a", - "stage": "1_backlog", - "name": "Heal A", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op_a1 = crdt_a.doc.items.insert(ROOT_ID, item_a).sign(&kp_a); - crdt_a.apply(op_a1.clone()); - // B also starts with op_a1 (shared state before partition). - crdt_b.apply(op_a1.clone()); - - // ── Phase 2: Partition — each side mutates independently ────────── - // A advances its story stage. - let op_a2 = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp_a); - crdt_a.apply(op_a2.clone()); - - // B inserts a new story that A doesn't know about yet. - let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "508_heal_b", - "stage": "1_backlog", - "name": "Heal B", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); - crdt_b.apply(op_b1.clone()); - - // Collect B's full state as bulk (what it will send on reconnect). - let b_ops: Vec = [&op_a1, &op_b1] - .iter() - .map(|op| serde_json::to_string(op).unwrap()) - .collect(); - let b_bulk_wire = serde_json::to_string(&SyncMessage::Bulk { ops: b_ops }).unwrap(); - - // Collect A's full state as bulk (what it will send on reconnect). - let a_ops: Vec = [&op_a1, &op_a2] - .iter() - .map(|op| serde_json::to_string(op).unwrap()) - .collect(); - let a_bulk_wire = serde_json::to_string(&SyncMessage::Bulk { ops: a_ops }).unwrap(); - - // ── Phase 3: Reconnect — use a real WebSocket to exchange bulk ──── - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let a_bulk_to_send = a_bulk_wire.clone(); - let a_received_bulk: std::sync::Arc>> = - std::sync::Arc::new(std::sync::Mutex::new(None)); - let a_received_clone = a_received_bulk.clone(); - - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - // A sends its bulk state. - sink.send(TMsg::Text(a_bulk_to_send.into())).await.unwrap(); - // A receives B's bulk state. - if let Some(Ok(TMsg::Text(txt))) = stream.next().await { - *a_received_clone.lock().unwrap() = Some(txt.to_string()); - } - }); - - // B connects, exchanges bulk state. - let (ws_b, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - let (mut sink_b, mut stream_b) = ws_b.split(); - - // B receives A's bulk and applies it. - if let Some(Ok(TMsg::Text(txt))) = stream_b.next().await { - let msg: SyncMessage = serde_json::from_str(txt.as_str()).unwrap(); - if let SyncMessage::Bulk { ops } = msg { - for op_str in &ops { - let signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(op_str).unwrap(); - let _ = crdt_b.apply(signed); - } - } - } - - // B sends its bulk state to A. - sink_b.send(TMsg::Text(b_bulk_wire.into())).await.unwrap(); - - tokio::time::sleep(std::time::Duration::from_millis(50)).await; - - // Apply A's received ops into crdt_a. - if let Some(bulk_str) = a_received_bulk.lock().unwrap().take() { - let msg: SyncMessage = serde_json::from_str(&bulk_str).unwrap(); - if let SyncMessage::Bulk { ops } = msg { - for op_str in &ops { - let signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(op_str).unwrap(); - let _ = crdt_a.apply(signed); - } - } - } - - // ── Assert convergence ───────────────────────────────────────────── - - // Both nodes must have 2 items. - assert_eq!( - crdt_a.doc.items.view().len(), - 2, - "A must have 2 items after healing" - ); - assert_eq!( - crdt_b.doc.items.view().len(), - 2, - "B must have 2 items after healing" - ); - - // A must see B's story. - let b_story_on_a = crdt_a - .doc - .items - .view() - .iter() - .any(|item| item.story_id.view() == JV::String("508_heal_b".to_string())); - assert!(b_story_on_a, "A must have B's story after healing"); - - // B must see A's stage advance. - let a_story_on_b = crdt_b - .doc - .items - .view() - .iter() - .any(|item| item.story_id.view() == JV::String("508_heal_a".to_string())); - assert!(a_story_on_b, "B must have A's story after healing"); - - // CRDT views must be byte-identical (convergence). - let view_a = serde_json::to_string(&CrdtNode::view(&crdt_a.doc.items)).unwrap(); - let view_b = serde_json::to_string(&CrdtNode::view(&crdt_b.doc.items)).unwrap(); - assert_eq!( - view_a, view_b, - "Both nodes must converge to identical state" - ); - } - - // ── Story 628: Connect-time mutual auth integration tests ──────────── - - /// Helper: run a listener that performs the server-side auth handshake. - /// Returns the TCP listener address so the connector can connect. - /// `trusted_keys` controls which pubkeys the listener accepts. - /// If `on_authenticated` is provided, it runs after successful auth. - async fn start_auth_listener( - trusted_keys: Vec, - ) -> ( - std::net::SocketAddr, - tokio::sync::oneshot::Receiver, - ) { - use tokio::net::TcpListener; - use tokio_tungstenite::accept_async; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let (result_tx, result_rx) = tokio::sync::oneshot::channel(); - - tokio::spawn(async move { - let (tcp_stream, _) = listener.accept().await.unwrap(); - let ws_stream = accept_async(tcp_stream).await.unwrap(); - let (mut sink, mut stream) = ws_stream.split(); - - use tokio_tungstenite::tungstenite::Message as TMsg; - - // Step 1: Send challenge. - let challenge = crate::node_identity::generate_challenge(); - let challenge_msg = super::ChallengeMessage { - r#type: "challenge".to_string(), - nonce: challenge.clone(), - }; - let challenge_json = serde_json::to_string(&challenge_msg).unwrap(); - if sink.send(TMsg::Text(challenge_json.into())).await.is_err() { - let _ = result_tx.send(AuthListenerResult::ConnectionLost); - return; - } - - // Step 2: Await auth reply (10s timeout). - let auth_frame = - tokio::time::timeout(std::time::Duration::from_secs(10), stream.next()).await; - - let auth_text = match auth_frame { - Ok(Some(Ok(TMsg::Text(t)))) => t.to_string(), - Ok(Some(Ok(TMsg::Close(reason)))) => { - let _ = result_tx.send(AuthListenerResult::PeerClosedEarly( - reason.map(|r| r.reason.to_string()), - )); - return; - } - _ => { - let _ = sink - .send(TMsg::Close(Some(tokio_tungstenite::tungstenite::protocol::CloseFrame { - code: tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode::from(4001), - reason: "auth_timeout".into(), - }))) - .await; - let _ = result_tx.send(AuthListenerResult::AuthTimeout); - return; - } - }; - - // Step 3: Verify. - let auth_msg: super::AuthMessage = match serde_json::from_str(&auth_text) { - Ok(m) => m, - Err(_) => { - let _ = close_listener_auth_failed(&mut sink).await; - let _ = result_tx.send(AuthListenerResult::AuthFailed("bad_json".into())); - return; - } - }; - - let sig_valid = crate::node_identity::verify_challenge( - &auth_msg.pubkey_hex, - &challenge, - &auth_msg.signature_hex, - ); - let key_trusted = trusted_keys.iter().any(|k| k == &auth_msg.pubkey_hex); - - if !sig_valid || !key_trusted { - let _ = close_listener_auth_failed(&mut sink).await; - let _ = result_tx.send(AuthListenerResult::AuthFailed(format!( - "sig_valid={sig_valid}, key_trusted={key_trusted}" - ))); - return; - } - - // Auth passed! Send a bulk state with one op to prove sync works. - let kp = bft_json_crdt::keypair::make_keypair(); - let mut crdt = - bft_json_crdt::json_crdt::BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = serde_json::json!({ - "story_id": "628_auth_test_item", - "stage": "1_backlog", - "name": "Auth Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op = crdt - .doc - .items - .insert(bft_json_crdt::op::ROOT_ID, item) - .sign(&kp); - let op_json = serde_json::to_string(&op).unwrap(); - let bulk = super::SyncMessage::Bulk { ops: vec![op_json] }; - let bulk_json = serde_json::to_string(&bulk).unwrap(); - let _ = sink.send(TMsg::Text(bulk_json.into())).await; - - let _ = result_tx.send(AuthListenerResult::Authenticated(auth_msg.pubkey_hex)); - }); - - (addr, result_rx) - } - - #[derive(Debug)] - #[allow(dead_code)] - enum AuthListenerResult { - Authenticated(String), - AuthFailed(String), - AuthTimeout, - ConnectionLost, - PeerClosedEarly(Option), - } - - async fn close_listener_auth_failed( - sink: &mut futures::stream::SplitSink< - tokio_tungstenite::WebSocketStream, - tokio_tungstenite::tungstenite::Message, - >, - ) { - use tokio_tungstenite::tungstenite::protocol::CloseFrame; - use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; - let _ = sink - .send(tokio_tungstenite::tungstenite::Message::Close(Some( - CloseFrame { - code: CloseCode::from(4002), - reason: "auth_failed".into(), - }, - ))) - .await; - } - - /// AC5 (story 628): Happy path — two nodes with each other's pubkeys - /// allow-listed complete the handshake and exchange at least one CRDT op. - #[tokio::test] - async fn auth_happy_path_handshake_and_sync() { - use bft_json_crdt::keypair::make_keypair; - use futures::{SinkExt, StreamExt}; - use tokio_tungstenite::connect_async; - use tokio_tungstenite::tungstenite::Message as TMsg; - - let connector_kp = make_keypair(); - let connector_pubkey = crate::node_identity::public_key_hex(&connector_kp); - - // Start listener that trusts the connector's pubkey. - let (addr, result_rx) = start_auth_listener(vec![connector_pubkey.clone()]).await; - - // Connect and do the handshake. - let url = format!("ws://{addr}"); - let (ws, _) = connect_async(&url).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - // Receive challenge. - let challenge_frame = stream.next().await.unwrap().unwrap(); - let challenge_text = match challenge_frame { - TMsg::Text(t) => t.to_string(), - other => panic!("Expected text frame, got {other:?}"), - }; - let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); - assert_eq!(challenge_msg.r#type, "challenge"); - assert_eq!( - challenge_msg.nonce.len(), - 64, - "Challenge must be 64 hex chars" - ); - - // Sign and reply. - let sig = crate::node_identity::sign_challenge(&connector_kp, &challenge_msg.nonce); - let auth_msg = super::AuthMessage { - r#type: "auth".to_string(), - pubkey_hex: connector_pubkey.clone(), - signature_hex: sig, - }; - let auth_json = serde_json::to_string(&auth_msg).unwrap(); - sink.send(TMsg::Text(auth_json.into())).await.unwrap(); - - // After auth, we should receive a bulk sync message with at least one op. - let bulk_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) - .await - .expect("should receive bulk within 5s") - .unwrap() - .unwrap(); - - let bulk_text = match bulk_frame { - TMsg::Text(t) => t.to_string(), - other => panic!("Expected bulk text frame, got {other:?}"), - }; - let bulk_msg: super::SyncMessage = serde_json::from_str(&bulk_text).unwrap(); - match bulk_msg { - super::SyncMessage::Bulk { ops } => { - assert!( - !ops.is_empty(), - "Bulk sync must contain at least one op after successful auth" - ); - // Verify we can deserialize the op. - let _signed: bft_json_crdt::json_crdt::SignedOp = - serde_json::from_str(&ops[0]).unwrap(); - } - _ => panic!("Expected Bulk message after auth"), - } - - // Verify listener also reports success. - let listener_result = result_rx.await.unwrap(); - match listener_result { - AuthListenerResult::Authenticated(pubkey) => { - assert_eq!(pubkey, connector_pubkey); - } - other => panic!("Expected Authenticated, got {other:?}"), - } - } - - /// AC6 (story 628): Untrusted pubkey — connector whose pubkey is NOT in - /// the listener's allow-list is rejected with close reason auth_failed, - /// and zero CRDT ops are exchanged. - #[tokio::test] - async fn auth_untrusted_pubkey_rejected() { - use bft_json_crdt::keypair::make_keypair; - use futures::{SinkExt, StreamExt}; - use tokio_tungstenite::connect_async; - use tokio_tungstenite::tungstenite::Message as TMsg; - - let connector_kp = make_keypair(); - let connector_pubkey = crate::node_identity::public_key_hex(&connector_kp); - - // Listener trusts a DIFFERENT key, not the connector's. - let other_kp = make_keypair(); - let other_pubkey = crate::node_identity::public_key_hex(&other_kp); - - let (addr, result_rx) = start_auth_listener(vec![other_pubkey]).await; - - let url = format!("ws://{addr}"); - let (ws, _) = connect_async(&url).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - // Receive challenge and sign with our (untrusted) key. - let challenge_frame = stream.next().await.unwrap().unwrap(); - let challenge_text = match challenge_frame { - TMsg::Text(t) => t.to_string(), - _ => panic!("Expected text frame"), - }; - let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); - - let sig = crate::node_identity::sign_challenge(&connector_kp, &challenge_msg.nonce); - let auth_msg = super::AuthMessage { - r#type: "auth".to_string(), - pubkey_hex: connector_pubkey, - signature_hex: sig, - }; - sink.send(TMsg::Text(serde_json::to_string(&auth_msg).unwrap().into())) - .await - .unwrap(); - - // Should receive a close frame with auth_failed. - let close_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) - .await - .expect("should receive close within 5s"); - - match close_frame { - Some(Ok(TMsg::Close(Some(frame)))) => { - assert_eq!( - &*frame.reason, "auth_failed", - "Close reason must be 'auth_failed'" - ); - } - Some(Ok(TMsg::Close(None))) => { - // Some implementations omit the close frame payload — that's acceptable - // as long as no sync data was sent. - } - other => { - // Connection dropped without close frame is also acceptable. - // The key assertion is below: no ops were exchanged. - let _ = other; - } - } - - // Verify listener reports auth failure. - let listener_result = result_rx.await.unwrap(); - match listener_result { - AuthListenerResult::AuthFailed(reason) => { - assert!(reason.contains("key_trusted=false"), "Reason: {reason}"); - } - other => panic!("Expected AuthFailed, got {other:?}"), - } - } - - /// AC7 (story 628): Bad signature — connector signs with wrong keypair. - /// Rejected with the same auth_failed — indistinguishable from untrusted key. - #[tokio::test] - async fn auth_bad_signature_rejected() { - use bft_json_crdt::keypair::make_keypair; - use futures::{SinkExt, StreamExt}; - use tokio_tungstenite::connect_async; - use tokio_tungstenite::tungstenite::Message as TMsg; - - let legitimate_kp = make_keypair(); - let legitimate_pubkey = crate::node_identity::public_key_hex(&legitimate_kp); - - // A different keypair that will sign the challenge (wrong key). - let impersonator_kp = make_keypair(); - - // Listener trusts the legitimate pubkey. - let (addr, result_rx) = start_auth_listener(vec![legitimate_pubkey.clone()]).await; - - let url = format!("ws://{addr}"); - let (ws, _) = connect_async(&url).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - // Receive challenge. - let challenge_frame = stream.next().await.unwrap().unwrap(); - let challenge_text = match challenge_frame { - TMsg::Text(t) => t.to_string(), - _ => panic!("Expected text frame"), - }; - let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); - - // Sign with the WRONG keypair but claim to be the legitimate pubkey. - let bad_sig = crate::node_identity::sign_challenge(&impersonator_kp, &challenge_msg.nonce); - let auth_msg = super::AuthMessage { - r#type: "auth".to_string(), - pubkey_hex: legitimate_pubkey, - signature_hex: bad_sig, - }; - sink.send(TMsg::Text(serde_json::to_string(&auth_msg).unwrap().into())) - .await - .unwrap(); - - // Should be rejected. - let close_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) - .await - .expect("should receive close within 5s"); - - match close_frame { - Some(Ok(TMsg::Close(Some(frame)))) => { - assert_eq!( - &*frame.reason, "auth_failed", - "Close reason must be 'auth_failed' — same as untrusted key" - ); - } - _ => { - // Connection closed is acceptable. - } - } - - // Verify listener reports auth failure with sig_valid=false. - let listener_result = result_rx.await.unwrap(); - match listener_result { - AuthListenerResult::AuthFailed(reason) => { - assert!(reason.contains("sig_valid=false"), "Reason: {reason}"); - } - other => panic!("Expected AuthFailed, got {other:?}"), - } - } - - /// AC8 (story 628): Replay protection sanity — two consecutive connect - /// attempts receive different challenges (fresh nonce per accept). - #[tokio::test] - async fn auth_replay_protection_fresh_nonces() { - use futures::StreamExt; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - // Start a listener that sends challenges but doesn't complete auth. - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let (nonce_tx, mut nonce_rx) = tokio::sync::mpsc::channel::(2); - - tokio::spawn(async move { - for _ in 0..2 { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, _stream) = ws.split(); - - let challenge = crate::node_identity::generate_challenge(); - let msg = super::ChallengeMessage { - r#type: "challenge".to_string(), - nonce: challenge.clone(), - }; - let json = serde_json::to_string(&msg).unwrap(); - let _ = sink.send(TMsg::Text(json.into())).await; - let _ = nonce_tx.send(challenge).await; - } - }); - - // Connect twice and collect the nonces. - let mut nonces = Vec::new(); - for _ in 0..2 { - let url = format!("ws://{addr}"); - let (ws, _) = connect_async(&url).await.unwrap(); - let (_sink, mut stream) = ws.split(); - - let frame = stream.next().await.unwrap().unwrap(); - let text = match frame { - TMsg::Text(t) => t.to_string(), - _ => panic!("Expected text"), - }; - let msg: super::ChallengeMessage = serde_json::from_str(&text).unwrap(); - nonces.push(msg.nonce); - // Drop connection so listener accepts the next one. - drop(stream); - } - - // Also collect nonces from the listener side. - let server_nonce_1 = nonce_rx.recv().await.unwrap(); - let server_nonce_2 = nonce_rx.recv().await.unwrap(); - - assert_ne!( - nonces[0], nonces[1], - "Consecutive challenges must be different" - ); - assert_ne!( - server_nonce_1, server_nonce_2, - "Server must generate fresh nonce per accept" - ); - assert_eq!(nonces[0], server_nonce_1, "Client/server nonces must match"); - assert_eq!(nonces[1], server_nonce_2, "Client/server nonces must match"); - } - - /// AC4 (story 628): trusted_keys config is parsed from project.toml. - #[test] - fn config_trusted_keys_parsed_from_toml() { - let toml_str = r#" -trusted_keys = [ - "aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233", - "11223344556677881122334455667788112233445566778811223344556677ab", -] - -[[agent]] -name = "test" -"#; - let config: crate::config::ProjectConfig = - crate::config::ProjectConfig::parse(toml_str).unwrap(); - assert_eq!(config.trusted_keys.len(), 2); - assert_eq!( - config.trusted_keys[0], - "aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233" - ); - } - - /// AC4 (story 628): trusted_keys defaults to empty (reject all). - #[test] - fn config_trusted_keys_defaults_to_empty() { - let config = crate::config::ProjectConfig::default(); - assert!( - config.trusted_keys.is_empty(), - "trusted_keys must default to empty (reject all)" - ); - } - - /// AC9 (story 628): Existing per-op signed replication tests still pass. - /// This is verified implicitly by running the full test suite — this marker - /// test documents the intent. - #[test] - fn existing_sync_tests_unchanged() { - // If we got here, all previous crdt_sync tests compiled and passed. - // This test exists as a documentation anchor for AC9. - } - - // ── Story 630: WebSocket keepalive (ping/pong) ──────────────────────────── - - /// AC1/AC2: PING_INTERVAL_SECS is 30 and PONG_TIMEOUT_SECS is 60 — the - /// transport-level constants are correct. - #[test] - fn keepalive_constants_are_correct() { - assert_eq!( - super::PING_INTERVAL_SECS, - 30, - "Ping interval must be 30 seconds" - ); - assert_eq!( - super::PONG_TIMEOUT_SECS, - 60, - "Pong timeout must be 60 seconds" - ); - } - - /// AC5: The agent-mode heartbeat interval (SCAN_INTERVAL_SECS) is 15s and - /// must not be changed by the keepalive work. - #[test] - fn agent_mode_heartbeat_interval_unchanged() { - assert_eq!( - crate::agent_mode::SCAN_INTERVAL_SECS, - 15, - "Agent-mode heartbeat interval must remain 15s" - ); - } - - /// AC4: Reconnect backoff constants are unchanged. - #[test] - fn reconnect_backoff_constants_unchanged() { - assert_eq!( - super::RENDEZVOUS_ERROR_THRESHOLD, - 10, - "Backoff threshold must still be 10" - ); - } - - /// AC1: Server (accept_async side) emits a Ping frame after the configured - /// interval. Uses short durations (100 ms ping) so the test finishes fast. - #[tokio::test] - async fn server_sends_ping_to_peer_at_interval() { - use futures::{SinkExt, StreamExt}; - use std::time::Duration; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - let ping_ms = 100u64; - let timeout_ms = 400u64; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - // Server task: keepalive sender with short intervals. - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); - let mut ticker = tokio::time::interval_at( - tokio::time::Instant::now() + Duration::from_millis(ping_ms), - Duration::from_millis(ping_ms), - ); - - loop { - tokio::select! { - _ = ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { break; } - if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { break; } - } - frame = stream.next() => { - match frame { - Some(Ok(TMsg::Pong(_))) => { - pong_deadline = tokio::time::Instant::now() - + Duration::from_millis(timeout_ms); - } - None | Some(Err(_)) => break, - _ => {} - } - } - } - } - }); - - let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - let (_sink_c, mut stream_c) = ws_client.split(); - - // Wait for more than one ping interval. - tokio::time::sleep(Duration::from_millis(ping_ms * 2)).await; - - // Client should receive a Ping from the server. - let frame = tokio::time::timeout(Duration::from_millis(200), stream_c.next()).await; - let got_ping = matches!(frame, Ok(Some(Ok(TMsg::Ping(_))))); - assert!( - got_ping, - "Client must receive a Ping frame from the server after the ping interval" - ); - } - - /// AC2: Client-side keepalive sender emits a Ping after the interval, - /// symmetrically to the server side. - #[tokio::test] - async fn client_sends_ping_to_server_at_interval() { - use futures::{SinkExt, StreamExt}; - use std::time::Duration; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - let ping_ms = 100u64; - let timeout_ms = 400u64; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let (ping_tx, ping_rx) = tokio::sync::oneshot::channel::<()>(); - - // Server task: wait for the first Ping the client sends. - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (_sink, mut stream) = ws.split(); - loop { - match stream.next().await { - Some(Ok(TMsg::Ping(_))) => { - let _ = ping_tx.send(()); - break; - } - Some(Ok(_)) => continue, - _ => break, - } - } - }); - - let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - let (mut sink_c, mut stream_c) = ws_client.split(); - - // Client keepalive task. - tokio::spawn(async move { - let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); - let mut ticker = tokio::time::interval_at( - tokio::time::Instant::now() + Duration::from_millis(ping_ms), - Duration::from_millis(ping_ms), - ); - loop { - tokio::select! { - _ = ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { break; } - if sink_c.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { break; } - } - frame = stream_c.next() => { - match frame { - Some(Ok(TMsg::Pong(_))) => { - pong_deadline = tokio::time::Instant::now() - + Duration::from_millis(timeout_ms); - } - None | Some(Err(_)) => break, - _ => {} - } - } - } - } - }); - - let result = tokio::time::timeout(Duration::from_millis(ping_ms * 3), ping_rx).await; - assert!( - result.is_ok(), - "Server must receive a Ping from the client after the ping interval" - ); - } - - /// AC3: Either side disconnects when no Pong is received within the timeout. - /// The keepalive sender returns `true` (timed out) when Pongs are withheld. - #[tokio::test] - async fn keepalive_disconnects_when_pong_withheld() { - use futures::{SinkExt, StreamExt}; - use std::time::Duration; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - let ping_ms = 100u64; - let timeout_ms = 250u64; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let (done_tx, done_rx) = tokio::sync::oneshot::channel::(); - - // Server: sends Pings, never receives Pong (client swallows all). - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - let pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); - let mut ticker = tokio::time::interval_at( - tokio::time::Instant::now() + Duration::from_millis(ping_ms), - Duration::from_millis(ping_ms), - ); - - let timed_out = loop { - tokio::select! { - _ = ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { break true; } - if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { - break false; - } - } - frame = stream.next() => { - match frame { - Some(Ok(_)) => {} // swallow — no Pong sent - _ => break false, - } - } - } - }; - let _ = done_tx.send(timed_out); - }); - - // Client: connect but never respond to Pings. - let (_ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - - let result = - tokio::time::timeout(Duration::from_millis(timeout_ms + ping_ms * 3), done_rx).await; - let timed_out = result - .expect("Server must report within expected wall-clock time") - .expect("oneshot intact"); - - assert!( - timed_out, - "Server must disconnect on keepalive timeout when Pong is withheld" - ); - } - - /// AC3 (positive path): Connection stays alive when Pong responses arrive - /// before the timeout fires. - #[tokio::test] - async fn keepalive_connection_survives_with_pong_responses() { - use futures::{SinkExt, StreamExt}; - use std::time::Duration; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - let ping_ms = 100u64; - let timeout_ms = 250u64; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - let (result_tx, result_rx) = tokio::sync::oneshot::channel::(); - - // Server: sends Pings, resets deadline on Pong. - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); - let mut ticker = tokio::time::interval_at( - tokio::time::Instant::now() + Duration::from_millis(ping_ms), - Duration::from_millis(ping_ms), - ); - - let timed_out = loop { - tokio::select! { - _ = ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { break true; } - if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { - break false; - } - } - frame = stream.next() => { - match frame { - Some(Ok(TMsg::Pong(_))) => { - pong_deadline = tokio::time::Instant::now() - + Duration::from_millis(timeout_ms); - } - None | Some(Err(_)) => break false, // clean close - _ => {} - } - } - } - }; - let _ = result_tx.send(timed_out); - }); - - let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - let (mut sink_c, mut stream_c) = ws_client.split(); - - // Client: respond to every Ping with Pong for several intervals. - let respond_task = tokio::spawn(async move { - while let Some(Ok(msg)) = stream_c.next().await { - if let TMsg::Ping(data) = msg - && sink_c.send(TMsg::Pong(data)).await.is_err() - { - break; - } - } - }); - - // Run for a few intervals, then drop the client. - tokio::time::sleep(Duration::from_millis(ping_ms * 3)).await; - respond_task.abort(); - - let result = tokio::time::timeout(Duration::from_millis(200), result_rx).await; - let timed_out = result.unwrap_or(Ok(false)).unwrap_or(false); - assert!( - !timed_out, - "Server must NOT timeout when the client responds to Pings with Pongs" - ); - } - - /// AC6: Integration — one node swallows Pongs; the other drops the - /// connection within the timeout, then reconnect is possible via backoff. - #[tokio::test] - async fn two_node_pong_swallow_causes_disconnect_within_timeout() { - use futures::{SinkExt, StreamExt}; - use std::time::Duration; - use tokio::net::TcpListener; - use tokio_tungstenite::tungstenite::Message as TMsg; - use tokio_tungstenite::{accept_async, connect_async}; - - let ping_ms = 100u64; - let timeout_ms = 250u64; - - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let addr = listener.local_addr().unwrap(); - - // Node A (listener): sends Pings, never receives Pong. - let (a_done_tx, a_done_rx) = tokio::sync::oneshot::channel::(); - tokio::spawn(async move { - let (tcp, _) = listener.accept().await.unwrap(); - let ws = accept_async(tcp).await.unwrap(); - let (mut sink, mut stream) = ws.split(); - - let pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); - let mut ticker = tokio::time::interval_at( - tokio::time::Instant::now() + Duration::from_millis(ping_ms), - Duration::from_millis(ping_ms), - ); - - let timed_out = loop { - tokio::select! { - _ = ticker.tick() => { - if tokio::time::Instant::now() >= pong_deadline { break true; } - if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { - break false; - } - } - frame = stream.next() => { - match frame { - Some(Ok(_)) => {} // swallow all frames - _ => break false, - } - } - } - }; - let _ = a_done_tx.send(timed_out); - }); - - // Node B: connects, drains frames silently (swallows Pings, never pongs). - let (ws_b, _) = connect_async(format!("ws://{addr}")).await.unwrap(); - let (_sink_b, mut stream_b) = ws_b.split(); - tokio::spawn(async move { while let Some(Ok(_)) = stream_b.next().await {} }); - - let result = - tokio::time::timeout(Duration::from_millis(timeout_ms + ping_ms * 3), a_done_rx).await; - let timed_out = result - .expect("Node A must report within expected wall-clock time") - .expect("channel intact"); - - assert!( - timed_out, - "Node A must disconnect due to keepalive timeout when Node B swallows Pongs" - ); - } - - // ── Story 631: vector clock wire format tests ─────────────────────── - - /// Clock message serialization round-trip via SyncMessage. - #[test] - fn sync_message_clock_serialization_roundtrip() { - let mut clock = std::collections::HashMap::new(); - clock.insert("aabbcc00".to_string(), 42u64); - clock.insert("ddeeff11".to_string(), 7u64); - - let msg = SyncMessage::Clock { clock }; - let json = serde_json::to_string(&msg).unwrap(); - assert!(json.contains(r#""type":"clock""#)); - - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - match deserialized { - SyncMessage::Clock { clock } => { - assert_eq!(clock["aabbcc00"], 42); - assert_eq!(clock["ddeeff11"], 7); - } - _ => panic!("Expected Clock"), - } - } - - /// Empty clock (new node) serializes correctly. - #[test] - fn sync_message_clock_empty() { - let msg = SyncMessage::Clock { - clock: std::collections::HashMap::new(), - }; - let json = serde_json::to_string(&msg).unwrap(); - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - match deserialized { - SyncMessage::Clock { clock } => assert!(clock.is_empty()), - _ => panic!("Expected Clock"), - } - } - - /// v1 compat: a v1 peer that only knows `bulk` and `op` will fail to parse - /// a `clock` message — verify the parse error is a clean serde error, not a - /// panic. - #[test] - fn v1_peer_ignores_clock_message_gracefully() { - // Simulate: v1 peer only knows Bulk and Op. - // A clock message should fail deserialization (unknown variant). - let clock_json = r#"{"type":"clock","clock":{"abc":10}}"#; - // handle_incoming_text logs and returns — must not panic. - handle_incoming_text(clock_json); - } - - /// v2 delta sync simulation: two CRDT nodes, exchange clocks, send deltas. - #[test] - fn v2_delta_sync_via_clock_exchange() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use fastcrypto::traits::KeyPair; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - let kp_a = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp_a); - - // A creates 5 items. - let mut ops_a = Vec::new(); - for i in 0..5 { - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": format!("631_v2_{i}"), - "stage": "1_backlog", - "name": format!("v2 item {i}"), - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); - crdt_a.apply(op.clone()); - ops_a.push(op); - } - - // B has seen the first 3 ops (clock says 3). - let kp_b = make_keypair(); - let mut crdt_b = BaseCrdt::::new(&kp_b); - for op in &ops_a[..3] { - crdt_b.apply(op.clone()); - } - assert_eq!(crdt_b.doc.items.view().len(), 3); - - // Build B's clock. - let author_a_hex = crate::crdt_state::hex::encode(&kp_a.public().0.to_bytes()); - let mut clock_b = std::collections::HashMap::new(); - clock_b.insert(author_a_hex.clone(), 3u64); - - // Serialize clock as wire message. - let clock_msg = SyncMessage::Clock { - clock: clock_b.clone(), - }; - let clock_wire = serde_json::to_string(&clock_msg).unwrap(); - - // A receives B's clock and computes delta. - let parsed: SyncMessage = serde_json::from_str(&clock_wire).unwrap(); - let delta_ops = match parsed { - SyncMessage::Clock { clock: peer_clock } => { - // Simulate ops_since: A has 5 ops from author_a, B has 3. - let all_json: Vec = ops_a - .iter() - .map(|op| serde_json::to_string(op).unwrap()) - .collect(); - let mut result = Vec::new(); - let mut count = 0u64; - for (i, _op) in ops_a.iter().enumerate() { - count += 1; - let peer_has = peer_clock.get(&author_a_hex).copied().unwrap_or(0); - if count > peer_has { - result.push(all_json[i].clone()); - } - } - result - } - _ => panic!("Expected Clock"), - }; - - assert_eq!(delta_ops.len(), 2, "delta should be 2 ops (ops 4 and 5)"); - - // B applies the delta. - for op_str in &delta_ops { - let signed: bft_json_crdt::json_crdt::SignedOp = serde_json::from_str(op_str).unwrap(); - crdt_b.apply(signed); - } - assert_eq!(crdt_b.doc.items.view().len(), 5); - } - - // ── Story 632: Ready ACK handshake ──────────────────────────────────────── - - /// AC1: `{"type":"ready"}` serialises and deserialises correctly. - #[test] - fn sync_message_ready_serialization_roundtrip() { - let msg = SyncMessage::Ready; - let json = serde_json::to_string(&msg).unwrap(); - assert_eq!(json, r#"{"type":"ready"}"#); - let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); - assert!(matches!(deserialized, SyncMessage::Ready)); - } - - /// AC4 (buffer flush): Locally-generated ops are buffered while `peer_ready` - /// is false, then flushed in-order once the peer's `Ready` frame arrives. - /// Frame capture verifies ordering. - #[test] - fn buffer_flush_ops_held_until_peer_ready() { - use bft_json_crdt::json_crdt::BaseCrdt; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - // Simulate the buffer-flush state machine without real WebSockets. - let mut peer_ready = false; - let mut op_buffer: Vec> = Vec::new(); // encoded wire frames - let mut sent_frames: Vec> = Vec::new(); // captured "sent" frames - - // Build two local ops on a fresh CRDT. - let kp = make_keypair(); - let mut crdt = BaseCrdt::::new(&kp); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "632_buffer_a", - "stage": "1_backlog", - "name": "Buffer A", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); - crdt.apply(op1.clone()); - let op2 = crdt.doc.items[0] - .stage - .set("2_current".to_string()) - .sign(&kp); - crdt.apply(op2.clone()); - - // Simulate op1 arriving from broadcast while peer is NOT ready. - let frame1 = crate::crdt_wire::encode(&op1); - if peer_ready { - sent_frames.push(frame1.clone()); - } else { - op_buffer.push(frame1.clone()); - } - - // Simulate op2 arriving while peer is still NOT ready. - let frame2 = crate::crdt_wire::encode(&op2); - if peer_ready { - sent_frames.push(frame2.clone()); - } else { - op_buffer.push(frame2.clone()); - } - - // Both ops must be buffered, none sent yet. - assert_eq!( - sent_frames.len(), - 0, - "ops must be buffered before peer Ready arrives" - ); - assert_eq!(op_buffer.len(), 2, "buffer must hold both ops"); - - // Simulate receiving the peer's Ready frame. - let ready_json = serde_json::to_string(&SyncMessage::Ready).unwrap(); - if let Ok(SyncMessage::Ready) = serde_json::from_str::(&ready_json) { - peer_ready = true; - for encoded in op_buffer.drain(..) { - sent_frames.push(encoded); - } - } - - assert!(peer_ready, "peer_ready must be true after Ready frame"); - assert_eq!(op_buffer.len(), 0, "buffer must be empty after flush"); - assert_eq!( - sent_frames.len(), - 2, - "both buffered ops must appear in captured frames after flush" - ); - // Order preserved: op1 before op2. - assert_eq!(sent_frames[0], frame1, "op1 must be first flushed frame"); - assert_eq!(sent_frames[1], frame2, "op2 must be second flushed frame"); - - // After flush, a new op is sent immediately (no buffering). - let op3 = crdt.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); - crdt.apply(op3.clone()); - let frame3 = crate::crdt_wire::encode(&op3); - if peer_ready { - sent_frames.push(frame3.clone()); - } else { - op_buffer.push(frame3.clone()); - } - assert_eq!( - sent_frames.len(), - 3, - "op after flush must be sent immediately" - ); - assert_eq!(op_buffer.len(), 0, "buffer must remain empty"); - } - - /// AC5 (causal queueing regression): Real-time binary ops from the peer are - /// applied immediately regardless of our own ready state. When a real-time - /// op causally depends on a bulk op, the CRDT causal queue ensures it is - /// applied correctly once the dependency arrives. - #[test] - fn realtime_op_from_peer_applied_immediately_regardless_of_ready_state() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - // Node A creates a bulk op and a real-time op that causally depends on it. - let kp_a = make_keypair(); - let mut crdt_a = BaseCrdt::::new(&kp_a); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "632_causal_test", - "stage": "1_backlog", - "name": "Causal Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let bulk_op = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); - crdt_a.apply(bulk_op.clone()); - - // Real-time op that causally depends on bulk_op. - let rt_op = crdt_a.doc.items[0] - .stage - .set("2_current".to_string()) - .sign_with_dependencies(&kp_a, vec![&bulk_op]); - crdt_a.apply(rt_op.clone()); - - // Node B receives the bulk op first (simulating the bulk-delta phase). - let kp_b = make_keypair(); - let mut crdt_b = BaseCrdt::::new(&kp_b); - assert_eq!( - crdt_b.apply(bulk_op.clone()), - OpState::Ok, - "bulk op must apply cleanly on node B" - ); - - // Node B has not yet sent its own Ready frame, but it receives A's - // real-time binary frame. It must be applied immediately via - // handle_incoming_binary (causal queue handles ordering). - let wire = crate::crdt_wire::encode(&rt_op); - let decoded = crate::crdt_wire::decode(&wire).unwrap(); - let result = crdt_b.apply(decoded); - assert_eq!( - result, - OpState::Ok, - "real-time op depending on bulk op must apply immediately after bulk op is present" - ); - - // Both nodes converge to the same state. - assert_eq!( - crdt_b.doc.items[0].stage.view(), - JV::String("2_current".to_string()), - "Node B must converge to stage 2_current" - ); - assert_eq!( - CrdtNode::view(&crdt_a.doc.items), - CrdtNode::view(&crdt_b.doc.items), - "Both nodes must converge to identical state" - ); - } - - /// AC3: Real-time ops received from the peer BEFORE our own ready is sent - /// are applied immediately (no buffering on the receive side). - #[test] - fn incoming_realtime_op_applied_before_we_send_ready() { - use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; - use bft_json_crdt::keypair::make_keypair; - use bft_json_crdt::op::ROOT_ID; - use serde_json::json; - - use crate::crdt_state::PipelineDoc; - - // Simulate: we have NOT sent our ready yet (own_ready_sent = false), - // but the peer sends us a real-time binary op. It must be applied. - let own_ready_sent = false; // we haven't sent ready yet - let _ = own_ready_sent; // doc-only; receiving side never gates on this - - let kp_peer = make_keypair(); - let mut crdt_peer = BaseCrdt::::new(&kp_peer); - let item: bft_json_crdt::json_crdt::JsonValue = json!({ - "story_id": "632_ac3_test", - "stage": "1_backlog", - "name": "AC3 Test", - "agent": "", - "retry_count": 0.0, - "blocked": false, - "depends_on": "", - "claimed_by": "", - "claimed_at": 0.0, - }) - .into(); - let rt_op = crdt_peer.doc.items.insert(ROOT_ID, item).sign(&kp_peer); - crdt_peer.apply(rt_op.clone()); - - // Node B decodes and applies the peer's real-time op directly. - let kp_b = make_keypair(); - let mut crdt_b = BaseCrdt::::new(&kp_b); - let wire = crate::crdt_wire::encode(&rt_op); - let decoded = crate::crdt_wire::decode(&wire).unwrap(); - // This mirrors handle_incoming_binary() — applied unconditionally. - let result = crdt_b.apply(decoded); - assert_eq!( - result, - OpState::Ok, - "incoming real-time op must be applied immediately regardless of own ready state" - ); - assert_eq!( - crdt_b.doc.items[0].stage.view(), - JV::String("1_backlog".to_string()), - "op content must be correct" - ); - } - - /// AC6: All existing CRDT sync tests pass — verified by this marker test. - #[test] - fn existing_crdt_sync_tests_pass_unchanged() { - // Reaching this point means all prior tests in this module compiled - // and passed. This test documents the AC6 intent. - } - - // ── Story 633: bearer-token connection auth ─────────────────────────────── - - /// AC4: Valid token — `validate_join_token` returns `true` for a token that - /// has been added via `add_join_token` and has not expired. - #[test] - fn valid_token_passes_validation() { - let token = format!("test-valid-{}", uuid::Uuid::new_v4()); - super::add_join_token(token.clone()); - assert!( - super::validate_join_token(&token), - "A freshly added token must pass validation" - ); - } - - /// AC5: Invalid (bogus) token — `validate_join_token` returns `false` for a - /// token that was never added to the store. - #[test] - fn bogus_token_fails_validation() { - let bogus = "this-token-was-never-added-to-the-store"; - assert!( - !super::validate_join_token(bogus), - "An unknown token must fail validation" - ); - } - - /// AC5: Expired token — `validate_join_token` returns `false` for a token - /// whose `expires_at` is in the past. - #[test] - fn expired_token_fails_validation() { - // Insert a token directly with an already-past expiry timestamp. - let token = format!("test-expired-{}", uuid::Uuid::new_v4()); - let store = super::CRDT_TOKENS - .get_or_init(|| std::sync::RwLock::new(std::collections::HashMap::new())); - // expires_at = 1 (way in the past — 1970-01-01T00:00:01Z) - store.write().unwrap().insert(token.clone(), 1.0_f64); - assert!( - !super::validate_join_token(&token), - "An expired token must fail validation" - ); - } - - /// AC6: No token when server requires one — `validate_join_token` returns - /// `false` and the caller must reject with 401. Verifies the logic path - /// used by `crdt_sync_handler`. - #[test] - fn no_token_with_require_true_is_rejected() { - // Simulate: require_token=true, token=None → reject. - let require_token = true; - let token: Option<&str> = None; - let should_reject = match token { - Some(t) => !super::validate_join_token(t), - None if require_token => true, - None => false, - }; - assert!( - should_reject, - "Missing token must be rejected when token is required" - ); - } - - /// AC6: No token when server is in open mode — connection is accepted. - #[test] - fn no_token_with_require_false_is_accepted() { - let require_token = false; - let token: Option<&str> = None; - let should_reject = match token { - Some(t) => !super::validate_join_token(t), - None if require_token => true, - None => false, - }; - assert!( - !should_reject, - "Missing token must be accepted in open mode" - ); - } - - /// AC3: `spawn_rendezvous_client` URL construction — when a token is provided - /// the `?token=` query parameter is appended correctly. - #[test] - fn rendezvous_url_with_token_appended() { - let base = "ws://host:3001/crdt-sync"; - let token = "my-secret-token"; - let url_with_token = if base.contains('?') { - format!("{base}&token={token}") - } else { - format!("{base}?token={token}") - }; - assert_eq!( - url_with_token, - "ws://host:3001/crdt-sync?token=my-secret-token" - ); - - // With existing query params. - let base_with_query = "ws://host:3001/crdt-sync?foo=bar"; - let url_appended = if base_with_query.contains('?') { - format!("{base_with_query}&token={token}") - } else { - format!("{base_with_query}?token={token}") - }; - assert_eq!( - url_appended, - "ws://host:3001/crdt-sync?foo=bar&token=my-secret-token" - ); - } - - /// AC3: Without a token, the URL is used as-is. - #[test] - fn rendezvous_url_without_token_unchanged() { - let base = "ws://host:3001/crdt-sync"; - let token: Option<&str> = None; - let connect_url = match token { - Some(t) => format!("{base}?token={t}"), - None => base.to_string(), - }; - assert_eq!(connect_url, base); - } - - /// `add_join_token` returns a future expiry timestamp that is in the future. - #[test] - fn add_join_token_returns_future_expiry() { - let token = format!("test-expiry-{}", uuid::Uuid::new_v4()); - let now = chrono::Utc::now().timestamp() as f64; - let expires_at = super::add_join_token(token); - assert!( - expires_at > now, - "Expiry timestamp must be in the future (got {expires_at}, now={now})" - ); - } - - /// TOKEN_TTL_SECS must be 30 days. - #[test] - fn token_ttl_is_thirty_days() { - assert_eq!( - super::TOKEN_TTL_SECS, - 30.0 * 24.0 * 3600.0, - "TOKEN_TTL_SECS must be 30 days" - ); - } - - /// Config: `crdt_require_token` defaults to `false`. - #[test] - fn config_crdt_require_token_defaults_to_false() { - let config = crate::config::ProjectConfig::default(); - assert!( - !config.crdt_require_token, - "crdt_require_token must default to false (open access)" - ); - } - - /// Config: `crdt_tokens` defaults to empty. - #[test] - fn config_crdt_tokens_defaults_to_empty() { - let config = crate::config::ProjectConfig::default(); - assert!( - config.crdt_tokens.is_empty(), - "crdt_tokens must default to empty" - ); - } - - /// Config: `crdt_require_token` and `crdt_tokens` are parsed from TOML. - #[test] - fn config_crdt_token_fields_parsed_from_toml() { - let toml_str = r#" -crdt_require_token = true -crdt_tokens = ["token-abc", "token-xyz"] - -[[agent]] -name = "test" -"#; - let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap(); - assert!(config.crdt_require_token); - assert_eq!(config.crdt_tokens, vec!["token-abc", "token-xyz"]); - } -} diff --git a/server/src/crdt_sync/auth.rs b/server/src/crdt_sync/auth.rs new file mode 100644 index 00000000..c09fa157 --- /dev/null +++ b/server/src/crdt_sync/auth.rs @@ -0,0 +1,230 @@ +//! Auth: trusted-key allow-list and bearer-token validation for `/crdt-sync` connections. + +use std::collections::HashMap; +use std::sync::OnceLock; + +/// Trusted public keys loaded once at startup. +static TRUSTED_KEYS: OnceLock> = OnceLock::new(); + +/// Initialise the trusted-key allow-list for connect-time mutual auth. +/// +/// Must be called once at startup before any WebSocket connections are +/// accepted. Subsequent calls are no-ops (OnceLock). +pub fn init_trusted_keys(keys: Vec) { + let _ = TRUSTED_KEYS.set(keys); +} + +/// Return a reference to the trusted-key allow-list. +pub(super) fn trusted_keys() -> &'static [String] { + TRUSTED_KEYS.get().map(|v| v.as_slice()).unwrap_or(&[]) +} + +// ── Bearer-token auth ─────────────────────────────────────────────── + +/// Time-to-live for CRDT bearer tokens in seconds (30 days). +pub(super) const TOKEN_TTL_SECS: f64 = 30.0 * 24.0 * 3600.0; + +/// Whether a bearer token is required for `/crdt-sync` connections. +/// `None` (uninitialised) → open access (backward compatible). +pub(super) static REQUIRE_TOKEN: OnceLock = OnceLock::new(); + +/// Valid bearer tokens — maps token string to its expiry unix timestamp. +static CRDT_TOKENS: OnceLock>> = OnceLock::new(); + +/// Initialise bearer-token auth for CRDT-sync connections. +/// +/// Must be called once at startup before any WebSocket connections are accepted. +/// When `require` is `true`, clients must supply a valid `?token=` query +/// parameter on the upgrade request or receive HTTP 401. When `require` is +/// `false` (default) a token is optional — connections without one are +/// accepted, but a supplied token is still validated. +pub fn init_token_auth(require: bool, tokens: Vec) { + let _ = REQUIRE_TOKEN.set(require); + let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new())); + if let Ok(mut map) = store.write() { + let now = chrono::Utc::now().timestamp() as f64; + for token in tokens { + map.insert(token, now + TOKEN_TTL_SECS); + } + } +} + +/// Add a bearer token to the CRDT-sync token store. +/// +/// The token expires after [`TOKEN_TTL_SECS`] seconds. Returns the expiry +/// unix timestamp so callers can surface it in admin tooling. +pub fn add_join_token(token: String) -> f64 { + let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new())); + let now = chrono::Utc::now().timestamp() as f64; + let expires_at = now + TOKEN_TTL_SECS; + if let Ok(mut map) = store.write() { + map.insert(token, expires_at); + } + expires_at +} + +/// Validate a bearer token against the CRDT-sync token store. +/// +/// Returns `true` if the token exists in the store and has not expired. +pub(super) fn validate_join_token(token: &str) -> bool { + let Some(store) = CRDT_TOKENS.get() else { + return false; + }; + let now = chrono::Utc::now().timestamp() as f64; + store + .read() + .ok() + .and_then(|map| map.get(token).copied()) + .is_some_and(|expires_at| expires_at > now) +} + +#[cfg(test)] +mod tests { + #[allow(unused_imports)] + use super::*; + + #[test] + fn config_trusted_keys_parsed_from_toml() { + let toml_str = r#" +trusted_keys = [ + "aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233", + "11223344556677881122334455667788112233445566778811223344556677ab", +] + +[[agent]] +name = "test" +"#; + let config: crate::config::ProjectConfig = + crate::config::ProjectConfig::parse(toml_str).unwrap(); + assert_eq!(config.trusted_keys.len(), 2); + assert_eq!( + config.trusted_keys[0], + "aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233" + ); + } + + #[test] + fn config_trusted_keys_defaults_to_empty() { + let config = crate::config::ProjectConfig::default(); + assert!( + config.trusted_keys.is_empty(), + "trusted_keys must default to empty (reject all)" + ); + } + + #[test] + fn valid_token_passes_validation() { + let token = format!("test-valid-{}", uuid::Uuid::new_v4()); + super::add_join_token(token.clone()); + assert!( + super::validate_join_token(&token), + "A freshly added token must pass validation" + ); + } + + #[test] + fn bogus_token_fails_validation() { + let bogus = "this-token-was-never-added-to-the-store"; + assert!( + !super::validate_join_token(bogus), + "An unknown token must fail validation" + ); + } + + #[test] + fn expired_token_fails_validation() { + // Insert a token directly with an already-past expiry timestamp. + let token = format!("test-expired-{}", uuid::Uuid::new_v4()); + let store = super::CRDT_TOKENS + .get_or_init(|| std::sync::RwLock::new(std::collections::HashMap::new())); + // expires_at = 1 (way in the past — 1970-01-01T00:00:01Z) + store.write().unwrap().insert(token.clone(), 1.0_f64); + assert!( + !super::validate_join_token(&token), + "An expired token must fail validation" + ); + } + + #[test] + fn no_token_with_require_true_is_rejected() { + // Simulate: require_token=true, token=None → reject. + let require_token = true; + let token: Option<&str> = None; + let should_reject = match token { + Some(t) => !super::validate_join_token(t), + None if require_token => true, + None => false, + }; + assert!( + should_reject, + "Missing token must be rejected when token is required" + ); + } + + #[test] + fn no_token_with_require_false_is_accepted() { + let require_token = false; + let token: Option<&str> = None; + let should_reject = match token { + Some(t) => !super::validate_join_token(t), + None if require_token => true, + None => false, + }; + assert!( + !should_reject, + "Missing token must be accepted in open mode" + ); + } + + #[test] + fn add_join_token_returns_future_expiry() { + let token = format!("test-expiry-{}", uuid::Uuid::new_v4()); + let now = chrono::Utc::now().timestamp() as f64; + let expires_at = super::add_join_token(token); + assert!( + expires_at > now, + "Expiry timestamp must be in the future (got {expires_at}, now={now})" + ); + } + + #[test] + fn token_ttl_is_thirty_days() { + assert_eq!( + super::TOKEN_TTL_SECS, + 30.0 * 24.0 * 3600.0, + "TOKEN_TTL_SECS must be 30 days" + ); + } + + #[test] + fn config_crdt_require_token_defaults_to_false() { + let config = crate::config::ProjectConfig::default(); + assert!( + !config.crdt_require_token, + "crdt_require_token must default to false (open access)" + ); + } + + #[test] + fn config_crdt_tokens_defaults_to_empty() { + let config = crate::config::ProjectConfig::default(); + assert!( + config.crdt_tokens.is_empty(), + "crdt_tokens must default to empty" + ); + } + + #[test] + fn config_crdt_token_fields_parsed_from_toml() { + let toml_str = r#" +crdt_require_token = true +crdt_tokens = ["token-abc", "token-xyz"] + +[[agent]] +name = "test" +"#; + let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap(); + assert!(config.crdt_require_token); + assert_eq!(config.crdt_tokens, vec!["token-abc", "token-xyz"]); + } +} diff --git a/server/src/crdt_sync/client.rs b/server/src/crdt_sync/client.rs new file mode 100644 index 00000000..8d65ce27 --- /dev/null +++ b/server/src/crdt_sync/client.rs @@ -0,0 +1,464 @@ +//! Rendezvous client: connect to a remote peer, authenticate, and exchange CRDT ops. + +use bft_json_crdt::json_crdt::SignedOp; +use futures::{SinkExt, StreamExt}; + +use crate::crdt_state; +use crate::crdt_wire; +use crate::slog; +use crate::slog_error; +use crate::slog_warn; + +use super::auth; +use super::dispatch::{handle_incoming_binary, handle_incoming_text}; +use super::wire::{AuthMessage, ChallengeMessage, SyncMessage}; +use super::{AUTH_TIMEOUT_SECS, PING_INTERVAL_SECS, PONG_TIMEOUT_SECS}; + +#[allow(unused_imports)] +use auth::{add_join_token, init_token_auth}; // needed by tests + +// ── Rendezvous client ─────────────────────────────────────────────── + +/// Number of consecutive connection failures before escalating from WARN to ERROR. +pub const RENDEZVOUS_ERROR_THRESHOLD: u32 = 10; + +/// Spawn a background task that connects to the configured rendezvous +/// peer and exchanges CRDT ops bidirectionally. +/// +/// The client reconnects with exponential backoff if the connection drops. +/// Individual failures are logged at WARN; after [`RENDEZVOUS_ERROR_THRESHOLD`] +/// consecutive failures the log level escalates to ERROR. +/// +/// When `token` is provided it is appended to the upgrade URL as +/// `?token=` so the server's bearer-token check is satisfied. This +/// reuses the existing `--join-token` / `HUSKIES_JOIN_TOKEN` plumbing on the +/// agent side. +pub fn spawn_rendezvous_client(url: String, token: Option) { + tokio::spawn(async move { + let mut backoff_secs = 1u64; + let mut consecutive_failures: u32 = 0; + loop { + slog!("[crdt-sync] Connecting to rendezvous peer: {url}"); + match connect_and_sync(&url, token.as_deref()).await { + Ok(()) => { + slog!("[crdt-sync] Rendezvous connection closed cleanly"); + backoff_secs = 1; + consecutive_failures = 0; + } + Err(e) => { + consecutive_failures += 1; + if consecutive_failures >= RENDEZVOUS_ERROR_THRESHOLD { + slog_error!( + "[crdt-sync] Rendezvous peer unreachable ({consecutive_failures} consecutive failures): {e}" + ); + } else { + slog_warn!( + "[crdt-sync] Rendezvous connection error (attempt {consecutive_failures}): {e}" + ); + } + } + } + slog!("[crdt-sync] Reconnecting in {backoff_secs}s..."); + tokio::time::sleep(std::time::Duration::from_secs(backoff_secs)).await; + backoff_secs = (backoff_secs * 2).min(30); + } + }); +} + +/// Connect to a remote sync endpoint and exchange ops until disconnect. +/// +/// When `token` is supplied it is appended as `?token=` to the +/// connection URL so the server's bearer-token check passes. +pub(crate) async fn connect_and_sync(url: &str, token: Option<&str>) -> Result<(), String> { + let connect_url = match token { + Some(t) => { + if url.contains('?') { + format!("{url}&token={t}") + } else { + format!("{url}?token={t}") + } + } + None => url.to_string(), + }; + let (ws_stream, _) = tokio_tungstenite::connect_async(connect_url.as_str()) + .await + .map_err(|e| format!("WebSocket connect failed: {e}"))?; + + let (mut sink, mut stream) = ws_stream.split(); + + slog!("[crdt-sync] Connected to rendezvous peer, awaiting challenge"); + + // ── Step 1: Receive challenge from listener ─────────────────── + use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; + + let challenge_frame = tokio::time::timeout( + std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), + stream.next(), + ) + .await + .map_err(|_| "Auth timeout waiting for challenge".to_string())? + .ok_or_else(|| "Connection closed before challenge".to_string())? + .map_err(|e| format!("WebSocket read error: {e}"))?; + + let challenge_text = match challenge_frame { + TungsteniteMsg::Text(t) => t.to_string(), + _ => return Err("Expected text frame for challenge".to_string()), + }; + + let challenge_msg: ChallengeMessage = serde_json::from_str(&challenge_text) + .map_err(|e| format!("Invalid challenge message: {e}"))?; + + if challenge_msg.r#type != "challenge" { + return Err(format!( + "Expected challenge message, got type={}", + challenge_msg.r#type + )); + } + + // ── Step 2: Sign challenge and send auth reply ──────────────── + let (pubkey_hex, signature_hex) = crdt_state::sign_challenge(&challenge_msg.nonce) + .ok_or_else(|| "CRDT not initialised — cannot sign challenge".to_string())?; + + let auth_msg = AuthMessage { + r#type: "auth".to_string(), + pubkey_hex, + signature_hex, + }; + let auth_json = serde_json::to_string(&auth_msg).map_err(|e| format!("Serialize auth: {e}"))?; + sink.send(TungsteniteMsg::Text(auth_json.into())) + .await + .map_err(|e| format!("Send auth failed: {e}"))?; + + slog!("[crdt-sync] Auth reply sent, waiting for sync data"); + + // v2 protocol: send our vector clock. + let our_clock = crdt_state::our_vector_clock().unwrap_or_default(); + let clock_msg = SyncMessage::Clock { clock: our_clock }; + if let Ok(json) = serde_json::to_string(&clock_msg) { + sink.send(TungsteniteMsg::Text(json.into())) + .await + .map_err(|e| format!("Send clock failed: {e}"))?; + } + + // Wait for the server's first sync message. + let first_msg = tokio::time::timeout( + std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), + wait_for_rendezvous_sync_text(&mut stream), + ) + .await + .map_err(|_| "Timeout waiting for server sync message".to_string())?; + + match first_msg { + Some(SyncMessage::Clock { clock: peer_clock }) => { + // v2 server — send only the ops the server is missing. + let delta = crdt_state::ops_since(&peer_clock).unwrap_or_default(); + slog!( + "[crdt-sync] v2 delta sync: sending {} ops to server (server missing)", + delta.len() + ); + let msg = SyncMessage::Bulk { ops: delta }; + if let Ok(json) = serde_json::to_string(&msg) { + sink.send(TungsteniteMsg::Text(json.into())) + .await + .map_err(|e| format!("Send delta failed: {e}"))?; + } + } + Some(SyncMessage::Bulk { ops }) => { + // v1 server — apply their bulk and send our full bulk. + let mut applied = 0u64; + for op_json in &ops { + if let Ok(signed_op) = serde_json::from_str::(op_json) + && crdt_state::apply_remote_op(signed_op) + { + applied += 1; + } + } + slog!( + "[crdt-sync] v1 bulk sync: received {} ops from server, applied {applied}", + ops.len() + ); + if let Some(all) = crdt_state::all_ops_json() { + let msg = SyncMessage::Bulk { ops: all }; + if let Ok(json) = serde_json::to_string(&msg) { + sink.send(TungsteniteMsg::Text(json.into())) + .await + .map_err(|e| format!("Send bulk failed: {e}"))?; + } + } + } + _ => { + // Fallback — send full bulk. + slog!("[crdt-sync] No sync message from server; sending full bulk as fallback"); + if let Some(all) = crdt_state::all_ops_json() { + let msg = SyncMessage::Bulk { ops: all }; + if let Ok(json) = serde_json::to_string(&msg) { + sink.send(TungsteniteMsg::Text(json.into())) + .await + .map_err(|e| format!("Send bulk failed: {e}"))?; + } + } + } + } + + // Bulk-delta phase complete — signal the server that we are ready for + // real-time op streaming. + if let Ok(json) = serde_json::to_string(&SyncMessage::Ready) { + sink.send(TungsteniteMsg::Text(json.into())) + .await + .map_err(|e| format!("Send ready failed: {e}"))?; + } + + // Subscribe to new local ops. + let Some(mut op_rx) = crdt_state::subscribe_ops() else { + return Err("CRDT not initialised".to_string()); + }; + + // Buffer for locally-generated ops produced before the server's `ready` + // arrives. Flushed in-order once the server signals catch-up. + let mut peer_ready = false; + let mut op_buffer: Vec = Vec::new(); + + // ── Keepalive state ─────────────────────────────────────────────── + let mut pong_deadline = + tokio::time::Instant::now() + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); + let mut ping_ticker = tokio::time::interval_at( + tokio::time::Instant::now() + std::time::Duration::from_secs(PING_INTERVAL_SECS), + std::time::Duration::from_secs(PING_INTERVAL_SECS), + ); + + loop { + tokio::select! { + // Send periodic Ping and enforce Pong timeout. + _ = ping_ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { + slog_warn!( + "[crdt-sync] No pong from rendezvous peer {} in {}s; disconnecting", + url, + PONG_TIMEOUT_SECS + ); + return Err(format!( + "Keepalive timeout: no pong from {url} in {PONG_TIMEOUT_SECS}s" + )); + } + use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; + if sink.send(TungsteniteMsg::Ping(bytes::Bytes::new())).await.is_err() { + break; + } + } + result = op_rx.recv() => { + match result { + Ok(signed_op) => { + if peer_ready { + // Encode via wire codec and send as binary frame. + let bytes = crdt_wire::encode(&signed_op); + use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; + if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() { + break; + } + } else { + // Buffer until the server signals ready. + op_buffer.push(signed_op); + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { + slog!("[crdt-sync] Slow rendezvous link lagged {n} ops; disconnecting"); + break; + } + Err(_) => break, + } + } + frame = stream.next() => { + match frame { + Some(Ok(tokio_tungstenite::tungstenite::Message::Pong(_))) => { + // Reset the pong deadline on every Pong received. + pong_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); + } + Some(Ok(tokio_tungstenite::tungstenite::Message::Ping(_))) => { + // tungstenite auto-responds to Ping with Pong at the + // protocol level; no manual response needed here. + } + Some(Ok(tokio_tungstenite::tungstenite::Message::Text(text))) => { + // Check for the ready signal before other text frames. + if let Ok(SyncMessage::Ready) = serde_json::from_str(text.as_ref()) { + peer_ready = true; + slog!("[crdt-sync] Server ready; flushing {} buffered ops", op_buffer.len()); + let mut flush_ok = true; + for op in op_buffer.drain(..) { + let bytes = crdt_wire::encode(&op); + use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; + if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() { + flush_ok = false; + break; + } + } + if !flush_ok { + break; + } + } else { + handle_incoming_text(text.as_ref()); + } + } + Some(Ok(tokio_tungstenite::tungstenite::Message::Binary(bytes))) => { + // Real-time op — applied immediately regardless of ready state. + handle_incoming_binary(&bytes); + } + Some(Ok(tokio_tungstenite::tungstenite::Message::Close(_))) | None => break, + Some(Err(e)) => { + slog!("[crdt-sync] Rendezvous read error: {e}"); + break; + } + _ => {} + } + } + } + } + + Ok(()) +} + +/// Wait for the next text-frame sync message from a tungstenite stream, +/// handling Ping/Pong transparently. +/// +/// Returns `None` on connection close or read error. +async fn wait_for_rendezvous_sync_text( + stream: &mut futures::stream::SplitStream< + tokio_tungstenite::WebSocketStream< + tokio_tungstenite::MaybeTlsStream, + >, + >, +) -> Option { + use tokio_tungstenite::tungstenite::Message as TungsteniteMsg; + loop { + match stream.next().await { + Some(Ok(TungsteniteMsg::Text(text))) => { + return serde_json::from_str(text.as_ref()).ok(); + } + Some(Ok(TungsteniteMsg::Ping(_) | TungsteniteMsg::Pong(_))) => continue, + _ => return None, + } + } +} + +// ── Tests ──────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + #[allow(unused_imports)] + use super::*; + + #[test] + fn config_rendezvous_parsed_from_toml() { + let toml_str = r#" +rendezvous = "ws://remote:3001/crdt-sync" + +[[agent]] +name = "test" +"#; + let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap(); + assert_eq!( + config.rendezvous.as_deref(), + Some("ws://remote:3001/crdt-sync") + ); + } + + #[test] + fn config_rendezvous_defaults_to_none() { + let config = crate::config::ProjectConfig::default(); + assert!(config.rendezvous.is_none()); + } + + #[test] + fn failure_counter_warn_below_threshold() { + let threshold = RENDEZVOUS_ERROR_THRESHOLD; + let mut consecutive_failures: u32 = 0; + + // First threshold-1 failures are below the ERROR threshold. + for _ in 0..(threshold - 1) { + consecutive_failures += 1; + assert!( + consecutive_failures < threshold, + "failure {consecutive_failures} must be below ERROR threshold {threshold}" + ); + } + } + + #[test] + fn failure_counter_error_at_threshold() { + let threshold = RENDEZVOUS_ERROR_THRESHOLD; + let consecutive_failures: u32 = threshold; + assert!( + consecutive_failures >= threshold, + "failure {consecutive_failures} must reach or exceed ERROR threshold {threshold}" + ); + } + + #[test] + fn failure_counter_resets_on_success() { + let threshold = RENDEZVOUS_ERROR_THRESHOLD; + // Simulate sustained failure. + let mut consecutive_failures: u32 = threshold + 5; + assert!(consecutive_failures >= threshold); + + // Simulate a clean reconnect. + consecutive_failures = 0; + assert_eq!( + consecutive_failures, 0, + "counter must reset to 0 on success" + ); + + // Next error is attempt 1 — well below the ERROR threshold. + consecutive_failures += 1; + assert!( + consecutive_failures < threshold, + "first failure after reset must be below ERROR threshold" + ); + } + + #[test] + fn error_threshold_is_ten() { + assert_eq!( + RENDEZVOUS_ERROR_THRESHOLD, + 10, + "ERROR escalation threshold must be 10 consecutive failures" + ); + } + + #[test] + fn rendezvous_url_with_token_appended() { + let base = "ws://host:3001/crdt-sync"; + let token = "my-secret-token"; + let url_with_token = if base.contains('?') { + format!("{base}&token={token}") + } else { + format!("{base}?token={token}") + }; + assert_eq!( + url_with_token, + "ws://host:3001/crdt-sync?token=my-secret-token" + ); + + // With existing query params. + let base_with_query = "ws://host:3001/crdt-sync?foo=bar"; + let url_appended = if base_with_query.contains('?') { + format!("{base_with_query}&token={token}") + } else { + format!("{base_with_query}?token={token}") + }; + assert_eq!( + url_appended, + "ws://host:3001/crdt-sync?foo=bar&token=my-secret-token" + ); + } + + #[test] + fn rendezvous_url_without_token_unchanged() { + let base = "ws://host:3001/crdt-sync"; + let token: Option<&str> = None; + let connect_url = match token { + Some(t) => format!("{base}?token={t}"), + None => base.to_string(), + }; + assert_eq!(connect_url, base); + } +} diff --git a/server/src/crdt_sync/dispatch.rs b/server/src/crdt_sync/dispatch.rs new file mode 100644 index 00000000..5692e6c4 --- /dev/null +++ b/server/src/crdt_sync/dispatch.rs @@ -0,0 +1,1028 @@ +//! Shared message dispatch — handle inbound text/binary frames once auth is complete. + +use bft_json_crdt::json_crdt::SignedOp; + +use crate::crdt_snapshot; +use crate::crdt_state; +use crate::crdt_wire; +use crate::slog; + +use super::wire::SyncMessage; + +pub(super) fn handle_incoming_text(text: &str) { + // First try to parse as a snapshot protocol message. + if let Ok(snapshot_msg) = serde_json::from_str::(text) { + handle_snapshot_message(snapshot_msg); + return; + } + + let msg: SyncMessage = match serde_json::from_str(text) { + Ok(m) => m, + Err(e) => { + slog!("[crdt-sync] Bad text message from peer: {e}"); + return; + } + }; + + match msg { + SyncMessage::Bulk { ops } => { + let mut applied = 0u64; + for op_json in &ops { + if let Ok(signed_op) = serde_json::from_str::(op_json) + && crdt_state::apply_remote_op(signed_op) + { + applied += 1; + } + } + slog!( + "[crdt-sync] Bulk sync: received {} ops, applied {applied}", + ops.len() + ); + } + SyncMessage::Op { op } => { + if let Ok(signed_op) = serde_json::from_str::(&op) { + crdt_state::apply_remote_op(signed_op); + } + } + SyncMessage::Clock { .. } => { + // Clock frames are handled during the initial negotiation phase. + // If one arrives during the streaming loop it is a protocol error + // on the peer's part — log and ignore. + slog!("[crdt-sync] Ignoring unexpected clock frame during streaming phase"); + } + SyncMessage::Ready => { + // Ready frames are intercepted inline in the streaming loop before + // this function is called. If one reaches here it is a protocol + // error — log and ignore. + slog!("[crdt-sync] Ignoring unexpected ready frame in handle_incoming_text"); + } + } +} + +/// Handle an incoming snapshot protocol message. +/// +/// - **Snapshot**: apply the snapshot state and send an ack back. +/// Peers without snapshot support will never reach this code path because +/// the `SnapshotMessage` parse will fail and the message falls through to +/// the legacy `SyncMessage` handler, which logs and ignores unknown types. +/// - **SnapshotAck**: record the ack for quorum tracking. +pub(super) fn handle_snapshot_message(msg: crdt_snapshot::SnapshotMessage) { + match msg { + crdt_snapshot::SnapshotMessage::Snapshot(snapshot) => { + slog!( + "[crdt-sync] Received snapshot at_seq={}, {} ops, {} manifest entries", + snapshot.at_seq, + snapshot.state.len(), + snapshot.op_manifest.len() + ); + // Apply compaction on this peer. + crdt_snapshot::apply_compaction(snapshot.clone()); + + // Send ack back to leader via the sync broadcast channel. + // The ack is sent as a CRDT event that the streaming loop picks up. + // For now, log the ack intent — actual transport is handled by the + // caller that invokes handle_incoming_text. + slog!( + "[crdt-sync] Snapshot applied, ack for at_seq={}", + snapshot.at_seq + ); + } + crdt_snapshot::SnapshotMessage::SnapshotAck(ack) => { + if let Some(node_id) = crdt_state::our_node_id() { + let _ = node_id; // The ack comes from a peer, not from us. + } + slog!( + "[crdt-sync] Received snapshot_ack for at_seq={}", + ack.at_seq + ); + // Record the ack — the coordination logic checks for quorum. + // Note: we don't know the peer's node_id from the message alone; + // in a full implementation the ack would include the sender's + // node_id. For now we log it for protocol completeness. + } + } +} + +/// Process an incoming binary-frame op from a peer. +/// +/// Binary frames carry a single `SignedOp` encoded via [`crdt_wire`]. +pub(super) fn handle_incoming_binary(bytes: &[u8]) { + match crdt_wire::decode(bytes) { + Ok(signed_op) => { + crdt_state::apply_remote_op(signed_op); + } + Err(e) => { + slog!("[crdt-sync] Bad binary frame from peer: {e}"); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn handle_incoming_text_bad_json_does_not_panic() { + handle_incoming_text("not valid json"); + } + + #[test] + fn handle_incoming_text_bulk_with_invalid_ops_does_not_panic() { + let msg = SyncMessage::Bulk { + ops: vec!["not a valid signed op".to_string()], + }; + let json = serde_json::to_string(&msg).unwrap(); + handle_incoming_text(&json); + } + + #[test] + fn handle_incoming_text_op_with_invalid_op_does_not_panic() { + let msg = SyncMessage::Op { + op: "garbage".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + handle_incoming_text(&json); + } + + #[test] + fn handle_incoming_binary_bad_bytes_does_not_panic() { + handle_incoming_binary(b"not valid wire codec"); + } + + #[test] + fn handle_incoming_binary_empty_bytes_does_not_panic() { + handle_incoming_binary(b""); + } + + #[test] + fn subscribe_ops_returns_none_before_init() { + // Before crdt_state::init() the channel doesn't exist yet. + // In test binaries it may or may not be initialised depending on + // other tests, so we just verify no panic. + let _ = crdt_state::subscribe_ops(); + } + + #[test] + fn all_ops_json_returns_none_before_init() { + let _ = crdt_state::all_ops_json(); + } + + #[test] + fn two_node_sync_via_protocol_messages() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + // ── Node A: create an item ── + let kp_a = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp_a); + + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "100_story_sync_test", + "stage": "1_backlog", + "name": "Sync Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); + assert_eq!(crdt_a.apply(op1.clone()), OpState::Ok); + + // Serialise op1 into a SyncMessage::Op. + let op1_json = serde_json::to_string(&op1).unwrap(); + let wire_msg = SyncMessage::Op { + op: op1_json.clone(), + }; + let wire_json = serde_json::to_string(&wire_msg).unwrap(); + + // ── Node B: receive the op through protocol ── + let kp_b = make_keypair(); + let mut crdt_b = BaseCrdt::::new(&kp_b); + assert!(crdt_b.doc.items.view().is_empty()); + + // Parse wire message and apply. + let parsed: SyncMessage = serde_json::from_str(&wire_json).unwrap(); + match parsed { + SyncMessage::Op { op } => { + let signed_op: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(&op).unwrap(); + let result = crdt_b.apply(signed_op); + assert_eq!(result, OpState::Ok); + } + _ => panic!("Expected Op"), + } + + // Verify Node B has the same state as Node A. + assert_eq!(crdt_b.doc.items.view().len(), 1); + assert_eq!( + crdt_a.doc.items[0].story_id.view(), + crdt_b.doc.items[0].story_id.view() + ); + assert_eq!( + crdt_a.doc.items[0].stage.view(), + crdt_b.doc.items[0].stage.view() + ); + + // ── Node A: update stage ── + let op2 = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp_a); + crdt_a.apply(op2.clone()); + + // Send via bulk message. + let op2_json = serde_json::to_string(&op2).unwrap(); + let bulk_msg = SyncMessage::Bulk { + ops: vec![op1_json, op2_json], + }; + let bulk_wire = serde_json::to_string(&bulk_msg).unwrap(); + + // ── Node C: receives full state via bulk ── + let kp_c = make_keypair(); + let mut crdt_c = BaseCrdt::::new(&kp_c); + + let parsed_bulk: SyncMessage = serde_json::from_str(&bulk_wire).unwrap(); + match parsed_bulk { + SyncMessage::Bulk { ops } => { + for op_str in &ops { + let signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(op_str).unwrap(); + crdt_c.apply(signed); + } + } + _ => panic!("Expected Bulk"), + } + + // Node C should have the updated stage. + assert_eq!(crdt_c.doc.items.view().len(), 1); + assert_eq!( + crdt_c.doc.items[0].stage.view(), + bft_json_crdt::json_crdt::JsonValue::String("2_current".to_string()) + ); + } + + #[test] + fn partition_heal_via_bulk_replay() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp = make_keypair(); + + // Node A creates an item and advances it. + let mut crdt_a = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "200_story_heal", + "stage": "1_backlog", + "name": "Heal Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + + let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp); + crdt_a.apply(op1.clone()); + + let op2 = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp); + crdt_a.apply(op2.clone()); + + let op3 = crdt_a.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); + crdt_a.apply(op3.clone()); + + // Serialise all ops as a bulk message (simulates partition heal). + let ops_json: Vec = [&op1, &op2, &op3] + .iter() + .map(|op| serde_json::to_string(op).unwrap()) + .collect(); + let bulk = SyncMessage::Bulk { ops: ops_json }; + let wire = serde_json::to_string(&bulk).unwrap(); + + // Node B receives the bulk and reconstructs state. + let mut crdt_b = BaseCrdt::::new(&kp); + let parsed: SyncMessage = serde_json::from_str(&wire).unwrap(); + match parsed { + SyncMessage::Bulk { ops } => { + for op_str in &ops { + let signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(op_str).unwrap(); + crdt_b.apply(signed); + } + } + _ => panic!("Expected Bulk"), + } + + // Node B should match Node A exactly. + assert_eq!(crdt_b.doc.items.view().len(), 1); + assert_eq!( + crdt_b.doc.items[0].stage.view(), + JV::String("3_qa".to_string()) + ); + assert_eq!( + crdt_a.doc.items[0].stage.view(), + crdt_b.doc.items[0].stage.view() + ); + assert_eq!( + crdt_a.doc.items[0].name.view(), + crdt_b.doc.items[0].name.view() + ); + } + + #[test] + fn self_loop_dedup_second_apply_is_noop() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_dedup_test", + "stage": "1_backlog", + "name": "Dedup Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + + let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + + // First apply: succeeds. + assert_eq!(crdt.apply(op.clone()), OpState::Ok); + assert_eq!(crdt.doc.items.view().len(), 1); + + // Second apply (self-loop): must be a no-op. + assert_eq!(crdt.apply(op.clone()), OpState::AlreadySeen); + + // State must not have changed. + assert_eq!(crdt.doc.items.view().len(), 1); + + // Stage update also deduplicated correctly. + let stage_op = crdt.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp); + assert_eq!(crdt.apply(stage_op.clone()), OpState::Ok); + assert_eq!( + crdt.doc.items[0].stage.view(), + JV::String("2_current".to_string()) + ); + assert_eq!(crdt.apply(stage_op), OpState::AlreadySeen); + assert_eq!( + crdt.doc.items[0].stage.view(), + JV::String("2_current".to_string()), + "stage must not change on duplicate apply" + ); + } + + #[test] + fn out_of_order_causal_queueing_releases_on_dep_arrival() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_causal_test", + "stage": "1_backlog", + "name": "Causal Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + + // op1 = insert item (no deps) + let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + + // op2 = set stage, declared to depend on op1 + // We must first apply op1 locally to generate op2 from the right state, + // then we'll test op2-before-op1 on a fresh CRDT. + crdt.apply(op1.clone()); + let op2 = crdt.doc.items[0] + .stage + .set("2_current".to_string()) + .sign_with_dependencies(&kp, vec![&op1]); + + // Create a fresh receiver CRDT. + let mut receiver = BaseCrdt::::new(&kp); + + // Apply op2 first — dependency (op1) has not arrived yet. + let r = receiver.apply(op2.clone()); + assert_eq!( + r, + OpState::MissingCausalDependencies, + "op2 must be queued when op1 has not arrived" + ); + // Queue length must reflect the held op. + assert_eq!(receiver.causal_queue_len(), 1); + + // Item has NOT been inserted yet (op1 not applied). + assert_eq!(receiver.doc.items.view().len(), 0); + + // Now deliver op1 — this should trigger op2 to be flushed automatically. + let r = receiver.apply(op1.clone()); + assert_eq!(r, OpState::Ok); + + // Both ops are now applied — item is present at stage 2_current. + assert_eq!(receiver.doc.items.view().len(), 1); + assert_eq!( + receiver.doc.items[0].stage.view(), + JV::String("2_current".to_string()), + "op2 must have been applied automatically after op1 arrived" + ); + + // Queue must be empty now. + assert_eq!(receiver.causal_queue_len(), 0); + } + + #[test] + fn in_order_apply_works_without_queueing() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp); + + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_inorder_test", + "stage": "1_backlog", + "name": "In-Order Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + + let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp); + crdt_a.apply(op1.clone()); + let op2 = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp); + crdt_a.apply(op2.clone()); + let op3 = crdt_a.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); + crdt_a.apply(op3.clone()); + + // Receiver applies all ops in the correct order. + let mut crdt_b = BaseCrdt::::new(&kp); + assert_eq!(crdt_b.apply(op1), OpState::Ok); + assert_eq!(crdt_b.apply(op2), OpState::Ok); + assert_eq!(crdt_b.apply(op3), OpState::Ok); + assert_eq!(crdt_b.causal_queue_len(), 0); + assert_eq!( + crdt_b.doc.items[0].stage.view(), + JV::String("3_qa".to_string()) + ); + } + + #[test] + fn causal_queue_overflow_drops_oldest() { + use bft_json_crdt::json_crdt::{BaseCrdt, CAUSAL_QUEUE_MAX, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp = make_keypair(); + + // Build one "phantom" op that we'll claim as a dependency but never deliver. + // We do this by creating it on a separate CRDT and never applying it. + let mut source = BaseCrdt::::new(&kp); + let phantom_item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_phantom", + "stage": "1_backlog", + "name": "Phantom", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let phantom_op = source.doc.items.insert(ROOT_ID, phantom_item).sign(&kp); + + // Receiver never sees phantom_op, so any op declaring it as a dep will + // sit in the causal queue forever (until evicted by overflow). + let mut receiver = BaseCrdt::::new(&kp); + source.apply(phantom_op.clone()); + + // Send CAUSAL_QUEUE_MAX + 5 stage-update ops all depending on phantom_op. + // Each one will be queued because phantom_op is never delivered. + let mut queued = 0usize; + for i in 0..CAUSAL_QUEUE_MAX + 5 { + let stage_name = format!("stage_{i}"); + // Generate from source so seq numbers are valid. + let op = source.doc.items[0] + .stage + .set(stage_name) + .sign_with_dependencies(&kp, vec![&phantom_op]); + source.apply(op.clone()); + let r = receiver.apply(op); + if r == OpState::MissingCausalDependencies { + queued += 1; + } + } + + // We sent more than CAUSAL_QUEUE_MAX ops, but the queue must stay bounded. + assert!( + receiver.causal_queue_len() <= CAUSAL_QUEUE_MAX, + "queue ({}) must not exceed CAUSAL_QUEUE_MAX ({CAUSAL_QUEUE_MAX})", + receiver.causal_queue_len() + ); + assert!( + queued > 0, + "at least some ops must have been accepted into the queue" + ); + } + + #[test] + fn convergence_after_partition_and_replay() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp_a = make_keypair(); + let kp_b = make_keypair(); + + let mut crdt_a = BaseCrdt::::new(&kp_a); + let mut crdt_b = BaseCrdt::::new(&kp_b); + + // ── Phase 1: A generates ops while partitioned from B ────────────── + + let item_a: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_convergence_a", + "stage": "1_backlog", + "name": "Story A", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op_a1 = crdt_a.doc.items.insert(ROOT_ID, item_a).sign(&kp_a); + crdt_a.apply(op_a1.clone()); + + let op_a2 = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp_a); + crdt_a.apply(op_a2.clone()); + + // ── Phase 2: B generates ops while partitioned from A ────────────── + + let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "507_convergence_b", + "stage": "1_backlog", + "name": "Story B", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); + crdt_b.apply(op_b1.clone()); + + let op_b2 = crdt_b.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp_b); + crdt_b.apply(op_b2.clone()); + + // ── Phase 3: Reconnect — both sides replay all buffered ops ──────── + + // A sends its ops to B. + let r = crdt_b.apply(op_a1.clone()); + assert!(r == OpState::Ok || r == OpState::AlreadySeen); + let r = crdt_b.apply(op_a2.clone()); + assert!(r == OpState::Ok || r == OpState::AlreadySeen); + + // B sends its ops to A. + let r = crdt_a.apply(op_b1.clone()); + assert!(r == OpState::Ok || r == OpState::AlreadySeen); + let r = crdt_a.apply(op_b2.clone()); + assert!(r == OpState::Ok || r == OpState::AlreadySeen); + + // ── Phase 4: Assert convergence ──────────────────────────────────── + + // Both nodes must have both stories. + assert_eq!( + crdt_a.doc.items.view().len(), + 2, + "A must have 2 items after convergence" + ); + assert_eq!( + crdt_b.doc.items.view().len(), + 2, + "B must have 2 items after convergence" + ); + + // Serialise both CRDT views to JSON and assert byte-identical. + let view_a = serde_json::to_string(&CrdtNode::view(&crdt_a.doc.items)).unwrap(); + let view_b = serde_json::to_string(&CrdtNode::view(&crdt_b.doc.items)).unwrap(); + assert_eq!( + view_a, view_b, + "CRDT states must be byte-identical after convergence" + ); + + // Spot-check: both stories are at 2_current on both nodes. + let stories_a: Vec = crdt_a + .doc + .items + .view() + .iter() + .filter_map(|item| { + if let JV::Object(m) = CrdtNode::view(item) { + m.get("story_id").and_then(|s| { + if let JV::String(s) = s { + Some(s.clone()) + } else { + None + } + }) + } else { + None + } + }) + .collect(); + assert!( + stories_a.contains(&"507_convergence_a".to_string()), + "A must contain story_a" + ); + assert!( + stories_a.contains(&"507_convergence_b".to_string()), + "A must contain story_b" + ); + } + + #[test] + fn existing_sync_tests_unchanged() { + // If we got here, all previous crdt_sync tests compiled and passed. + // This test exists as a documentation anchor for AC9. + } + + #[test] + fn v1_peer_ignores_clock_message_gracefully() { + // Simulate: v1 peer only knows Bulk and Op. + // A clock message should fail deserialization (unknown variant). + let clock_json = r#"{"type":"clock","clock":{"abc":10}}"#; + // handle_incoming_text logs and returns — must not panic. + handle_incoming_text(clock_json); + } + + #[test] + fn v2_delta_sync_via_clock_exchange() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use fastcrypto::traits::KeyPair; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + let kp_a = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp_a); + + // A creates 5 items. + let mut ops_a = Vec::new(); + for i in 0..5 { + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": format!("631_v2_{i}"), + "stage": "1_backlog", + "name": format!("v2 item {i}"), + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); + crdt_a.apply(op.clone()); + ops_a.push(op); + } + + // B has seen the first 3 ops (clock says 3). + let kp_b = make_keypair(); + let mut crdt_b = BaseCrdt::::new(&kp_b); + for op in &ops_a[..3] { + crdt_b.apply(op.clone()); + } + assert_eq!(crdt_b.doc.items.view().len(), 3); + + // Build B's clock. + let author_a_hex = crate::crdt_state::hex::encode(&kp_a.public().0.to_bytes()); + let mut clock_b = std::collections::HashMap::new(); + clock_b.insert(author_a_hex.clone(), 3u64); + + // Serialize clock as wire message. + let clock_msg = SyncMessage::Clock { + clock: clock_b.clone(), + }; + let clock_wire = serde_json::to_string(&clock_msg).unwrap(); + + // A receives B's clock and computes delta. + let parsed: SyncMessage = serde_json::from_str(&clock_wire).unwrap(); + let delta_ops = match parsed { + SyncMessage::Clock { clock: peer_clock } => { + // Simulate ops_since: A has 5 ops from author_a, B has 3. + let all_json: Vec = ops_a + .iter() + .map(|op| serde_json::to_string(op).unwrap()) + .collect(); + let mut result = Vec::new(); + let mut count = 0u64; + for (i, _op) in ops_a.iter().enumerate() { + count += 1; + let peer_has = peer_clock.get(&author_a_hex).copied().unwrap_or(0); + if count > peer_has { + result.push(all_json[i].clone()); + } + } + result + } + _ => panic!("Expected Clock"), + }; + + assert_eq!(delta_ops.len(), 2, "delta should be 2 ops (ops 4 and 5)"); + + // B applies the delta. + for op_str in &delta_ops { + let signed: bft_json_crdt::json_crdt::SignedOp = serde_json::from_str(op_str).unwrap(); + crdt_b.apply(signed); + } + assert_eq!(crdt_b.doc.items.view().len(), 5); + } + + #[test] + fn buffer_flush_ops_held_until_peer_ready() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + // Simulate the buffer-flush state machine without real WebSockets. + let mut peer_ready = false; + let mut op_buffer: Vec> = Vec::new(); // encoded wire frames + let mut sent_frames: Vec> = Vec::new(); // captured "sent" frames + + // Build two local ops on a fresh CRDT. + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "632_buffer_a", + "stage": "1_backlog", + "name": "Buffer A", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + crdt.apply(op1.clone()); + let op2 = crdt.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp); + crdt.apply(op2.clone()); + + // Simulate op1 arriving from broadcast while peer is NOT ready. + let frame1 = crate::crdt_wire::encode(&op1); + if peer_ready { + sent_frames.push(frame1.clone()); + } else { + op_buffer.push(frame1.clone()); + } + + // Simulate op2 arriving while peer is still NOT ready. + let frame2 = crate::crdt_wire::encode(&op2); + if peer_ready { + sent_frames.push(frame2.clone()); + } else { + op_buffer.push(frame2.clone()); + } + + // Both ops must be buffered, none sent yet. + assert_eq!( + sent_frames.len(), + 0, + "ops must be buffered before peer Ready arrives" + ); + assert_eq!(op_buffer.len(), 2, "buffer must hold both ops"); + + // Simulate receiving the peer's Ready frame. + let ready_json = serde_json::to_string(&SyncMessage::Ready).unwrap(); + if let Ok(SyncMessage::Ready) = serde_json::from_str::(&ready_json) { + peer_ready = true; + for encoded in op_buffer.drain(..) { + sent_frames.push(encoded); + } + } + + assert!(peer_ready, "peer_ready must be true after Ready frame"); + assert_eq!(op_buffer.len(), 0, "buffer must be empty after flush"); + assert_eq!( + sent_frames.len(), + 2, + "both buffered ops must appear in captured frames after flush" + ); + // Order preserved: op1 before op2. + assert_eq!(sent_frames[0], frame1, "op1 must be first flushed frame"); + assert_eq!(sent_frames[1], frame2, "op2 must be second flushed frame"); + + // After flush, a new op is sent immediately (no buffering). + let op3 = crdt.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); + crdt.apply(op3.clone()); + let frame3 = crate::crdt_wire::encode(&op3); + if peer_ready { + sent_frames.push(frame3.clone()); + } else { + op_buffer.push(frame3.clone()); + } + assert_eq!( + sent_frames.len(), + 3, + "op after flush must be sent immediately" + ); + assert_eq!(op_buffer.len(), 0, "buffer must remain empty"); + } + + #[test] + fn realtime_op_from_peer_applied_immediately_regardless_of_ready_state() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + // Node A creates a bulk op and a real-time op that causally depends on it. + let kp_a = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp_a); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "632_causal_test", + "stage": "1_backlog", + "name": "Causal Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let bulk_op = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); + crdt_a.apply(bulk_op.clone()); + + // Real-time op that causally depends on bulk_op. + let rt_op = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign_with_dependencies(&kp_a, vec![&bulk_op]); + crdt_a.apply(rt_op.clone()); + + // Node B receives the bulk op first (simulating the bulk-delta phase). + let kp_b = make_keypair(); + let mut crdt_b = BaseCrdt::::new(&kp_b); + assert_eq!( + crdt_b.apply(bulk_op.clone()), + OpState::Ok, + "bulk op must apply cleanly on node B" + ); + + // Node B has not yet sent its own Ready frame, but it receives A's + // real-time binary frame. It must be applied immediately via + // handle_incoming_binary (causal queue handles ordering). + let wire = crate::crdt_wire::encode(&rt_op); + let decoded = crate::crdt_wire::decode(&wire).unwrap(); + let result = crdt_b.apply(decoded); + assert_eq!( + result, + OpState::Ok, + "real-time op depending on bulk op must apply immediately after bulk op is present" + ); + + // Both nodes converge to the same state. + assert_eq!( + crdt_b.doc.items[0].stage.view(), + JV::String("2_current".to_string()), + "Node B must converge to stage 2_current" + ); + assert_eq!( + CrdtNode::view(&crdt_a.doc.items), + CrdtNode::view(&crdt_b.doc.items), + "Both nodes must converge to identical state" + ); + } + + #[test] + fn incoming_realtime_op_applied_before_we_send_ready() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + + // Simulate: we have NOT sent our ready yet (own_ready_sent = false), + // but the peer sends us a real-time binary op. It must be applied. + let own_ready_sent = false; // we haven't sent ready yet + let _ = own_ready_sent; // doc-only; receiving side never gates on this + + let kp_peer = make_keypair(); + let mut crdt_peer = BaseCrdt::::new(&kp_peer); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "632_ac3_test", + "stage": "1_backlog", + "name": "AC3 Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let rt_op = crdt_peer.doc.items.insert(ROOT_ID, item).sign(&kp_peer); + crdt_peer.apply(rt_op.clone()); + + // Node B decodes and applies the peer's real-time op directly. + let kp_b = make_keypair(); + let mut crdt_b = BaseCrdt::::new(&kp_b); + let wire = crate::crdt_wire::encode(&rt_op); + let decoded = crate::crdt_wire::decode(&wire).unwrap(); + // This mirrors handle_incoming_binary() — applied unconditionally. + let result = crdt_b.apply(decoded); + assert_eq!( + result, + OpState::Ok, + "incoming real-time op must be applied immediately regardless of own ready state" + ); + assert_eq!( + crdt_b.doc.items[0].stage.view(), + JV::String("1_backlog".to_string()), + "op content must be correct" + ); + } + + #[test] + fn existing_crdt_sync_tests_pass_unchanged() { + // Reaching this point means all prior tests in this module compiled + // and passed. This test documents the AC6 intent. + } +} diff --git a/server/src/crdt_sync/mod.rs b/server/src/crdt_sync/mod.rs new file mode 100644 index 00000000..a5ec01b7 --- /dev/null +++ b/server/src/crdt_sync/mod.rs @@ -0,0 +1,75 @@ +//! CRDT sync — WebSocket-based replication of pipeline state between huskies nodes. +/// WebSocket-based CRDT sync layer for replicating pipeline state between +/// huskies nodes. +/// +/// # Protocol +/// +/// ## Version negotiation +/// +/// After the auth handshake, both sides send their first sync message: +/// +/// - **v2 peers** send a `clock` frame: `{"type":"clock","clock":{ : , ... }}` +/// containing a vector clock that maps each author's hex Ed25519 pubkey to the +/// count of ops received from that author. Upon receiving the peer's clock, +/// each side computes the delta via [`crdt_state::ops_since`] and sends only +/// the missing ops as a `bulk` frame. +/// +/// - **v1 (legacy) peers** send a `bulk` frame directly (full op dump). +/// A v2 peer receiving a `bulk` first (instead of a `clock`) falls back to +/// the full-dump path: applies the incoming bulk and responds with its own +/// full bulk. This preserves backward compatibility — no code change needed +/// on the v1 side. +/// +/// ## Text frames +/// A JSON object with a `"type"` field: +/// - `{"type":"clock","clock":{...}}` — Vector clock (v2 protocol). +/// - `{"type":"bulk","ops":[...]}` — Ops dump (full or delta). +/// - `{"type":"ready"}` — Signals that the bulk-delta phase is complete and the +/// sender is ready for real-time op streaming. Locally-generated ops are +/// buffered until the peer's `ready` is received, then flushed in order. +/// +/// ## Binary frames (real-time op broadcast) +/// Individual `SignedOp`s encoded via [`crate::crdt_wire`] (versioned JSON +/// envelope: `{"v":1,"op":{...}}`). Each locally-applied op is immediately +/// broadcast as a binary frame to all connected peers. +/// +/// Both the server endpoint and the rendezvous client use the same protocol, +/// making the connection fully symmetric. +/// +/// ## Backpressure +/// Each connected peer has its own [`tokio::sync::broadcast`] receiver. If a +/// slow peer allows the channel to fill (indicated by a `Lagged` error), the +/// connection is dropped with a warning log. The peer can reconnect and +/// receive a fresh bulk state dump to catch up. + +// ── Cross-cutting constants ───────────────────────────────────────── + +// ── Auth configuration ────────────────────────────────────────────── + +/// Default timeout for the auth handshake (seconds). +pub(super) const AUTH_TIMEOUT_SECS: u64 = 10; + +// ── Keepalive configuration ───────────────────────────────────────── + +/// Interval (seconds) between WebSocket Ping frames sent by each side. +pub const PING_INTERVAL_SECS: u64 = 30; + +/// Seconds without a Pong response before the connection is dropped. +pub const PONG_TIMEOUT_SECS: u64 = 60; + +// ── Sub-modules ───────────────────────────────────────────────────── +mod auth; +mod client; +mod dispatch; +mod server; +mod wire; + +// ── Public API re-exports ─────────────────────────────────────────── +pub use auth::{add_join_token, init_token_auth, init_trusted_keys}; +pub(crate) use client::connect_and_sync; +pub use client::{RENDEZVOUS_ERROR_THRESHOLD, spawn_rendezvous_client}; +pub use server::crdt_sync_handler; + +// Test-only re-export used by `crdt_snapshot` tests. +#[cfg(test)] +pub(crate) use wire::SyncMessagePublic; diff --git a/server/src/crdt_sync/server.rs b/server/src/crdt_sync/server.rs new file mode 100644 index 00000000..bf51a0df --- /dev/null +++ b/server/src/crdt_sync/server.rs @@ -0,0 +1,1680 @@ +//! Server-side `/crdt-sync` WebSocket handler. + +use bft_json_crdt::json_crdt::SignedOp; +use futures::{SinkExt, StreamExt}; +use poem::handler; +use poem::http::StatusCode; +use poem::web::Data; +use poem::web::Query; +use poem::web::websocket::{Message as WsMessage, WebSocket}; +use serde::Deserialize; +use std::sync::Arc; + +use crate::crdt_snapshot; +use crate::crdt_state; +use crate::crdt_wire; +use crate::http::context::AppContext; +use crate::node_identity; +use crate::slog; +use crate::slog_warn; + +use super::auth::{REQUIRE_TOKEN, trusted_keys, validate_join_token}; +use super::dispatch::{handle_incoming_binary, handle_incoming_text}; +use super::wire::{AuthMessage, ChallengeMessage, SyncMessage}; +use super::{AUTH_TIMEOUT_SECS, PING_INTERVAL_SECS, PONG_TIMEOUT_SECS}; + +// ── Server-side WebSocket handler ─────────────────────────────────── + +/// Query parameters accepted on the `/crdt-sync` WebSocket upgrade request. +#[derive(Deserialize)] +struct SyncQueryParams { + /// Optional bearer token. Required when the server is in token-required mode. + token: Option, +} + +/// WebSocket handler for CRDT peer synchronisation. +/// +/// Accepts an optional `?token=` query parameter. When the +/// server is configured with `crdt_require_token = true`, a valid token must +/// be supplied or the upgrade is rejected with HTTP 401. When the server is +/// in open-access mode (the default), a token is optional but still validated +/// if present. +#[handler] +pub async fn crdt_sync_handler( + ws: WebSocket, + _ctx: Data<&Arc>, + remote_addr: &poem::web::RemoteAddr, + Query(params): Query, +) -> poem::Response { + // ── Bearer-token check (pre-upgrade) ──────────────────────────── + let require_token = REQUIRE_TOKEN.get().copied().unwrap_or(false); + match ¶ms.token { + Some(t) => { + if !validate_join_token(t) { + slog!("[crdt-sync] Rejected connection: invalid or expired token"); + return poem::Response::builder() + .status(StatusCode::UNAUTHORIZED) + .body("invalid or expired token"); + } + } + None if require_token => { + slog!("[crdt-sync] Rejected connection: token required but not provided"); + return poem::Response::builder() + .status(StatusCode::UNAUTHORIZED) + .body("token required"); + } + None => {} + } + + // ── WebSocket upgrade ──────────────────────────────────────────── + use poem::IntoResponse as _; + let peer_addr = remote_addr.to_string(); + ws.on_upgrade(move |socket| async move { + let (mut sink, mut stream) = socket.split(); + + slog!("[crdt-sync] Peer connected, starting auth handshake"); + + // ── Step 1: Send challenge to the connecting peer ───────── + let challenge = node_identity::generate_challenge(); + let challenge_msg = ChallengeMessage { + r#type: "challenge".to_string(), + nonce: challenge.clone(), + }; + let challenge_json = match serde_json::to_string(&challenge_msg) { + Ok(j) => j, + Err(_) => return, + }; + if sink.send(WsMessage::Text(challenge_json)).await.is_err() { + return; + } + + // ── Step 2: Await auth reply within timeout ─────────────── + let auth_result = tokio::time::timeout( + std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), + stream.next(), + ) + .await; + + let auth_text = match auth_result { + Ok(Some(Ok(WsMessage::Text(text)))) => text, + Ok(_) | Err(_) => { + // Timeout or connection closed before auth reply. + slog!("[crdt-sync] Auth timeout or connection lost during handshake"); + let _ = sink + .send(WsMessage::Close(Some(( + poem::web::websocket::CloseCode::from(4001), + "auth_timeout".to_string(), + )))) + .await; + let _ = sink.close().await; + return; + } + }; + + // ── Step 3: Verify auth reply ───────────────────────────── + let auth_msg: AuthMessage = match serde_json::from_str(&auth_text) { + Ok(m) => m, + Err(_) => { + slog!("[crdt-sync] Invalid auth message from peer"); + close_with_auth_failed(&mut sink).await; + return; + } + }; + + // Verify signature AND check allow-list. + let sig_valid = + node_identity::verify_challenge(&auth_msg.pubkey_hex, &challenge, &auth_msg.signature_hex); + let key_trusted = trusted_keys().iter().any(|k| k == &auth_msg.pubkey_hex); + + if !sig_valid || !key_trusted { + slog!("[crdt-sync] Auth failed for peer (sig_valid={sig_valid}, key_trusted={key_trusted})"); + close_with_auth_failed(&mut sink).await; + return; + } + + slog!( + "[crdt-sync] Peer authenticated: {:.12}…", + &auth_msg.pubkey_hex + ); + + // ── Auth passed — proceed with CRDT sync ────────────────── + + // v2 protocol: send our vector clock so the peer can compute the delta. + let our_clock = crdt_state::our_vector_clock().unwrap_or_default(); + let clock_msg = SyncMessage::Clock { clock: our_clock }; + if let Ok(json) = serde_json::to_string(&clock_msg) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + + // Wait for the peer's first sync message to determine protocol version. + let first_msg = tokio::time::timeout( + std::time::Duration::from_secs(AUTH_TIMEOUT_SECS), + wait_for_sync_text(&mut stream, &mut sink), + ) + .await; + + match first_msg { + Ok(Some(SyncMessage::Clock { clock: peer_clock })) => { + // v2 peer — if we have a snapshot and the peer has an empty + // clock (new node), send the snapshot first for onboarding. + if peer_clock.is_empty() + && let Some(snapshot) = crdt_snapshot::latest_snapshot() + { + let snap_msg = crdt_snapshot::SnapshotMessage::Snapshot(snapshot); + if let Ok(json) = serde_json::to_string(&snap_msg) { + if sink.send(WsMessage::Text(json)).await.is_err() { + return; + } + slog!("[crdt-sync] Sent snapshot to new node for onboarding"); + } + } + + // Send only the ops the peer is missing. + let delta = crdt_state::ops_since(&peer_clock).unwrap_or_default(); + slog!( + "[crdt-sync] v2 delta sync: sending {} ops (peer missing)", + delta.len() + ); + let msg = SyncMessage::Bulk { ops: delta }; + if let Ok(json) = serde_json::to_string(&msg) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + } + Ok(Some(SyncMessage::Bulk { ops })) => { + // v1 peer — apply their bulk and send our full bulk. + let mut applied = 0u64; + for op_json in &ops { + if let Ok(signed_op) = serde_json::from_str::(op_json) + && crdt_state::apply_remote_op(signed_op) + { + applied += 1; + } + } + slog!( + "[crdt-sync] v1 bulk sync: received {} ops, applied {applied}", + ops.len() + ); + if let Some(all) = crdt_state::all_ops_json() { + let msg = SyncMessage::Bulk { ops: all }; + if let Ok(json) = serde_json::to_string(&msg) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + } + } + Ok(Some(SyncMessage::Op { op })) => { + // Single op before negotiation — treat as v1. + if let Ok(signed_op) = serde_json::from_str::(&op) { + crdt_state::apply_remote_op(signed_op); + } + if let Some(all) = crdt_state::all_ops_json() { + let msg = SyncMessage::Bulk { ops: all }; + if let Ok(json) = serde_json::to_string(&msg) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + } + } + _ => { + // Timeout or error — send full bulk as fallback. + slog!("[crdt-sync] No sync message from peer; sending full bulk as fallback"); + if let Some(all) = crdt_state::all_ops_json() { + let msg = SyncMessage::Bulk { ops: all }; + if let Ok(json) = serde_json::to_string(&msg) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + } + } + } + + // Bulk-delta phase complete — signal the peer that we are ready for + // real-time op streaming. + if let Ok(json) = serde_json::to_string(&SyncMessage::Ready) + && sink.send(WsMessage::Text(json)).await.is_err() + { + return; + } + + // Subscribe to new local ops. + let Some(mut op_rx) = crdt_state::subscribe_ops() else { + return; + }; + + // Buffer for locally-generated ops produced before the peer's `ready` + // arrives. Flushed in-order once the peer signals catch-up. + let mut peer_ready = false; + let mut op_buffer: Vec = Vec::new(); + + // ── Keepalive state ─────────────────────────────────────────── + let mut pong_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); + let mut ping_ticker = tokio::time::interval_at( + tokio::time::Instant::now() + + std::time::Duration::from_secs(PING_INTERVAL_SECS), + std::time::Duration::from_secs(PING_INTERVAL_SECS), + ); + + loop { + tokio::select! { + // Send periodic Ping and enforce Pong timeout. + _ = ping_ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { + slog_warn!( + "[crdt-sync] No pong from peer {} in {}s; disconnecting", + peer_addr, + PONG_TIMEOUT_SECS + ); + break; + } + if sink.send(WsMessage::Ping(vec![])).await.is_err() { + break; + } + } + // Forward new local ops to the peer encoded via the wire codec. + result = op_rx.recv() => { + match result { + Ok(signed_op) => { + if peer_ready { + let bytes = crdt_wire::encode(&signed_op); + if sink.send(WsMessage::Binary(bytes)).await.is_err() { + break; + } + } else { + // Buffer until the peer signals ready. + op_buffer.push(signed_op); + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { + // The peer cannot keep up; disconnect so it can + // reconnect and receive a fresh bulk state dump. + slog!("[crdt-sync] Slow peer lagged {n} ops; disconnecting"); + break; + } + Err(_) => break, + } + } + // Receive ops from the peer. + frame = stream.next() => { + match frame { + Some(Ok(WsMessage::Pong(_))) => { + // Reset the pong deadline on every Pong received. + pong_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(PONG_TIMEOUT_SECS); + } + Some(Ok(WsMessage::Ping(data))) => { + // Respond to peer's Ping so the peer's keepalive passes. + let _ = sink.send(WsMessage::Pong(data)).await; + } + Some(Ok(WsMessage::Text(text))) => { + // Check for the ready signal before other text frames. + if let Ok(SyncMessage::Ready) = serde_json::from_str(&text) { + peer_ready = true; + slog!("[crdt-sync] Peer ready; flushing {} buffered ops", op_buffer.len()); + let mut flush_ok = true; + for op in op_buffer.drain(..) { + let bytes = crdt_wire::encode(&op); + if sink.send(WsMessage::Binary(bytes)).await.is_err() { + flush_ok = false; + break; + } + } + if !flush_ok { + break; + } + } else { + // Bulk state dump, legacy op frame, or clock frame. + handle_incoming_text(&text); + } + } + Some(Ok(WsMessage::Binary(bytes))) => { + // Real-time op encoded via wire codec — applied immediately + // regardless of our own ready state. + handle_incoming_binary(&bytes); + } + Some(Ok(WsMessage::Close(_))) | None => break, + _ => {} + } + } + } + } + + slog!("[crdt-sync] Peer disconnected"); + }) + .into_response() +} + +/// Wait for the next text-frame sync message from the peer, handling Ping/Pong +/// transparently. +/// +/// Returns `None` on connection close or read error. +async fn wait_for_sync_text( + stream: &mut futures::stream::SplitStream, + sink: &mut futures::stream::SplitSink, +) -> Option { + loop { + match stream.next().await { + Some(Ok(WsMessage::Text(text))) => { + return serde_json::from_str(&text).ok(); + } + Some(Ok(WsMessage::Ping(data))) => { + let _ = sink.send(WsMessage::Pong(data)).await; + } + Some(Ok(WsMessage::Pong(_))) => continue, + _ => return None, + } + } +} + +/// Close the WebSocket with a generic `auth_failed` reason. +/// +/// The close reason is intentionally the same for all auth failures +/// (bad signature, untrusted key, malformed message) to avoid leaking +/// which check failed. +async fn close_with_auth_failed( + sink: &mut futures::stream::SplitSink, +) { + let _ = sink + .send(WsMessage::Close(Some(( + poem::web::websocket::CloseCode::from(4002), + "auth_failed".to_string(), + )))) + .await; + let _ = sink.close().await; +} + +/// Process an incoming text-frame sync message from a peer. +/// +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug)] + #[allow(dead_code)] + enum AuthListenerResult { + Authenticated(String), + AuthFailed(String), + AuthTimeout, + ConnectionLost, + PeerClosedEarly(Option), + } + + #[test] + fn peer_receives_op_encoded_via_wire_codec() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + + use crate::crdt_state::PipelineDoc; + use crate::crdt_wire; + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "506_story_lifecycle_test", + "stage": "1_backlog", + "name": "Lifecycle Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + + // Simulate what the broadcast handler does: encode via wire codec. + let bytes = crdt_wire::encode(&op); + + // The bytes must be a versioned JSON envelope, not a SyncMessage wrapper. + let text = std::str::from_utf8(&bytes).expect("wire output is valid UTF-8"); + assert!( + text.contains("\"v\":1"), + "wire codec version tag must be present: {text}" + ); + assert!( + !text.contains("\"type\":\"op\""), + "must not be wrapped in SyncMessage: {text}" + ); + + // The receiving peer can decode and apply the op. + let decoded = crdt_wire::decode(&bytes).expect("decode must succeed"); + assert_eq!(op, decoded); + } + + #[tokio::test] + async fn multiple_peers_all_receive_broadcast_op() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + use tokio::sync::broadcast; + + use crate::crdt_state::PipelineDoc; + use crate::crdt_wire; + + // Create a broadcast channel (analogous to SYNC_TX). + let (tx, _) = broadcast::channel::(16); + let mut rx_peer1 = tx.subscribe(); + let mut rx_peer2 = tx.subscribe(); + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "506_story_multi_peer_test", + "stage": "1_backlog", + "name": "Multi-Peer Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + + // Broadcast one op. + tx.send(op.clone()).expect("send must succeed"); + + // Both peers receive the same op. + let received1 = rx_peer1.recv().await.expect("peer 1 must receive"); + let received2 = rx_peer2.recv().await.expect("peer 2 must receive"); + assert_eq!(received1, op); + assert_eq!(received2, op); + + // Both encode identically via wire codec. + let bytes1 = crdt_wire::encode(&received1); + let bytes2 = crdt_wire::encode(&received2); + assert_eq!(bytes1, bytes2, "wire-encoded bytes must be identical"); + } + + #[test] + fn disconnected_peer_does_not_panic() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + use tokio::sync::broadcast; + + use crate::crdt_state::PipelineDoc; + + let (tx, rx) = broadcast::channel::(16); + // Drop the receiver to simulate a peer that disconnected. + drop(rx); + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "506_story_disconnect_test", + "stage": "1_backlog", + "name": "Disconnect Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + + // Sending to a channel with no receivers returns an error; must not panic. + let _ = tx.send(op); + } + + #[tokio::test] + async fn lagged_peer_gets_lagged_error() { + use bft_json_crdt::json_crdt::BaseCrdt; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use serde_json::json; + use tokio::sync::broadcast; + + use crate::crdt_state::PipelineDoc; + + // Tiny capacity so we can trigger Lagged easily. + let (tx, mut rx) = broadcast::channel::(2); + + let kp = make_keypair(); + let mut crdt = BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "506_story_lag_test", + "stage": "1_backlog", + "name": "Lag Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op1 = crdt.doc.items.insert(ROOT_ID, item).sign(&kp); + crdt.apply(op1.clone()); + + // Overflow the tiny buffer by sending more ops than the capacity. + let op2 = crdt.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp); + crdt.apply(op2.clone()); + let op3 = crdt.doc.items[0].stage.set("3_qa".to_string()).sign(&kp); + crdt.apply(op3.clone()); + let op4 = crdt.doc.items[0].stage.set("4_merge".to_string()).sign(&kp); + crdt.apply(op4.clone()); + + // Send more ops than the channel capacity without consuming. + let _ = tx.send(op1); + let _ = tx.send(op2); + let _ = tx.send(op3); + let _ = tx.send(op4); + + // The slow peer should now see a Lagged error on next recv. + // Consume until we hit Lagged or run out. + let mut got_lagged = false; + for _ in 0..10 { + match rx.recv().await { + Err(broadcast::error::RecvError::Lagged(_)) => { + got_lagged = true; + break; + } + Ok(_) => continue, + Err(broadcast::error::RecvError::Closed) => break, + } + } + assert!( + got_lagged, + "slow peer must receive a Lagged error when channel overflows" + ); + } + + #[tokio::test] + async fn e2e_convergence_two_websocket_nodes() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV, OpState}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use futures::{SinkExt, StreamExt}; + use serde_json::json; + use std::sync::{Arc, Mutex}; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + use crate::crdt_state::PipelineDoc; + + // ── Node A: build local state ────────────────────────────────────── + let kp_a = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp_a); + + let item: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "508_e2e_convergence", + "stage": "2_current", + "name": "E2E Convergence Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a); + crdt_a.apply(op1.clone()); + + // Serialise A's full state as a bulk message. + let op1_json = serde_json::to_string(&op1).unwrap(); + let bulk_msg = SyncMessage::Bulk { + ops: vec![op1_json], + }; + let bulk_wire = serde_json::to_string(&bulk_msg).unwrap(); + + // ── Start Node A's WebSocket server on a random port ─────────────── + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let bulk_to_send = bulk_wire.clone(); + let received_by_a: Arc>> = Arc::new(Mutex::new(vec![])); + let received_by_a_clone = received_by_a.clone(); + + tokio::spawn(async move { + let (tcp_stream, _) = listener.accept().await.unwrap(); + let ws_stream = accept_async(tcp_stream).await.unwrap(); + let (mut sink, mut stream) = ws_stream.split(); + + // Send bulk state to the connecting peer. + sink.send(TMsg::Text(bulk_to_send.into())).await.unwrap(); + + // Also listen for ops sent by the peer. + if let Some(Ok(TMsg::Text(txt))) = stream.next().await { + received_by_a_clone.lock().unwrap().push(txt.to_string()); + } + }); + + // ── Node B: connect to Node A and exchange state ─────────────────── + let kp_b = make_keypair(); + let mut crdt_b = BaseCrdt::::new(&kp_b); + + let url = format!("ws://{addr}"); + let (ws_b, _) = connect_async(&url).await.unwrap(); + let (mut sink_b, mut stream_b) = ws_b.split(); + + // Node B receives bulk from A. + if let Some(Ok(TMsg::Text(txt))) = stream_b.next().await { + let msg: SyncMessage = serde_json::from_str(txt.as_str()).unwrap(); + match msg { + SyncMessage::Bulk { ops } => { + for op_str in &ops { + let signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(op_str).unwrap(); + let r = crdt_b.apply(signed); + assert!(r == OpState::Ok || r == OpState::AlreadySeen); + } + } + _ => panic!("Expected Bulk from Node A"), + } + } + + // Node B also creates a new op and sends it to A. + let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "508_e2e_convergence_b", + "stage": "1_backlog", + "name": "E2E Convergence B", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); + crdt_b.apply(op_b1.clone()); + + let op_b1_json = serde_json::to_string(&op_b1).unwrap(); + let msg_to_a = SyncMessage::Op { op: op_b1_json }; + sink_b + .send(TMsg::Text(serde_json::to_string(&msg_to_a).unwrap().into())) + .await + .unwrap(); + + // Wait a moment for Node A to process. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // ── Assert convergence ───────────────────────────────────────────── + + // Node B received Node A's item. + assert_eq!( + crdt_b.doc.items.view().len(), + 2, + "Node B must see both items after sync" + ); + let has_a_item = crdt_b + .doc + .items + .view() + .iter() + .any(|item| item.story_id.view() == JV::String("508_e2e_convergence".to_string())); + assert!(has_a_item, "Node B must have Node A's item"); + + // Node A received Node B's op via the WebSocket. + let a_received = received_by_a.lock().unwrap(); + assert!( + !a_received.is_empty(), + "Node A must have received an op from Node B" + ); + } + + #[tokio::test] + async fn e2e_partition_healing_websocket() { + use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV}; + use bft_json_crdt::keypair::make_keypair; + use bft_json_crdt::op::ROOT_ID; + use futures::{SinkExt, StreamExt}; + use serde_json::json; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + use crate::crdt_state::PipelineDoc; + + // ── Phase 1: Both nodes start with op_a1 (before partition) ─────── + let kp_a = make_keypair(); + let kp_b = make_keypair(); + let mut crdt_a = BaseCrdt::::new(&kp_a); + let mut crdt_b = BaseCrdt::::new(&kp_b); + + let item_a: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "508_heal_a", + "stage": "1_backlog", + "name": "Heal A", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op_a1 = crdt_a.doc.items.insert(ROOT_ID, item_a).sign(&kp_a); + crdt_a.apply(op_a1.clone()); + // B also starts with op_a1 (shared state before partition). + crdt_b.apply(op_a1.clone()); + + // ── Phase 2: Partition — each side mutates independently ────────── + // A advances its story stage. + let op_a2 = crdt_a.doc.items[0] + .stage + .set("2_current".to_string()) + .sign(&kp_a); + crdt_a.apply(op_a2.clone()); + + // B inserts a new story that A doesn't know about yet. + let item_b: bft_json_crdt::json_crdt::JsonValue = json!({ + "story_id": "508_heal_b", + "stage": "1_backlog", + "name": "Heal B", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op_b1 = crdt_b.doc.items.insert(ROOT_ID, item_b).sign(&kp_b); + crdt_b.apply(op_b1.clone()); + + // Collect B's full state as bulk (what it will send on reconnect). + let b_ops: Vec = [&op_a1, &op_b1] + .iter() + .map(|op| serde_json::to_string(op).unwrap()) + .collect(); + let b_bulk_wire = serde_json::to_string(&SyncMessage::Bulk { ops: b_ops }).unwrap(); + + // Collect A's full state as bulk (what it will send on reconnect). + let a_ops: Vec = [&op_a1, &op_a2] + .iter() + .map(|op| serde_json::to_string(op).unwrap()) + .collect(); + let a_bulk_wire = serde_json::to_string(&SyncMessage::Bulk { ops: a_ops }).unwrap(); + + // ── Phase 3: Reconnect — use a real WebSocket to exchange bulk ──── + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let a_bulk_to_send = a_bulk_wire.clone(); + let a_received_bulk: std::sync::Arc>> = + std::sync::Arc::new(std::sync::Mutex::new(None)); + let a_received_clone = a_received_bulk.clone(); + + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + // A sends its bulk state. + sink.send(TMsg::Text(a_bulk_to_send.into())).await.unwrap(); + // A receives B's bulk state. + if let Some(Ok(TMsg::Text(txt))) = stream.next().await { + *a_received_clone.lock().unwrap() = Some(txt.to_string()); + } + }); + + // B connects, exchanges bulk state. + let (ws_b, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + let (mut sink_b, mut stream_b) = ws_b.split(); + + // B receives A's bulk and applies it. + if let Some(Ok(TMsg::Text(txt))) = stream_b.next().await { + let msg: SyncMessage = serde_json::from_str(txt.as_str()).unwrap(); + if let SyncMessage::Bulk { ops } = msg { + for op_str in &ops { + let signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(op_str).unwrap(); + let _ = crdt_b.apply(signed); + } + } + } + + // B sends its bulk state to A. + sink_b.send(TMsg::Text(b_bulk_wire.into())).await.unwrap(); + + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Apply A's received ops into crdt_a. + if let Some(bulk_str) = a_received_bulk.lock().unwrap().take() { + let msg: SyncMessage = serde_json::from_str(&bulk_str).unwrap(); + if let SyncMessage::Bulk { ops } = msg { + for op_str in &ops { + let signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(op_str).unwrap(); + let _ = crdt_a.apply(signed); + } + } + } + + // ── Assert convergence ───────────────────────────────────────────── + + // Both nodes must have 2 items. + assert_eq!( + crdt_a.doc.items.view().len(), + 2, + "A must have 2 items after healing" + ); + assert_eq!( + crdt_b.doc.items.view().len(), + 2, + "B must have 2 items after healing" + ); + + // A must see B's story. + let b_story_on_a = crdt_a + .doc + .items + .view() + .iter() + .any(|item| item.story_id.view() == JV::String("508_heal_b".to_string())); + assert!(b_story_on_a, "A must have B's story after healing"); + + // B must see A's stage advance. + let a_story_on_b = crdt_b + .doc + .items + .view() + .iter() + .any(|item| item.story_id.view() == JV::String("508_heal_a".to_string())); + assert!(a_story_on_b, "B must have A's story after healing"); + + // CRDT views must be byte-identical (convergence). + let view_a = serde_json::to_string(&CrdtNode::view(&crdt_a.doc.items)).unwrap(); + let view_b = serde_json::to_string(&CrdtNode::view(&crdt_b.doc.items)).unwrap(); + assert_eq!( + view_a, view_b, + "Both nodes must converge to identical state" + ); + } + + async fn start_auth_listener( + trusted_keys: Vec, + ) -> ( + std::net::SocketAddr, + tokio::sync::oneshot::Receiver, + ) { + use tokio::net::TcpListener; + use tokio_tungstenite::accept_async; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let (result_tx, result_rx) = tokio::sync::oneshot::channel(); + + tokio::spawn(async move { + let (tcp_stream, _) = listener.accept().await.unwrap(); + let ws_stream = accept_async(tcp_stream).await.unwrap(); + let (mut sink, mut stream) = ws_stream.split(); + + use tokio_tungstenite::tungstenite::Message as TMsg; + + // Step 1: Send challenge. + let challenge = crate::node_identity::generate_challenge(); + let challenge_msg = super::ChallengeMessage { + r#type: "challenge".to_string(), + nonce: challenge.clone(), + }; + let challenge_json = serde_json::to_string(&challenge_msg).unwrap(); + if sink.send(TMsg::Text(challenge_json.into())).await.is_err() { + let _ = result_tx.send(AuthListenerResult::ConnectionLost); + return; + } + + // Step 2: Await auth reply (10s timeout). + let auth_frame = + tokio::time::timeout(std::time::Duration::from_secs(10), stream.next()).await; + + let auth_text = match auth_frame { + Ok(Some(Ok(TMsg::Text(t)))) => t.to_string(), + Ok(Some(Ok(TMsg::Close(reason)))) => { + let _ = result_tx.send(AuthListenerResult::PeerClosedEarly( + reason.map(|r| r.reason.to_string()), + )); + return; + } + _ => { + let _ = sink + .send(TMsg::Close(Some(tokio_tungstenite::tungstenite::protocol::CloseFrame { + code: tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode::from(4001), + reason: "auth_timeout".into(), + }))) + .await; + let _ = result_tx.send(AuthListenerResult::AuthTimeout); + return; + } + }; + + // Step 3: Verify. + let auth_msg: super::AuthMessage = match serde_json::from_str(&auth_text) { + Ok(m) => m, + Err(_) => { + let _ = close_listener_auth_failed(&mut sink).await; + let _ = result_tx.send(AuthListenerResult::AuthFailed("bad_json".into())); + return; + } + }; + + let sig_valid = crate::node_identity::verify_challenge( + &auth_msg.pubkey_hex, + &challenge, + &auth_msg.signature_hex, + ); + let key_trusted = trusted_keys.iter().any(|k| k == &auth_msg.pubkey_hex); + + if !sig_valid || !key_trusted { + let _ = close_listener_auth_failed(&mut sink).await; + let _ = result_tx.send(AuthListenerResult::AuthFailed(format!( + "sig_valid={sig_valid}, key_trusted={key_trusted}" + ))); + return; + } + + // Auth passed! Send a bulk state with one op to prove sync works. + let kp = bft_json_crdt::keypair::make_keypair(); + let mut crdt = + bft_json_crdt::json_crdt::BaseCrdt::::new(&kp); + let item: bft_json_crdt::json_crdt::JsonValue = serde_json::json!({ + "story_id": "628_auth_test_item", + "stage": "1_backlog", + "name": "Auth Test", + "agent": "", + "retry_count": 0.0, + "blocked": false, + "depends_on": "", + "claimed_by": "", + "claimed_at": 0.0, + }) + .into(); + let op = crdt + .doc + .items + .insert(bft_json_crdt::op::ROOT_ID, item) + .sign(&kp); + let op_json = serde_json::to_string(&op).unwrap(); + let bulk = super::SyncMessage::Bulk { ops: vec![op_json] }; + let bulk_json = serde_json::to_string(&bulk).unwrap(); + let _ = sink.send(TMsg::Text(bulk_json.into())).await; + + let _ = result_tx.send(AuthListenerResult::Authenticated(auth_msg.pubkey_hex)); + }); + + (addr, result_rx) + } + + async fn close_listener_auth_failed( + sink: &mut futures::stream::SplitSink< + tokio_tungstenite::WebSocketStream, + tokio_tungstenite::tungstenite::Message, + >, + ) { + use tokio_tungstenite::tungstenite::protocol::CloseFrame; + use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; + let _ = sink + .send(tokio_tungstenite::tungstenite::Message::Close(Some( + CloseFrame { + code: CloseCode::from(4002), + reason: "auth_failed".into(), + }, + ))) + .await; + } + + #[tokio::test] + async fn auth_happy_path_handshake_and_sync() { + use bft_json_crdt::keypair::make_keypair; + use futures::{SinkExt, StreamExt}; + use tokio_tungstenite::connect_async; + use tokio_tungstenite::tungstenite::Message as TMsg; + + let connector_kp = make_keypair(); + let connector_pubkey = crate::node_identity::public_key_hex(&connector_kp); + + // Start listener that trusts the connector's pubkey. + let (addr, result_rx) = start_auth_listener(vec![connector_pubkey.clone()]).await; + + // Connect and do the handshake. + let url = format!("ws://{addr}"); + let (ws, _) = connect_async(&url).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + // Receive challenge. + let challenge_frame = stream.next().await.unwrap().unwrap(); + let challenge_text = match challenge_frame { + TMsg::Text(t) => t.to_string(), + other => panic!("Expected text frame, got {other:?}"), + }; + let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); + assert_eq!(challenge_msg.r#type, "challenge"); + assert_eq!( + challenge_msg.nonce.len(), + 64, + "Challenge must be 64 hex chars" + ); + + // Sign and reply. + let sig = crate::node_identity::sign_challenge(&connector_kp, &challenge_msg.nonce); + let auth_msg = super::AuthMessage { + r#type: "auth".to_string(), + pubkey_hex: connector_pubkey.clone(), + signature_hex: sig, + }; + let auth_json = serde_json::to_string(&auth_msg).unwrap(); + sink.send(TMsg::Text(auth_json.into())).await.unwrap(); + + // After auth, we should receive a bulk sync message with at least one op. + let bulk_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) + .await + .expect("should receive bulk within 5s") + .unwrap() + .unwrap(); + + let bulk_text = match bulk_frame { + TMsg::Text(t) => t.to_string(), + other => panic!("Expected bulk text frame, got {other:?}"), + }; + let bulk_msg: super::SyncMessage = serde_json::from_str(&bulk_text).unwrap(); + match bulk_msg { + super::SyncMessage::Bulk { ops } => { + assert!( + !ops.is_empty(), + "Bulk sync must contain at least one op after successful auth" + ); + // Verify we can deserialize the op. + let _signed: bft_json_crdt::json_crdt::SignedOp = + serde_json::from_str(&ops[0]).unwrap(); + } + _ => panic!("Expected Bulk message after auth"), + } + + // Verify listener also reports success. + let listener_result = result_rx.await.unwrap(); + match listener_result { + AuthListenerResult::Authenticated(pubkey) => { + assert_eq!(pubkey, connector_pubkey); + } + other => panic!("Expected Authenticated, got {other:?}"), + } + } + + #[tokio::test] + async fn auth_untrusted_pubkey_rejected() { + use bft_json_crdt::keypair::make_keypair; + use futures::{SinkExt, StreamExt}; + use tokio_tungstenite::connect_async; + use tokio_tungstenite::tungstenite::Message as TMsg; + + let connector_kp = make_keypair(); + let connector_pubkey = crate::node_identity::public_key_hex(&connector_kp); + + // Listener trusts a DIFFERENT key, not the connector's. + let other_kp = make_keypair(); + let other_pubkey = crate::node_identity::public_key_hex(&other_kp); + + let (addr, result_rx) = start_auth_listener(vec![other_pubkey]).await; + + let url = format!("ws://{addr}"); + let (ws, _) = connect_async(&url).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + // Receive challenge and sign with our (untrusted) key. + let challenge_frame = stream.next().await.unwrap().unwrap(); + let challenge_text = match challenge_frame { + TMsg::Text(t) => t.to_string(), + _ => panic!("Expected text frame"), + }; + let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); + + let sig = crate::node_identity::sign_challenge(&connector_kp, &challenge_msg.nonce); + let auth_msg = super::AuthMessage { + r#type: "auth".to_string(), + pubkey_hex: connector_pubkey, + signature_hex: sig, + }; + sink.send(TMsg::Text(serde_json::to_string(&auth_msg).unwrap().into())) + .await + .unwrap(); + + // Should receive a close frame with auth_failed. + let close_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) + .await + .expect("should receive close within 5s"); + + match close_frame { + Some(Ok(TMsg::Close(Some(frame)))) => { + assert_eq!( + &*frame.reason, "auth_failed", + "Close reason must be 'auth_failed'" + ); + } + Some(Ok(TMsg::Close(None))) => { + // Some implementations omit the close frame payload — that's acceptable + // as long as no sync data was sent. + } + other => { + // Connection dropped without close frame is also acceptable. + // The key assertion is below: no ops were exchanged. + let _ = other; + } + } + + // Verify listener reports auth failure. + let listener_result = result_rx.await.unwrap(); + match listener_result { + AuthListenerResult::AuthFailed(reason) => { + assert!(reason.contains("key_trusted=false"), "Reason: {reason}"); + } + other => panic!("Expected AuthFailed, got {other:?}"), + } + } + + #[tokio::test] + async fn auth_bad_signature_rejected() { + use bft_json_crdt::keypair::make_keypair; + use futures::{SinkExt, StreamExt}; + use tokio_tungstenite::connect_async; + use tokio_tungstenite::tungstenite::Message as TMsg; + + let legitimate_kp = make_keypair(); + let legitimate_pubkey = crate::node_identity::public_key_hex(&legitimate_kp); + + // A different keypair that will sign the challenge (wrong key). + let impersonator_kp = make_keypair(); + + // Listener trusts the legitimate pubkey. + let (addr, result_rx) = start_auth_listener(vec![legitimate_pubkey.clone()]).await; + + let url = format!("ws://{addr}"); + let (ws, _) = connect_async(&url).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + // Receive challenge. + let challenge_frame = stream.next().await.unwrap().unwrap(); + let challenge_text = match challenge_frame { + TMsg::Text(t) => t.to_string(), + _ => panic!("Expected text frame"), + }; + let challenge_msg: super::ChallengeMessage = serde_json::from_str(&challenge_text).unwrap(); + + // Sign with the WRONG keypair but claim to be the legitimate pubkey. + let bad_sig = crate::node_identity::sign_challenge(&impersonator_kp, &challenge_msg.nonce); + let auth_msg = super::AuthMessage { + r#type: "auth".to_string(), + pubkey_hex: legitimate_pubkey, + signature_hex: bad_sig, + }; + sink.send(TMsg::Text(serde_json::to_string(&auth_msg).unwrap().into())) + .await + .unwrap(); + + // Should be rejected. + let close_frame = tokio::time::timeout(std::time::Duration::from_secs(5), stream.next()) + .await + .expect("should receive close within 5s"); + + match close_frame { + Some(Ok(TMsg::Close(Some(frame)))) => { + assert_eq!( + &*frame.reason, "auth_failed", + "Close reason must be 'auth_failed' — same as untrusted key" + ); + } + _ => { + // Connection closed is acceptable. + } + } + + // Verify listener reports auth failure with sig_valid=false. + let listener_result = result_rx.await.unwrap(); + match listener_result { + AuthListenerResult::AuthFailed(reason) => { + assert!(reason.contains("sig_valid=false"), "Reason: {reason}"); + } + other => panic!("Expected AuthFailed, got {other:?}"), + } + } + + #[tokio::test] + async fn auth_replay_protection_fresh_nonces() { + use futures::StreamExt; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + // Start a listener that sends challenges but doesn't complete auth. + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let (nonce_tx, mut nonce_rx) = tokio::sync::mpsc::channel::(2); + + tokio::spawn(async move { + for _ in 0..2 { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, _stream) = ws.split(); + + let challenge = crate::node_identity::generate_challenge(); + let msg = super::ChallengeMessage { + r#type: "challenge".to_string(), + nonce: challenge.clone(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let _ = sink.send(TMsg::Text(json.into())).await; + let _ = nonce_tx.send(challenge).await; + } + }); + + // Connect twice and collect the nonces. + let mut nonces = Vec::new(); + for _ in 0..2 { + let url = format!("ws://{addr}"); + let (ws, _) = connect_async(&url).await.unwrap(); + let (_sink, mut stream) = ws.split(); + + let frame = stream.next().await.unwrap().unwrap(); + let text = match frame { + TMsg::Text(t) => t.to_string(), + _ => panic!("Expected text"), + }; + let msg: super::ChallengeMessage = serde_json::from_str(&text).unwrap(); + nonces.push(msg.nonce); + // Drop connection so listener accepts the next one. + drop(stream); + } + + // Also collect nonces from the listener side. + let server_nonce_1 = nonce_rx.recv().await.unwrap(); + let server_nonce_2 = nonce_rx.recv().await.unwrap(); + + assert_ne!( + nonces[0], nonces[1], + "Consecutive challenges must be different" + ); + assert_ne!( + server_nonce_1, server_nonce_2, + "Server must generate fresh nonce per accept" + ); + assert_eq!(nonces[0], server_nonce_1, "Client/server nonces must match"); + assert_eq!(nonces[1], server_nonce_2, "Client/server nonces must match"); + } + + #[test] + fn keepalive_constants_are_correct() { + assert_eq!( + super::super::PING_INTERVAL_SECS, + 30, + "Ping interval must be 30 seconds" + ); + assert_eq!( + super::super::PONG_TIMEOUT_SECS, + 60, + "Pong timeout must be 60 seconds" + ); + } + + #[test] + fn agent_mode_heartbeat_interval_unchanged() { + assert_eq!( + crate::agent_mode::SCAN_INTERVAL_SECS, + 15, + "Agent-mode heartbeat interval must remain 15s" + ); + } + + #[test] + fn reconnect_backoff_constants_unchanged() { + assert_eq!( + super::super::RENDEZVOUS_ERROR_THRESHOLD, + 10, + "Backoff threshold must still be 10" + ); + } + + #[tokio::test] + async fn server_sends_ping_to_peer_at_interval() { + use futures::{SinkExt, StreamExt}; + use std::time::Duration; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + let ping_ms = 100u64; + let timeout_ms = 400u64; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + // Server task: keepalive sender with short intervals. + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); + let mut ticker = tokio::time::interval_at( + tokio::time::Instant::now() + Duration::from_millis(ping_ms), + Duration::from_millis(ping_ms), + ); + + loop { + tokio::select! { + _ = ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { break; } + if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { break; } + } + frame = stream.next() => { + match frame { + Some(Ok(TMsg::Pong(_))) => { + pong_deadline = tokio::time::Instant::now() + + Duration::from_millis(timeout_ms); + } + None | Some(Err(_)) => break, + _ => {} + } + } + } + } + }); + + let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + let (_sink_c, mut stream_c) = ws_client.split(); + + // Wait for more than one ping interval. + tokio::time::sleep(Duration::from_millis(ping_ms * 2)).await; + + // Client should receive a Ping from the server. + let frame = tokio::time::timeout(Duration::from_millis(200), stream_c.next()).await; + let got_ping = matches!(frame, Ok(Some(Ok(TMsg::Ping(_))))); + assert!( + got_ping, + "Client must receive a Ping frame from the server after the ping interval" + ); + } + + #[tokio::test] + async fn client_sends_ping_to_server_at_interval() { + use futures::{SinkExt, StreamExt}; + use std::time::Duration; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + let ping_ms = 100u64; + let timeout_ms = 400u64; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let (ping_tx, ping_rx) = tokio::sync::oneshot::channel::<()>(); + + // Server task: wait for the first Ping the client sends. + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (_sink, mut stream) = ws.split(); + loop { + match stream.next().await { + Some(Ok(TMsg::Ping(_))) => { + let _ = ping_tx.send(()); + break; + } + Some(Ok(_)) => continue, + _ => break, + } + } + }); + + let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + let (mut sink_c, mut stream_c) = ws_client.split(); + + // Client keepalive task. + tokio::spawn(async move { + let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); + let mut ticker = tokio::time::interval_at( + tokio::time::Instant::now() + Duration::from_millis(ping_ms), + Duration::from_millis(ping_ms), + ); + loop { + tokio::select! { + _ = ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { break; } + if sink_c.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { break; } + } + frame = stream_c.next() => { + match frame { + Some(Ok(TMsg::Pong(_))) => { + pong_deadline = tokio::time::Instant::now() + + Duration::from_millis(timeout_ms); + } + None | Some(Err(_)) => break, + _ => {} + } + } + } + } + }); + + let result = tokio::time::timeout(Duration::from_millis(ping_ms * 3), ping_rx).await; + assert!( + result.is_ok(), + "Server must receive a Ping from the client after the ping interval" + ); + } + + #[tokio::test] + async fn keepalive_disconnects_when_pong_withheld() { + use futures::{SinkExt, StreamExt}; + use std::time::Duration; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + let ping_ms = 100u64; + let timeout_ms = 250u64; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let (done_tx, done_rx) = tokio::sync::oneshot::channel::(); + + // Server: sends Pings, never receives Pong (client swallows all). + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + let pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); + let mut ticker = tokio::time::interval_at( + tokio::time::Instant::now() + Duration::from_millis(ping_ms), + Duration::from_millis(ping_ms), + ); + + let timed_out = loop { + tokio::select! { + _ = ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { break true; } + if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { + break false; + } + } + frame = stream.next() => { + match frame { + Some(Ok(_)) => {} // swallow — no Pong sent + _ => break false, + } + } + } + }; + let _ = done_tx.send(timed_out); + }); + + // Client: connect but never respond to Pings. + let (_ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + + let result = + tokio::time::timeout(Duration::from_millis(timeout_ms + ping_ms * 3), done_rx).await; + let timed_out = result + .expect("Server must report within expected wall-clock time") + .expect("oneshot intact"); + + assert!( + timed_out, + "Server must disconnect on keepalive timeout when Pong is withheld" + ); + } + + #[tokio::test] + async fn keepalive_connection_survives_with_pong_responses() { + use futures::{SinkExt, StreamExt}; + use std::time::Duration; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + let ping_ms = 100u64; + let timeout_ms = 250u64; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let (result_tx, result_rx) = tokio::sync::oneshot::channel::(); + + // Server: sends Pings, resets deadline on Pong. + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + let mut pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); + let mut ticker = tokio::time::interval_at( + tokio::time::Instant::now() + Duration::from_millis(ping_ms), + Duration::from_millis(ping_ms), + ); + + let timed_out = loop { + tokio::select! { + _ = ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { break true; } + if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { + break false; + } + } + frame = stream.next() => { + match frame { + Some(Ok(TMsg::Pong(_))) => { + pong_deadline = tokio::time::Instant::now() + + Duration::from_millis(timeout_ms); + } + None | Some(Err(_)) => break false, // clean close + _ => {} + } + } + } + }; + let _ = result_tx.send(timed_out); + }); + + let (ws_client, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + let (mut sink_c, mut stream_c) = ws_client.split(); + + // Client: respond to every Ping with Pong for several intervals. + let respond_task = tokio::spawn(async move { + while let Some(Ok(msg)) = stream_c.next().await { + if let TMsg::Ping(data) = msg + && sink_c.send(TMsg::Pong(data)).await.is_err() + { + break; + } + } + }); + + // Run for a few intervals, then drop the client. + tokio::time::sleep(Duration::from_millis(ping_ms * 3)).await; + respond_task.abort(); + + let result = tokio::time::timeout(Duration::from_millis(200), result_rx).await; + let timed_out = result.unwrap_or(Ok(false)).unwrap_or(false); + assert!( + !timed_out, + "Server must NOT timeout when the client responds to Pings with Pongs" + ); + } + + #[tokio::test] + async fn two_node_pong_swallow_causes_disconnect_within_timeout() { + use futures::{SinkExt, StreamExt}; + use std::time::Duration; + use tokio::net::TcpListener; + use tokio_tungstenite::tungstenite::Message as TMsg; + use tokio_tungstenite::{accept_async, connect_async}; + + let ping_ms = 100u64; + let timeout_ms = 250u64; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + // Node A (listener): sends Pings, never receives Pong. + let (a_done_tx, a_done_rx) = tokio::sync::oneshot::channel::(); + tokio::spawn(async move { + let (tcp, _) = listener.accept().await.unwrap(); + let ws = accept_async(tcp).await.unwrap(); + let (mut sink, mut stream) = ws.split(); + + let pong_deadline = tokio::time::Instant::now() + Duration::from_millis(timeout_ms); + let mut ticker = tokio::time::interval_at( + tokio::time::Instant::now() + Duration::from_millis(ping_ms), + Duration::from_millis(ping_ms), + ); + + let timed_out = loop { + tokio::select! { + _ = ticker.tick() => { + if tokio::time::Instant::now() >= pong_deadline { break true; } + if sink.send(TMsg::Ping(bytes::Bytes::new())).await.is_err() { + break false; + } + } + frame = stream.next() => { + match frame { + Some(Ok(_)) => {} // swallow all frames + _ => break false, + } + } + } + }; + let _ = a_done_tx.send(timed_out); + }); + + // Node B: connects, drains frames silently (swallows Pings, never pongs). + let (ws_b, _) = connect_async(format!("ws://{addr}")).await.unwrap(); + let (_sink_b, mut stream_b) = ws_b.split(); + tokio::spawn(async move { while let Some(Ok(_)) = stream_b.next().await {} }); + + let result = + tokio::time::timeout(Duration::from_millis(timeout_ms + ping_ms * 3), a_done_rx).await; + let timed_out = result + .expect("Node A must report within expected wall-clock time") + .expect("channel intact"); + + assert!( + timed_out, + "Node A must disconnect due to keepalive timeout when Node B swallows Pongs" + ); + } +} diff --git a/server/src/crdt_sync/wire.rs b/server/src/crdt_sync/wire.rs new file mode 100644 index 00000000..a95565b3 --- /dev/null +++ b/server/src/crdt_sync/wire.rs @@ -0,0 +1,141 @@ +//! Wire-protocol types for the `/crdt-sync` WebSocket protocol. + +use serde::{Deserialize, Serialize}; + +// ── Wire protocol types ───────────────────────────────────────────── + +/// Auth handshake: challenge sent by the listener to the connector. +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct ChallengeMessage { + pub(super) r#type: String, + pub(super) nonce: String, +} + +/// Auth handshake: auth reply sent by the connector to the listener. +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct AuthMessage { + pub(super) r#type: String, + pub(super) pubkey_hex: String, + pub(super) signature_hex: String, +} + +#[derive(Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub(crate) enum SyncMessage { + /// Bulk state dump sent on connect (v1) or delta ops after clock exchange (v2). + Bulk { ops: Vec }, + /// A single new op. + Op { op: String }, + /// Vector clock exchanged on connect (v2 protocol). + /// + /// Each entry maps a node's hex-encoded Ed25519 public key to the count of + /// ops received from that node. The receiving side computes the delta via + /// [`crdt_state::ops_since`] and sends only the missing ops. + Clock { + clock: std::collections::HashMap, + }, + /// Signals that the bulk-delta phase is complete; the sender is ready for + /// real-time op streaming. Locally-generated ops are buffered until the + /// peer's `Ready` is received, then flushed in-order. + Ready, +} + +/// Crate-visible re-export of `SyncMessage` for backwards-compatibility testing. +/// +/// Used by `crdt_snapshot` tests to verify that snapshot messages are NOT +/// parseable as legacy `SyncMessage` variants — confirming that old peers +/// will gracefully reject them. +#[cfg(test)] +pub(crate) type SyncMessagePublic = SyncMessage; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sync_message_bulk_serialization_roundtrip() { + let msg = SyncMessage::Bulk { + ops: vec!["op1".to_string(), "op2".to_string()], + }; + let json = serde_json::to_string(&msg).unwrap(); + assert!(json.contains(r#""type":"bulk""#)); + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + match deserialized { + SyncMessage::Bulk { ops } => { + assert_eq!(ops.len(), 2); + assert_eq!(ops[0], "op1"); + assert_eq!(ops[1], "op2"); + } + _ => panic!("Expected Bulk"), + } + } + + #[test] + fn sync_message_op_serialization_roundtrip() { + let msg = SyncMessage::Op { + op: r#"{"inner":{}}"#.to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + assert!(json.contains(r#""type":"op""#)); + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + match deserialized { + SyncMessage::Op { op } => { + assert_eq!(op, r#"{"inner":{}}"#); + } + _ => panic!("Expected Op"), + } + } + + #[test] + fn sync_message_bulk_empty_ops() { + let msg = SyncMessage::Bulk { ops: vec![] }; + let json = serde_json::to_string(&msg).unwrap(); + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + match deserialized { + SyncMessage::Bulk { ops } => assert!(ops.is_empty()), + _ => panic!("Expected Bulk"), + } + } + + #[test] + fn sync_message_clock_serialization_roundtrip() { + let mut clock = std::collections::HashMap::new(); + clock.insert("aabbcc00".to_string(), 42u64); + clock.insert("ddeeff11".to_string(), 7u64); + + let msg = SyncMessage::Clock { clock }; + let json = serde_json::to_string(&msg).unwrap(); + assert!(json.contains(r#""type":"clock""#)); + + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + match deserialized { + SyncMessage::Clock { clock } => { + assert_eq!(clock["aabbcc00"], 42); + assert_eq!(clock["ddeeff11"], 7); + } + _ => panic!("Expected Clock"), + } + } + + #[test] + fn sync_message_clock_empty() { + let msg = SyncMessage::Clock { + clock: std::collections::HashMap::new(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + match deserialized { + SyncMessage::Clock { clock } => assert!(clock.is_empty()), + _ => panic!("Expected Clock"), + } + } + + #[test] + fn sync_message_ready_serialization_roundtrip() { + let msg = SyncMessage::Ready; + let json = serde_json::to_string(&msg).unwrap(); + assert_eq!(json, r#"{"type":"ready"}"#); + let deserialized: SyncMessage = serde_json::from_str(&json).unwrap(); + assert!(matches!(deserialized, SyncMessage::Ready)); + } +}