refactor: split crdt_sync.rs into auth/wire/server/dispatch/client modules
The 3672-line crdt_sync.rs is split into a sub-module directory with co-located tests per Rust convention: - auth.rs: trusted-keys + bearer-token validation (230 lines) - wire.rs: ChallengeMessage / AuthMessage / SyncMessage types (141 lines) - server.rs: WebSocket server handler (1680 lines) - dispatch.rs: incoming-message dispatch + bulk/clock/op handling (1028 lines) - client.rs: rendezvous client + reconnect/backoff (464 lines) - mod.rs: doc, cross-cutting constants, re-exports (75 lines) No behaviour change. All 65 crdt_sync tests pass; full suite green (2635 tests with --test-threads=1).
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,230 @@
|
|||||||
|
//! Auth: trusted-key allow-list and bearer-token validation for `/crdt-sync` connections.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
/// Trusted public keys loaded once at startup.
|
||||||
|
static TRUSTED_KEYS: OnceLock<Vec<String>> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Initialise the trusted-key allow-list for connect-time mutual auth.
|
||||||
|
///
|
||||||
|
/// Must be called once at startup before any WebSocket connections are
|
||||||
|
/// accepted. Subsequent calls are no-ops (OnceLock).
|
||||||
|
pub fn init_trusted_keys(keys: Vec<String>) {
|
||||||
|
let _ = TRUSTED_KEYS.set(keys);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a reference to the trusted-key allow-list.
|
||||||
|
pub(super) fn trusted_keys() -> &'static [String] {
|
||||||
|
TRUSTED_KEYS.get().map(|v| v.as_slice()).unwrap_or(&[])
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Bearer-token auth ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Time-to-live for CRDT bearer tokens in seconds (30 days).
|
||||||
|
pub(super) const TOKEN_TTL_SECS: f64 = 30.0 * 24.0 * 3600.0;
|
||||||
|
|
||||||
|
/// Whether a bearer token is required for `/crdt-sync` connections.
|
||||||
|
/// `None` (uninitialised) → open access (backward compatible).
|
||||||
|
pub(super) static REQUIRE_TOKEN: OnceLock<bool> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Valid bearer tokens — maps token string to its expiry unix timestamp.
|
||||||
|
static CRDT_TOKENS: OnceLock<std::sync::RwLock<HashMap<String, f64>>> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Initialise bearer-token auth for CRDT-sync connections.
|
||||||
|
///
|
||||||
|
/// Must be called once at startup before any WebSocket connections are accepted.
|
||||||
|
/// When `require` is `true`, clients must supply a valid `?token=` query
|
||||||
|
/// parameter on the upgrade request or receive HTTP 401. When `require` is
|
||||||
|
/// `false` (default) a token is optional — connections without one are
|
||||||
|
/// accepted, but a supplied token is still validated.
|
||||||
|
pub fn init_token_auth(require: bool, tokens: Vec<String>) {
|
||||||
|
let _ = REQUIRE_TOKEN.set(require);
|
||||||
|
let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new()));
|
||||||
|
if let Ok(mut map) = store.write() {
|
||||||
|
let now = chrono::Utc::now().timestamp() as f64;
|
||||||
|
for token in tokens {
|
||||||
|
map.insert(token, now + TOKEN_TTL_SECS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a bearer token to the CRDT-sync token store.
|
||||||
|
///
|
||||||
|
/// The token expires after [`TOKEN_TTL_SECS`] seconds. Returns the expiry
|
||||||
|
/// unix timestamp so callers can surface it in admin tooling.
|
||||||
|
pub fn add_join_token(token: String) -> f64 {
|
||||||
|
let store = CRDT_TOKENS.get_or_init(|| std::sync::RwLock::new(HashMap::new()));
|
||||||
|
let now = chrono::Utc::now().timestamp() as f64;
|
||||||
|
let expires_at = now + TOKEN_TTL_SECS;
|
||||||
|
if let Ok(mut map) = store.write() {
|
||||||
|
map.insert(token, expires_at);
|
||||||
|
}
|
||||||
|
expires_at
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate a bearer token against the CRDT-sync token store.
|
||||||
|
///
|
||||||
|
/// Returns `true` if the token exists in the store and has not expired.
|
||||||
|
pub(super) fn validate_join_token(token: &str) -> bool {
|
||||||
|
let Some(store) = CRDT_TOKENS.get() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let now = chrono::Utc::now().timestamp() as f64;
|
||||||
|
store
|
||||||
|
.read()
|
||||||
|
.ok()
|
||||||
|
.and_then(|map| map.get(token).copied())
|
||||||
|
.is_some_and(|expires_at| expires_at > now)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
#[allow(unused_imports)]
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_trusted_keys_parsed_from_toml() {
|
||||||
|
let toml_str = r#"
|
||||||
|
trusted_keys = [
|
||||||
|
"aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233",
|
||||||
|
"11223344556677881122334455667788112233445566778811223344556677ab",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[agent]]
|
||||||
|
name = "test"
|
||||||
|
"#;
|
||||||
|
let config: crate::config::ProjectConfig =
|
||||||
|
crate::config::ProjectConfig::parse(toml_str).unwrap();
|
||||||
|
assert_eq!(config.trusted_keys.len(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
config.trusted_keys[0],
|
||||||
|
"aabbccdd00112233aabbccdd00112233aabbccdd00112233aabbccdd00112233"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_trusted_keys_defaults_to_empty() {
|
||||||
|
let config = crate::config::ProjectConfig::default();
|
||||||
|
assert!(
|
||||||
|
config.trusted_keys.is_empty(),
|
||||||
|
"trusted_keys must default to empty (reject all)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn valid_token_passes_validation() {
|
||||||
|
let token = format!("test-valid-{}", uuid::Uuid::new_v4());
|
||||||
|
super::add_join_token(token.clone());
|
||||||
|
assert!(
|
||||||
|
super::validate_join_token(&token),
|
||||||
|
"A freshly added token must pass validation"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bogus_token_fails_validation() {
|
||||||
|
let bogus = "this-token-was-never-added-to-the-store";
|
||||||
|
assert!(
|
||||||
|
!super::validate_join_token(bogus),
|
||||||
|
"An unknown token must fail validation"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn expired_token_fails_validation() {
|
||||||
|
// Insert a token directly with an already-past expiry timestamp.
|
||||||
|
let token = format!("test-expired-{}", uuid::Uuid::new_v4());
|
||||||
|
let store = super::CRDT_TOKENS
|
||||||
|
.get_or_init(|| std::sync::RwLock::new(std::collections::HashMap::new()));
|
||||||
|
// expires_at = 1 (way in the past — 1970-01-01T00:00:01Z)
|
||||||
|
store.write().unwrap().insert(token.clone(), 1.0_f64);
|
||||||
|
assert!(
|
||||||
|
!super::validate_join_token(&token),
|
||||||
|
"An expired token must fail validation"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_token_with_require_true_is_rejected() {
|
||||||
|
// Simulate: require_token=true, token=None → reject.
|
||||||
|
let require_token = true;
|
||||||
|
let token: Option<&str> = None;
|
||||||
|
let should_reject = match token {
|
||||||
|
Some(t) => !super::validate_join_token(t),
|
||||||
|
None if require_token => true,
|
||||||
|
None => false,
|
||||||
|
};
|
||||||
|
assert!(
|
||||||
|
should_reject,
|
||||||
|
"Missing token must be rejected when token is required"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_token_with_require_false_is_accepted() {
|
||||||
|
let require_token = false;
|
||||||
|
let token: Option<&str> = None;
|
||||||
|
let should_reject = match token {
|
||||||
|
Some(t) => !super::validate_join_token(t),
|
||||||
|
None if require_token => true,
|
||||||
|
None => false,
|
||||||
|
};
|
||||||
|
assert!(
|
||||||
|
!should_reject,
|
||||||
|
"Missing token must be accepted in open mode"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_join_token_returns_future_expiry() {
|
||||||
|
let token = format!("test-expiry-{}", uuid::Uuid::new_v4());
|
||||||
|
let now = chrono::Utc::now().timestamp() as f64;
|
||||||
|
let expires_at = super::add_join_token(token);
|
||||||
|
assert!(
|
||||||
|
expires_at > now,
|
||||||
|
"Expiry timestamp must be in the future (got {expires_at}, now={now})"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn token_ttl_is_thirty_days() {
|
||||||
|
assert_eq!(
|
||||||
|
super::TOKEN_TTL_SECS,
|
||||||
|
30.0 * 24.0 * 3600.0,
|
||||||
|
"TOKEN_TTL_SECS must be 30 days"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_crdt_require_token_defaults_to_false() {
|
||||||
|
let config = crate::config::ProjectConfig::default();
|
||||||
|
assert!(
|
||||||
|
!config.crdt_require_token,
|
||||||
|
"crdt_require_token must default to false (open access)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_crdt_tokens_defaults_to_empty() {
|
||||||
|
let config = crate::config::ProjectConfig::default();
|
||||||
|
assert!(
|
||||||
|
config.crdt_tokens.is_empty(),
|
||||||
|
"crdt_tokens must default to empty"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_crdt_token_fields_parsed_from_toml() {
|
||||||
|
let toml_str = r#"
|
||||||
|
crdt_require_token = true
|
||||||
|
crdt_tokens = ["token-abc", "token-xyz"]
|
||||||
|
|
||||||
|
[[agent]]
|
||||||
|
name = "test"
|
||||||
|
"#;
|
||||||
|
let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap();
|
||||||
|
assert!(config.crdt_require_token);
|
||||||
|
assert_eq!(config.crdt_tokens, vec!["token-abc", "token-xyz"]);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,464 @@
|
|||||||
|
//! Rendezvous client: connect to a remote peer, authenticate, and exchange CRDT ops.
|
||||||
|
|
||||||
|
use bft_json_crdt::json_crdt::SignedOp;
|
||||||
|
use futures::{SinkExt, StreamExt};
|
||||||
|
|
||||||
|
use crate::crdt_state;
|
||||||
|
use crate::crdt_wire;
|
||||||
|
use crate::slog;
|
||||||
|
use crate::slog_error;
|
||||||
|
use crate::slog_warn;
|
||||||
|
|
||||||
|
use super::auth;
|
||||||
|
use super::dispatch::{handle_incoming_binary, handle_incoming_text};
|
||||||
|
use super::wire::{AuthMessage, ChallengeMessage, SyncMessage};
|
||||||
|
use super::{AUTH_TIMEOUT_SECS, PING_INTERVAL_SECS, PONG_TIMEOUT_SECS};
|
||||||
|
|
||||||
|
#[allow(unused_imports)]
|
||||||
|
use auth::{add_join_token, init_token_auth}; // needed by tests
|
||||||
|
|
||||||
|
// ── Rendezvous client ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Number of consecutive connection failures before escalating from WARN to ERROR.
|
||||||
|
pub const RENDEZVOUS_ERROR_THRESHOLD: u32 = 10;
|
||||||
|
|
||||||
|
/// Spawn a background task that connects to the configured rendezvous
|
||||||
|
/// peer and exchanges CRDT ops bidirectionally.
|
||||||
|
///
|
||||||
|
/// The client reconnects with exponential backoff if the connection drops.
|
||||||
|
/// Individual failures are logged at WARN; after [`RENDEZVOUS_ERROR_THRESHOLD`]
|
||||||
|
/// consecutive failures the log level escalates to ERROR.
|
||||||
|
///
|
||||||
|
/// When `token` is provided it is appended to the upgrade URL as
|
||||||
|
/// `?token=<token>` so the server's bearer-token check is satisfied. This
|
||||||
|
/// reuses the existing `--join-token` / `HUSKIES_JOIN_TOKEN` plumbing on the
|
||||||
|
/// agent side.
|
||||||
|
pub fn spawn_rendezvous_client(url: String, token: Option<String>) {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut backoff_secs = 1u64;
|
||||||
|
let mut consecutive_failures: u32 = 0;
|
||||||
|
loop {
|
||||||
|
slog!("[crdt-sync] Connecting to rendezvous peer: {url}");
|
||||||
|
match connect_and_sync(&url, token.as_deref()).await {
|
||||||
|
Ok(()) => {
|
||||||
|
slog!("[crdt-sync] Rendezvous connection closed cleanly");
|
||||||
|
backoff_secs = 1;
|
||||||
|
consecutive_failures = 0;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
consecutive_failures += 1;
|
||||||
|
if consecutive_failures >= RENDEZVOUS_ERROR_THRESHOLD {
|
||||||
|
slog_error!(
|
||||||
|
"[crdt-sync] Rendezvous peer unreachable ({consecutive_failures} consecutive failures): {e}"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
slog_warn!(
|
||||||
|
"[crdt-sync] Rendezvous connection error (attempt {consecutive_failures}): {e}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog!("[crdt-sync] Reconnecting in {backoff_secs}s...");
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(backoff_secs)).await;
|
||||||
|
backoff_secs = (backoff_secs * 2).min(30);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Connect to a remote sync endpoint and exchange ops until disconnect.
|
||||||
|
///
|
||||||
|
/// When `token` is supplied it is appended as `?token=<token>` to the
|
||||||
|
/// connection URL so the server's bearer-token check passes.
|
||||||
|
pub(crate) async fn connect_and_sync(url: &str, token: Option<&str>) -> Result<(), String> {
|
||||||
|
let connect_url = match token {
|
||||||
|
Some(t) => {
|
||||||
|
if url.contains('?') {
|
||||||
|
format!("{url}&token={t}")
|
||||||
|
} else {
|
||||||
|
format!("{url}?token={t}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => url.to_string(),
|
||||||
|
};
|
||||||
|
let (ws_stream, _) = tokio_tungstenite::connect_async(connect_url.as_str())
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("WebSocket connect failed: {e}"))?;
|
||||||
|
|
||||||
|
let (mut sink, mut stream) = ws_stream.split();
|
||||||
|
|
||||||
|
slog!("[crdt-sync] Connected to rendezvous peer, awaiting challenge");
|
||||||
|
|
||||||
|
// ── Step 1: Receive challenge from listener ───────────────────
|
||||||
|
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
|
||||||
|
|
||||||
|
let challenge_frame = tokio::time::timeout(
|
||||||
|
std::time::Duration::from_secs(AUTH_TIMEOUT_SECS),
|
||||||
|
stream.next(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|_| "Auth timeout waiting for challenge".to_string())?
|
||||||
|
.ok_or_else(|| "Connection closed before challenge".to_string())?
|
||||||
|
.map_err(|e| format!("WebSocket read error: {e}"))?;
|
||||||
|
|
||||||
|
let challenge_text = match challenge_frame {
|
||||||
|
TungsteniteMsg::Text(t) => t.to_string(),
|
||||||
|
_ => return Err("Expected text frame for challenge".to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let challenge_msg: ChallengeMessage = serde_json::from_str(&challenge_text)
|
||||||
|
.map_err(|e| format!("Invalid challenge message: {e}"))?;
|
||||||
|
|
||||||
|
if challenge_msg.r#type != "challenge" {
|
||||||
|
return Err(format!(
|
||||||
|
"Expected challenge message, got type={}",
|
||||||
|
challenge_msg.r#type
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 2: Sign challenge and send auth reply ────────────────
|
||||||
|
let (pubkey_hex, signature_hex) = crdt_state::sign_challenge(&challenge_msg.nonce)
|
||||||
|
.ok_or_else(|| "CRDT not initialised — cannot sign challenge".to_string())?;
|
||||||
|
|
||||||
|
let auth_msg = AuthMessage {
|
||||||
|
r#type: "auth".to_string(),
|
||||||
|
pubkey_hex,
|
||||||
|
signature_hex,
|
||||||
|
};
|
||||||
|
let auth_json = serde_json::to_string(&auth_msg).map_err(|e| format!("Serialize auth: {e}"))?;
|
||||||
|
sink.send(TungsteniteMsg::Text(auth_json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send auth failed: {e}"))?;
|
||||||
|
|
||||||
|
slog!("[crdt-sync] Auth reply sent, waiting for sync data");
|
||||||
|
|
||||||
|
// v2 protocol: send our vector clock.
|
||||||
|
let our_clock = crdt_state::our_vector_clock().unwrap_or_default();
|
||||||
|
let clock_msg = SyncMessage::Clock { clock: our_clock };
|
||||||
|
if let Ok(json) = serde_json::to_string(&clock_msg) {
|
||||||
|
sink.send(TungsteniteMsg::Text(json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send clock failed: {e}"))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the server's first sync message.
|
||||||
|
let first_msg = tokio::time::timeout(
|
||||||
|
std::time::Duration::from_secs(AUTH_TIMEOUT_SECS),
|
||||||
|
wait_for_rendezvous_sync_text(&mut stream),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|_| "Timeout waiting for server sync message".to_string())?;
|
||||||
|
|
||||||
|
match first_msg {
|
||||||
|
Some(SyncMessage::Clock { clock: peer_clock }) => {
|
||||||
|
// v2 server — send only the ops the server is missing.
|
||||||
|
let delta = crdt_state::ops_since(&peer_clock).unwrap_or_default();
|
||||||
|
slog!(
|
||||||
|
"[crdt-sync] v2 delta sync: sending {} ops to server (server missing)",
|
||||||
|
delta.len()
|
||||||
|
);
|
||||||
|
let msg = SyncMessage::Bulk { ops: delta };
|
||||||
|
if let Ok(json) = serde_json::to_string(&msg) {
|
||||||
|
sink.send(TungsteniteMsg::Text(json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send delta failed: {e}"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(SyncMessage::Bulk { ops }) => {
|
||||||
|
// v1 server — apply their bulk and send our full bulk.
|
||||||
|
let mut applied = 0u64;
|
||||||
|
for op_json in &ops {
|
||||||
|
if let Ok(signed_op) = serde_json::from_str::<SignedOp>(op_json)
|
||||||
|
&& crdt_state::apply_remote_op(signed_op)
|
||||||
|
{
|
||||||
|
applied += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog!(
|
||||||
|
"[crdt-sync] v1 bulk sync: received {} ops from server, applied {applied}",
|
||||||
|
ops.len()
|
||||||
|
);
|
||||||
|
if let Some(all) = crdt_state::all_ops_json() {
|
||||||
|
let msg = SyncMessage::Bulk { ops: all };
|
||||||
|
if let Ok(json) = serde_json::to_string(&msg) {
|
||||||
|
sink.send(TungsteniteMsg::Text(json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send bulk failed: {e}"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Fallback — send full bulk.
|
||||||
|
slog!("[crdt-sync] No sync message from server; sending full bulk as fallback");
|
||||||
|
if let Some(all) = crdt_state::all_ops_json() {
|
||||||
|
let msg = SyncMessage::Bulk { ops: all };
|
||||||
|
if let Ok(json) = serde_json::to_string(&msg) {
|
||||||
|
sink.send(TungsteniteMsg::Text(json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send bulk failed: {e}"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bulk-delta phase complete — signal the server that we are ready for
|
||||||
|
// real-time op streaming.
|
||||||
|
if let Ok(json) = serde_json::to_string(&SyncMessage::Ready) {
|
||||||
|
sink.send(TungsteniteMsg::Text(json.into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send ready failed: {e}"))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Subscribe to new local ops.
|
||||||
|
let Some(mut op_rx) = crdt_state::subscribe_ops() else {
|
||||||
|
return Err("CRDT not initialised".to_string());
|
||||||
|
};
|
||||||
|
|
||||||
|
// Buffer for locally-generated ops produced before the server's `ready`
|
||||||
|
// arrives. Flushed in-order once the server signals catch-up.
|
||||||
|
let mut peer_ready = false;
|
||||||
|
let mut op_buffer: Vec<bft_json_crdt::json_crdt::SignedOp> = Vec::new();
|
||||||
|
|
||||||
|
// ── Keepalive state ───────────────────────────────────────────────
|
||||||
|
let mut pong_deadline =
|
||||||
|
tokio::time::Instant::now() + std::time::Duration::from_secs(PONG_TIMEOUT_SECS);
|
||||||
|
let mut ping_ticker = tokio::time::interval_at(
|
||||||
|
tokio::time::Instant::now() + std::time::Duration::from_secs(PING_INTERVAL_SECS),
|
||||||
|
std::time::Duration::from_secs(PING_INTERVAL_SECS),
|
||||||
|
);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
// Send periodic Ping and enforce Pong timeout.
|
||||||
|
_ = ping_ticker.tick() => {
|
||||||
|
if tokio::time::Instant::now() >= pong_deadline {
|
||||||
|
slog_warn!(
|
||||||
|
"[crdt-sync] No pong from rendezvous peer {} in {}s; disconnecting",
|
||||||
|
url,
|
||||||
|
PONG_TIMEOUT_SECS
|
||||||
|
);
|
||||||
|
return Err(format!(
|
||||||
|
"Keepalive timeout: no pong from {url} in {PONG_TIMEOUT_SECS}s"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
|
||||||
|
if sink.send(TungsteniteMsg::Ping(bytes::Bytes::new())).await.is_err() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = op_rx.recv() => {
|
||||||
|
match result {
|
||||||
|
Ok(signed_op) => {
|
||||||
|
if peer_ready {
|
||||||
|
// Encode via wire codec and send as binary frame.
|
||||||
|
let bytes = crdt_wire::encode(&signed_op);
|
||||||
|
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
|
||||||
|
if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Buffer until the server signals ready.
|
||||||
|
op_buffer.push(signed_op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
|
||||||
|
slog!("[crdt-sync] Slow rendezvous link lagged {n} ops; disconnecting");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Err(_) => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
frame = stream.next() => {
|
||||||
|
match frame {
|
||||||
|
Some(Ok(tokio_tungstenite::tungstenite::Message::Pong(_))) => {
|
||||||
|
// Reset the pong deadline on every Pong received.
|
||||||
|
pong_deadline = tokio::time::Instant::now()
|
||||||
|
+ std::time::Duration::from_secs(PONG_TIMEOUT_SECS);
|
||||||
|
}
|
||||||
|
Some(Ok(tokio_tungstenite::tungstenite::Message::Ping(_))) => {
|
||||||
|
// tungstenite auto-responds to Ping with Pong at the
|
||||||
|
// protocol level; no manual response needed here.
|
||||||
|
}
|
||||||
|
Some(Ok(tokio_tungstenite::tungstenite::Message::Text(text))) => {
|
||||||
|
// Check for the ready signal before other text frames.
|
||||||
|
if let Ok(SyncMessage::Ready) = serde_json::from_str(text.as_ref()) {
|
||||||
|
peer_ready = true;
|
||||||
|
slog!("[crdt-sync] Server ready; flushing {} buffered ops", op_buffer.len());
|
||||||
|
let mut flush_ok = true;
|
||||||
|
for op in op_buffer.drain(..) {
|
||||||
|
let bytes = crdt_wire::encode(&op);
|
||||||
|
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
|
||||||
|
if sink.send(TungsteniteMsg::Binary(bytes.into())).await.is_err() {
|
||||||
|
flush_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !flush_ok {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
handle_incoming_text(text.as_ref());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(Ok(tokio_tungstenite::tungstenite::Message::Binary(bytes))) => {
|
||||||
|
// Real-time op — applied immediately regardless of ready state.
|
||||||
|
handle_incoming_binary(&bytes);
|
||||||
|
}
|
||||||
|
Some(Ok(tokio_tungstenite::tungstenite::Message::Close(_))) | None => break,
|
||||||
|
Some(Err(e)) => {
|
||||||
|
slog!("[crdt-sync] Rendezvous read error: {e}");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for the next text-frame sync message from a tungstenite stream,
|
||||||
|
/// handling Ping/Pong transparently.
|
||||||
|
///
|
||||||
|
/// Returns `None` on connection close or read error.
|
||||||
|
async fn wait_for_rendezvous_sync_text(
|
||||||
|
stream: &mut futures::stream::SplitStream<
|
||||||
|
tokio_tungstenite::WebSocketStream<
|
||||||
|
tokio_tungstenite::MaybeTlsStream<tokio::net::TcpStream>,
|
||||||
|
>,
|
||||||
|
>,
|
||||||
|
) -> Option<SyncMessage> {
|
||||||
|
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
|
||||||
|
loop {
|
||||||
|
match stream.next().await {
|
||||||
|
Some(Ok(TungsteniteMsg::Text(text))) => {
|
||||||
|
return serde_json::from_str(text.as_ref()).ok();
|
||||||
|
}
|
||||||
|
Some(Ok(TungsteniteMsg::Ping(_) | TungsteniteMsg::Pong(_))) => continue,
|
||||||
|
_ => return None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
#[allow(unused_imports)]
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_rendezvous_parsed_from_toml() {
|
||||||
|
let toml_str = r#"
|
||||||
|
rendezvous = "ws://remote:3001/crdt-sync"
|
||||||
|
|
||||||
|
[[agent]]
|
||||||
|
name = "test"
|
||||||
|
"#;
|
||||||
|
let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
config.rendezvous.as_deref(),
|
||||||
|
Some("ws://remote:3001/crdt-sync")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_rendezvous_defaults_to_none() {
|
||||||
|
let config = crate::config::ProjectConfig::default();
|
||||||
|
assert!(config.rendezvous.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn failure_counter_warn_below_threshold() {
|
||||||
|
let threshold = RENDEZVOUS_ERROR_THRESHOLD;
|
||||||
|
let mut consecutive_failures: u32 = 0;
|
||||||
|
|
||||||
|
// First threshold-1 failures are below the ERROR threshold.
|
||||||
|
for _ in 0..(threshold - 1) {
|
||||||
|
consecutive_failures += 1;
|
||||||
|
assert!(
|
||||||
|
consecutive_failures < threshold,
|
||||||
|
"failure {consecutive_failures} must be below ERROR threshold {threshold}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn failure_counter_error_at_threshold() {
|
||||||
|
let threshold = RENDEZVOUS_ERROR_THRESHOLD;
|
||||||
|
let consecutive_failures: u32 = threshold;
|
||||||
|
assert!(
|
||||||
|
consecutive_failures >= threshold,
|
||||||
|
"failure {consecutive_failures} must reach or exceed ERROR threshold {threshold}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn failure_counter_resets_on_success() {
|
||||||
|
let threshold = RENDEZVOUS_ERROR_THRESHOLD;
|
||||||
|
// Simulate sustained failure.
|
||||||
|
let mut consecutive_failures: u32 = threshold + 5;
|
||||||
|
assert!(consecutive_failures >= threshold);
|
||||||
|
|
||||||
|
// Simulate a clean reconnect.
|
||||||
|
consecutive_failures = 0;
|
||||||
|
assert_eq!(
|
||||||
|
consecutive_failures, 0,
|
||||||
|
"counter must reset to 0 on success"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Next error is attempt 1 — well below the ERROR threshold.
|
||||||
|
consecutive_failures += 1;
|
||||||
|
assert!(
|
||||||
|
consecutive_failures < threshold,
|
||||||
|
"first failure after reset must be below ERROR threshold"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn error_threshold_is_ten() {
|
||||||
|
assert_eq!(
|
||||||
|
RENDEZVOUS_ERROR_THRESHOLD,
|
||||||
|
10,
|
||||||
|
"ERROR escalation threshold must be 10 consecutive failures"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rendezvous_url_with_token_appended() {
|
||||||
|
let base = "ws://host:3001/crdt-sync";
|
||||||
|
let token = "my-secret-token";
|
||||||
|
let url_with_token = if base.contains('?') {
|
||||||
|
format!("{base}&token={token}")
|
||||||
|
} else {
|
||||||
|
format!("{base}?token={token}")
|
||||||
|
};
|
||||||
|
assert_eq!(
|
||||||
|
url_with_token,
|
||||||
|
"ws://host:3001/crdt-sync?token=my-secret-token"
|
||||||
|
);
|
||||||
|
|
||||||
|
// With existing query params.
|
||||||
|
let base_with_query = "ws://host:3001/crdt-sync?foo=bar";
|
||||||
|
let url_appended = if base_with_query.contains('?') {
|
||||||
|
format!("{base_with_query}&token={token}")
|
||||||
|
} else {
|
||||||
|
format!("{base_with_query}?token={token}")
|
||||||
|
};
|
||||||
|
assert_eq!(
|
||||||
|
url_appended,
|
||||||
|
"ws://host:3001/crdt-sync?foo=bar&token=my-secret-token"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rendezvous_url_without_token_unchanged() {
|
||||||
|
let base = "ws://host:3001/crdt-sync";
|
||||||
|
let token: Option<&str> = None;
|
||||||
|
let connect_url = match token {
|
||||||
|
Some(t) => format!("{base}?token={t}"),
|
||||||
|
None => base.to_string(),
|
||||||
|
};
|
||||||
|
assert_eq!(connect_url, base);
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,75 @@
|
|||||||
|
//! CRDT sync — WebSocket-based replication of pipeline state between huskies nodes.
|
||||||
|
/// WebSocket-based CRDT sync layer for replicating pipeline state between
|
||||||
|
/// huskies nodes.
|
||||||
|
///
|
||||||
|
/// # Protocol
|
||||||
|
///
|
||||||
|
/// ## Version negotiation
|
||||||
|
///
|
||||||
|
/// After the auth handshake, both sides send their first sync message:
|
||||||
|
///
|
||||||
|
/// - **v2 peers** send a `clock` frame: `{"type":"clock","clock":{ <node_id_hex>: <max_count>, ... }}`
|
||||||
|
/// containing a vector clock that maps each author's hex Ed25519 pubkey to the
|
||||||
|
/// count of ops received from that author. Upon receiving the peer's clock,
|
||||||
|
/// each side computes the delta via [`crdt_state::ops_since`] and sends only
|
||||||
|
/// the missing ops as a `bulk` frame.
|
||||||
|
///
|
||||||
|
/// - **v1 (legacy) peers** send a `bulk` frame directly (full op dump).
|
||||||
|
/// A v2 peer receiving a `bulk` first (instead of a `clock`) falls back to
|
||||||
|
/// the full-dump path: applies the incoming bulk and responds with its own
|
||||||
|
/// full bulk. This preserves backward compatibility — no code change needed
|
||||||
|
/// on the v1 side.
|
||||||
|
///
|
||||||
|
/// ## Text frames
|
||||||
|
/// A JSON object with a `"type"` field:
|
||||||
|
/// - `{"type":"clock","clock":{...}}` — Vector clock (v2 protocol).
|
||||||
|
/// - `{"type":"bulk","ops":[...]}` — Ops dump (full or delta).
|
||||||
|
/// - `{"type":"ready"}` — Signals that the bulk-delta phase is complete and the
|
||||||
|
/// sender is ready for real-time op streaming. Locally-generated ops are
|
||||||
|
/// buffered until the peer's `ready` is received, then flushed in order.
|
||||||
|
///
|
||||||
|
/// ## Binary frames (real-time op broadcast)
|
||||||
|
/// Individual `SignedOp`s encoded via [`crate::crdt_wire`] (versioned JSON
|
||||||
|
/// envelope: `{"v":1,"op":{...}}`). Each locally-applied op is immediately
|
||||||
|
/// broadcast as a binary frame to all connected peers.
|
||||||
|
///
|
||||||
|
/// Both the server endpoint and the rendezvous client use the same protocol,
|
||||||
|
/// making the connection fully symmetric.
|
||||||
|
///
|
||||||
|
/// ## Backpressure
|
||||||
|
/// Each connected peer has its own [`tokio::sync::broadcast`] receiver. If a
|
||||||
|
/// slow peer allows the channel to fill (indicated by a `Lagged` error), the
|
||||||
|
/// connection is dropped with a warning log. The peer can reconnect and
|
||||||
|
/// receive a fresh bulk state dump to catch up.
|
||||||
|
|
||||||
|
// ── Cross-cutting constants ─────────────────────────────────────────
|
||||||
|
|
||||||
|
// ── Auth configuration ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Default timeout for the auth handshake (seconds).
|
||||||
|
pub(super) const AUTH_TIMEOUT_SECS: u64 = 10;
|
||||||
|
|
||||||
|
// ── Keepalive configuration ─────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Interval (seconds) between WebSocket Ping frames sent by each side.
|
||||||
|
pub const PING_INTERVAL_SECS: u64 = 30;
|
||||||
|
|
||||||
|
/// Seconds without a Pong response before the connection is dropped.
|
||||||
|
pub const PONG_TIMEOUT_SECS: u64 = 60;
|
||||||
|
|
||||||
|
// ── Sub-modules ─────────────────────────────────────────────────────
|
||||||
|
mod auth;
|
||||||
|
mod client;
|
||||||
|
mod dispatch;
|
||||||
|
mod server;
|
||||||
|
mod wire;
|
||||||
|
|
||||||
|
// ── Public API re-exports ───────────────────────────────────────────
|
||||||
|
pub use auth::{add_join_token, init_token_auth, init_trusted_keys};
|
||||||
|
pub(crate) use client::connect_and_sync;
|
||||||
|
pub use client::{RENDEZVOUS_ERROR_THRESHOLD, spawn_rendezvous_client};
|
||||||
|
pub use server::crdt_sync_handler;
|
||||||
|
|
||||||
|
// Test-only re-export used by `crdt_snapshot` tests.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) use wire::SyncMessagePublic;
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,141 @@
|
|||||||
|
//! Wire-protocol types for the `/crdt-sync` WebSocket protocol.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
// ── Wire protocol types ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Auth handshake: challenge sent by the listener to the connector.
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub(super) struct ChallengeMessage {
|
||||||
|
pub(super) r#type: String,
|
||||||
|
pub(super) nonce: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Auth handshake: auth reply sent by the connector to the listener.
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub(super) struct AuthMessage {
|
||||||
|
pub(super) r#type: String,
|
||||||
|
pub(super) pubkey_hex: String,
|
||||||
|
pub(super) signature_hex: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[serde(tag = "type", rename_all = "snake_case")]
|
||||||
|
pub(crate) enum SyncMessage {
|
||||||
|
/// Bulk state dump sent on connect (v1) or delta ops after clock exchange (v2).
|
||||||
|
Bulk { ops: Vec<String> },
|
||||||
|
/// A single new op.
|
||||||
|
Op { op: String },
|
||||||
|
/// Vector clock exchanged on connect (v2 protocol).
|
||||||
|
///
|
||||||
|
/// Each entry maps a node's hex-encoded Ed25519 public key to the count of
|
||||||
|
/// ops received from that node. The receiving side computes the delta via
|
||||||
|
/// [`crdt_state::ops_since`] and sends only the missing ops.
|
||||||
|
Clock {
|
||||||
|
clock: std::collections::HashMap<String, u64>,
|
||||||
|
},
|
||||||
|
/// Signals that the bulk-delta phase is complete; the sender is ready for
|
||||||
|
/// real-time op streaming. Locally-generated ops are buffered until the
|
||||||
|
/// peer's `Ready` is received, then flushed in-order.
|
||||||
|
Ready,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Crate-visible re-export of `SyncMessage` for backwards-compatibility testing.
|
||||||
|
///
|
||||||
|
/// Used by `crdt_snapshot` tests to verify that snapshot messages are NOT
|
||||||
|
/// parseable as legacy `SyncMessage` variants — confirming that old peers
|
||||||
|
/// will gracefully reject them.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) type SyncMessagePublic = SyncMessage;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_bulk_serialization_roundtrip() {
|
||||||
|
let msg = SyncMessage::Bulk {
|
||||||
|
ops: vec!["op1".to_string(), "op2".to_string()],
|
||||||
|
};
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
assert!(json.contains(r#""type":"bulk""#));
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
match deserialized {
|
||||||
|
SyncMessage::Bulk { ops } => {
|
||||||
|
assert_eq!(ops.len(), 2);
|
||||||
|
assert_eq!(ops[0], "op1");
|
||||||
|
assert_eq!(ops[1], "op2");
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Bulk"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_op_serialization_roundtrip() {
|
||||||
|
let msg = SyncMessage::Op {
|
||||||
|
op: r#"{"inner":{}}"#.to_string(),
|
||||||
|
};
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
assert!(json.contains(r#""type":"op""#));
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
match deserialized {
|
||||||
|
SyncMessage::Op { op } => {
|
||||||
|
assert_eq!(op, r#"{"inner":{}}"#);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Op"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_bulk_empty_ops() {
|
||||||
|
let msg = SyncMessage::Bulk { ops: vec![] };
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
match deserialized {
|
||||||
|
SyncMessage::Bulk { ops } => assert!(ops.is_empty()),
|
||||||
|
_ => panic!("Expected Bulk"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_clock_serialization_roundtrip() {
|
||||||
|
let mut clock = std::collections::HashMap::new();
|
||||||
|
clock.insert("aabbcc00".to_string(), 42u64);
|
||||||
|
clock.insert("ddeeff11".to_string(), 7u64);
|
||||||
|
|
||||||
|
let msg = SyncMessage::Clock { clock };
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
assert!(json.contains(r#""type":"clock""#));
|
||||||
|
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
match deserialized {
|
||||||
|
SyncMessage::Clock { clock } => {
|
||||||
|
assert_eq!(clock["aabbcc00"], 42);
|
||||||
|
assert_eq!(clock["ddeeff11"], 7);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Clock"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_clock_empty() {
|
||||||
|
let msg = SyncMessage::Clock {
|
||||||
|
clock: std::collections::HashMap::new(),
|
||||||
|
};
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
match deserialized {
|
||||||
|
SyncMessage::Clock { clock } => assert!(clock.is_empty()),
|
||||||
|
_ => panic!("Expected Clock"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_message_ready_serialization_roundtrip() {
|
||||||
|
let msg = SyncMessage::Ready;
|
||||||
|
let json = serde_json::to_string(&msg).unwrap();
|
||||||
|
assert_eq!(json, r#"{"type":"ready"}"#);
|
||||||
|
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
|
||||||
|
assert!(matches!(deserialized, SyncMessage::Ready));
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user