Files
huskies/server/src/crdt_sync.rs
T

519 lines
18 KiB
Rust
Raw Normal View History

/// WebSocket-based CRDT sync layer for replicating pipeline state between
/// huskies nodes.
///
/// # Protocol
///
/// The sync protocol uses newline-delimited JSON over WebSocket text frames.
/// Each message is a JSON object with a `"type"` field:
///
/// - `{"type":"bulk","ops":[...]}` — Initial state dump (array of serialised
/// `SignedOp` JSON strings). Sent by both sides immediately after connect.
/// - `{"type":"op","op":"..."}` — A single new `SignedOp` (serialised JSON
/// string). Streamed in real-time as local ops are created.
///
/// Both the server endpoint and the rendezvous client use the same protocol,
/// making the connection fully symmetric.
use bft_json_crdt::json_crdt::SignedOp;
use futures::{SinkExt, StreamExt};
use poem::handler;
use poem::web::Data;
use poem::web::websocket::{Message as WsMessage, WebSocket};
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use crate::crdt_state;
use crate::http::context::AppContext;
use crate::slog;
// ── Wire protocol types ─────────────────────────────────────────────
#[derive(Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
enum SyncMessage {
/// Bulk state dump sent on connect.
Bulk { ops: Vec<String> },
/// A single new op.
Op { op: String },
}
// ── Server-side WebSocket handler ───────────────────────────────────
#[handler]
pub async fn crdt_sync_handler(
ws: WebSocket,
_ctx: Data<&Arc<AppContext>>,
) -> impl poem::IntoResponse {
ws.on_upgrade(|socket| async move {
let (mut sink, mut stream) = socket.split();
slog!("[crdt-sync] Peer connected");
// Send bulk state dump.
if let Some(ops) = crdt_state::all_ops_json() {
let msg = SyncMessage::Bulk { ops };
if let Ok(json) = serde_json::to_string(&msg)
&& sink.send(WsMessage::Text(json)).await.is_err()
{
return;
}
}
// Subscribe to new local ops.
let Some(mut op_rx) = crdt_state::subscribe_ops() else {
return;
};
loop {
tokio::select! {
// Forward new local ops to the peer.
result = op_rx.recv() => {
match result {
Ok(signed_op) => {
if let Ok(op_json) = serde_json::to_string(&signed_op) {
let msg = SyncMessage::Op { op: op_json };
if let Ok(text) = serde_json::to_string(&msg)
&& sink.send(WsMessage::Text(text)).await.is_err()
{
break;
}
}
}
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
slog!("[crdt-sync] Lagged {n} ops, peer may need re-sync");
}
Err(_) => break,
}
}
// Receive ops from the peer.
frame = stream.next() => {
match frame {
Some(Ok(WsMessage::Text(text))) => {
handle_incoming_message(&text);
}
Some(Ok(WsMessage::Close(_))) | None => break,
_ => {}
}
}
}
}
slog!("[crdt-sync] Peer disconnected");
})
}
/// Process an incoming sync message from a peer.
fn handle_incoming_message(text: &str) {
let msg: SyncMessage = match serde_json::from_str(text) {
Ok(m) => m,
Err(e) => {
slog!("[crdt-sync] Bad message from peer: {e}");
return;
}
};
match msg {
SyncMessage::Bulk { ops } => {
let mut applied = 0u64;
for op_json in &ops {
if let Ok(signed_op) = serde_json::from_str::<SignedOp>(op_json)
&& crdt_state::apply_remote_op(signed_op)
{
applied += 1;
}
}
slog!(
"[crdt-sync] Bulk sync: received {} ops, applied {applied}",
ops.len()
);
}
SyncMessage::Op { op } => {
if let Ok(signed_op) = serde_json::from_str::<SignedOp>(&op) {
crdt_state::apply_remote_op(signed_op);
}
}
}
}
// ── Rendezvous client ───────────────────────────────────────────────
/// Spawn a background task that connects to the configured rendezvous
/// peer and exchanges CRDT ops bidirectionally.
///
/// The client reconnects with exponential backoff if the connection drops.
pub fn spawn_rendezvous_client(url: String) {
tokio::spawn(async move {
let mut backoff_secs = 1u64;
loop {
slog!("[crdt-sync] Connecting to rendezvous peer: {url}");
match connect_and_sync(&url).await {
Ok(()) => {
slog!("[crdt-sync] Rendezvous connection closed cleanly");
backoff_secs = 1;
}
Err(e) => {
slog!("[crdt-sync] Rendezvous connection error: {e}");
}
}
slog!("[crdt-sync] Reconnecting in {backoff_secs}s...");
tokio::time::sleep(std::time::Duration::from_secs(backoff_secs)).await;
backoff_secs = (backoff_secs * 2).min(30);
}
});
}
/// Connect to a remote sync endpoint and exchange ops until disconnect.
async fn connect_and_sync(url: &str) -> Result<(), String> {
let (ws_stream, _) = tokio_tungstenite::connect_async(url)
.await
.map_err(|e| format!("WebSocket connect failed: {e}"))?;
let (mut sink, mut stream) = ws_stream.split();
slog!("[crdt-sync] Connected to rendezvous peer");
// Send our bulk state.
if let Some(ops) = crdt_state::all_ops_json() {
let msg = SyncMessage::Bulk { ops };
if let Ok(json) = serde_json::to_string(&msg) {
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
sink.send(TungsteniteMsg::Text(json.into()))
.await
.map_err(|e| format!("Send bulk failed: {e}"))?;
}
}
// Subscribe to new local ops.
let Some(mut op_rx) = crdt_state::subscribe_ops() else {
return Err("CRDT not initialised".to_string());
};
loop {
tokio::select! {
result = op_rx.recv() => {
match result {
Ok(signed_op) => {
if let Ok(op_json) = serde_json::to_string(&signed_op) {
let msg = SyncMessage::Op { op: op_json };
if let Ok(text) = serde_json::to_string(&msg) {
use tokio_tungstenite::tungstenite::Message as TungsteniteMsg;
if sink.send(TungsteniteMsg::Text(text.into())).await.is_err() {
break;
}
}
}
}
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
slog!("[crdt-sync] Lagged {n} ops on rendezvous link");
}
Err(_) => break,
}
}
frame = stream.next() => {
match frame {
Some(Ok(tokio_tungstenite::tungstenite::Message::Text(text))) => {
handle_incoming_message(text.as_ref());
}
Some(Ok(tokio_tungstenite::tungstenite::Message::Close(_))) | None => break,
Some(Err(e)) => {
slog!("[crdt-sync] Rendezvous read error: {e}");
break;
}
_ => {}
}
}
}
}
Ok(())
}
// ── Tests ────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sync_message_bulk_serialization_roundtrip() {
let msg = SyncMessage::Bulk {
ops: vec!["op1".to_string(), "op2".to_string()],
};
let json = serde_json::to_string(&msg).unwrap();
assert!(json.contains(r#""type":"bulk""#));
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
match deserialized {
SyncMessage::Bulk { ops } => {
assert_eq!(ops.len(), 2);
assert_eq!(ops[0], "op1");
assert_eq!(ops[1], "op2");
}
_ => panic!("Expected Bulk"),
}
}
#[test]
fn sync_message_op_serialization_roundtrip() {
let msg = SyncMessage::Op {
op: r#"{"inner":{}}"#.to_string(),
};
let json = serde_json::to_string(&msg).unwrap();
assert!(json.contains(r#""type":"op""#));
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
match deserialized {
SyncMessage::Op { op } => {
assert_eq!(op, r#"{"inner":{}}"#);
}
_ => panic!("Expected Op"),
}
}
#[test]
fn handle_incoming_message_bad_json_does_not_panic() {
handle_incoming_message("not valid json");
}
#[test]
fn handle_incoming_message_bulk_with_invalid_ops_does_not_panic() {
let msg = SyncMessage::Bulk {
ops: vec!["not a valid signed op".to_string()],
};
let json = serde_json::to_string(&msg).unwrap();
handle_incoming_message(&json);
}
#[test]
fn handle_incoming_message_op_with_invalid_op_does_not_panic() {
let msg = SyncMessage::Op {
op: "garbage".to_string(),
};
let json = serde_json::to_string(&msg).unwrap();
handle_incoming_message(&json);
}
#[test]
fn subscribe_ops_returns_none_before_init() {
// Before crdt_state::init() the channel doesn't exist yet.
// In test binaries it may or may not be initialised depending on
// other tests, so we just verify no panic.
let _ = crdt_state::subscribe_ops();
}
#[test]
fn all_ops_json_returns_none_before_init() {
let _ = crdt_state::all_ops_json();
}
#[test]
fn sync_message_bulk_empty_ops() {
let msg = SyncMessage::Bulk { ops: vec![] };
let json = serde_json::to_string(&msg).unwrap();
let deserialized: SyncMessage = serde_json::from_str(&json).unwrap();
match deserialized {
SyncMessage::Bulk { ops } => assert!(ops.is_empty()),
_ => panic!("Expected Bulk"),
}
}
/// Simulate the sync protocol by creating real SignedOps on two separate
/// CRDT instances and exchanging them through the SyncMessage wire format.
#[test]
fn two_node_sync_via_protocol_messages() {
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, OpState};
use bft_json_crdt::keypair::make_keypair;
use bft_json_crdt::op::ROOT_ID;
use serde_json::json;
use crate::crdt_state::PipelineDoc;
// ── Node A: create an item ──
let kp_a = make_keypair();
let mut crdt_a = BaseCrdt::<PipelineDoc>::new(&kp_a);
let item: bft_json_crdt::json_crdt::JsonValue = json!({
"story_id": "100_story_sync_test",
"stage": "1_backlog",
"name": "Sync Test",
"agent": "",
"retry_count": 0.0,
"blocked": false,
"depends_on": "",
})
.into();
let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp_a);
assert_eq!(crdt_a.apply(op1.clone()), OpState::Ok);
// Serialise op1 into a SyncMessage::Op.
let op1_json = serde_json::to_string(&op1).unwrap();
let wire_msg = SyncMessage::Op { op: op1_json.clone() };
let wire_json = serde_json::to_string(&wire_msg).unwrap();
// ── Node B: receive the op through protocol ──
let kp_b = make_keypair();
let mut crdt_b = BaseCrdt::<PipelineDoc>::new(&kp_b);
assert!(crdt_b.doc.items.view().is_empty());
// Parse wire message and apply.
let parsed: SyncMessage = serde_json::from_str(&wire_json).unwrap();
match parsed {
SyncMessage::Op { op } => {
let signed_op: bft_json_crdt::json_crdt::SignedOp =
serde_json::from_str(&op).unwrap();
let result = crdt_b.apply(signed_op);
assert_eq!(result, OpState::Ok);
}
_ => panic!("Expected Op"),
}
// Verify Node B has the same state as Node A.
assert_eq!(crdt_b.doc.items.view().len(), 1);
assert_eq!(
crdt_a.doc.items[0].story_id.view(),
crdt_b.doc.items[0].story_id.view()
);
assert_eq!(
crdt_a.doc.items[0].stage.view(),
crdt_b.doc.items[0].stage.view()
);
// ── Node A: update stage ──
let op2 = crdt_a.doc.items[0]
.stage
.set("2_current".to_string())
.sign(&kp_a);
crdt_a.apply(op2.clone());
// Send via bulk message.
let op2_json = serde_json::to_string(&op2).unwrap();
let bulk_msg = SyncMessage::Bulk {
ops: vec![op1_json, op2_json],
};
let bulk_wire = serde_json::to_string(&bulk_msg).unwrap();
// ── Node C: receives full state via bulk ──
let kp_c = make_keypair();
let mut crdt_c = BaseCrdt::<PipelineDoc>::new(&kp_c);
let parsed_bulk: SyncMessage = serde_json::from_str(&bulk_wire).unwrap();
match parsed_bulk {
SyncMessage::Bulk { ops } => {
for op_str in &ops {
let signed: bft_json_crdt::json_crdt::SignedOp =
serde_json::from_str(op_str).unwrap();
crdt_c.apply(signed);
}
}
_ => panic!("Expected Bulk"),
}
// Node C should have the updated stage.
assert_eq!(crdt_c.doc.items.view().len(), 1);
assert_eq!(
crdt_c.doc.items[0].stage.view(),
bft_json_crdt::json_crdt::JsonValue::String("2_current".to_string())
);
}
/// Verify that a single node's ops (insert + update) can be replayed
/// on another node via bulk sync and produce the same final state.
/// This is the core property needed for partition healing: when a
/// disconnected node reconnects, it sends all its ops as a bulk
/// message and the receiver catches up.
#[test]
fn partition_heal_via_bulk_replay() {
use bft_json_crdt::json_crdt::{BaseCrdt, CrdtNode, JsonValue as JV};
use bft_json_crdt::keypair::make_keypair;
use bft_json_crdt::op::ROOT_ID;
use serde_json::json;
use crate::crdt_state::PipelineDoc;
let kp = make_keypair();
// Node A creates an item and advances it.
let mut crdt_a = BaseCrdt::<PipelineDoc>::new(&kp);
let item: bft_json_crdt::json_crdt::JsonValue = json!({
"story_id": "200_story_heal",
"stage": "1_backlog",
"name": "Heal Test",
"agent": "",
"retry_count": 0.0,
"blocked": false,
"depends_on": "",
})
.into();
let op1 = crdt_a.doc.items.insert(ROOT_ID, item).sign(&kp);
crdt_a.apply(op1.clone());
let op2 = crdt_a.doc.items[0]
.stage
.set("2_current".to_string())
.sign(&kp);
crdt_a.apply(op2.clone());
let op3 = crdt_a.doc.items[0]
.stage
.set("3_qa".to_string())
.sign(&kp);
crdt_a.apply(op3.clone());
// Serialise all ops as a bulk message (simulates partition heal).
let ops_json: Vec<String> = [&op1, &op2, &op3]
.iter()
.map(|op| serde_json::to_string(op).unwrap())
.collect();
let bulk = SyncMessage::Bulk { ops: ops_json };
let wire = serde_json::to_string(&bulk).unwrap();
// Node B receives the bulk and reconstructs state.
let mut crdt_b = BaseCrdt::<PipelineDoc>::new(&kp);
let parsed: SyncMessage = serde_json::from_str(&wire).unwrap();
match parsed {
SyncMessage::Bulk { ops } => {
for op_str in &ops {
let signed: bft_json_crdt::json_crdt::SignedOp =
serde_json::from_str(op_str).unwrap();
crdt_b.apply(signed);
}
}
_ => panic!("Expected Bulk"),
}
// Node B should match Node A exactly.
assert_eq!(crdt_b.doc.items.view().len(), 1);
assert_eq!(
crdt_b.doc.items[0].stage.view(),
JV::String("3_qa".to_string())
);
assert_eq!(
crdt_a.doc.items[0].stage.view(),
crdt_b.doc.items[0].stage.view()
);
assert_eq!(
crdt_a.doc.items[0].name.view(),
crdt_b.doc.items[0].name.view()
);
}
#[test]
fn config_rendezvous_parsed_from_toml() {
let toml_str = r#"
rendezvous = "ws://remote:3001/crdt-sync"
[[agent]]
name = "test"
"#;
let config: crate::config::ProjectConfig = toml::from_str(toml_str).unwrap();
assert_eq!(
config.rendezvous.as_deref(),
Some("ws://remote:3001/crdt-sync")
);
}
#[test]
fn config_rendezvous_defaults_to_none() {
let config = crate::config::ProjectConfig::default();
assert!(config.rendezvous.is_none());
}
}