huskies: merge 899

This commit is contained in:
dave
2026-05-12 23:11:34 +00:00
parent 0f0cf59329
commit cd214d7246
9 changed files with 1105 additions and 218 deletions
+40 -18
View File
@@ -203,14 +203,11 @@ pub async fn gateway_mcp_post_handler(
}
/// Proxy a request to the active project and format the response.
///
/// Prefers the live sled-uplink WebSocket when one is attached (story 899
/// AC 2); falls back to the legacy HTTP proxy otherwise.
async fn proxy_and_respond(state: &GatewayState, bytes: &[u8], id: Option<Value>) -> Response {
let url = match state.active_url().await {
Ok(u) => u,
Err(e) => {
return to_json_response(JsonRpcResponse::error(id, -32603, e.to_string()));
}
};
match gateway::io::proxy_mcp_call(&state.client, &url, bytes).await {
match state.proxy_active_mcp(bytes).await {
Ok(resp_body) => Response::builder()
.status(StatusCode::OK)
.header("Content-Type", "application/json")
@@ -316,13 +313,23 @@ fn handle_initialize(id: Option<Value>) -> JsonRpcResponse {
}
/// Fetch tools/list from the active project and merge in gateway tools.
///
/// Routes via the sled-uplink WS when one is attached (story 899 AC 2);
/// falls back to HTTP otherwise.
async fn handle_tools_list(
state: &GatewayState,
id: Option<Value>,
) -> Result<JsonRpcResponse, String> {
let url = state.active_url().await.map_err(|e| e.to_string())?;
let resp_json = gateway::io::fetch_tools_list(&state.client, &url).await?;
let rpc_body = json!({
"jsonrpc": "2.0",
"id": 1,
"method": "tools/list",
"params": {}
});
let bytes = serde_json::to_vec(&rpc_body).map_err(|e| e.to_string())?;
let resp_bytes = state.proxy_active_mcp(&bytes).await?;
let resp_json: Value =
serde_json::from_slice(&resp_bytes).map_err(|e| format!("invalid tools/list JSON: {e}"))?;
let mut tools: Vec<Value> = resp_json
.get("result")
@@ -414,21 +421,36 @@ async fn handle_gateway_status_tool(state: &GatewayState, id: Option<Value>) ->
async fn handle_gateway_health_tool(state: &GatewayState, id: Option<Value>) -> JsonRpcResponse {
let mut results = BTreeMap::new();
let project_entries: Vec<(String, String)> = state
// Build the project list, preferring the WS-uplink heartbeat as the
// source of truth for liveness (story 899 AC 3). HTTP polls are used
// only as a fallback when no live sled is connected.
let project_names: Vec<(String, Option<String>)> = state
.projects
.read()
.await
.iter()
.map(|(n, e)| (n.clone(), e.url.clone()))
.collect();
for (name, url) in &project_entries {
let status = match gateway::io::check_project_health(&state.client, url).await {
Ok(true) => "healthy".to_string(),
Ok(false) => "unhealthy".to_string(),
Err(e) => e,
let sled_conns = state.sled_connections.read().await;
for (name, url_opt) in &project_names {
let status = if let Some(conn) = sled_conns.get(name) {
if conn.is_alive(crate::service::gateway::HEARTBEAT_MAX_AGE_MS) {
"healthy (ws)".to_string()
} else {
"stale (ws heartbeat overdue)".to_string()
}
} else if let Some(url) = url_opt {
match gateway::io::check_project_health(&state.client, url).await {
Ok(true) => "healthy".to_string(),
Ok(false) => "unhealthy".to_string(),
Err(e) => e,
}
} else {
"no uplink and no url configured".to_string()
};
results.insert(name.clone(), status);
}
drop(sled_conns);
let active = state.active_project.read().await.clone();
JsonRpcResponse::success(
@@ -512,7 +534,7 @@ async fn handle_aggregate_pipeline_status_tool(
.read()
.await
.iter()
.map(|(name, entry)| (name.clone(), entry.url.clone()))
.filter_map(|(name, entry)| entry.url.as_ref().map(|u| (name.clone(), u.clone())))
.collect();
let statuses =
@@ -656,7 +678,7 @@ async fn handle_pipeline_get(state: &GatewayState, id: Option<Value>) -> JsonRpc
.read()
.await
.iter()
.map(|(n, e)| (n.clone(), e.url.clone()))
.filter_map(|(n, e)| e.url.as_ref().map(|u| (n.clone(), u.clone())))
.collect();
let results = gateway::io::fetch_all_project_pipeline_items(&project_urls, &state.client).await;
+213 -75
View File
@@ -155,21 +155,30 @@ struct SledUplinkParams {
token: Option<String>,
}
/// `GET /api/sled-uplink` — gateway-side WebSocket endpoint for sled permission uplinks.
/// `GET /api/sled-uplink` — gateway-side WebSocket endpoint for sled uplinks.
///
/// # Authentication
///
/// The connecting sled must supply a valid shared-secret token via the `token`
/// query parameter. Tokens are configured in `[sled_tokens]` in `projects.toml`
/// as `sled_id = "secret"` entries.
/// query parameter. Tokens are configured either as per-project `auth_token`
/// fields under `[projects.<name>]` in `projects.toml` (preferred, story 899)
/// or, for backwards compatibility, in the deprecated `[sled_tokens]` table.
///
/// # Protocol
///
/// See `sled_uplink.rs` for the wire format ([`UplinkEnvelope`]). The gateway
/// accepts `perm_request` messages, injects them into the local permission
/// pipeline (via `state.perm_tx`), and sends `perm_response` frames back to the
/// sled once the Matrix bot resolves them. Multiple sleds are demuxed by
/// connection: each handler owns exactly one sled's request/response flow.
/// See `sled_uplink.rs` for the wire format ([`UplinkEnvelope`]).
///
/// Phase 2 (story 899) expands the protocol from the original perm-only flow
/// to a full bidirectional MCP transport:
///
/// - **sled → gateway:** `identity` (mandatory first frame after upgrade),
/// `heartbeat`, `perm_request`, `mcp_response`.
/// - **gateway → sled:** `mcp_request`, `perm_response`.
///
/// On `identity`, the connection is published to
/// [`GatewayState::sled_connections`] under the project name, allowing the
/// MCP proxy (`mcp.rs::proxy_and_respond`) to route subsequent calls over
/// the live WS instead of HTTP. The entry is removed on disconnect.
#[handler]
pub async fn gateway_sled_uplink_handler(
ws: WebSocket,
@@ -196,82 +205,211 @@ pub async fn gateway_sled_uplink_handler(
use poem::IntoResponse as _;
let perm_tx = state.perm_tx.clone();
let state = Arc::clone(&state);
ws.on_upgrade(move |socket| async move {
let (mut sink, mut stream) = socket.split();
// Aggregator channel: spawned per-request tasks send (req_id, decision) here
// so the main loop can write perm_response frames back to the sled.
let (agg_tx, mut agg_rx) = tokio::sync::mpsc::unbounded_channel::<(
String,
crate::http::context::PermissionDecision,
)>();
run_sled_uplink_session(socket, state, sled_id, perm_tx).await;
})
.into_response()
}
crate::slog!("[gateway/sled-uplink] Sled '{}' connected", sled_id);
/// Run a single connected sled's request/response flow until it disconnects.
///
/// Performs the identity handshake, registers a [`SledConnection`] in
/// [`GatewayState::sled_connections`] under the published project name, and
/// pumps messages bidirectionally for the lifetime of the WebSocket.
async fn run_sled_uplink_session(
socket: poem::web::websocket::WebSocketStream,
state: Arc<GatewayState>,
token_sled_id: String,
perm_tx: tokio::sync::mpsc::UnboundedSender<crate::http::context::PermissionForward>,
) {
use crate::service::gateway::SledConnection;
use crate::sled_uplink::UplinkEnvelope;
use std::sync::Arc as StdArc;
use std::sync::atomic::AtomicI64;
loop {
tokio::select! {
msg = stream.next() => {
let text = match msg {
Some(Ok(WsMessage::Text(t))) => t,
Some(Ok(WsMessage::Close(_))) | None => break,
_ => continue,
};
let Ok(env) = serde_json::from_str::<crate::sled_uplink::UplinkEnvelope>(&text) else {
continue;
};
if env.msg_type == "perm_request" {
let req_id = env.req_id.clone();
let tool_name = env.payload.get("tool_name")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
let tool_input = env.payload.get("tool_input")
.cloned()
.unwrap_or(serde_json::Value::Null);
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
let fwd = crate::http::context::PermissionForward {
request_id: format!("{sled_id}:{req_id}"),
tool_name,
tool_input,
response_tx,
};
if perm_tx.send(fwd).is_err() {
break;
}
let agg_tx2 = agg_tx.clone();
tokio::spawn(async move {
let decision = response_rx
.await
.unwrap_or(crate::http::context::PermissionDecision::Deny);
let _ = agg_tx2.send((req_id, decision));
});
}
let (mut sink, mut stream) = socket.split();
// ── Identity handshake: expect the first frame to be `identity` ──────
let identity = match tokio::time::timeout(
std::time::Duration::from_secs(10),
wait_for_identity(&mut stream),
)
.await
{
Ok(Some(p)) => p,
_ => {
crate::slog!(
"[gateway/sled-uplink] '{}' missing identity frame; closing",
token_sled_id
);
return;
}
};
// Project name in the identity must match the project resolved from the
// auth token; otherwise the sled is claiming to be a different project.
if identity != token_sled_id {
crate::slog!(
"[gateway/sled-uplink] identity mismatch (token says '{}', sled claims '{}'); closing",
token_sled_id,
identity
);
return;
}
// ── Build SledConnection and publish it ─────────────────────────────
let (out_tx, mut out_rx) = tokio::sync::mpsc::unbounded_channel::<UplinkEnvelope>();
let conn = SledConnection {
tx: out_tx,
last_heartbeat_ms: StdArc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
in_flight: StdArc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())),
};
state
.register_sled_connection(identity.clone(), conn.clone())
.await;
crate::slog!(
"[gateway/sled-uplink] Sled '{}' connected and registered",
identity
);
// Aggregator channel for perm responses produced by spawned per-request tasks.
let (agg_tx, mut agg_rx) =
tokio::sync::mpsc::unbounded_channel::<(String, crate::http::context::PermissionDecision)>(
);
loop {
tokio::select! {
// Outbound: forward queued envelopes (mcp_request, etc.) to the sled.
Some(env) = out_rx.recv() => {
let Ok(text) = serde_json::to_string(&env) else { continue };
if sink.send(WsMessage::Text(text)).await.is_err() {
break;
}
Some((req_id, decision)) = agg_rx.recv() => {
use crate::http::context::PermissionDecision;
let (approved, always_allow) = match decision {
PermissionDecision::AlwaysAllow => (true, true),
PermissionDecision::Approve => (true, false),
PermissionDecision::Deny => (false, false),
};
let resp = crate::sled_uplink::UplinkEnvelope {
msg_type: "perm_response".to_string(),
req_id,
payload: serde_json::json!({
"approved": approved,
"always_allow": always_allow,
}),
};
let Ok(text) = serde_json::to_string(&resp) else { continue };
if sink.send(WsMessage::Text(text)).await.is_err() {
break;
}
// Inbound: messages from the sled.
msg = stream.next() => {
let text = match msg {
Some(Ok(WsMessage::Text(t))) => t,
Some(Ok(WsMessage::Close(_))) | None => break,
_ => continue,
};
let Ok(env) = serde_json::from_str::<UplinkEnvelope>(&text) else {
continue;
};
match env.msg_type.as_str() {
"heartbeat" => {
conn.last_heartbeat_ms.store(
chrono::Utc::now().timestamp_millis(),
std::sync::atomic::Ordering::Relaxed,
);
}
"mcp_response" => {
let tx_opt = conn.in_flight.lock().await.remove(&env.req_id);
if let Some(tx) = tx_opt {
let _ = tx.send(env.payload);
}
}
"perm_request" => {
forward_perm_request(env, &perm_tx, &agg_tx, &identity);
}
_ => {}
}
}
// Aggregator: per-request permission resolutions get formatted as perm_response.
Some((req_id, decision)) = agg_rx.recv() => {
use crate::http::context::PermissionDecision;
let (approved, always_allow) = match decision {
PermissionDecision::AlwaysAllow => (true, true),
PermissionDecision::Approve => (true, false),
PermissionDecision::Deny => (false, false),
};
let resp = UplinkEnvelope {
msg_type: "perm_response".to_string(),
req_id,
payload: serde_json::json!({
"approved": approved,
"always_allow": always_allow,
}),
};
let Ok(text) = serde_json::to_string(&resp) else { continue };
if sink.send(WsMessage::Text(text)).await.is_err() {
break;
}
}
}
}
crate::slog!("[gateway/sled-uplink] Sled '{}' disconnected", sled_id);
})
.into_response()
state.deregister_sled_connection(&identity).await;
crate::slog!("[gateway/sled-uplink] Sled '{}' disconnected", identity);
}
/// Read frames until an `identity` envelope is seen; return the project name
/// from its payload. Returns `None` if the stream ends or an invalid frame
/// arrives where the first frame is expected to be `identity`.
async fn wait_for_identity(
stream: &mut futures::stream::SplitStream<poem::web::websocket::WebSocketStream>,
) -> Option<String> {
while let Some(msg) = stream.next().await {
let text = match msg {
Ok(WsMessage::Text(t)) => t,
Ok(WsMessage::Close(_)) | Err(_) => return None,
_ => continue,
};
let Ok(env) = serde_json::from_str::<crate::sled_uplink::UplinkEnvelope>(&text) else {
return None;
};
if env.msg_type != "identity" {
return None;
}
return env
.payload
.get("project")
.and_then(|v| v.as_str())
.map(str::to_string);
}
None
}
/// Convert an inbound `perm_request` envelope into a [`PermissionForward`] and
/// inject it into the gateway's permission pipeline. The spawned waiter task
/// publishes the resolved decision into the aggregator channel so the WS
/// writer can emit a matching `perm_response` frame.
fn forward_perm_request(
env: crate::sled_uplink::UplinkEnvelope,
perm_tx: &tokio::sync::mpsc::UnboundedSender<crate::http::context::PermissionForward>,
agg_tx: &tokio::sync::mpsc::UnboundedSender<(String, crate::http::context::PermissionDecision)>,
sled_id: &str,
) {
let req_id = env.req_id.clone();
let tool_name = env
.payload
.get("tool_name")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
let tool_input = env
.payload
.get("tool_input")
.cloned()
.unwrap_or(serde_json::Value::Null);
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
let fwd = crate::http::context::PermissionForward {
request_id: format!("{sled_id}:{req_id}"),
tool_name,
tool_input,
response_tx,
};
if perm_tx.send(fwd).is_err() {
return;
}
let agg_tx2 = agg_tx.clone();
tokio::spawn(async move {
let decision = response_rx
.await
.unwrap_or(crate::http::context::PermissionDecision::Deny);
let _ = agg_tx2.send((req_id, decision));
});
}
// ── Event-push WebSocket handler ─────────────────────────────────────────────