huskies: merge 899

This commit is contained in:
dave
2026-05-12 23:11:34 +00:00
parent 0f0cf59329
commit cd214d7246
9 changed files with 1105 additions and 218 deletions
+144 -27
View File
@@ -7,20 +7,56 @@ use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
/// A single project entry in `projects.toml`.
///
/// Phase 2 (story 899): `url` is now optional — a project served exclusively
/// via the sled-uplink WebSocket does not need an HTTP base URL. The `url`
/// field is deprecated for removal in a future release; configure
/// `auth_token` instead and rely on the WS uplink for all traffic.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ProjectEntry {
/// Base URL of the project's huskies container (e.g. `http://localhost:3001`).
pub url: String,
///
/// **Deprecated** (story 899) — when a sled connects via the uplink WS the
/// gateway routes all MCP traffic over that connection instead. The URL is
/// used as a fallback when no live uplink exists. Omit for WS-only projects.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
/// Shared-secret token used to authenticate this project's sled when it
/// connects to `/api/sled-uplink`. Takes precedence over the top-level
/// `[sled_tokens]` table for projects that set this field.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub auth_token: Option<String>,
}
impl ProjectEntry {
/// Convenience constructor for entries that only have a URL (e.g. in tests
/// and existing `projects.toml` files that have not yet been migrated to
/// the WS-uplink model).
pub fn with_url(url: impl Into<String>) -> Self {
Self {
url: Some(url.into()),
auth_token: None,
}
}
/// Returns `true` if this entry has a configured HTTP base URL.
pub fn has_url(&self) -> bool {
self.url.as_ref().is_some_and(|u| !u.is_empty())
}
}
/// Top-level `projects.toml` config.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct GatewayConfig {
/// Map of project name → container URL.
/// Map of project name → container configuration.
#[serde(default)]
pub projects: BTreeMap<String, ProjectEntry>,
/// Map of sled_id → shared secret token for sled-uplink authentication.
///
/// **Deprecated** (story 899) — move tokens into per-project
/// `auth_token` fields instead. The gateway still honours entries here for
/// one release to provide a smooth migration window.
///
/// Each entry allows a sled identified by `sled_id` to connect to
/// `/api/sled-uplink` using the given secret token as a bearer credential.
#[serde(default)]
@@ -40,18 +76,22 @@ pub fn validate_config(config: &GatewayConfig) -> Result<String, String> {
/// Validate that a project name exists in the given project map.
///
/// Returns the project's URL on success.
/// Returns the project's URL (may be empty for WS-uplink-only projects) on
/// success.
pub fn validate_project_exists(
projects: &BTreeMap<String, ProjectEntry>,
name: &str,
) -> Result<String, String> {
projects.get(name).map(|p| p.url.clone()).ok_or_else(|| {
let available: Vec<&str> = projects.keys().map(|s| s.as_str()).collect();
format!(
"unknown project '{name}'. Available: {}",
available.join(", ")
)
})
projects
.get(name)
.map(|p| p.url.clone().unwrap_or_default())
.ok_or_else(|| {
let available: Vec<&str> = projects.keys().map(|s| s.as_str()).collect();
format!(
"unknown project '{name}'. Available: {}",
available.join(", ")
)
})
}
/// Escape a string as a TOML quoted string.
@@ -104,8 +144,29 @@ url = "http://localhost:3002"
"#;
let config: GatewayConfig = toml::from_str(toml_str).unwrap();
assert_eq!(config.projects.len(), 2);
assert_eq!(config.projects["huskies"].url, "http://localhost:3001");
assert_eq!(config.projects["robot-studio"].url, "http://localhost:3002");
assert_eq!(
config.projects["huskies"].url.as_deref(),
Some("http://localhost:3001")
);
assert_eq!(
config.projects["robot-studio"].url.as_deref(),
Some("http://localhost:3002")
);
}
#[test]
fn parse_project_without_url_is_valid() {
let toml_str = r#"
[projects.ws-only]
auth_token = "secret"
"#;
let config: GatewayConfig = toml::from_str(toml_str).unwrap();
assert_eq!(config.projects.len(), 1);
assert!(config.projects["ws-only"].url.is_none());
assert_eq!(
config.projects["ws-only"].auth_token.as_deref(),
Some("secret")
);
}
#[test]
@@ -127,18 +188,8 @@ url = "http://localhost:3002"
#[test]
fn validate_config_returns_first_project_name() {
let mut projects = BTreeMap::new();
projects.insert(
"beta".into(),
ProjectEntry {
url: "http://b".into(),
},
);
projects.insert(
"alpha".into(),
ProjectEntry {
url: "http://a".into(),
},
);
projects.insert("beta".into(), ProjectEntry::with_url("http://b"));
projects.insert("alpha".into(), ProjectEntry::with_url("http://a"));
let config = GatewayConfig {
projects,
sled_tokens: BTreeMap::new(),
@@ -147,14 +198,26 @@ url = "http://localhost:3002"
}
#[test]
fn validate_project_exists_succeeds() {
fn validate_config_accepts_ws_only_project() {
let mut projects = BTreeMap::new();
projects.insert(
"p1".into(),
"ws-only".into(),
ProjectEntry {
url: "http://p1".into(),
url: None,
auth_token: Some("secret".into()),
},
);
let config = GatewayConfig {
projects,
sled_tokens: BTreeMap::new(),
};
assert!(validate_config(&config).is_ok());
}
#[test]
fn validate_project_exists_succeeds() {
let mut projects = BTreeMap::new();
projects.insert("p1".into(), ProjectEntry::with_url("http://p1"));
assert_eq!(
validate_project_exists(&projects, "p1").unwrap(),
"http://p1"
@@ -167,6 +230,36 @@ url = "http://localhost:3002"
assert!(validate_project_exists(&projects, "missing").is_err());
}
#[test]
fn validate_project_exists_ws_only_returns_empty_url() {
let mut projects = BTreeMap::new();
projects.insert(
"ws".into(),
ProjectEntry {
url: None,
auth_token: Some("tok".into()),
},
);
assert_eq!(validate_project_exists(&projects, "ws").unwrap(), "");
}
#[test]
fn project_entry_with_url_constructor() {
let e = ProjectEntry::with_url("http://example.com");
assert_eq!(e.url.as_deref(), Some("http://example.com"));
assert!(e.auth_token.is_none());
assert!(e.has_url());
}
#[test]
fn project_entry_has_url_false_when_none() {
let e = ProjectEntry {
url: None,
auth_token: Some("tok".into()),
};
assert!(!e.has_url());
}
#[test]
fn toml_string_escapes_quotes() {
assert_eq!(toml_string(r#"a"b"#), r#""a\"b""#);
@@ -198,4 +291,28 @@ url = "http://localhost:3002"
assert!(content.contains("transport = \"slack\""));
assert!(content.contains("slack_bot_token = \"xoxb-123\""));
}
#[test]
fn roundtrip_project_entry_with_auth_token() {
let entry = ProjectEntry {
url: Some("http://a:3001".into()),
auth_token: Some("mysecret".into()),
};
let mut projects = BTreeMap::new();
projects.insert("myproj".into(), entry);
let config = GatewayConfig {
projects,
sled_tokens: BTreeMap::new(),
};
let toml_str = toml::to_string_pretty(&config).unwrap();
let parsed: GatewayConfig = toml::from_str(&toml_str).unwrap();
assert_eq!(
parsed.projects["myproj"].url.as_deref(),
Some("http://a:3001")
);
assert_eq!(
parsed.projects["myproj"].auth_token.as_deref(),
Some("mysecret")
);
}
}
-23
View File
@@ -140,29 +140,6 @@ pub async fn proxy_mcp_call_sse(
.map_err(|e| format!("failed to reach {mcp_url}: {e}"))
}
/// Fetch tools/list from a project's MCP endpoint.
pub async fn fetch_tools_list(client: &Client, base_url: &str) -> Result<Value, String> {
let mcp_url = format!("{}/mcp", base_url.trim_end_matches('/'));
let rpc_body = json!({
"jsonrpc": "2.0",
"id": 1,
"method": "tools/list",
"params": {}
});
let resp = client
.post(&mcp_url)
.json(&rpc_body)
.send()
.await
.map_err(|e| format!("failed to reach {mcp_url}: {e}"))?;
resp.json()
.await
.map_err(|e| format!("invalid JSON from upstream: {e}"))
}
/// Fetch and aggregate pipeline status for a single project URL.
pub async fn fetch_one_project_pipeline_status(url: &str, client: &Client) -> Value {
let mcp_url = format!("{}/mcp", url.trim_end_matches('/'));
+232 -17
View File
@@ -28,6 +28,7 @@ use io::Client;
use std::collections::{BTreeMap, HashMap};
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicI64, Ordering};
use tokio::sync::Mutex as TokioMutex;
use tokio::sync::RwLock;
use tokio::sync::mpsc;
@@ -49,6 +50,95 @@ pub struct GatewayStatusEvent {
pub event: crate::service::events::StoredEvent,
}
// ── Sled connection ─────────────────────────────────────────────────────────
/// Maximum age, in milliseconds, of a sled heartbeat before the gateway
/// considers the connection stale (story 899 AC 3).
pub const HEARTBEAT_MAX_AGE_MS: i64 = 30_000;
/// Default per-request timeout, in milliseconds, for an MCP call proxied over
/// a sled uplink WebSocket. Mirrors the existing reqwest-based path which has
/// no explicit cap; we set a generous bound so long-running tools (e.g.
/// `run_tests`) still complete.
pub const MCP_VIA_WS_TIMEOUT_MS: u64 = 1_200_000;
/// Handle to a sled currently connected to the gateway via the uplink WebSocket.
///
/// Created by the `/api/sled-uplink` WS handler on connect and stored in
/// [`GatewayState::sled_connections`]. The gateway's MCP proxy reads this
/// to forward requests over the live connection rather than via HTTP.
#[derive(Clone)]
pub struct SledConnection {
/// Sender side of the channel the WS handler reads to forward outgoing
/// frames (e.g. `mcp_request`) to the sled.
pub tx: mpsc::UnboundedSender<crate::sled_uplink::UplinkEnvelope>,
/// Timestamp (ms since Unix epoch) of the last `heartbeat` frame received
/// from this sled. Updated atomically by the WS handler task.
pub last_heartbeat_ms: Arc<AtomicI64>,
/// In-flight MCP requests waiting for a matching `mcp_response` from this
/// sled. Keyed by `req_id`; the oneshot sender is resolved when the
/// response arrives.
pub in_flight:
Arc<TokioMutex<HashMap<String, tokio::sync::oneshot::Sender<serde_json::Value>>>>,
}
impl SledConnection {
/// Returns `true` if a heartbeat has been received within the last `max_age_ms`
/// milliseconds.
pub fn is_alive(&self, max_age_ms: i64) -> bool {
let last = self.last_heartbeat_ms.load(Ordering::Relaxed);
let now = chrono::Utc::now().timestamp_millis();
now - last <= max_age_ms
}
}
/// Proxy a raw MCP request body to a sled over its uplink WebSocket and
/// return the serialised JSON response bytes.
///
/// Generates a fresh correlation id, registers a oneshot in the connection's
/// in-flight map, sends an `mcp_request` envelope, and waits for the
/// matching `mcp_response` (or [`MCP_VIA_WS_TIMEOUT_MS`] to elapse).
///
/// The response payload is serialised back into JSON bytes so callers can
/// return it directly to the HTTP client unchanged.
pub async fn proxy_mcp_via_ws(
conn: &SledConnection,
request_bytes: &[u8],
) -> Result<Vec<u8>, String> {
let req_id = uuid::Uuid::new_v4().to_string();
let body_str = std::str::from_utf8(request_bytes)
.map_err(|e| format!("non-utf8 mcp request body: {e}"))?
.to_string();
let (tx, rx) = tokio::sync::oneshot::channel();
conn.in_flight.lock().await.insert(req_id.clone(), tx);
let env = crate::sled_uplink::UplinkEnvelope {
msg_type: "mcp_request".to_string(),
req_id: req_id.clone(),
payload: serde_json::json!({ "body": body_str }),
};
if conn.tx.send(env).is_err() {
conn.in_flight.lock().await.remove(&req_id);
return Err("sled uplink connection closed".to_string());
}
let timeout = std::time::Duration::from_millis(MCP_VIA_WS_TIMEOUT_MS);
match tokio::time::timeout(timeout, rx).await {
Ok(Ok(response_value)) => {
serde_json::to_vec(&response_value).map_err(|e| format!("serialise mcp_response: {e}"))
}
Ok(Err(_)) => Err("sled response channel dropped".to_string()),
Err(_) => {
conn.in_flight.lock().await.remove(&req_id);
Err(format!(
"mcp call to sled timed out after {MCP_VIA_WS_TIMEOUT_MS} ms"
))
}
}
}
// ── Error type ──────────────────────────────────────────────────────────────
/// Typed errors returned by `service::gateway` functions.
@@ -135,11 +225,18 @@ pub struct GatewayState {
/// The Matrix bot's `permission_listener` holds this locked for its lifetime;
/// the sled-uplink WS handler sends requests via `perm_tx`.
pub perm_rx: Arc<TokioMutex<mpsc::UnboundedReceiver<PermissionForward>>>,
/// Reversed sled-token map: token → sled_id.
/// Reversed sled-token map: token → project_name (sled_id).
///
/// Built at startup from [`GatewayConfig::sled_tokens`] (which maps
/// sled_id → token). The handler looks up incoming tokens in O(1).
/// Built at startup from both [`GatewayConfig::sled_tokens`] AND the
/// per-project `auth_token` field (story 899). The handler looks up
/// incoming tokens in O(1) to identify the project the sled represents.
pub sled_tokens: HashMap<String, String>,
/// Live sled connections keyed by project name.
///
/// Populated by the `/api/sled-uplink` WS handler when a sled authenticates
/// and depopulated when it disconnects. MCP proxy functions check here
/// first (WS route), falling back to HTTP when no live connection exists.
pub sled_connections: Arc<RwLock<HashMap<String, SledConnection>>>,
}
impl GatewayState {
@@ -160,11 +257,21 @@ impl GatewayState {
.unwrap_or(first_from_config);
let (event_tx, _) = tokio::sync::broadcast::channel(EVENT_CHANNEL_CAPACITY);
let (perm_tx, perm_rx) = mpsc::unbounded_channel::<PermissionForward>();
let sled_tokens: HashMap<String, String> = gateway_config
// Build token→project_name map from two sources:
// 1. Legacy top-level [sled_tokens] section (sled_id → token, reversed)
// 2. Per-project auth_token fields (project_name → token, reversed)
let mut sled_tokens: HashMap<String, String> = gateway_config
.sled_tokens
.iter()
.map(|(sled_id, token)| (token.clone(), sled_id.clone()))
.collect();
for (project_name, entry) in &gateway_config.projects {
if let Some(ref token) = entry.auth_token {
sled_tokens.insert(token.clone(), project_name.clone());
}
}
Ok(Self {
projects: Arc::new(RwLock::new(gateway_config.projects)),
active_project: Arc::new(RwLock::new(first)),
@@ -178,26 +285,78 @@ impl GatewayState {
perm_tx,
perm_rx: Arc::new(TokioMutex::new(perm_rx)),
sled_tokens,
sled_connections: Arc::new(RwLock::new(HashMap::new())),
})
}
/// Get the URL of the currently active project.
/// Get the URL of the currently active project, if one is configured.
///
/// Returns `Err` when the active project has no URL configured (WS-uplink
/// only) or the project name is not found.
pub async fn active_url(&self) -> Result<String, Error> {
let name = self.active_project.read().await.clone();
self.projects
.read()
.await
.get(&name)
.map(|p| p.url.clone())
.and_then(|p| p.url.clone())
.ok_or_else(|| {
Error::ProjectNotFound(format!("active project '{name}' not found in config"))
Error::ProjectNotFound(format!(
"active project '{name}' has no URL configured \
(use sled-uplink WS or add url to projects.toml)"
))
})
}
/// Register a live sled connection for the given project.
pub async fn register_sled_connection(&self, project_name: String, conn: SledConnection) {
self.sled_connections
.write()
.await
.insert(project_name, conn);
}
/// Remove the sled connection for the given project (on disconnect).
pub async fn deregister_sled_connection(&self, project_name: &str) {
self.sled_connections.write().await.remove(project_name);
}
/// Look up the live sled connection for the active project, returning a
/// clone if one exists and has a recent heartbeat.
///
/// Returns `None` when no sled has connected for this project or when its
/// heartbeat is overdue.
pub async fn active_sled_connection(&self) -> Option<SledConnection> {
let name = self.active_project.read().await.clone();
let conn = self.sled_connections.read().await.get(&name).cloned()?;
if conn.is_alive(HEARTBEAT_MAX_AGE_MS) {
Some(conn)
} else {
None
}
}
/// Proxy an MCP request to the active project, preferring the live
/// sled-uplink WebSocket when available (story 899 AC 2) and falling
/// back to HTTP otherwise.
///
/// Returns the raw response body bytes ready to be relayed to the caller.
pub async fn proxy_active_mcp(&self, bytes: &[u8]) -> Result<Vec<u8>, String> {
if let Some(conn) = self.active_sled_connection().await {
return proxy_mcp_via_ws(&conn, bytes).await;
}
let url = self.active_url().await.map_err(|e| e.to_string())?;
crate::slog!(
"[gateway] MCP proxy: WS uplink unavailable, falling back to HTTP \
(deprecated, will be removed once all sleds are WS-only)"
);
crate::service::gateway::io::proxy_mcp_call(&self.client, &url, bytes).await
}
}
// ── Public API ──────────────────────────────────────────────────────────────
/// Switch the active project. Returns the project's URL on success.
/// Switch the active project. Returns the project's URL (empty for WS-only projects).
///
/// Writes the new active project to the CRDT `gateway_config.active_project`
/// register (LWW — last write wins) so the selection is persisted across
@@ -358,7 +517,7 @@ pub async fn add_project(state: &GatewayState, name: &str, url: &str) -> Result<
"project '{name}' already exists"
)));
}
projects.insert(name.clone(), ProjectEntry { url: url.clone() });
projects.insert(name.clone(), ProjectEntry::with_url(&url));
}
let snapshot = state.projects.read().await.clone();
@@ -441,7 +600,7 @@ pub async fn init_project(
"project '{n}' is already registered. Choose a different name or use switch_project."
)));
}
projects.insert(n.to_string(), ProjectEntry { url: u.to_string() });
projects.insert(n.to_string(), ProjectEntry::with_url(u));
io::save_config(&projects, &state.config_dir).await;
crate::slog!("[gateway] init_project: registered '{n}' ({u})");
Some(n.to_string())
@@ -494,7 +653,7 @@ pub async fn save_bot_config_and_restart(state: &GatewayState, content: &str) ->
.read()
.await
.iter()
.map(|(name, entry)| (name.clone(), entry.url.clone()))
.filter_map(|(name, entry)| entry.url.as_ref().map(|u| (name.clone(), u.clone())))
.collect();
let (new_handle, new_shutdown_tx) = io::spawn_gateway_bot(
@@ -523,12 +682,7 @@ mod tests {
fn make_config(names: &[(&str, &str)]) -> GatewayConfig {
let mut projects = BTreeMap::new();
for (name, url) in names {
projects.insert(
name.to_string(),
ProjectEntry {
url: url.to_string(),
},
);
projects.insert(name.to_string(), ProjectEntry::with_url(*url));
}
GatewayConfig {
projects,
@@ -584,6 +738,24 @@ mod tests {
assert_eq!(url, "http://my:3001");
}
#[tokio::test]
async fn active_url_fails_for_ws_only_project() {
let mut projects = BTreeMap::new();
projects.insert(
"ws-proj".into(),
ProjectEntry {
url: None,
auth_token: Some("tok".into()),
},
);
let config = GatewayConfig {
projects,
sled_tokens: BTreeMap::new(),
};
let state = GatewayState::new(config, PathBuf::from("."), 3000).unwrap();
assert!(state.active_url().await.is_err());
}
#[test]
fn error_display_variants() {
assert!(
@@ -682,4 +854,47 @@ mod tests {
let result = init_project(&state, dir.path().to_str().unwrap(), None, None).await;
assert!(result.is_err());
}
#[tokio::test]
async fn sled_connection_registration_and_lookup() {
let config = make_config(&[("myproj", "http://myproj:3001")]);
let state = GatewayState::new(config, PathBuf::new(), 3000).unwrap();
let (tx, _rx) = mpsc::unbounded_channel();
let conn = SledConnection {
tx,
last_heartbeat_ms: Arc::new(AtomicI64::new(chrono::Utc::now().timestamp_millis())),
in_flight: Arc::new(TokioMutex::new(HashMap::new())),
};
state
.register_sled_connection("myproj".to_string(), conn)
.await;
assert!(state.sled_connections.read().await.contains_key("myproj"));
state.deregister_sled_connection("myproj").await;
assert!(!state.sled_connections.read().await.contains_key("myproj"));
}
#[tokio::test]
async fn auth_token_in_project_entry_populates_sled_tokens_map() {
let mut projects = BTreeMap::new();
projects.insert(
"huskies".into(),
ProjectEntry {
url: Some("http://huskies:3001".into()),
auth_token: Some("secret-token".into()),
},
);
let config = GatewayConfig {
projects,
sled_tokens: BTreeMap::new(),
};
let state = GatewayState::new(config, PathBuf::new(), 3000).unwrap();
assert_eq!(
state.sled_tokens.get("secret-token").map(|s| s.as_str()),
Some("huskies"),
"Per-project auth_token must be in reversed sled_tokens map"
);
}
}