huskies: merge 760

This commit is contained in:
dave
2026-04-28 00:17:44 +00:00
parent 63ce7b9ec3
commit d1a2393b32
4 changed files with 221 additions and 3 deletions
+7 -3
View File
@@ -12,9 +12,9 @@ use std::sync::Arc;
// Re-export public types that callers reference as `crate::gateway::*`.
pub use crate::service::gateway::{
GatewayConfig, GatewayState as GatewayStateType, JoinedAgent, ProjectEntry,
fetch_all_project_pipeline_statuses, format_aggregate_status_compact,
spawn_gateway_notification_poller,
GatewayConfig, GatewayState as GatewayStateType, GatewayStatusEvent, JoinedAgent, ProjectEntry,
broadcast_status_event, fetch_all_project_pipeline_statuses, format_aggregate_status_compact,
spawn_gateway_notification_poller, subscribe_status_events,
};
/// Build the complete gateway route tree.
@@ -70,6 +70,10 @@ pub fn build_gateway_route(state_arc: Arc<GatewayState>) -> impl poem::Endpoint
"/gateway/agents/:id/heartbeat",
poem::post(gateway_heartbeat_handler),
)
.at(
"/gateway/events/push",
poem::get(gateway_event_push_handler),
)
// Serve the embedded React frontend so the gateway has a UI.
.at(
"/assets/*path",
+116
View File
@@ -4,9 +4,12 @@
//! the response. No inline business logic, no `reqwest`, no filesystem access.
use crate::service::gateway::{self, GatewayState};
use futures::StreamExt;
use poem::handler;
use poem::http::StatusCode;
use poem::web::Path as PoemPath;
use poem::web::Query;
use poem::web::websocket::{Message as WsMessage, WebSocket};
use poem::web::{Data, Json};
use poem::{Body, Request, Response};
use serde::{Deserialize, Serialize};
@@ -631,6 +634,119 @@ pub async fn gateway_heartbeat_handler(
}
}
// ── Event-push WebSocket handler ────────────────────────────────────────────
/// Query parameters accepted on the `/gateway/events/push` WebSocket upgrade.
#[derive(Deserialize)]
struct EventPushQueryParams {
/// One-time join token generated by `POST /gateway/tokens`.
token: Option<String>,
/// The project name this node represents (e.g. `"huskies"`).
project: Option<String>,
}
/// `GET /gateway/events/push` — WebSocket endpoint for project nodes to push
/// [`StatusEvent`] frames to the gateway.
///
/// # Authentication
///
/// The connecting node must supply a valid one-time join token via the `token`
/// query parameter, obtained from `POST /gateway/tokens`. The token is
/// consumed on the first successful upgrade — the connection itself is then
/// kept open indefinitely.
///
/// # Protocol
///
/// Each message from the project node must be a JSON-encoded
/// [`crate::service::events::StoredEvent`]. The gateway fan-outs the event
/// (tagged with the project name) to all current local subscribers.
///
/// The server does not send data back; clients should treat any close frame
/// as a signal to reconnect with exponential back-off (see docs/gateway-protocol.html).
///
/// # Reconnect-with-backoff
///
/// Project nodes MUST reconnect on disconnect. Recommended policy:
///
/// - Initial retry delay: **1 s**
/// - Back-off multiplier: **2×** per attempt
/// - Max delay cap: **60 s**
/// - Jitter: add ±10 % to the delay to avoid thundering herds
#[handler]
pub async fn gateway_event_push_handler(
ws: WebSocket,
state: Data<&Arc<GatewayState>>,
Query(params): Query<EventPushQueryParams>,
) -> poem::Response {
// ── Authentication (pre-upgrade) ─────────────────────────────────────
let token = match params.token {
Some(t) if !t.is_empty() => t,
_ => {
return poem::Response::builder()
.status(StatusCode::UNAUTHORIZED)
.body("token query parameter required");
}
};
let project = match params.project {
Some(p) if !p.is_empty() => p,
_ => {
return poem::Response::builder()
.status(StatusCode::BAD_REQUEST)
.body("project query parameter required");
}
};
// Validate and consume the one-time token.
{
let mut tokens = state.pending_tokens.write().await;
if !tokens.contains_key(&token) {
return poem::Response::builder()
.status(StatusCode::UNAUTHORIZED)
.body("invalid or already-used join token");
}
tokens.remove(&token);
}
// ── WebSocket upgrade ────────────────────────────────────────────────
use poem::IntoResponse as _;
let state = Arc::clone(&state);
ws.on_upgrade(move |socket| async move {
let (_, mut stream) = socket.split();
crate::slog!(
"[gateway] Project node '{}' connected to event-push endpoint",
project
);
while let Some(msg) = stream.next().await {
let text = match msg {
Ok(WsMessage::Text(t)) => t,
Ok(WsMessage::Close(_)) | Err(_) => break,
_ => continue,
};
match serde_json::from_str::<crate::service::events::StoredEvent>(&text) {
Ok(event) => {
gateway::broadcast_status_event(&state, project.clone(), event);
}
Err(e) => {
crate::slog!(
"[gateway] event-push: invalid frame from '{}': {e}",
project
);
}
}
}
crate::slog!(
"[gateway] Project node '{}' disconnected from event-push endpoint",
project
);
})
.into_response()
}
// ── Health handler ──────────────────────────────────────────────────────────
/// HTTP GET `/health` handler for the gateway.
+47
View File
@@ -26,6 +26,21 @@ use std::sync::Arc;
use tokio::sync::Mutex as TokioMutex;
use tokio::sync::RwLock;
// ── Status event broadcaster ────────────────────────────────────────────────
/// Capacity of the gateway status event broadcast channel.
const EVENT_CHANNEL_CAPACITY: usize = 64;
/// A status event pushed by a project node and fanned out to all local
/// subscribers (e.g. the Web UI, notification forwarders).
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct GatewayStatusEvent {
/// The project name that emitted this event.
pub project: String,
/// The pipeline event payload.
pub event: crate::service::events::StoredEvent,
}
// ── Error type ──────────────────────────────────────────────────────────────
/// Typed errors returned by `service::gateway` functions.
@@ -93,6 +108,10 @@ pub struct GatewayState {
pub port: u16,
/// Abort handle for the running Matrix bot task (if any).
pub bot_handle: Arc<TokioMutex<Option<tokio::task::AbortHandle>>>,
/// Broadcast sender for [`GatewayStatusEvent`]s pushed by project nodes.
///
/// Call `event_tx.subscribe()` to obtain a receiver for outbound fan-out.
pub event_tx: tokio::sync::broadcast::Sender<GatewayStatusEvent>,
}
impl GatewayState {
@@ -107,6 +126,7 @@ impl GatewayState {
) -> Result<Self, String> {
let first = config::validate_config(&gateway_config)?;
let agents = io::load_agents(&config_dir);
let (event_tx, _) = tokio::sync::broadcast::channel(EVENT_CHANNEL_CAPACITY);
Ok(Self {
projects: Arc::new(RwLock::new(gateway_config.projects)),
active_project: Arc::new(RwLock::new(first)),
@@ -116,6 +136,7 @@ impl GatewayState {
config_dir,
port,
bot_handle: Arc::new(TokioMutex::new(None)),
event_tx,
})
}
@@ -380,6 +401,32 @@ pub async fn health_check_all(state: &GatewayState) -> (bool, BTreeMap<String, &
(all_healthy, statuses)
}
/// Broadcast a status event received from a project node to all local subscribers.
///
/// Returns the number of active receivers that received the event.
/// A return value of zero means no subscribers are currently connected.
pub fn broadcast_status_event(
state: &GatewayState,
project: String,
event: crate::service::events::StoredEvent,
) -> usize {
let msg = GatewayStatusEvent { project, event };
state.event_tx.send(msg).unwrap_or(0)
}
/// Subscribe to the gateway's status event stream.
///
/// Returns a broadcast receiver that will yield [`GatewayStatusEvent`]s as
/// project nodes push them. If the receiver falls behind (more than
/// [`EVENT_CHANNEL_CAPACITY`] events are queued), it will receive a
/// [`tokio::sync::broadcast::error::RecvError::Lagged`] error; callers
/// should discard lagged events and continue.
pub fn subscribe_status_events(
state: &GatewayState,
) -> tokio::sync::broadcast::Receiver<GatewayStatusEvent> {
state.event_tx.subscribe()
}
/// Save bot config and restart the bot.
pub async fn save_bot_config_and_restart(state: &GatewayState, content: &str) -> Result<(), Error> {
io::write_bot_config(&state.config_dir, content).map_err(Error::Config)?;
+51
View File
@@ -146,6 +146,57 @@ HUSKIES_PORT=3002 huskies</code></pre>
</tbody>
</table>
<h2>Gateway event-push protocol</h2>
<p>Project nodes can push pipeline status events to the gateway in real time over a WebSocket connection. The gateway fans each event out to all connected local subscribers.</p>
<h3>Connecting</h3>
<ol>
<li>Obtain a one-time join token: <code>POST /gateway/tokens</code><code>{"token":"…"}</code></li>
<li>Open a WebSocket upgrade to <code>GET /gateway/events/push?token=TOKEN&amp;project=PROJECT_NAME</code></li>
<li>The token is consumed on upgrade. The project name is attached to every event the server broadcasts downstream.</li>
</ol>
<h3>Sending events</h3>
<p>Each message must be a JSON-encoded <code>StoredEvent</code> frame:</p>
<pre><code>// Stage transition
{"type":"stage_transition","story_id":"42_story_login","from_stage":"2_current","to_stage":"3_qa","timestamp_ms":1700000000000}
// Merge failure
{"type":"merge_failure","story_id":"42_story_login","reason":"conflict in src/main.rs","timestamp_ms":1700000001000}
// Story blocked
{"type":"story_blocked","story_id":"42_story_login","reason":"retry limit exceeded","timestamp_ms":1700000002000}</code></pre>
<p>The server does not send frames back. Any other frames received by the project node indicate an error or server restart — treat them as a disconnect signal.</p>
<h3>Reconnect with exponential back-off</h3>
<p>Project nodes <strong>must</strong> reconnect on any disconnect. Use the following policy to avoid thundering herds after a gateway restart:</p>
<table>
<thead><tr><th>Parameter</th><th>Value</th></tr></thead>
<tbody>
<tr><td>Initial delay</td><td>1 s</td></tr>
<tr><td>Back-off multiplier</td><td>2× per attempt</td></tr>
<tr><td>Maximum delay</td><td>60 s</td></tr>
<tr><td>Jitter</td><td>±10 % of the computed delay</td></tr>
</tbody>
</table>
<p>Pseudocode:</p>
<pre><code>delay = 1.0 // seconds
max_delay = 60.0
loop:
token = POST /gateway/tokens
connect ws:/gateway/events/push?token=TOKEN&amp;project=NAME
while connected:
send StoredEvent frames
// disconnected — wait and retry
jitter = delay * (random(0.9, 1.1))
sleep(min(jitter, max_delay))
delay = min(delay * 2, max_delay)</code></pre>
<div class="note">
<strong>New token per connection:</strong> Each WebSocket upgrade consumes the join token. Request a fresh token for every reconnect attempt.
</div>
<h2>Building from source</h2>
<h3>Standard release build</h3>
<pre><code>cargo build --release