feat: add noisebell observability

This commit is contained in:
Jet 2026-05-27 20:09:44 -07:00
parent b57927a395
commit e6c1b82679
No known key found for this signature in database
24 changed files with 2289 additions and 137 deletions

View file

@ -1,12 +1,14 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use anyhow::{Context, Result};
use axum::extract::State;
use axum::http::{HeaderMap, StatusCode};
use axum::http::{header, HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use axum::{Json, Router};
use noisebell_common::{validate_bearer, WebhookPayload};
use noisebell_common::{validate_bearer, WebhookPayload, PROMETHEUS_CONTENT_TYPE};
use tracing::{error, info, warn};
#[derive(Clone)]
@ -17,6 +19,109 @@ struct AppState {
target_secret: Option<String>,
retry_attempts: u32,
retry_base_delay_secs: u64,
metrics: Arc<RelayMetrics>,
}
#[derive(Debug)]
struct RelayMetrics {
process_start_time: u64,
received_total: AtomicU64,
forwarded_total: AtomicU64,
attempt_failure_total: AtomicU64,
failed_total: AtomicU64,
last_attempt_timestamp: AtomicU64,
last_success_timestamp: AtomicU64,
last_failure_timestamp: AtomicU64,
last_duration_millis: AtomicU64,
last_http_status: AtomicU64,
last_result: AtomicU64,
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum RelayResultKind {
Never = 0,
Success = 1,
HttpError = 2,
RequestError = 3,
}
impl RelayResultKind {
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
const fn as_str(self) -> &'static str {
match self {
Self::Never => "never",
Self::Success => "success",
Self::HttpError => "http_error",
Self::RequestError => "request_error",
}
}
const fn from_code(code: u64) -> Self {
match code {
1 => Self::Success,
2 => Self::HttpError,
3 => Self::RequestError,
_ => Self::Never,
}
}
}
impl RelayMetrics {
fn new() -> Self {
Self {
process_start_time: unix_timestamp(),
received_total: AtomicU64::new(0),
forwarded_total: AtomicU64::new(0),
attempt_failure_total: AtomicU64::new(0),
failed_total: AtomicU64::new(0),
last_attempt_timestamp: AtomicU64::new(0),
last_success_timestamp: AtomicU64::new(0),
last_failure_timestamp: AtomicU64::new(0),
last_duration_millis: AtomicU64::new(0),
last_http_status: AtomicU64::new(0),
last_result: AtomicU64::new(RelayResultKind::Never as u64),
}
}
fn record_attempt(&self, timestamp: u64) {
self.last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
}
fn record_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
self.forwarded_total.fetch_add(1, Ordering::Relaxed);
self.last_success_timestamp.store(timestamp, Ordering::Relaxed);
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.last_http_status.store(u64::from(status), Ordering::Relaxed);
self.last_result.store(RelayResultKind::Success as u64, Ordering::Relaxed);
}
fn record_failure(
&self,
kind: RelayResultKind,
timestamp: u64,
duration_millis: u64,
status: Option<u16>,
final_failure: bool,
) {
self.attempt_failure_total.fetch_add(1, Ordering::Relaxed);
if final_failure {
self.failed_total.fetch_add(1, Ordering::Relaxed);
}
self.last_failure_timestamp.store(timestamp, Ordering::Relaxed);
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
self.last_result.store(kind as u64, Ordering::Relaxed);
}
}
fn unix_timestamp() -> u64 {
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
}
fn duration_millis(started_at: Instant) -> u64 {
let millis = started_at.elapsed().as_millis();
millis.try_into().unwrap_or(u64::MAX)
}
async fn post_webhook(
@ -25,12 +130,20 @@ async fn post_webhook(
Json(payload): Json<WebhookPayload>,
) -> StatusCode {
if !validate_bearer(&headers, &state.inbound_api_key) {
warn!(
status = %payload.status,
timestamp = payload.timestamp,
"unauthorized relay webhook rejected"
);
return StatusCode::UNAUTHORIZED;
}
state.metrics.received_total.fetch_add(1, Ordering::Relaxed);
info!(status = %payload.status, timestamp = payload.timestamp, "relay received webhook");
for attempt in 0..=state.retry_attempts {
let forward_started_at = Instant::now();
state.metrics.record_attempt(unix_timestamp());
let mut req = state.client.post(&state.target_url).json(&payload);
if let Some(secret) = &state.target_secret {
req = req.bearer_auth(secret);
@ -38,7 +151,17 @@ async fn post_webhook(
match req.send().await {
Ok(resp) if resp.status().is_success() => {
info!(status = %payload.status, "relay forwarded webhook");
let duration_ms = duration_millis(forward_started_at);
let http_status = resp.status().as_u16();
state.metrics.record_success(unix_timestamp(), duration_ms, http_status);
info!(
status = %payload.status,
timestamp = payload.timestamp,
http_status,
duration_ms,
attempts = attempt + 1,
"relay forwarded webhook"
);
return StatusCode::OK;
}
result => {
@ -46,13 +169,47 @@ async fn post_webhook(
Ok(resp) => format!("HTTP {}", resp.status()),
Err(err) => err.to_string(),
};
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
let kind = if http_status.is_some() {
RelayResultKind::HttpError
} else {
RelayResultKind::RequestError
};
let duration_ms = duration_millis(forward_started_at);
state.metrics.record_failure(
kind,
unix_timestamp(),
duration_ms,
http_status,
attempt == state.retry_attempts,
);
if attempt == state.retry_attempts {
error!(error = %err_msg, "relay failed to forward webhook after {} attempts", state.retry_attempts + 1);
error!(
status = %payload.status,
timestamp = payload.timestamp,
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempts = state.retry_attempts + 1,
"relay failed to forward webhook after retries"
);
return StatusCode::BAD_GATEWAY;
}
let delay = Duration::from_secs(state.retry_base_delay_secs * 2u64.pow(attempt));
warn!(error = %err_msg, attempt = attempt + 1, "relay forward failed, retrying in {:?}", delay);
warn!(
status = %payload.status,
timestamp = payload.timestamp,
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempt = attempt + 1,
total_attempts = state.retry_attempts + 1,
delay_seconds = delay.as_secs(),
"relay forward failed, retrying"
);
tokio::time::sleep(delay).await;
}
}
@ -65,6 +222,84 @@ async fn health() -> StatusCode {
StatusCode::OK
}
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
let mut body = String::new();
body.push_str("# HELP noisebell_relay_process_start_time_seconds Unix timestamp when the relay service started.\n");
body.push_str("# TYPE noisebell_relay_process_start_time_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_process_start_time_seconds {}\n",
state.metrics.process_start_time
));
body.push_str(
"# HELP noisebell_relay_received_total Authenticated inbound webhooks received.\n",
);
body.push_str("# TYPE noisebell_relay_received_total counter\n");
body.push_str(&format!(
"noisebell_relay_received_total {}\n",
state.metrics.received_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_forwarded_total Webhooks forwarded to Home Assistant successfully.\n");
body.push_str("# TYPE noisebell_relay_forwarded_total counter\n");
body.push_str(&format!(
"noisebell_relay_forwarded_total {}\n",
state.metrics.forwarded_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_attempt_failure_total Failed forward attempts before retry or final failure.\n");
body.push_str("# TYPE noisebell_relay_attempt_failure_total counter\n");
body.push_str(&format!(
"noisebell_relay_attempt_failure_total {}\n",
state.metrics.attempt_failure_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_failed_total Webhooks that failed after all retries.\n");
body.push_str("# TYPE noisebell_relay_failed_total counter\n");
body.push_str(&format!(
"noisebell_relay_failed_total {}\n",
state.metrics.failed_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_attempt_timestamp_seconds Unix timestamp of the last Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_attempt_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_attempt_timestamp_seconds {}\n",
state.metrics.last_attempt_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_success_timestamp_seconds Unix timestamp of the last successful Home Assistant forward.\n");
body.push_str("# TYPE noisebell_relay_last_success_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_success_timestamp_seconds {}\n",
state.metrics.last_success_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_failure_timestamp_seconds Unix timestamp of the last failed Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_failure_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_failure_timestamp_seconds {}\n",
state.metrics.last_failure_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_duration_seconds Duration of the most recent Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_duration_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_duration_seconds {}\n",
state.metrics.last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
));
body.push_str("# HELP noisebell_relay_last_http_status HTTP status from the most recent Home Assistant forward attempt, or 0 when no HTTP response was received.\n");
body.push_str("# TYPE noisebell_relay_last_http_status gauge\n");
body.push_str(&format!(
"noisebell_relay_last_http_status {}\n",
state.metrics.last_http_status.load(Ordering::Relaxed)
));
let last_result = RelayResultKind::from_code(state.metrics.last_result.load(Ordering::Relaxed));
body.push_str("# HELP noisebell_relay_last_result Last Home Assistant forward result as one-hot labels.\n");
body.push_str("# TYPE noisebell_relay_last_result gauge\n");
for result in RelayResultKind::ALL {
let value = u8::from(result == last_result);
body.push_str(&format!(
"noisebell_relay_last_result{{result=\"{}\"}} {value}\n",
result.as_str()
));
}
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
@ -122,10 +357,12 @@ async fn main() -> Result<()> {
target_secret,
retry_attempts,
retry_base_delay_secs,
metrics: Arc::new(RelayMetrics::new()),
});
let app = Router::new()
.route("/health", get(health))
.route("/metrics", get(get_metrics))
.route("/webhook", post(post_webhook))
.with_state(state);