feat: add noisebell observability
This commit is contained in:
parent
b57927a395
commit
e6c1b82679
24 changed files with 2289 additions and 137 deletions
|
|
@ -1,12 +1,14 @@
|
|||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use axum::extract::State;
|
||||
use axum::http::{HeaderMap, StatusCode};
|
||||
use axum::http::{header, HeaderMap, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use axum::{Json, Router};
|
||||
use noisebell_common::{validate_bearer, WebhookPayload};
|
||||
use noisebell_common::{validate_bearer, WebhookPayload, PROMETHEUS_CONTENT_TYPE};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
#[derive(Clone)]
|
||||
|
|
@ -17,6 +19,109 @@ struct AppState {
|
|||
target_secret: Option<String>,
|
||||
retry_attempts: u32,
|
||||
retry_base_delay_secs: u64,
|
||||
metrics: Arc<RelayMetrics>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RelayMetrics {
|
||||
process_start_time: u64,
|
||||
received_total: AtomicU64,
|
||||
forwarded_total: AtomicU64,
|
||||
attempt_failure_total: AtomicU64,
|
||||
failed_total: AtomicU64,
|
||||
last_attempt_timestamp: AtomicU64,
|
||||
last_success_timestamp: AtomicU64,
|
||||
last_failure_timestamp: AtomicU64,
|
||||
last_duration_millis: AtomicU64,
|
||||
last_http_status: AtomicU64,
|
||||
last_result: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum RelayResultKind {
|
||||
Never = 0,
|
||||
Success = 1,
|
||||
HttpError = 2,
|
||||
RequestError = 3,
|
||||
}
|
||||
|
||||
impl RelayResultKind {
|
||||
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
|
||||
|
||||
const fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Never => "never",
|
||||
Self::Success => "success",
|
||||
Self::HttpError => "http_error",
|
||||
Self::RequestError => "request_error",
|
||||
}
|
||||
}
|
||||
|
||||
const fn from_code(code: u64) -> Self {
|
||||
match code {
|
||||
1 => Self::Success,
|
||||
2 => Self::HttpError,
|
||||
3 => Self::RequestError,
|
||||
_ => Self::Never,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RelayMetrics {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
process_start_time: unix_timestamp(),
|
||||
received_total: AtomicU64::new(0),
|
||||
forwarded_total: AtomicU64::new(0),
|
||||
attempt_failure_total: AtomicU64::new(0),
|
||||
failed_total: AtomicU64::new(0),
|
||||
last_attempt_timestamp: AtomicU64::new(0),
|
||||
last_success_timestamp: AtomicU64::new(0),
|
||||
last_failure_timestamp: AtomicU64::new(0),
|
||||
last_duration_millis: AtomicU64::new(0),
|
||||
last_http_status: AtomicU64::new(0),
|
||||
last_result: AtomicU64::new(RelayResultKind::Never as u64),
|
||||
}
|
||||
}
|
||||
|
||||
fn record_attempt(&self, timestamp: u64) {
|
||||
self.last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn record_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
|
||||
self.forwarded_total.fetch_add(1, Ordering::Relaxed);
|
||||
self.last_success_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||
self.last_http_status.store(u64::from(status), Ordering::Relaxed);
|
||||
self.last_result.store(RelayResultKind::Success as u64, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn record_failure(
|
||||
&self,
|
||||
kind: RelayResultKind,
|
||||
timestamp: u64,
|
||||
duration_millis: u64,
|
||||
status: Option<u16>,
|
||||
final_failure: bool,
|
||||
) {
|
||||
self.attempt_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||
if final_failure {
|
||||
self.failed_total.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
self.last_failure_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||
self.last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
|
||||
self.last_result.store(kind as u64, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
fn unix_timestamp() -> u64 {
|
||||
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
|
||||
}
|
||||
|
||||
fn duration_millis(started_at: Instant) -> u64 {
|
||||
let millis = started_at.elapsed().as_millis();
|
||||
millis.try_into().unwrap_or(u64::MAX)
|
||||
}
|
||||
|
||||
async fn post_webhook(
|
||||
|
|
@ -25,12 +130,20 @@ async fn post_webhook(
|
|||
Json(payload): Json<WebhookPayload>,
|
||||
) -> StatusCode {
|
||||
if !validate_bearer(&headers, &state.inbound_api_key) {
|
||||
warn!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
"unauthorized relay webhook rejected"
|
||||
);
|
||||
return StatusCode::UNAUTHORIZED;
|
||||
}
|
||||
state.metrics.received_total.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
info!(status = %payload.status, timestamp = payload.timestamp, "relay received webhook");
|
||||
|
||||
for attempt in 0..=state.retry_attempts {
|
||||
let forward_started_at = Instant::now();
|
||||
state.metrics.record_attempt(unix_timestamp());
|
||||
let mut req = state.client.post(&state.target_url).json(&payload);
|
||||
if let Some(secret) = &state.target_secret {
|
||||
req = req.bearer_auth(secret);
|
||||
|
|
@ -38,7 +151,17 @@ async fn post_webhook(
|
|||
|
||||
match req.send().await {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
info!(status = %payload.status, "relay forwarded webhook");
|
||||
let duration_ms = duration_millis(forward_started_at);
|
||||
let http_status = resp.status().as_u16();
|
||||
state.metrics.record_success(unix_timestamp(), duration_ms, http_status);
|
||||
info!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
http_status,
|
||||
duration_ms,
|
||||
attempts = attempt + 1,
|
||||
"relay forwarded webhook"
|
||||
);
|
||||
return StatusCode::OK;
|
||||
}
|
||||
result => {
|
||||
|
|
@ -46,13 +169,47 @@ async fn post_webhook(
|
|||
Ok(resp) => format!("HTTP {}", resp.status()),
|
||||
Err(err) => err.to_string(),
|
||||
};
|
||||
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
|
||||
let kind = if http_status.is_some() {
|
||||
RelayResultKind::HttpError
|
||||
} else {
|
||||
RelayResultKind::RequestError
|
||||
};
|
||||
let duration_ms = duration_millis(forward_started_at);
|
||||
state.metrics.record_failure(
|
||||
kind,
|
||||
unix_timestamp(),
|
||||
duration_ms,
|
||||
http_status,
|
||||
attempt == state.retry_attempts,
|
||||
);
|
||||
if attempt == state.retry_attempts {
|
||||
error!(error = %err_msg, "relay failed to forward webhook after {} attempts", state.retry_attempts + 1);
|
||||
error!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
error = %err_msg,
|
||||
kind = kind.as_str(),
|
||||
http_status = http_status.unwrap_or(0),
|
||||
duration_ms,
|
||||
attempts = state.retry_attempts + 1,
|
||||
"relay failed to forward webhook after retries"
|
||||
);
|
||||
return StatusCode::BAD_GATEWAY;
|
||||
}
|
||||
|
||||
let delay = Duration::from_secs(state.retry_base_delay_secs * 2u64.pow(attempt));
|
||||
warn!(error = %err_msg, attempt = attempt + 1, "relay forward failed, retrying in {:?}", delay);
|
||||
warn!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
error = %err_msg,
|
||||
kind = kind.as_str(),
|
||||
http_status = http_status.unwrap_or(0),
|
||||
duration_ms,
|
||||
attempt = attempt + 1,
|
||||
total_attempts = state.retry_attempts + 1,
|
||||
delay_seconds = delay.as_secs(),
|
||||
"relay forward failed, retrying"
|
||||
);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
}
|
||||
|
|
@ -65,6 +222,84 @@ async fn health() -> StatusCode {
|
|||
StatusCode::OK
|
||||
}
|
||||
|
||||
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
|
||||
let mut body = String::new();
|
||||
body.push_str("# HELP noisebell_relay_process_start_time_seconds Unix timestamp when the relay service started.\n");
|
||||
body.push_str("# TYPE noisebell_relay_process_start_time_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_process_start_time_seconds {}\n",
|
||||
state.metrics.process_start_time
|
||||
));
|
||||
body.push_str(
|
||||
"# HELP noisebell_relay_received_total Authenticated inbound webhooks received.\n",
|
||||
);
|
||||
body.push_str("# TYPE noisebell_relay_received_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_received_total {}\n",
|
||||
state.metrics.received_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_forwarded_total Webhooks forwarded to Home Assistant successfully.\n");
|
||||
body.push_str("# TYPE noisebell_relay_forwarded_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_forwarded_total {}\n",
|
||||
state.metrics.forwarded_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_attempt_failure_total Failed forward attempts before retry or final failure.\n");
|
||||
body.push_str("# TYPE noisebell_relay_attempt_failure_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_attempt_failure_total {}\n",
|
||||
state.metrics.attempt_failure_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_failed_total Webhooks that failed after all retries.\n");
|
||||
body.push_str("# TYPE noisebell_relay_failed_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_failed_total {}\n",
|
||||
state.metrics.failed_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_last_attempt_timestamp_seconds Unix timestamp of the last Home Assistant forward attempt.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_attempt_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_attempt_timestamp_seconds {}\n",
|
||||
state.metrics.last_attempt_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_last_success_timestamp_seconds Unix timestamp of the last successful Home Assistant forward.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_success_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_success_timestamp_seconds {}\n",
|
||||
state.metrics.last_success_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_last_failure_timestamp_seconds Unix timestamp of the last failed Home Assistant forward attempt.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_failure_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_failure_timestamp_seconds {}\n",
|
||||
state.metrics.last_failure_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_last_duration_seconds Duration of the most recent Home Assistant forward attempt.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_duration_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_duration_seconds {}\n",
|
||||
state.metrics.last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
|
||||
));
|
||||
body.push_str("# HELP noisebell_relay_last_http_status HTTP status from the most recent Home Assistant forward attempt, or 0 when no HTTP response was received.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_http_status gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_http_status {}\n",
|
||||
state.metrics.last_http_status.load(Ordering::Relaxed)
|
||||
));
|
||||
let last_result = RelayResultKind::from_code(state.metrics.last_result.load(Ordering::Relaxed));
|
||||
body.push_str("# HELP noisebell_relay_last_result Last Home Assistant forward result as one-hot labels.\n");
|
||||
body.push_str("# TYPE noisebell_relay_last_result gauge\n");
|
||||
for result in RelayResultKind::ALL {
|
||||
let value = u8::from(result == last_result);
|
||||
body.push_str(&format!(
|
||||
"noisebell_relay_last_result{{result=\"{}\"}} {value}\n",
|
||||
result.as_str()
|
||||
));
|
||||
}
|
||||
|
||||
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
|
|
@ -122,10 +357,12 @@ async fn main() -> Result<()> {
|
|||
target_secret,
|
||||
retry_attempts,
|
||||
retry_base_delay_secs,
|
||||
metrics: Arc::new(RelayMetrics::new()),
|
||||
});
|
||||
|
||||
let app = Router::new()
|
||||
.route("/health", get(health))
|
||||
.route("/metrics", get(get_metrics))
|
||||
.route("/webhook", post(post_webhook))
|
||||
.with_state(state);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue