feat: add noisebell observability

This commit is contained in:
Jet 2026-05-27 20:09:44 -07:00
parent b57927a395
commit e6c1b82679
No known key found for this signature in database
24 changed files with 2289 additions and 137 deletions

View file

@ -124,8 +124,11 @@ That script:
6. writes `/etc/noisebell/noisebell.env`
7. writes `/etc/noisebell/noisebell-relay.env`
8. installs `noisebell.service` and `noisebell-relay.service`
9. enables and starts both services
10. runs `tailscale up` with the decrypted auth key
9. enables persistent journald with a 30 day retention target
10. installs and enables `prometheus-node-exporter`
11. installs `noisebell-loki-journal.service` to ship Pi logs to Loki on `noisebell-do`
12. enables and starts the Noisebell services
13. runs `tailscale up` with the decrypted auth key
## Files written on the Pi
@ -143,6 +146,9 @@ The deploy script creates:
- `/etc/noisebell/noisebell-relay.env`
- `/etc/systemd/system/noisebell.service`
- `/etc/systemd/system/noisebell-relay.service`
- `/etc/systemd/system/noisebell-loki-journal.service`
- `/usr/local/bin/noisebell-loki-journal`
- `/etc/systemd/journald.conf.d/noisebell-persistent.conf`
All secret files are root-only.
@ -275,10 +281,18 @@ Important: Home Assistant webhook IDs are exact. If the automation shows a leadi
## API
All endpoints require `Authorization: Bearer <token>`.
`GET /` requires `Authorization: Bearer <token>`.
**`GET /`**
```json
{"status": "open", "timestamp": 1710000000}
```
**`GET /metrics`**
Prometheus metrics for local door state, raw GPIO level, debounced state-change counters, webhook delivery counters, last webhook result/status/duration, boot identity, uptime, temperature, throttling flags, Wi-Fi signal, and Tailscale state. This endpoint is unauthenticated and intended for Tailscale-only scraping by the DO Prometheus.
`noisebell-relay` also exposes unauthenticated Prometheus metrics at `GET /metrics` on port `8090`, including inbound webhook count, Home Assistant forwarding counters, and last forward result/status/duration.
Routine sampled values belong in Prometheus, not logs: GPIO level, Wi-Fi signal, temperature, uptime, Tailscale state, scrape health, and webhook counters are graphed from `/metrics`. Journald/Loki logs are intended to stay event-oriented: startup/shutdown, initial state sync, debounced door state changes, successful state deliveries, delivery retries/failures, unauthorized requests, relay forwards, and GPIO read error/recovery events.

View file

@ -1,12 +1,14 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use anyhow::{Context, Result};
use axum::extract::State;
use axum::http::{HeaderMap, StatusCode};
use axum::http::{header, HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use axum::{Json, Router};
use noisebell_common::{validate_bearer, WebhookPayload};
use noisebell_common::{validate_bearer, WebhookPayload, PROMETHEUS_CONTENT_TYPE};
use tracing::{error, info, warn};
#[derive(Clone)]
@ -17,6 +19,109 @@ struct AppState {
target_secret: Option<String>,
retry_attempts: u32,
retry_base_delay_secs: u64,
metrics: Arc<RelayMetrics>,
}
#[derive(Debug)]
struct RelayMetrics {
process_start_time: u64,
received_total: AtomicU64,
forwarded_total: AtomicU64,
attempt_failure_total: AtomicU64,
failed_total: AtomicU64,
last_attempt_timestamp: AtomicU64,
last_success_timestamp: AtomicU64,
last_failure_timestamp: AtomicU64,
last_duration_millis: AtomicU64,
last_http_status: AtomicU64,
last_result: AtomicU64,
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum RelayResultKind {
Never = 0,
Success = 1,
HttpError = 2,
RequestError = 3,
}
impl RelayResultKind {
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
const fn as_str(self) -> &'static str {
match self {
Self::Never => "never",
Self::Success => "success",
Self::HttpError => "http_error",
Self::RequestError => "request_error",
}
}
const fn from_code(code: u64) -> Self {
match code {
1 => Self::Success,
2 => Self::HttpError,
3 => Self::RequestError,
_ => Self::Never,
}
}
}
impl RelayMetrics {
fn new() -> Self {
Self {
process_start_time: unix_timestamp(),
received_total: AtomicU64::new(0),
forwarded_total: AtomicU64::new(0),
attempt_failure_total: AtomicU64::new(0),
failed_total: AtomicU64::new(0),
last_attempt_timestamp: AtomicU64::new(0),
last_success_timestamp: AtomicU64::new(0),
last_failure_timestamp: AtomicU64::new(0),
last_duration_millis: AtomicU64::new(0),
last_http_status: AtomicU64::new(0),
last_result: AtomicU64::new(RelayResultKind::Never as u64),
}
}
fn record_attempt(&self, timestamp: u64) {
self.last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
}
fn record_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
self.forwarded_total.fetch_add(1, Ordering::Relaxed);
self.last_success_timestamp.store(timestamp, Ordering::Relaxed);
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.last_http_status.store(u64::from(status), Ordering::Relaxed);
self.last_result.store(RelayResultKind::Success as u64, Ordering::Relaxed);
}
fn record_failure(
&self,
kind: RelayResultKind,
timestamp: u64,
duration_millis: u64,
status: Option<u16>,
final_failure: bool,
) {
self.attempt_failure_total.fetch_add(1, Ordering::Relaxed);
if final_failure {
self.failed_total.fetch_add(1, Ordering::Relaxed);
}
self.last_failure_timestamp.store(timestamp, Ordering::Relaxed);
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
self.last_result.store(kind as u64, Ordering::Relaxed);
}
}
fn unix_timestamp() -> u64 {
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
}
fn duration_millis(started_at: Instant) -> u64 {
let millis = started_at.elapsed().as_millis();
millis.try_into().unwrap_or(u64::MAX)
}
async fn post_webhook(
@ -25,12 +130,20 @@ async fn post_webhook(
Json(payload): Json<WebhookPayload>,
) -> StatusCode {
if !validate_bearer(&headers, &state.inbound_api_key) {
warn!(
status = %payload.status,
timestamp = payload.timestamp,
"unauthorized relay webhook rejected"
);
return StatusCode::UNAUTHORIZED;
}
state.metrics.received_total.fetch_add(1, Ordering::Relaxed);
info!(status = %payload.status, timestamp = payload.timestamp, "relay received webhook");
for attempt in 0..=state.retry_attempts {
let forward_started_at = Instant::now();
state.metrics.record_attempt(unix_timestamp());
let mut req = state.client.post(&state.target_url).json(&payload);
if let Some(secret) = &state.target_secret {
req = req.bearer_auth(secret);
@ -38,7 +151,17 @@ async fn post_webhook(
match req.send().await {
Ok(resp) if resp.status().is_success() => {
info!(status = %payload.status, "relay forwarded webhook");
let duration_ms = duration_millis(forward_started_at);
let http_status = resp.status().as_u16();
state.metrics.record_success(unix_timestamp(), duration_ms, http_status);
info!(
status = %payload.status,
timestamp = payload.timestamp,
http_status,
duration_ms,
attempts = attempt + 1,
"relay forwarded webhook"
);
return StatusCode::OK;
}
result => {
@ -46,13 +169,47 @@ async fn post_webhook(
Ok(resp) => format!("HTTP {}", resp.status()),
Err(err) => err.to_string(),
};
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
let kind = if http_status.is_some() {
RelayResultKind::HttpError
} else {
RelayResultKind::RequestError
};
let duration_ms = duration_millis(forward_started_at);
state.metrics.record_failure(
kind,
unix_timestamp(),
duration_ms,
http_status,
attempt == state.retry_attempts,
);
if attempt == state.retry_attempts {
error!(error = %err_msg, "relay failed to forward webhook after {} attempts", state.retry_attempts + 1);
error!(
status = %payload.status,
timestamp = payload.timestamp,
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempts = state.retry_attempts + 1,
"relay failed to forward webhook after retries"
);
return StatusCode::BAD_GATEWAY;
}
let delay = Duration::from_secs(state.retry_base_delay_secs * 2u64.pow(attempt));
warn!(error = %err_msg, attempt = attempt + 1, "relay forward failed, retrying in {:?}", delay);
warn!(
status = %payload.status,
timestamp = payload.timestamp,
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempt = attempt + 1,
total_attempts = state.retry_attempts + 1,
delay_seconds = delay.as_secs(),
"relay forward failed, retrying"
);
tokio::time::sleep(delay).await;
}
}
@ -65,6 +222,84 @@ async fn health() -> StatusCode {
StatusCode::OK
}
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
let mut body = String::new();
body.push_str("# HELP noisebell_relay_process_start_time_seconds Unix timestamp when the relay service started.\n");
body.push_str("# TYPE noisebell_relay_process_start_time_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_process_start_time_seconds {}\n",
state.metrics.process_start_time
));
body.push_str(
"# HELP noisebell_relay_received_total Authenticated inbound webhooks received.\n",
);
body.push_str("# TYPE noisebell_relay_received_total counter\n");
body.push_str(&format!(
"noisebell_relay_received_total {}\n",
state.metrics.received_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_forwarded_total Webhooks forwarded to Home Assistant successfully.\n");
body.push_str("# TYPE noisebell_relay_forwarded_total counter\n");
body.push_str(&format!(
"noisebell_relay_forwarded_total {}\n",
state.metrics.forwarded_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_attempt_failure_total Failed forward attempts before retry or final failure.\n");
body.push_str("# TYPE noisebell_relay_attempt_failure_total counter\n");
body.push_str(&format!(
"noisebell_relay_attempt_failure_total {}\n",
state.metrics.attempt_failure_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_failed_total Webhooks that failed after all retries.\n");
body.push_str("# TYPE noisebell_relay_failed_total counter\n");
body.push_str(&format!(
"noisebell_relay_failed_total {}\n",
state.metrics.failed_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_attempt_timestamp_seconds Unix timestamp of the last Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_attempt_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_attempt_timestamp_seconds {}\n",
state.metrics.last_attempt_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_success_timestamp_seconds Unix timestamp of the last successful Home Assistant forward.\n");
body.push_str("# TYPE noisebell_relay_last_success_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_success_timestamp_seconds {}\n",
state.metrics.last_success_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_failure_timestamp_seconds Unix timestamp of the last failed Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_failure_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_failure_timestamp_seconds {}\n",
state.metrics.last_failure_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_relay_last_duration_seconds Duration of the most recent Home Assistant forward attempt.\n");
body.push_str("# TYPE noisebell_relay_last_duration_seconds gauge\n");
body.push_str(&format!(
"noisebell_relay_last_duration_seconds {}\n",
state.metrics.last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
));
body.push_str("# HELP noisebell_relay_last_http_status HTTP status from the most recent Home Assistant forward attempt, or 0 when no HTTP response was received.\n");
body.push_str("# TYPE noisebell_relay_last_http_status gauge\n");
body.push_str(&format!(
"noisebell_relay_last_http_status {}\n",
state.metrics.last_http_status.load(Ordering::Relaxed)
));
let last_result = RelayResultKind::from_code(state.metrics.last_result.load(Ordering::Relaxed));
body.push_str("# HELP noisebell_relay_last_result Last Home Assistant forward result as one-hot labels.\n");
body.push_str("# TYPE noisebell_relay_last_result gauge\n");
for result in RelayResultKind::ALL {
let value = u8::from(result == last_result);
body.push_str(&format!(
"noisebell_relay_last_result{{result=\"{}\"}} {value}\n",
result.as_str()
));
}
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
@ -122,10 +357,12 @@ async fn main() -> Result<()> {
target_secret,
retry_attempts,
retry_base_delay_secs,
metrics: Arc::new(RelayMetrics::new()),
});
let app = Router::new()
.route("/health", get(health))
.route("/metrics", get(get_metrics))
.route("/webhook", post(post_webhook))
.with_state(state);

View file

@ -1,15 +1,18 @@
use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use std::{fs, process::Command};
use anyhow::{Context, Result};
use axum::extract::State;
use axum::http::{HeaderMap, StatusCode};
use axum::http::{header, HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::routing::get;
use axum::{Json, Router};
use gpiod::{Bias, Chip, Options};
use noisebell_common::{
validate_bearer, DoorStatus, PiStatusResponse, SignalLevel, WebhookPayload,
prometheus_escape_label_value, validate_bearer, DoorStatus, PiStatusResponse, SignalLevel,
WebhookPayload, PROMETHEUS_CONTENT_TYPE,
};
use tracing::{error, info, warn};
@ -44,10 +47,126 @@ impl LocalDoorState {
}
}
#[derive(Clone, Copy)]
enum StateEventKind {
Startup,
StateChange,
}
impl StateEventKind {
const fn as_str(self) -> &'static str {
match self {
Self::Startup => "startup",
Self::StateChange => "state_change",
}
}
}
struct AppState {
door_state: AtomicU8,
last_changed: AtomicU64,
inbound_api_key: String,
metrics: AppMetrics,
}
struct AppMetrics {
process_start_time: u64,
notify_success_total: AtomicU64,
notify_attempt_failure_total: AtomicU64,
notify_failure_total: AtomicU64,
notify_last_attempt_timestamp: AtomicU64,
notify_last_success_timestamp: AtomicU64,
notify_last_failure_timestamp: AtomicU64,
notify_last_duration_millis: AtomicU64,
notify_last_http_status: AtomicU64,
notify_last_result: AtomicU64,
state_change_open_total: AtomicU64,
state_change_closed_total: AtomicU64,
gpio_last_read_timestamp: AtomicU64,
gpio_raw_level: AtomicU8,
gpio_read_error_total: AtomicU64,
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum NotifyResultKind {
Never = 0,
Success = 1,
HttpError = 2,
RequestError = 3,
}
impl NotifyResultKind {
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
const fn as_str(self) -> &'static str {
match self {
Self::Never => "never",
Self::Success => "success",
Self::HttpError => "http_error",
Self::RequestError => "request_error",
}
}
const fn from_code(code: u64) -> Self {
match code {
1 => Self::Success,
2 => Self::HttpError,
3 => Self::RequestError,
_ => Self::Never,
}
}
}
impl AppMetrics {
fn new() -> Self {
Self {
process_start_time: unix_timestamp(),
notify_success_total: AtomicU64::new(0),
notify_attempt_failure_total: AtomicU64::new(0),
notify_failure_total: AtomicU64::new(0),
notify_last_attempt_timestamp: AtomicU64::new(0),
notify_last_success_timestamp: AtomicU64::new(0),
notify_last_failure_timestamp: AtomicU64::new(0),
notify_last_duration_millis: AtomicU64::new(0),
notify_last_http_status: AtomicU64::new(0),
notify_last_result: AtomicU64::new(NotifyResultKind::Never as u64),
state_change_open_total: AtomicU64::new(0),
state_change_closed_total: AtomicU64::new(0),
gpio_last_read_timestamp: AtomicU64::new(0),
gpio_raw_level: AtomicU8::new(0),
gpio_read_error_total: AtomicU64::new(0),
}
}
fn record_notify_attempt(&self, timestamp: u64) {
self.notify_last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
}
fn record_notify_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
self.notify_success_total.fetch_add(1, Ordering::Relaxed);
self.notify_last_success_timestamp.store(timestamp, Ordering::Relaxed);
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.notify_last_http_status.store(u64::from(status), Ordering::Relaxed);
self.notify_last_result.store(NotifyResultKind::Success as u64, Ordering::Relaxed);
}
fn record_notify_failure(
&self,
kind: NotifyResultKind,
timestamp: u64,
duration_millis: u64,
status: Option<u16>,
final_failure: bool,
) {
self.notify_attempt_failure_total.fetch_add(1, Ordering::Relaxed);
if final_failure {
self.notify_failure_total.fetch_add(1, Ordering::Relaxed);
}
self.notify_last_failure_timestamp.store(timestamp, Ordering::Relaxed);
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
self.notify_last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
self.notify_last_result.store(kind as u64, Ordering::Relaxed);
}
}
impl AppState {
@ -60,11 +179,17 @@ fn unix_timestamp() -> u64 {
SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs()
}
fn duration_millis(started_at: Instant) -> u64 {
let millis = started_at.elapsed().as_millis();
millis.try_into().unwrap_or(u64::MAX)
}
async fn get_status(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
) -> Result<Json<PiStatusResponse>, StatusCode> {
if !validate_bearer(&headers, &state.inbound_api_key) {
warn!("unauthorized status request rejected");
return Err(StatusCode::UNAUTHORIZED);
}
Ok(Json(PiStatusResponse {
@ -73,6 +198,211 @@ async fn get_status(
}))
}
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
let mut body = String::new();
let current_status = state.current_door_state().as_door_status();
body.push_str("# HELP noisebell_pi_door_status Current local Pi door status.\n");
body.push_str("# TYPE noisebell_pi_door_status gauge\n");
for status in [DoorStatus::Open, DoorStatus::Closed] {
let value = u8::from(current_status == status);
let status = prometheus_escape_label_value(status.as_str());
body.push_str(&format!("noisebell_pi_door_status{{status=\"{status}\"}} {value}\n"));
}
body.push_str("# HELP noisebell_pi_last_changed_timestamp_seconds Unix timestamp for the last local door state change.\n");
body.push_str("# TYPE noisebell_pi_last_changed_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_last_changed_timestamp_seconds {}\n",
state.last_changed.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_process_start_time_seconds Unix timestamp when the Pi service started.\n");
body.push_str("# TYPE noisebell_pi_process_start_time_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_process_start_time_seconds {}\n",
state.metrics.process_start_time
));
body.push_str(
"# HELP noisebell_pi_notify_success_total Successful state webhooks sent to the cache.\n",
);
body.push_str("# TYPE noisebell_pi_notify_success_total counter\n");
body.push_str(&format!(
"noisebell_pi_notify_success_total {}\n",
state.metrics.notify_success_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_attempt_failure_total Failed state webhook attempts before retry or final failure.\n");
body.push_str("# TYPE noisebell_pi_notify_attempt_failure_total counter\n");
body.push_str(&format!(
"noisebell_pi_notify_attempt_failure_total {}\n",
state.metrics.notify_attempt_failure_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_failure_total State changes that failed to reach the cache after all retries.\n");
body.push_str("# TYPE noisebell_pi_notify_failure_total counter\n");
body.push_str(&format!(
"noisebell_pi_notify_failure_total {}\n",
state.metrics.notify_failure_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_last_attempt_timestamp_seconds Unix timestamp of the last cache webhook attempt.\n");
body.push_str("# TYPE noisebell_pi_notify_last_attempt_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_notify_last_attempt_timestamp_seconds {}\n",
state.metrics.notify_last_attempt_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_last_success_timestamp_seconds Unix timestamp of the last successful cache webhook.\n");
body.push_str("# TYPE noisebell_pi_notify_last_success_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_notify_last_success_timestamp_seconds {}\n",
state.metrics.notify_last_success_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_last_failure_timestamp_seconds Unix timestamp of the last failed cache webhook attempt.\n");
body.push_str("# TYPE noisebell_pi_notify_last_failure_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_notify_last_failure_timestamp_seconds {}\n",
state.metrics.notify_last_failure_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_notify_last_duration_seconds Duration of the most recent cache webhook attempt.\n");
body.push_str("# TYPE noisebell_pi_notify_last_duration_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_notify_last_duration_seconds {}\n",
state.metrics.notify_last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
));
body.push_str("# HELP noisebell_pi_notify_last_http_status HTTP status from the most recent cache webhook attempt, or 0 when no HTTP response was received.\n");
body.push_str("# TYPE noisebell_pi_notify_last_http_status gauge\n");
body.push_str(&format!(
"noisebell_pi_notify_last_http_status {}\n",
state.metrics.notify_last_http_status.load(Ordering::Relaxed)
));
let last_notify =
NotifyResultKind::from_code(state.metrics.notify_last_result.load(Ordering::Relaxed));
body.push_str(
"# HELP noisebell_pi_notify_last_result Last cache webhook result as one-hot labels.\n",
);
body.push_str("# TYPE noisebell_pi_notify_last_result gauge\n");
for result in NotifyResultKind::ALL {
let value = u8::from(result == last_notify);
body.push_str(&format!(
"noisebell_pi_notify_last_result{{result=\"{}\"}} {value}\n",
result.as_str()
));
}
body.push_str("# HELP noisebell_pi_state_change_total Local debounced door state changes by resulting status.\n");
body.push_str("# TYPE noisebell_pi_state_change_total counter\n");
body.push_str(&format!(
"noisebell_pi_state_change_total{{status=\"open\"}} {}\n",
state.metrics.state_change_open_total.load(Ordering::Relaxed)
));
body.push_str(&format!(
"noisebell_pi_state_change_total{{status=\"closed\"}} {}\n",
state.metrics.state_change_closed_total.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_gpio_raw_level Last GPIO raw signal level, 0 for low and 1 for high.\n");
body.push_str("# TYPE noisebell_pi_gpio_raw_level gauge\n");
body.push_str(&format!(
"noisebell_pi_gpio_raw_level {}\n",
state.metrics.gpio_raw_level.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_gpio_last_read_timestamp_seconds Unix timestamp of the last successful GPIO read.\n");
body.push_str("# TYPE noisebell_pi_gpio_last_read_timestamp_seconds gauge\n");
body.push_str(&format!(
"noisebell_pi_gpio_last_read_timestamp_seconds {}\n",
state.metrics.gpio_last_read_timestamp.load(Ordering::Relaxed)
));
body.push_str("# HELP noisebell_pi_gpio_read_error_total GPIO read errors.\n");
body.push_str("# TYPE noisebell_pi_gpio_read_error_total counter\n");
body.push_str(&format!(
"noisebell_pi_gpio_read_error_total {}\n",
state.metrics.gpio_read_error_total.load(Ordering::Relaxed)
));
if let Some(boot_id) = read_trimmed("/proc/sys/kernel/random/boot_id") {
let boot_id = prometheus_escape_label_value(&boot_id);
body.push_str("# HELP noisebell_pi_boot_info Pi boot identity. Changes on reboot.\n");
body.push_str("# TYPE noisebell_pi_boot_info gauge\n");
body.push_str(&format!("noisebell_pi_boot_info{{boot_id=\"{boot_id}\"}} 1\n"));
}
if let Some(uptime) = read_uptime_seconds() {
body.push_str("# HELP noisebell_pi_uptime_seconds Pi system uptime in seconds.\n");
body.push_str("# TYPE noisebell_pi_uptime_seconds gauge\n");
body.push_str(&format!("noisebell_pi_uptime_seconds {uptime}\n"));
}
if let Some(temp) = read_temperature_celsius() {
body.push_str("# HELP noisebell_pi_temperature_celsius Pi CPU temperature in Celsius.\n");
body.push_str("# TYPE noisebell_pi_temperature_celsius gauge\n");
body.push_str(&format!("noisebell_pi_temperature_celsius {temp}\n"));
}
if let Some(throttled) = read_throttled_flags() {
body.push_str("# HELP noisebell_pi_throttled_flags Raspberry Pi throttling bitfield from vcgencmd get_throttled.\n");
body.push_str("# TYPE noisebell_pi_throttled_flags gauge\n");
body.push_str(&format!("noisebell_pi_throttled_flags {throttled}\n"));
}
if let Some((interface, link, level)) = read_wifi_metrics() {
let interface = prometheus_escape_label_value(&interface);
body.push_str("# HELP noisebell_pi_wifi_link_quality Wireless link quality from /proc/net/wireless.\n");
body.push_str("# TYPE noisebell_pi_wifi_link_quality gauge\n");
body.push_str(&format!(
"noisebell_pi_wifi_link_quality{{interface=\"{interface}\"}} {link}\n"
));
body.push_str("# HELP noisebell_pi_wifi_signal_dbm Wireless signal level in dBm from /proc/net/wireless.\n");
body.push_str("# TYPE noisebell_pi_wifi_signal_dbm gauge\n");
body.push_str(&format!(
"noisebell_pi_wifi_signal_dbm{{interface=\"{interface}\"}} {level}\n"
));
}
body.push_str("# HELP noisebell_pi_tailscale_running Whether tailscale status reports BackendState Running.\n");
body.push_str("# TYPE noisebell_pi_tailscale_running gauge\n");
body.push_str(&format!("noisebell_pi_tailscale_running {}\n", u8::from(tailscale_running())));
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
}
fn read_trimmed(path: &str) -> Option<String> {
fs::read_to_string(path)
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
}
fn read_uptime_seconds() -> Option<f64> {
read_trimmed("/proc/uptime")?.split_whitespace().next()?.parse().ok()
}
fn read_temperature_celsius() -> Option<f64> {
let raw: f64 = read_trimmed("/sys/class/thermal/thermal_zone0/temp")?.parse().ok()?;
Some(raw / 1000.0)
}
fn read_throttled_flags() -> Option<u64> {
let output = Command::new("vcgencmd").arg("get_throttled").output().ok()?;
if !output.status.success() {
return None;
}
let text = String::from_utf8_lossy(&output.stdout);
let value = text.trim().strip_prefix("throttled=0x")?;
u64::from_str_radix(value, 16).ok()
}
fn read_wifi_metrics() -> Option<(String, f64, f64)> {
let text = read_trimmed("/proc/net/wireless")?;
for line in text.lines().skip(2) {
let (interface, values) = line.split_once(':')?;
let mut parts = values.split_whitespace();
let _status = parts.next()?;
let link: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
let level: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
return Some((interface.trim().to_string(), link, level));
}
None
}
fn tailscale_running() -> bool {
let output = match Command::new("tailscale").args(["status", "--json"]).output() {
Ok(output) => output,
Err(_) => return false,
};
output.status.success()
&& String::from_utf8_lossy(&output.stdout).contains("\"BackendState\":\"Running\"")
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
@ -151,17 +481,23 @@ async fn main() -> Result<()> {
door_state: AtomicU8::new(initial_state as u8),
last_changed: AtomicU64::new(now),
inbound_api_key,
metrics: AppMetrics::new(),
});
state
.metrics
.gpio_raw_level
.store(u8::from(initial_raw_level == SignalLevel::High), Ordering::Relaxed);
state.metrics.gpio_last_read_timestamp.store(now, Ordering::Relaxed);
info!(
initial_status = %initial_state.as_door_status(),
"GPIO initialized"
);
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64)>();
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64, StateEventKind)>();
// Sync initial state with the cache on startup
let _ = tx.send((initial_state.as_door_status(), now));
let _ = tx.send((initial_state.as_door_status(), now, StateEventKind::Startup));
// Poll the input level and debounce in software. This is less elegant than
// edge-triggered reads, but it is robust on Raspberry Pi OS.
@ -173,18 +509,46 @@ async fn main() -> Result<()> {
let mut current_state = initial_state;
let mut pending_state = current_state;
let mut pending_since = std::time::Instant::now();
let mut gpio_read_error_count = 0u64;
let mut last_gpio_error_log: Option<Instant> = None;
loop {
let values = match inputs.get_values([false]) {
Ok(values) => values,
Err(e) => {
error!(error = %e, "failed to read GPIO value");
state_for_edges.metrics.gpio_read_error_total.fetch_add(1, Ordering::Relaxed);
gpio_read_error_count = gpio_read_error_count.saturating_add(1);
let should_log = last_gpio_error_log
.map(|last| last.elapsed() >= Duration::from_secs(60))
.unwrap_or(true);
if should_log {
error!(
error = %e,
consecutive_errors = gpio_read_error_count,
"failed to read GPIO value"
);
last_gpio_error_log = Some(Instant::now());
}
std::thread::sleep(Duration::from_secs(1));
continue;
}
};
if gpio_read_error_count > 0 {
info!(recovered_after_errors = gpio_read_error_count, "GPIO reads recovered");
gpio_read_error_count = 0;
last_gpio_error_log = None;
}
let new_raw_level = if values[0] { SignalLevel::High } else { SignalLevel::Low };
state_for_edges
.metrics
.gpio_raw_level
.store(u8::from(new_raw_level == SignalLevel::High), Ordering::Relaxed);
state_for_edges
.metrics
.gpio_last_read_timestamp
.store(unix_timestamp(), Ordering::Relaxed);
let new_state = LocalDoorState::from_raw_level(new_raw_level, active_level);
if new_state != pending_state {
@ -203,7 +567,25 @@ async fn main() -> Result<()> {
let timestamp = unix_timestamp();
state_for_edges.last_changed.store(timestamp, Ordering::Relaxed);
let _ = edge_tx.send((new_state.as_door_status(), timestamp));
match new_state {
LocalDoorState::Open => {
state_for_edges
.metrics
.state_change_open_total
.fetch_add(1, Ordering::Relaxed);
}
LocalDoorState::Closed => {
state_for_edges
.metrics
.state_change_closed_total
.fetch_add(1, Ordering::Relaxed);
}
}
let _ = edge_tx.send((
new_state.as_door_status(),
timestamp,
StateEventKind::StateChange,
));
}
std::thread::sleep(poll_interval);
@ -211,33 +593,97 @@ async fn main() -> Result<()> {
});
drop(tx); // Drop original sender so rx closes when edge_handle is dropped
let state_for_notify = state.clone();
let notify_handle = tokio::spawn(async move {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(http_timeout_secs))
.build()
.expect("failed to build HTTP client");
while let Some((status, timestamp)) = rx.recv().await {
info!(status = %status, timestamp, "state changed");
while let Some((status, timestamp, event_kind)) = rx.recv().await {
match event_kind {
StateEventKind::Startup => {
info!(status = %status, timestamp, event = event_kind.as_str(), "syncing initial door state");
}
StateEventKind::StateChange => {
info!(status = %status, timestamp, event = event_kind.as_str(), "door state changed");
}
}
let payload = WebhookPayload { status, timestamp };
for attempt in 0..=retry_attempts {
let notify_started_at = Instant::now();
state_for_notify.metrics.record_notify_attempt(unix_timestamp());
let result =
client.post(&endpoint_url).bearer_auth(&api_key).json(&payload).send().await;
match result {
Ok(resp) if resp.status().is_success() => break,
_ => {
Ok(resp) if resp.status().is_success() => {
let duration_ms = duration_millis(notify_started_at);
let http_status = resp.status().as_u16();
state_for_notify.metrics.record_notify_success(
unix_timestamp(),
duration_ms,
http_status,
);
info!(
status = %payload.status,
timestamp = payload.timestamp,
event = event_kind.as_str(),
http_status,
duration_ms,
attempts = attempt + 1,
"notified cache of door state"
);
break;
}
result => {
let err_msg = match &result {
Ok(resp) => format!("HTTP {}", resp.status()),
Err(e) => e.to_string(),
};
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
let kind = if http_status.is_some() {
NotifyResultKind::HttpError
} else {
NotifyResultKind::RequestError
};
let duration_ms = duration_millis(notify_started_at);
state_for_notify.metrics.record_notify_failure(
kind,
unix_timestamp(),
duration_ms,
http_status,
attempt == retry_attempts,
);
if attempt == retry_attempts {
error!(error = %err_msg, "failed to notify endpoint after {} attempts", retry_attempts + 1);
error!(
status = %payload.status,
timestamp = payload.timestamp,
event = event_kind.as_str(),
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempts = retry_attempts + 1,
"failed to notify cache after retries"
);
} else {
let delay =
Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
warn!(error = %err_msg, attempt = attempt + 1, "notify failed, retrying in {:?}", delay);
warn!(
status = %payload.status,
timestamp = payload.timestamp,
event = event_kind.as_str(),
error = %err_msg,
kind = kind.as_str(),
http_status = http_status.unwrap_or(0),
duration_ms,
attempt = attempt + 1,
total_attempts = retry_attempts + 1,
delay_seconds = delay.as_secs(),
"notify cache failed, retrying"
);
tokio::time::sleep(delay).await;
}
}
@ -246,7 +692,10 @@ async fn main() -> Result<()> {
}
});
let app = Router::new().route("/", get(get_status)).with_state(state);
let app = Router::new()
.route("/", get(get_status))
.route("/metrics", get(get_metrics))
.with_state(state);
let listener = tokio::net::TcpListener::bind((&*bind_address, port))
.await