feat: add noisebell observability
This commit is contained in:
parent
b57927a395
commit
e6c1b82679
24 changed files with 2289 additions and 137 deletions
|
|
@ -1,15 +1,18 @@
|
|||
use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
use std::{fs, process::Command};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use axum::extract::State;
|
||||
use axum::http::{HeaderMap, StatusCode};
|
||||
use axum::http::{header, HeaderMap, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::get;
|
||||
use axum::{Json, Router};
|
||||
use gpiod::{Bias, Chip, Options};
|
||||
use noisebell_common::{
|
||||
validate_bearer, DoorStatus, PiStatusResponse, SignalLevel, WebhookPayload,
|
||||
prometheus_escape_label_value, validate_bearer, DoorStatus, PiStatusResponse, SignalLevel,
|
||||
WebhookPayload, PROMETHEUS_CONTENT_TYPE,
|
||||
};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
|
|
@ -44,10 +47,126 @@ impl LocalDoorState {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum StateEventKind {
|
||||
Startup,
|
||||
StateChange,
|
||||
}
|
||||
|
||||
impl StateEventKind {
|
||||
const fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Startup => "startup",
|
||||
Self::StateChange => "state_change",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct AppState {
|
||||
door_state: AtomicU8,
|
||||
last_changed: AtomicU64,
|
||||
inbound_api_key: String,
|
||||
metrics: AppMetrics,
|
||||
}
|
||||
|
||||
struct AppMetrics {
|
||||
process_start_time: u64,
|
||||
notify_success_total: AtomicU64,
|
||||
notify_attempt_failure_total: AtomicU64,
|
||||
notify_failure_total: AtomicU64,
|
||||
notify_last_attempt_timestamp: AtomicU64,
|
||||
notify_last_success_timestamp: AtomicU64,
|
||||
notify_last_failure_timestamp: AtomicU64,
|
||||
notify_last_duration_millis: AtomicU64,
|
||||
notify_last_http_status: AtomicU64,
|
||||
notify_last_result: AtomicU64,
|
||||
state_change_open_total: AtomicU64,
|
||||
state_change_closed_total: AtomicU64,
|
||||
gpio_last_read_timestamp: AtomicU64,
|
||||
gpio_raw_level: AtomicU8,
|
||||
gpio_read_error_total: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum NotifyResultKind {
|
||||
Never = 0,
|
||||
Success = 1,
|
||||
HttpError = 2,
|
||||
RequestError = 3,
|
||||
}
|
||||
|
||||
impl NotifyResultKind {
|
||||
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
|
||||
|
||||
const fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Never => "never",
|
||||
Self::Success => "success",
|
||||
Self::HttpError => "http_error",
|
||||
Self::RequestError => "request_error",
|
||||
}
|
||||
}
|
||||
|
||||
const fn from_code(code: u64) -> Self {
|
||||
match code {
|
||||
1 => Self::Success,
|
||||
2 => Self::HttpError,
|
||||
3 => Self::RequestError,
|
||||
_ => Self::Never,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AppMetrics {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
process_start_time: unix_timestamp(),
|
||||
notify_success_total: AtomicU64::new(0),
|
||||
notify_attempt_failure_total: AtomicU64::new(0),
|
||||
notify_failure_total: AtomicU64::new(0),
|
||||
notify_last_attempt_timestamp: AtomicU64::new(0),
|
||||
notify_last_success_timestamp: AtomicU64::new(0),
|
||||
notify_last_failure_timestamp: AtomicU64::new(0),
|
||||
notify_last_duration_millis: AtomicU64::new(0),
|
||||
notify_last_http_status: AtomicU64::new(0),
|
||||
notify_last_result: AtomicU64::new(NotifyResultKind::Never as u64),
|
||||
state_change_open_total: AtomicU64::new(0),
|
||||
state_change_closed_total: AtomicU64::new(0),
|
||||
gpio_last_read_timestamp: AtomicU64::new(0),
|
||||
gpio_raw_level: AtomicU8::new(0),
|
||||
gpio_read_error_total: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
fn record_notify_attempt(&self, timestamp: u64) {
|
||||
self.notify_last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn record_notify_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
|
||||
self.notify_success_total.fetch_add(1, Ordering::Relaxed);
|
||||
self.notify_last_success_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||
self.notify_last_http_status.store(u64::from(status), Ordering::Relaxed);
|
||||
self.notify_last_result.store(NotifyResultKind::Success as u64, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn record_notify_failure(
|
||||
&self,
|
||||
kind: NotifyResultKind,
|
||||
timestamp: u64,
|
||||
duration_millis: u64,
|
||||
status: Option<u16>,
|
||||
final_failure: bool,
|
||||
) {
|
||||
self.notify_attempt_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||
if final_failure {
|
||||
self.notify_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
self.notify_last_failure_timestamp.store(timestamp, Ordering::Relaxed);
|
||||
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||
self.notify_last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
|
||||
self.notify_last_result.store(kind as u64, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
impl AppState {
|
||||
|
|
@ -60,11 +179,17 @@ fn unix_timestamp() -> u64 {
|
|||
SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs()
|
||||
}
|
||||
|
||||
fn duration_millis(started_at: Instant) -> u64 {
|
||||
let millis = started_at.elapsed().as_millis();
|
||||
millis.try_into().unwrap_or(u64::MAX)
|
||||
}
|
||||
|
||||
async fn get_status(
|
||||
State(state): State<Arc<AppState>>,
|
||||
headers: HeaderMap,
|
||||
) -> Result<Json<PiStatusResponse>, StatusCode> {
|
||||
if !validate_bearer(&headers, &state.inbound_api_key) {
|
||||
warn!("unauthorized status request rejected");
|
||||
return Err(StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
Ok(Json(PiStatusResponse {
|
||||
|
|
@ -73,6 +198,211 @@ async fn get_status(
|
|||
}))
|
||||
}
|
||||
|
||||
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
|
||||
let mut body = String::new();
|
||||
let current_status = state.current_door_state().as_door_status();
|
||||
|
||||
body.push_str("# HELP noisebell_pi_door_status Current local Pi door status.\n");
|
||||
body.push_str("# TYPE noisebell_pi_door_status gauge\n");
|
||||
for status in [DoorStatus::Open, DoorStatus::Closed] {
|
||||
let value = u8::from(current_status == status);
|
||||
let status = prometheus_escape_label_value(status.as_str());
|
||||
body.push_str(&format!("noisebell_pi_door_status{{status=\"{status}\"}} {value}\n"));
|
||||
}
|
||||
body.push_str("# HELP noisebell_pi_last_changed_timestamp_seconds Unix timestamp for the last local door state change.\n");
|
||||
body.push_str("# TYPE noisebell_pi_last_changed_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_last_changed_timestamp_seconds {}\n",
|
||||
state.last_changed.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_process_start_time_seconds Unix timestamp when the Pi service started.\n");
|
||||
body.push_str("# TYPE noisebell_pi_process_start_time_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_process_start_time_seconds {}\n",
|
||||
state.metrics.process_start_time
|
||||
));
|
||||
body.push_str(
|
||||
"# HELP noisebell_pi_notify_success_total Successful state webhooks sent to the cache.\n",
|
||||
);
|
||||
body.push_str("# TYPE noisebell_pi_notify_success_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_success_total {}\n",
|
||||
state.metrics.notify_success_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_attempt_failure_total Failed state webhook attempts before retry or final failure.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_attempt_failure_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_attempt_failure_total {}\n",
|
||||
state.metrics.notify_attempt_failure_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_failure_total State changes that failed to reach the cache after all retries.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_failure_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_failure_total {}\n",
|
||||
state.metrics.notify_failure_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_last_attempt_timestamp_seconds Unix timestamp of the last cache webhook attempt.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_attempt_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_attempt_timestamp_seconds {}\n",
|
||||
state.metrics.notify_last_attempt_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_last_success_timestamp_seconds Unix timestamp of the last successful cache webhook.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_success_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_success_timestamp_seconds {}\n",
|
||||
state.metrics.notify_last_success_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_last_failure_timestamp_seconds Unix timestamp of the last failed cache webhook attempt.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_failure_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_failure_timestamp_seconds {}\n",
|
||||
state.metrics.notify_last_failure_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_last_duration_seconds Duration of the most recent cache webhook attempt.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_duration_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_duration_seconds {}\n",
|
||||
state.metrics.notify_last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_notify_last_http_status HTTP status from the most recent cache webhook attempt, or 0 when no HTTP response was received.\n");
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_http_status gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_http_status {}\n",
|
||||
state.metrics.notify_last_http_status.load(Ordering::Relaxed)
|
||||
));
|
||||
let last_notify =
|
||||
NotifyResultKind::from_code(state.metrics.notify_last_result.load(Ordering::Relaxed));
|
||||
body.push_str(
|
||||
"# HELP noisebell_pi_notify_last_result Last cache webhook result as one-hot labels.\n",
|
||||
);
|
||||
body.push_str("# TYPE noisebell_pi_notify_last_result gauge\n");
|
||||
for result in NotifyResultKind::ALL {
|
||||
let value = u8::from(result == last_notify);
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_notify_last_result{{result=\"{}\"}} {value}\n",
|
||||
result.as_str()
|
||||
));
|
||||
}
|
||||
body.push_str("# HELP noisebell_pi_state_change_total Local debounced door state changes by resulting status.\n");
|
||||
body.push_str("# TYPE noisebell_pi_state_change_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_state_change_total{{status=\"open\"}} {}\n",
|
||||
state.metrics.state_change_open_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_state_change_total{{status=\"closed\"}} {}\n",
|
||||
state.metrics.state_change_closed_total.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_gpio_raw_level Last GPIO raw signal level, 0 for low and 1 for high.\n");
|
||||
body.push_str("# TYPE noisebell_pi_gpio_raw_level gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_gpio_raw_level {}\n",
|
||||
state.metrics.gpio_raw_level.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_gpio_last_read_timestamp_seconds Unix timestamp of the last successful GPIO read.\n");
|
||||
body.push_str("# TYPE noisebell_pi_gpio_last_read_timestamp_seconds gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_gpio_last_read_timestamp_seconds {}\n",
|
||||
state.metrics.gpio_last_read_timestamp.load(Ordering::Relaxed)
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_gpio_read_error_total GPIO read errors.\n");
|
||||
body.push_str("# TYPE noisebell_pi_gpio_read_error_total counter\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_gpio_read_error_total {}\n",
|
||||
state.metrics.gpio_read_error_total.load(Ordering::Relaxed)
|
||||
));
|
||||
|
||||
if let Some(boot_id) = read_trimmed("/proc/sys/kernel/random/boot_id") {
|
||||
let boot_id = prometheus_escape_label_value(&boot_id);
|
||||
body.push_str("# HELP noisebell_pi_boot_info Pi boot identity. Changes on reboot.\n");
|
||||
body.push_str("# TYPE noisebell_pi_boot_info gauge\n");
|
||||
body.push_str(&format!("noisebell_pi_boot_info{{boot_id=\"{boot_id}\"}} 1\n"));
|
||||
}
|
||||
if let Some(uptime) = read_uptime_seconds() {
|
||||
body.push_str("# HELP noisebell_pi_uptime_seconds Pi system uptime in seconds.\n");
|
||||
body.push_str("# TYPE noisebell_pi_uptime_seconds gauge\n");
|
||||
body.push_str(&format!("noisebell_pi_uptime_seconds {uptime}\n"));
|
||||
}
|
||||
if let Some(temp) = read_temperature_celsius() {
|
||||
body.push_str("# HELP noisebell_pi_temperature_celsius Pi CPU temperature in Celsius.\n");
|
||||
body.push_str("# TYPE noisebell_pi_temperature_celsius gauge\n");
|
||||
body.push_str(&format!("noisebell_pi_temperature_celsius {temp}\n"));
|
||||
}
|
||||
if let Some(throttled) = read_throttled_flags() {
|
||||
body.push_str("# HELP noisebell_pi_throttled_flags Raspberry Pi throttling bitfield from vcgencmd get_throttled.\n");
|
||||
body.push_str("# TYPE noisebell_pi_throttled_flags gauge\n");
|
||||
body.push_str(&format!("noisebell_pi_throttled_flags {throttled}\n"));
|
||||
}
|
||||
if let Some((interface, link, level)) = read_wifi_metrics() {
|
||||
let interface = prometheus_escape_label_value(&interface);
|
||||
body.push_str("# HELP noisebell_pi_wifi_link_quality Wireless link quality from /proc/net/wireless.\n");
|
||||
body.push_str("# TYPE noisebell_pi_wifi_link_quality gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_wifi_link_quality{{interface=\"{interface}\"}} {link}\n"
|
||||
));
|
||||
body.push_str("# HELP noisebell_pi_wifi_signal_dbm Wireless signal level in dBm from /proc/net/wireless.\n");
|
||||
body.push_str("# TYPE noisebell_pi_wifi_signal_dbm gauge\n");
|
||||
body.push_str(&format!(
|
||||
"noisebell_pi_wifi_signal_dbm{{interface=\"{interface}\"}} {level}\n"
|
||||
));
|
||||
}
|
||||
|
||||
body.push_str("# HELP noisebell_pi_tailscale_running Whether tailscale status reports BackendState Running.\n");
|
||||
body.push_str("# TYPE noisebell_pi_tailscale_running gauge\n");
|
||||
body.push_str(&format!("noisebell_pi_tailscale_running {}\n", u8::from(tailscale_running())));
|
||||
|
||||
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
|
||||
}
|
||||
|
||||
fn read_trimmed(path: &str) -> Option<String> {
|
||||
fs::read_to_string(path)
|
||||
.ok()
|
||||
.map(|value| value.trim().to_string())
|
||||
.filter(|value| !value.is_empty())
|
||||
}
|
||||
|
||||
fn read_uptime_seconds() -> Option<f64> {
|
||||
read_trimmed("/proc/uptime")?.split_whitespace().next()?.parse().ok()
|
||||
}
|
||||
|
||||
fn read_temperature_celsius() -> Option<f64> {
|
||||
let raw: f64 = read_trimmed("/sys/class/thermal/thermal_zone0/temp")?.parse().ok()?;
|
||||
Some(raw / 1000.0)
|
||||
}
|
||||
|
||||
fn read_throttled_flags() -> Option<u64> {
|
||||
let output = Command::new("vcgencmd").arg("get_throttled").output().ok()?;
|
||||
if !output.status.success() {
|
||||
return None;
|
||||
}
|
||||
let text = String::from_utf8_lossy(&output.stdout);
|
||||
let value = text.trim().strip_prefix("throttled=0x")?;
|
||||
u64::from_str_radix(value, 16).ok()
|
||||
}
|
||||
|
||||
fn read_wifi_metrics() -> Option<(String, f64, f64)> {
|
||||
let text = read_trimmed("/proc/net/wireless")?;
|
||||
for line in text.lines().skip(2) {
|
||||
let (interface, values) = line.split_once(':')?;
|
||||
let mut parts = values.split_whitespace();
|
||||
let _status = parts.next()?;
|
||||
let link: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
|
||||
let level: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
|
||||
return Some((interface.trim().to_string(), link, level));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn tailscale_running() -> bool {
|
||||
let output = match Command::new("tailscale").args(["status", "--json"]).output() {
|
||||
Ok(output) => output,
|
||||
Err(_) => return false,
|
||||
};
|
||||
output.status.success()
|
||||
&& String::from_utf8_lossy(&output.stdout).contains("\"BackendState\":\"Running\"")
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
|
|
@ -151,17 +481,23 @@ async fn main() -> Result<()> {
|
|||
door_state: AtomicU8::new(initial_state as u8),
|
||||
last_changed: AtomicU64::new(now),
|
||||
inbound_api_key,
|
||||
metrics: AppMetrics::new(),
|
||||
});
|
||||
state
|
||||
.metrics
|
||||
.gpio_raw_level
|
||||
.store(u8::from(initial_raw_level == SignalLevel::High), Ordering::Relaxed);
|
||||
state.metrics.gpio_last_read_timestamp.store(now, Ordering::Relaxed);
|
||||
|
||||
info!(
|
||||
initial_status = %initial_state.as_door_status(),
|
||||
"GPIO initialized"
|
||||
);
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64)>();
|
||||
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64, StateEventKind)>();
|
||||
|
||||
// Sync initial state with the cache on startup
|
||||
let _ = tx.send((initial_state.as_door_status(), now));
|
||||
let _ = tx.send((initial_state.as_door_status(), now, StateEventKind::Startup));
|
||||
|
||||
// Poll the input level and debounce in software. This is less elegant than
|
||||
// edge-triggered reads, but it is robust on Raspberry Pi OS.
|
||||
|
|
@ -173,18 +509,46 @@ async fn main() -> Result<()> {
|
|||
let mut current_state = initial_state;
|
||||
let mut pending_state = current_state;
|
||||
let mut pending_since = std::time::Instant::now();
|
||||
let mut gpio_read_error_count = 0u64;
|
||||
let mut last_gpio_error_log: Option<Instant> = None;
|
||||
|
||||
loop {
|
||||
let values = match inputs.get_values([false]) {
|
||||
Ok(values) => values,
|
||||
Err(e) => {
|
||||
error!(error = %e, "failed to read GPIO value");
|
||||
state_for_edges.metrics.gpio_read_error_total.fetch_add(1, Ordering::Relaxed);
|
||||
gpio_read_error_count = gpio_read_error_count.saturating_add(1);
|
||||
let should_log = last_gpio_error_log
|
||||
.map(|last| last.elapsed() >= Duration::from_secs(60))
|
||||
.unwrap_or(true);
|
||||
if should_log {
|
||||
error!(
|
||||
error = %e,
|
||||
consecutive_errors = gpio_read_error_count,
|
||||
"failed to read GPIO value"
|
||||
);
|
||||
last_gpio_error_log = Some(Instant::now());
|
||||
}
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if gpio_read_error_count > 0 {
|
||||
info!(recovered_after_errors = gpio_read_error_count, "GPIO reads recovered");
|
||||
gpio_read_error_count = 0;
|
||||
last_gpio_error_log = None;
|
||||
}
|
||||
|
||||
let new_raw_level = if values[0] { SignalLevel::High } else { SignalLevel::Low };
|
||||
state_for_edges
|
||||
.metrics
|
||||
.gpio_raw_level
|
||||
.store(u8::from(new_raw_level == SignalLevel::High), Ordering::Relaxed);
|
||||
state_for_edges
|
||||
.metrics
|
||||
.gpio_last_read_timestamp
|
||||
.store(unix_timestamp(), Ordering::Relaxed);
|
||||
let new_state = LocalDoorState::from_raw_level(new_raw_level, active_level);
|
||||
|
||||
if new_state != pending_state {
|
||||
|
|
@ -203,7 +567,25 @@ async fn main() -> Result<()> {
|
|||
|
||||
let timestamp = unix_timestamp();
|
||||
state_for_edges.last_changed.store(timestamp, Ordering::Relaxed);
|
||||
let _ = edge_tx.send((new_state.as_door_status(), timestamp));
|
||||
match new_state {
|
||||
LocalDoorState::Open => {
|
||||
state_for_edges
|
||||
.metrics
|
||||
.state_change_open_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
LocalDoorState::Closed => {
|
||||
state_for_edges
|
||||
.metrics
|
||||
.state_change_closed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
let _ = edge_tx.send((
|
||||
new_state.as_door_status(),
|
||||
timestamp,
|
||||
StateEventKind::StateChange,
|
||||
));
|
||||
}
|
||||
|
||||
std::thread::sleep(poll_interval);
|
||||
|
|
@ -211,33 +593,97 @@ async fn main() -> Result<()> {
|
|||
});
|
||||
drop(tx); // Drop original sender so rx closes when edge_handle is dropped
|
||||
|
||||
let state_for_notify = state.clone();
|
||||
let notify_handle = tokio::spawn(async move {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(http_timeout_secs))
|
||||
.build()
|
||||
.expect("failed to build HTTP client");
|
||||
|
||||
while let Some((status, timestamp)) = rx.recv().await {
|
||||
info!(status = %status, timestamp, "state changed");
|
||||
while let Some((status, timestamp, event_kind)) = rx.recv().await {
|
||||
match event_kind {
|
||||
StateEventKind::Startup => {
|
||||
info!(status = %status, timestamp, event = event_kind.as_str(), "syncing initial door state");
|
||||
}
|
||||
StateEventKind::StateChange => {
|
||||
info!(status = %status, timestamp, event = event_kind.as_str(), "door state changed");
|
||||
}
|
||||
}
|
||||
|
||||
let payload = WebhookPayload { status, timestamp };
|
||||
|
||||
for attempt in 0..=retry_attempts {
|
||||
let notify_started_at = Instant::now();
|
||||
state_for_notify.metrics.record_notify_attempt(unix_timestamp());
|
||||
let result =
|
||||
client.post(&endpoint_url).bearer_auth(&api_key).json(&payload).send().await;
|
||||
match result {
|
||||
Ok(resp) if resp.status().is_success() => break,
|
||||
_ => {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
let duration_ms = duration_millis(notify_started_at);
|
||||
let http_status = resp.status().as_u16();
|
||||
state_for_notify.metrics.record_notify_success(
|
||||
unix_timestamp(),
|
||||
duration_ms,
|
||||
http_status,
|
||||
);
|
||||
info!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
event = event_kind.as_str(),
|
||||
http_status,
|
||||
duration_ms,
|
||||
attempts = attempt + 1,
|
||||
"notified cache of door state"
|
||||
);
|
||||
break;
|
||||
}
|
||||
result => {
|
||||
let err_msg = match &result {
|
||||
Ok(resp) => format!("HTTP {}", resp.status()),
|
||||
Err(e) => e.to_string(),
|
||||
};
|
||||
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
|
||||
let kind = if http_status.is_some() {
|
||||
NotifyResultKind::HttpError
|
||||
} else {
|
||||
NotifyResultKind::RequestError
|
||||
};
|
||||
let duration_ms = duration_millis(notify_started_at);
|
||||
state_for_notify.metrics.record_notify_failure(
|
||||
kind,
|
||||
unix_timestamp(),
|
||||
duration_ms,
|
||||
http_status,
|
||||
attempt == retry_attempts,
|
||||
);
|
||||
if attempt == retry_attempts {
|
||||
error!(error = %err_msg, "failed to notify endpoint after {} attempts", retry_attempts + 1);
|
||||
error!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
event = event_kind.as_str(),
|
||||
error = %err_msg,
|
||||
kind = kind.as_str(),
|
||||
http_status = http_status.unwrap_or(0),
|
||||
duration_ms,
|
||||
attempts = retry_attempts + 1,
|
||||
"failed to notify cache after retries"
|
||||
);
|
||||
} else {
|
||||
let delay =
|
||||
Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
|
||||
warn!(error = %err_msg, attempt = attempt + 1, "notify failed, retrying in {:?}", delay);
|
||||
warn!(
|
||||
status = %payload.status,
|
||||
timestamp = payload.timestamp,
|
||||
event = event_kind.as_str(),
|
||||
error = %err_msg,
|
||||
kind = kind.as_str(),
|
||||
http_status = http_status.unwrap_or(0),
|
||||
duration_ms,
|
||||
attempt = attempt + 1,
|
||||
total_attempts = retry_attempts + 1,
|
||||
delay_seconds = delay.as_secs(),
|
||||
"notify cache failed, retrying"
|
||||
);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
}
|
||||
|
|
@ -246,7 +692,10 @@ async fn main() -> Result<()> {
|
|||
}
|
||||
});
|
||||
|
||||
let app = Router::new().route("/", get(get_status)).with_state(state);
|
||||
let app = Router::new()
|
||||
.route("/", get(get_status))
|
||||
.route("/metrics", get(get_metrics))
|
||||
.with_state(state);
|
||||
|
||||
let listener = tokio::net::TcpListener::bind((&*bind_address, port))
|
||||
.await
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue