feat: add noisebell observability
This commit is contained in:
parent
b57927a395
commit
e6c1b82679
24 changed files with 2289 additions and 137 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
|
@ -886,7 +886,6 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower-http",
|
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
|
|
@ -912,7 +911,6 @@ dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serenity",
|
"serenity",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower-http",
|
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
|
|
@ -940,7 +938,6 @@ dependencies = [
|
||||||
"noisebell-common",
|
"noisebell-common",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower-http",
|
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
|
|
@ -955,7 +952,6 @@ dependencies = [
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower-http",
|
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
|
|
@ -1808,7 +1804,6 @@ dependencies = [
|
||||||
"tower",
|
"tower",
|
||||||
"tower-layer",
|
"tower-layer",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
"tracing",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
12
README.md
12
README.md
|
|
@ -34,3 +34,15 @@ Useful commands:
|
||||||
- `scripts/deploy-pios-pi.sh pi@100.66.45.36` redeploys the Raspberry Pi OS machine
|
- `scripts/deploy-pios-pi.sh pi@100.66.45.36` redeploys the Raspberry Pi OS machine
|
||||||
|
|
||||||
The full Home Assistant relay workflow is documented in `pi/README.md`.
|
The full Home Assistant relay workflow is documented in `pi/README.md`.
|
||||||
|
|
||||||
|
## Observability
|
||||||
|
|
||||||
|
The DigitalOcean host runs Prometheus, Loki, Grafana, Alloy, node_exporter, and blackbox_exporter via `hosts/noisebell-do/observability.nix`. Grafana provisions the `Noisebell DO + Pi` dashboard from code, with Prometheus panels for both hosts, detailed DO-to-Pi poll health, and Loki journal panels for both hosts.
|
||||||
|
|
||||||
|
- Grafana: `http://noisebell-do:3030/` over Tailscale
|
||||||
|
- Prometheus: `http://noisebell-do:9090/` over Tailscale
|
||||||
|
- Loki: `http://noisebell-do:3100/` over Tailscale
|
||||||
|
|
||||||
|
The Pi deploy script enables persistent journald, installs `prometheus-node-exporter`, and installs `noisebell-loki-journal.service` to ship Pi journal logs to Loki on the DO host.
|
||||||
|
|
||||||
|
Prometheus is the source of truth for regular time-based data: scrape health, host CPU/memory/disk/uptime, DO-to-Pi poll counts and last results, GPIO state, Pi hardware readings, webhook counters, and retry counters. Loki/journald is reserved for sparse event logs that should be readable in chronological order: service start/stop, door state changes, cache state changes, Pi offline/online transitions, auth or rate-limit rejections, webhook retries/failures, stale events, and GPIO read errors. Routine successful polls, unchanged poll results, metrics scrapes, and badge/image/status reads are intentionally not logged at `INFO`.
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,10 @@
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [ (modulesPath + "/virtualisation/digital-ocean-config.nix") ];
|
imports = [
|
||||||
|
(modulesPath + "/virtualisation/digital-ocean-config.nix")
|
||||||
|
./observability.nix
|
||||||
|
];
|
||||||
|
|
||||||
system.stateVersion = "26.05";
|
system.stateVersion = "26.05";
|
||||||
|
|
||||||
|
|
|
||||||
674
hosts/noisebell-do/observability.nix
Normal file
674
hosts/noisebell-do/observability.nix
Normal file
|
|
@ -0,0 +1,674 @@
|
||||||
|
{ lib, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
prometheusPort = 9090;
|
||||||
|
lokiPort = 3100;
|
||||||
|
grafanaPort = 3030;
|
||||||
|
nodeExporterPort = 9100;
|
||||||
|
blackboxExporterPort = 9115;
|
||||||
|
|
||||||
|
blackboxConfig = pkgs.writeText "noisebell-blackbox.yml" ''
|
||||||
|
modules:
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
tcp_connect:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
'';
|
||||||
|
|
||||||
|
prometheusDatasource = {
|
||||||
|
type = "prometheus";
|
||||||
|
uid = "prometheus";
|
||||||
|
};
|
||||||
|
|
||||||
|
lokiDatasource = {
|
||||||
|
type = "loki";
|
||||||
|
uid = "loki";
|
||||||
|
};
|
||||||
|
|
||||||
|
prometheusPanel =
|
||||||
|
{
|
||||||
|
id,
|
||||||
|
title,
|
||||||
|
type ? "timeseries",
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
w ? 12,
|
||||||
|
h ? 8,
|
||||||
|
targets,
|
||||||
|
}:
|
||||||
|
{
|
||||||
|
inherit id title type targets;
|
||||||
|
datasource = prometheusDatasource;
|
||||||
|
gridPos = {
|
||||||
|
inherit h w x y;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
lokiPanel =
|
||||||
|
{
|
||||||
|
id,
|
||||||
|
title,
|
||||||
|
type ? "logs",
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
w ? 12,
|
||||||
|
h ? 8,
|
||||||
|
targets,
|
||||||
|
}:
|
||||||
|
{
|
||||||
|
inherit id title type targets;
|
||||||
|
datasource = lokiDatasource;
|
||||||
|
gridPos = {
|
||||||
|
inherit h w x y;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
promTarget = refId: expr: legendFormat: {
|
||||||
|
inherit refId expr legendFormat;
|
||||||
|
};
|
||||||
|
|
||||||
|
lokiTarget = refId: expr: {
|
||||||
|
inherit refId expr;
|
||||||
|
};
|
||||||
|
|
||||||
|
dashboard = pkgs.writeText "noisebell-dashboard.json" (builtins.toJSON {
|
||||||
|
uid = "noisebell";
|
||||||
|
title = "Noisebell DO + Pi";
|
||||||
|
tags = [
|
||||||
|
"noisebell"
|
||||||
|
"prometheus"
|
||||||
|
"loki"
|
||||||
|
];
|
||||||
|
timezone = "browser";
|
||||||
|
schemaVersion = 39;
|
||||||
|
version = 1;
|
||||||
|
refresh = "30s";
|
||||||
|
time = {
|
||||||
|
from = "now-6h";
|
||||||
|
to = "now";
|
||||||
|
};
|
||||||
|
panels = [
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 1;
|
||||||
|
title = "Host Scrape Health (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 0;
|
||||||
|
y = 0;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "up{job=~\"noisebell-(do|pi)-node\"}" "{{host}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 2;
|
||||||
|
title = "Noisebell Service Health (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 6;
|
||||||
|
y = 0;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "up{job=~\"noisebell-(cache|pi-app|pi-relay)\"}" "{{job}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 3;
|
||||||
|
title = "Probe Health (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 12;
|
||||||
|
y = 0;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "probe_success{job=~\"noisebell-.*-probes\"}" "{{instance}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 4;
|
||||||
|
title = "Door State (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 18;
|
||||||
|
y = 0;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "noisebell_cache_status" "{{status}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 5;
|
||||||
|
title = "CPU Used % (DO + Pi)";
|
||||||
|
x = 0;
|
||||||
|
y = 6;
|
||||||
|
w = 8;
|
||||||
|
targets = [ (promTarget "A" "100 - (avg by (host) (rate(node_cpu_seconds_total{job=~\"noisebell-(do|pi)-node\",mode=\"idle\"}[5m])) * 100)" "{{host}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 6;
|
||||||
|
title = "Memory Used % (DO + Pi)";
|
||||||
|
x = 8;
|
||||||
|
y = 6;
|
||||||
|
w = 8;
|
||||||
|
targets = [ (promTarget "A" "100 * (1 - (node_memory_MemAvailable_bytes{job=~\"noisebell-(do|pi)-node\"} / node_memory_MemTotal_bytes{job=~\"noisebell-(do|pi)-node\"}))" "{{host}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 7;
|
||||||
|
title = "Root Disk Used % (DO + Pi)";
|
||||||
|
x = 16;
|
||||||
|
y = 6;
|
||||||
|
w = 8;
|
||||||
|
targets = [ (promTarget "A" "100 * (1 - (node_filesystem_avail_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}))" "{{host}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 8;
|
||||||
|
title = "Host Uptime Hours (DO + Pi)";
|
||||||
|
type = "stat";
|
||||||
|
x = 0;
|
||||||
|
y = 14;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "(time() - node_boot_time_seconds{job=~\"noisebell-(do|pi)-node\"}) / 3600" "{{host}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 9;
|
||||||
|
title = "Observability Stack Health (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 6;
|
||||||
|
y = 14;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [ (promTarget "A" "up{job=~\"observability-.*\"}" "{{service}}") ];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 10;
|
||||||
|
title = "Pi Hardware (Prometheus)";
|
||||||
|
x = 12;
|
||||||
|
y = 14;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "noisebell_pi_temperature_celsius" "temp C")
|
||||||
|
(promTarget "B" "noisebell_pi_throttled_flags" "throttled flags")
|
||||||
|
(promTarget "C" "noisebell_pi_tailscale_running" "tailscale running")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 11;
|
||||||
|
title = "Pi Wi-Fi Signal (Prometheus)";
|
||||||
|
x = 18;
|
||||||
|
y = 14;
|
||||||
|
w = 6;
|
||||||
|
h = 6;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "noisebell_pi_wifi_signal_dbm" "{{interface}} dBm")
|
||||||
|
(promTarget "B" "noisebell_pi_wifi_link_quality" "{{interface}} link")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 12;
|
||||||
|
title = "Cache Poll Health (Prometheus)";
|
||||||
|
x = 0;
|
||||||
|
y = 20;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "noisebell_cache_poll_consecutive_failures" "consecutive failures")
|
||||||
|
(promTarget "B" "rate(noisebell_cache_poll_failure_total[5m])" "failure rate")
|
||||||
|
(promTarget "C" "rate(noisebell_cache_poll_success_total[5m])" "success rate")
|
||||||
|
(promTarget "D" "noisebell_cache_poll_last_duration_seconds" "last duration")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 13;
|
||||||
|
title = "Pi App Webhook Delivery (Prometheus)";
|
||||||
|
x = 12;
|
||||||
|
y = 20;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "rate(noisebell_pi_notify_success_total[5m])" "success")
|
||||||
|
(promTarget "B" "rate(noisebell_pi_notify_attempt_failure_total[5m])" "attempt failures")
|
||||||
|
(promTarget "C" "rate(noisebell_pi_notify_failure_total[5m])" "final failures")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 14;
|
||||||
|
title = "DO -> Pi Last Poll Details (Prometheus)";
|
||||||
|
type = "stat";
|
||||||
|
x = 0;
|
||||||
|
y = 28;
|
||||||
|
w = 12;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "noisebell_cache_poll_last_result" "result {{result}}")
|
||||||
|
(promTarget "B" "noisebell_cache_poll_last_http_status" "last HTTP status")
|
||||||
|
(promTarget "C" "noisebell_cache_poll_last_duration_seconds" "last duration sec")
|
||||||
|
(promTarget "D" "time() - noisebell_cache_poll_last_attempt_timestamp_seconds" "seconds since attempt")
|
||||||
|
(promTarget "E" "time() - noisebell_cache_poll_last_success_timestamp_seconds" "seconds since success")
|
||||||
|
(promTarget "F" "time() - noisebell_cache_poll_last_failure_timestamp_seconds" "seconds since failure")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 15;
|
||||||
|
title = "DO -> Pi Poll Failure Types (Prometheus)";
|
||||||
|
x = 12;
|
||||||
|
y = 28;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "rate(noisebell_cache_poll_http_error_total[5m])" "http error")
|
||||||
|
(promTarget "B" "rate(noisebell_cache_poll_request_timeout_total[5m])" "timeout")
|
||||||
|
(promTarget "C" "rate(noisebell_cache_poll_request_connect_total[5m])" "connect")
|
||||||
|
(promTarget "D" "rate(noisebell_cache_poll_request_other_total[5m])" "request other")
|
||||||
|
(promTarget "E" "rate(noisebell_cache_poll_parse_failure_total[5m])" "parse")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(prometheusPanel {
|
||||||
|
id = 16;
|
||||||
|
title = "Relay Delivery (Prometheus)";
|
||||||
|
x = 0;
|
||||||
|
y = 36;
|
||||||
|
targets = [
|
||||||
|
(promTarget "A" "rate(noisebell_relay_forwarded_total[5m])" "forwarded")
|
||||||
|
(promTarget "B" "rate(noisebell_relay_attempt_failure_total[5m])" "attempt failures")
|
||||||
|
(promTarget "C" "rate(noisebell_relay_failed_total[5m])" "final failures")
|
||||||
|
(promTarget "D" "noisebell_relay_last_duration_seconds" "last duration")
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(lokiPanel {
|
||||||
|
id = 17;
|
||||||
|
title = "Journal Log Rate (Loki, DO + Pi)";
|
||||||
|
type = "timeseries";
|
||||||
|
x = 12;
|
||||||
|
y = 36;
|
||||||
|
targets = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
expr = "sum by (host) (rate({job=\"journal\"}[5m]))";
|
||||||
|
legendFormat = "{{host}}";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
})
|
||||||
|
(lokiPanel {
|
||||||
|
id = 18;
|
||||||
|
title = "DO Journal Logs (Loki)";
|
||||||
|
x = 0;
|
||||||
|
y = 44;
|
||||||
|
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-do\"}") ];
|
||||||
|
})
|
||||||
|
(lokiPanel {
|
||||||
|
id = 19;
|
||||||
|
title = "Pi Journal Logs (Loki)";
|
||||||
|
x = 12;
|
||||||
|
y = 44;
|
||||||
|
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-pi\"}") ];
|
||||||
|
})
|
||||||
|
];
|
||||||
|
});
|
||||||
|
|
||||||
|
dashboardDir = pkgs.runCommand "noisebell-grafana-dashboards" { } ''
|
||||||
|
mkdir -p "$out"
|
||||||
|
cp ${dashboard} "$out/noisebell.json"
|
||||||
|
'';
|
||||||
|
|
||||||
|
blackboxRelabels = [
|
||||||
|
{
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
target_label = "__param_target";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
source_labels = [ "__param_target" ];
|
||||||
|
target_label = "instance";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "127.0.0.1:${toString blackboxExporterPort}";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.prometheus = {
|
||||||
|
enable = true;
|
||||||
|
listenAddress = "0.0.0.0";
|
||||||
|
port = prometheusPort;
|
||||||
|
retentionTime = "7d";
|
||||||
|
globalConfig = {
|
||||||
|
scrape_interval = "15s";
|
||||||
|
evaluation_interval = "15s";
|
||||||
|
};
|
||||||
|
exporters = {
|
||||||
|
node = {
|
||||||
|
enable = true;
|
||||||
|
port = nodeExporterPort;
|
||||||
|
enabledCollectors = [ "systemd" ];
|
||||||
|
};
|
||||||
|
blackbox = {
|
||||||
|
enable = true;
|
||||||
|
port = blackboxExporterPort;
|
||||||
|
configFile = blackboxConfig;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
scrapeConfigs = [
|
||||||
|
{
|
||||||
|
job_name = "noisebell-do-node";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:${toString nodeExporterPort}" ];
|
||||||
|
labels.host = "noisebell-do";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-pi-node";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "noisebell-pi:${toString nodeExporterPort}" ];
|
||||||
|
labels.host = "noisebell-pi";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-cache";
|
||||||
|
metrics_path = "/metrics";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:3000" ];
|
||||||
|
labels.host = "noisebell-do";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-pi-app";
|
||||||
|
metrics_path = "/metrics";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "noisebell-pi:80" ];
|
||||||
|
labels.host = "noisebell-pi";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-pi-relay";
|
||||||
|
metrics_path = "/metrics";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "noisebell-pi:8090" ];
|
||||||
|
labels.host = "noisebell-pi";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "observability-prometheus";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:${toString prometheusPort}" ];
|
||||||
|
labels = {
|
||||||
|
host = "noisebell-do";
|
||||||
|
service = "prometheus";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "observability-loki";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:${toString lokiPort}" ];
|
||||||
|
labels = {
|
||||||
|
host = "noisebell-do";
|
||||||
|
service = "loki";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "observability-grafana";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:${toString grafanaPort}" ];
|
||||||
|
labels = {
|
||||||
|
host = "noisebell-do";
|
||||||
|
service = "grafana";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "observability-alloy";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:12345" ];
|
||||||
|
labels = {
|
||||||
|
host = "noisebell-do";
|
||||||
|
service = "alloy";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "observability-blackbox";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:${toString blackboxExporterPort}" ];
|
||||||
|
labels = {
|
||||||
|
host = "noisebell-do";
|
||||||
|
service = "blackbox";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-http-probes";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params.module = [ "http_2xx" ];
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [
|
||||||
|
"http://noisebell-pi/metrics"
|
||||||
|
"http://noisebell-pi:8090/health"
|
||||||
|
"https://noisebell.extremist.software/status"
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
relabel_configs = blackboxRelabels;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "noisebell-tcp-probes";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params.module = [ "tcp_connect" ];
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [
|
||||||
|
"noisebell-pi:22"
|
||||||
|
"noisebell-pi:80"
|
||||||
|
"noisebell-pi:8090"
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
relabel_configs = blackboxRelabels;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
rules = [
|
||||||
|
''
|
||||||
|
groups:
|
||||||
|
- name: noisebell
|
||||||
|
rules:
|
||||||
|
- alert: NoisebellPiAppDown
|
||||||
|
expr: up{job="noisebell-pi-app"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell Pi app metrics are down
|
||||||
|
- alert: NoisebellPiNodeExporterDown
|
||||||
|
expr: up{job="noisebell-pi-node"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell Pi node exporter is down
|
||||||
|
- alert: NoisebellProbeFailed
|
||||||
|
expr: probe_success{job=~"noisebell-.*-probes"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell probe failed for {{ $labels.instance }}
|
||||||
|
- alert: NoisebellCachePollFailures
|
||||||
|
expr: noisebell_cache_poll_consecutive_failures >= 3
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell cache cannot poll the Pi
|
||||||
|
- alert: NoisebellPiRecentlyRebooted
|
||||||
|
expr: noisebell_pi_uptime_seconds < 300
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell Pi rebooted recently
|
||||||
|
- alert: NoisebellPiThrottled
|
||||||
|
expr: noisebell_pi_throttled_flags > 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Noisebell Pi reports throttling flags
|
||||||
|
''
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
services.loki = {
|
||||||
|
enable = true;
|
||||||
|
configuration = {
|
||||||
|
auth_enabled = false;
|
||||||
|
server = {
|
||||||
|
http_listen_address = "0.0.0.0";
|
||||||
|
http_listen_port = lokiPort;
|
||||||
|
grpc_listen_address = "127.0.0.1";
|
||||||
|
grpc_listen_port = 9096;
|
||||||
|
};
|
||||||
|
common = {
|
||||||
|
path_prefix = "/var/lib/loki";
|
||||||
|
replication_factor = 1;
|
||||||
|
ring.kvstore.store = "inmemory";
|
||||||
|
storage.filesystem = {
|
||||||
|
chunks_directory = "/var/lib/loki/chunks";
|
||||||
|
rules_directory = "/var/lib/loki/rules";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
schema_config.configs = [
|
||||||
|
{
|
||||||
|
from = "2024-01-01";
|
||||||
|
store = "tsdb";
|
||||||
|
object_store = "filesystem";
|
||||||
|
schema = "v13";
|
||||||
|
index = {
|
||||||
|
prefix = "index_";
|
||||||
|
period = "24h";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "168h";
|
||||||
|
reject_old_samples = true;
|
||||||
|
reject_old_samples_max_age = "168h";
|
||||||
|
};
|
||||||
|
compactor = {
|
||||||
|
working_directory = "/var/lib/loki/compactor";
|
||||||
|
compaction_interval = "10m";
|
||||||
|
retention_enabled = true;
|
||||||
|
retention_delete_delay = "2h";
|
||||||
|
retention_delete_worker_count = 1;
|
||||||
|
delete_request_store = "filesystem";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.alloy = {
|
||||||
|
enable = true;
|
||||||
|
extraFlags = [ "--server.http.listen-addr=127.0.0.1:12345" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
environment.etc."alloy/config.alloy".text = ''
|
||||||
|
loki.write "local" {
|
||||||
|
endpoint {
|
||||||
|
url = "http://127.0.0.1:${toString lokiPort}/loki/api/v1/push"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loki.source.journal "system" {
|
||||||
|
max_age = "12h"
|
||||||
|
labels = {
|
||||||
|
job = "journal",
|
||||||
|
host = "noisebell-do",
|
||||||
|
}
|
||||||
|
forward_to = [loki.write.local.receiver]
|
||||||
|
}
|
||||||
|
'';
|
||||||
|
|
||||||
|
systemd.services.alloy = {
|
||||||
|
after = [ "loki.service" ];
|
||||||
|
wants = [ "loki.service" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
services.grafana = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
server = {
|
||||||
|
http_addr = "0.0.0.0";
|
||||||
|
http_port = grafanaPort;
|
||||||
|
domain = "noisebell-do";
|
||||||
|
root_url = "http://noisebell-do:${toString grafanaPort}/";
|
||||||
|
};
|
||||||
|
analytics.reporting_enabled = false;
|
||||||
|
metrics.enabled = true;
|
||||||
|
security = {
|
||||||
|
secret_key = "$__file{/var/lib/grafana/secret_key}";
|
||||||
|
disable_initial_admin_creation = true;
|
||||||
|
};
|
||||||
|
auth.disable_login_form = true;
|
||||||
|
users.allow_sign_up = false;
|
||||||
|
"auth.anonymous" = {
|
||||||
|
enabled = true;
|
||||||
|
org_role = "Viewer";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
provision = {
|
||||||
|
enable = true;
|
||||||
|
datasources.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
prune = true;
|
||||||
|
datasources = [
|
||||||
|
{
|
||||||
|
name = "Prometheus";
|
||||||
|
uid = "prometheus";
|
||||||
|
type = "prometheus";
|
||||||
|
access = "proxy";
|
||||||
|
url = "http://127.0.0.1:${toString prometheusPort}";
|
||||||
|
isDefault = true;
|
||||||
|
editable = false;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
name = "Loki";
|
||||||
|
uid = "loki";
|
||||||
|
type = "loki";
|
||||||
|
access = "proxy";
|
||||||
|
url = "http://127.0.0.1:${toString lokiPort}";
|
||||||
|
editable = false;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
dashboards.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
providers = [
|
||||||
|
{
|
||||||
|
name = "Noisebell";
|
||||||
|
type = "file";
|
||||||
|
allowUiUpdates = false;
|
||||||
|
options.path = dashboardDir;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.services.grafana.preStart = lib.mkBefore ''
|
||||||
|
if [ ! -s /var/lib/grafana/secret_key ]; then
|
||||||
|
umask 077
|
||||||
|
${pkgs.coreutils}/bin/head -c 64 /dev/urandom | ${pkgs.coreutils}/bin/base64 --wrap=0 > /var/lib/grafana/secret_key
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
}
|
||||||
20
pi/README.md
20
pi/README.md
|
|
@ -124,8 +124,11 @@ That script:
|
||||||
6. writes `/etc/noisebell/noisebell.env`
|
6. writes `/etc/noisebell/noisebell.env`
|
||||||
7. writes `/etc/noisebell/noisebell-relay.env`
|
7. writes `/etc/noisebell/noisebell-relay.env`
|
||||||
8. installs `noisebell.service` and `noisebell-relay.service`
|
8. installs `noisebell.service` and `noisebell-relay.service`
|
||||||
9. enables and starts both services
|
9. enables persistent journald with a 30 day retention target
|
||||||
10. runs `tailscale up` with the decrypted auth key
|
10. installs and enables `prometheus-node-exporter`
|
||||||
|
11. installs `noisebell-loki-journal.service` to ship Pi logs to Loki on `noisebell-do`
|
||||||
|
12. enables and starts the Noisebell services
|
||||||
|
13. runs `tailscale up` with the decrypted auth key
|
||||||
|
|
||||||
## Files written on the Pi
|
## Files written on the Pi
|
||||||
|
|
||||||
|
|
@ -143,6 +146,9 @@ The deploy script creates:
|
||||||
- `/etc/noisebell/noisebell-relay.env`
|
- `/etc/noisebell/noisebell-relay.env`
|
||||||
- `/etc/systemd/system/noisebell.service`
|
- `/etc/systemd/system/noisebell.service`
|
||||||
- `/etc/systemd/system/noisebell-relay.service`
|
- `/etc/systemd/system/noisebell-relay.service`
|
||||||
|
- `/etc/systemd/system/noisebell-loki-journal.service`
|
||||||
|
- `/usr/local/bin/noisebell-loki-journal`
|
||||||
|
- `/etc/systemd/journald.conf.d/noisebell-persistent.conf`
|
||||||
|
|
||||||
All secret files are root-only.
|
All secret files are root-only.
|
||||||
|
|
||||||
|
|
@ -275,10 +281,18 @@ Important: Home Assistant webhook IDs are exact. If the automation shows a leadi
|
||||||
|
|
||||||
## API
|
## API
|
||||||
|
|
||||||
All endpoints require `Authorization: Bearer <token>`.
|
`GET /` requires `Authorization: Bearer <token>`.
|
||||||
|
|
||||||
**`GET /`**
|
**`GET /`**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{"status": "open", "timestamp": 1710000000}
|
{"status": "open", "timestamp": 1710000000}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**`GET /metrics`**
|
||||||
|
|
||||||
|
Prometheus metrics for local door state, raw GPIO level, debounced state-change counters, webhook delivery counters, last webhook result/status/duration, boot identity, uptime, temperature, throttling flags, Wi-Fi signal, and Tailscale state. This endpoint is unauthenticated and intended for Tailscale-only scraping by the DO Prometheus.
|
||||||
|
|
||||||
|
`noisebell-relay` also exposes unauthenticated Prometheus metrics at `GET /metrics` on port `8090`, including inbound webhook count, Home Assistant forwarding counters, and last forward result/status/duration.
|
||||||
|
|
||||||
|
Routine sampled values belong in Prometheus, not logs: GPIO level, Wi-Fi signal, temperature, uptime, Tailscale state, scrape health, and webhook counters are graphed from `/metrics`. Journald/Loki logs are intended to stay event-oriented: startup/shutdown, initial state sync, debounced door state changes, successful state deliveries, delivery retries/failures, unauthorized requests, relay forwards, and GPIO read error/recovery events.
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use axum::extract::State;
|
use axum::extract::State;
|
||||||
use axum::http::{HeaderMap, StatusCode};
|
use axum::http::{header, HeaderMap, StatusCode};
|
||||||
|
use axum::response::{IntoResponse, Response};
|
||||||
use axum::routing::{get, post};
|
use axum::routing::{get, post};
|
||||||
use axum::{Json, Router};
|
use axum::{Json, Router};
|
||||||
use noisebell_common::{validate_bearer, WebhookPayload};
|
use noisebell_common::{validate_bearer, WebhookPayload, PROMETHEUS_CONTENT_TYPE};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
|
|
@ -17,6 +19,109 @@ struct AppState {
|
||||||
target_secret: Option<String>,
|
target_secret: Option<String>,
|
||||||
retry_attempts: u32,
|
retry_attempts: u32,
|
||||||
retry_base_delay_secs: u64,
|
retry_base_delay_secs: u64,
|
||||||
|
metrics: Arc<RelayMetrics>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct RelayMetrics {
|
||||||
|
process_start_time: u64,
|
||||||
|
received_total: AtomicU64,
|
||||||
|
forwarded_total: AtomicU64,
|
||||||
|
attempt_failure_total: AtomicU64,
|
||||||
|
failed_total: AtomicU64,
|
||||||
|
last_attempt_timestamp: AtomicU64,
|
||||||
|
last_success_timestamp: AtomicU64,
|
||||||
|
last_failure_timestamp: AtomicU64,
|
||||||
|
last_duration_millis: AtomicU64,
|
||||||
|
last_http_status: AtomicU64,
|
||||||
|
last_result: AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||||
|
enum RelayResultKind {
|
||||||
|
Never = 0,
|
||||||
|
Success = 1,
|
||||||
|
HttpError = 2,
|
||||||
|
RequestError = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RelayResultKind {
|
||||||
|
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
|
||||||
|
|
||||||
|
const fn as_str(self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Self::Never => "never",
|
||||||
|
Self::Success => "success",
|
||||||
|
Self::HttpError => "http_error",
|
||||||
|
Self::RequestError => "request_error",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fn from_code(code: u64) -> Self {
|
||||||
|
match code {
|
||||||
|
1 => Self::Success,
|
||||||
|
2 => Self::HttpError,
|
||||||
|
3 => Self::RequestError,
|
||||||
|
_ => Self::Never,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RelayMetrics {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
process_start_time: unix_timestamp(),
|
||||||
|
received_total: AtomicU64::new(0),
|
||||||
|
forwarded_total: AtomicU64::new(0),
|
||||||
|
attempt_failure_total: AtomicU64::new(0),
|
||||||
|
failed_total: AtomicU64::new(0),
|
||||||
|
last_attempt_timestamp: AtomicU64::new(0),
|
||||||
|
last_success_timestamp: AtomicU64::new(0),
|
||||||
|
last_failure_timestamp: AtomicU64::new(0),
|
||||||
|
last_duration_millis: AtomicU64::new(0),
|
||||||
|
last_http_status: AtomicU64::new(0),
|
||||||
|
last_result: AtomicU64::new(RelayResultKind::Never as u64),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_attempt(&self, timestamp: u64) {
|
||||||
|
self.last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
|
||||||
|
self.forwarded_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.last_success_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.last_http_status.store(u64::from(status), Ordering::Relaxed);
|
||||||
|
self.last_result.store(RelayResultKind::Success as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_failure(
|
||||||
|
&self,
|
||||||
|
kind: RelayResultKind,
|
||||||
|
timestamp: u64,
|
||||||
|
duration_millis: u64,
|
||||||
|
status: Option<u16>,
|
||||||
|
final_failure: bool,
|
||||||
|
) {
|
||||||
|
self.attempt_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
if final_failure {
|
||||||
|
self.failed_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
self.last_failure_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
|
||||||
|
self.last_result.store(kind as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unix_timestamp() -> u64 {
|
||||||
|
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn duration_millis(started_at: Instant) -> u64 {
|
||||||
|
let millis = started_at.elapsed().as_millis();
|
||||||
|
millis.try_into().unwrap_or(u64::MAX)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn post_webhook(
|
async fn post_webhook(
|
||||||
|
|
@ -25,12 +130,20 @@ async fn post_webhook(
|
||||||
Json(payload): Json<WebhookPayload>,
|
Json(payload): Json<WebhookPayload>,
|
||||||
) -> StatusCode {
|
) -> StatusCode {
|
||||||
if !validate_bearer(&headers, &state.inbound_api_key) {
|
if !validate_bearer(&headers, &state.inbound_api_key) {
|
||||||
|
warn!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
"unauthorized relay webhook rejected"
|
||||||
|
);
|
||||||
return StatusCode::UNAUTHORIZED;
|
return StatusCode::UNAUTHORIZED;
|
||||||
}
|
}
|
||||||
|
state.metrics.received_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
|
||||||
info!(status = %payload.status, timestamp = payload.timestamp, "relay received webhook");
|
info!(status = %payload.status, timestamp = payload.timestamp, "relay received webhook");
|
||||||
|
|
||||||
for attempt in 0..=state.retry_attempts {
|
for attempt in 0..=state.retry_attempts {
|
||||||
|
let forward_started_at = Instant::now();
|
||||||
|
state.metrics.record_attempt(unix_timestamp());
|
||||||
let mut req = state.client.post(&state.target_url).json(&payload);
|
let mut req = state.client.post(&state.target_url).json(&payload);
|
||||||
if let Some(secret) = &state.target_secret {
|
if let Some(secret) = &state.target_secret {
|
||||||
req = req.bearer_auth(secret);
|
req = req.bearer_auth(secret);
|
||||||
|
|
@ -38,7 +151,17 @@ async fn post_webhook(
|
||||||
|
|
||||||
match req.send().await {
|
match req.send().await {
|
||||||
Ok(resp) if resp.status().is_success() => {
|
Ok(resp) if resp.status().is_success() => {
|
||||||
info!(status = %payload.status, "relay forwarded webhook");
|
let duration_ms = duration_millis(forward_started_at);
|
||||||
|
let http_status = resp.status().as_u16();
|
||||||
|
state.metrics.record_success(unix_timestamp(), duration_ms, http_status);
|
||||||
|
info!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
http_status,
|
||||||
|
duration_ms,
|
||||||
|
attempts = attempt + 1,
|
||||||
|
"relay forwarded webhook"
|
||||||
|
);
|
||||||
return StatusCode::OK;
|
return StatusCode::OK;
|
||||||
}
|
}
|
||||||
result => {
|
result => {
|
||||||
|
|
@ -46,13 +169,47 @@ async fn post_webhook(
|
||||||
Ok(resp) => format!("HTTP {}", resp.status()),
|
Ok(resp) => format!("HTTP {}", resp.status()),
|
||||||
Err(err) => err.to_string(),
|
Err(err) => err.to_string(),
|
||||||
};
|
};
|
||||||
|
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
|
||||||
|
let kind = if http_status.is_some() {
|
||||||
|
RelayResultKind::HttpError
|
||||||
|
} else {
|
||||||
|
RelayResultKind::RequestError
|
||||||
|
};
|
||||||
|
let duration_ms = duration_millis(forward_started_at);
|
||||||
|
state.metrics.record_failure(
|
||||||
|
kind,
|
||||||
|
unix_timestamp(),
|
||||||
|
duration_ms,
|
||||||
|
http_status,
|
||||||
|
attempt == state.retry_attempts,
|
||||||
|
);
|
||||||
if attempt == state.retry_attempts {
|
if attempt == state.retry_attempts {
|
||||||
error!(error = %err_msg, "relay failed to forward webhook after {} attempts", state.retry_attempts + 1);
|
error!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
error = %err_msg,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempts = state.retry_attempts + 1,
|
||||||
|
"relay failed to forward webhook after retries"
|
||||||
|
);
|
||||||
return StatusCode::BAD_GATEWAY;
|
return StatusCode::BAD_GATEWAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
let delay = Duration::from_secs(state.retry_base_delay_secs * 2u64.pow(attempt));
|
let delay = Duration::from_secs(state.retry_base_delay_secs * 2u64.pow(attempt));
|
||||||
warn!(error = %err_msg, attempt = attempt + 1, "relay forward failed, retrying in {:?}", delay);
|
warn!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
error = %err_msg,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempt = attempt + 1,
|
||||||
|
total_attempts = state.retry_attempts + 1,
|
||||||
|
delay_seconds = delay.as_secs(),
|
||||||
|
"relay forward failed, retrying"
|
||||||
|
);
|
||||||
tokio::time::sleep(delay).await;
|
tokio::time::sleep(delay).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -65,6 +222,84 @@ async fn health() -> StatusCode {
|
||||||
StatusCode::OK
|
StatusCode::OK
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
|
||||||
|
let mut body = String::new();
|
||||||
|
body.push_str("# HELP noisebell_relay_process_start_time_seconds Unix timestamp when the relay service started.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_process_start_time_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_process_start_time_seconds {}\n",
|
||||||
|
state.metrics.process_start_time
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_relay_received_total Authenticated inbound webhooks received.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_relay_received_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_received_total {}\n",
|
||||||
|
state.metrics.received_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_forwarded_total Webhooks forwarded to Home Assistant successfully.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_forwarded_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_forwarded_total {}\n",
|
||||||
|
state.metrics.forwarded_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_attempt_failure_total Failed forward attempts before retry or final failure.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_attempt_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_attempt_failure_total {}\n",
|
||||||
|
state.metrics.attempt_failure_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_failed_total Webhooks that failed after all retries.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_failed_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_failed_total {}\n",
|
||||||
|
state.metrics.failed_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_attempt_timestamp_seconds Unix timestamp of the last Home Assistant forward attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_attempt_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_attempt_timestamp_seconds {}\n",
|
||||||
|
state.metrics.last_attempt_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_success_timestamp_seconds Unix timestamp of the last successful Home Assistant forward.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_success_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_success_timestamp_seconds {}\n",
|
||||||
|
state.metrics.last_success_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_failure_timestamp_seconds Unix timestamp of the last failed Home Assistant forward attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_failure_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_failure_timestamp_seconds {}\n",
|
||||||
|
state.metrics.last_failure_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_duration_seconds Duration of the most recent Home Assistant forward attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_duration_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_duration_seconds {}\n",
|
||||||
|
state.metrics.last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_http_status HTTP status from the most recent Home Assistant forward attempt, or 0 when no HTTP response was received.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_http_status gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_http_status {}\n",
|
||||||
|
state.metrics.last_http_status.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
let last_result = RelayResultKind::from_code(state.metrics.last_result.load(Ordering::Relaxed));
|
||||||
|
body.push_str("# HELP noisebell_relay_last_result Last Home Assistant forward result as one-hot labels.\n");
|
||||||
|
body.push_str("# TYPE noisebell_relay_last_result gauge\n");
|
||||||
|
for result in RelayResultKind::ALL {
|
||||||
|
let value = u8::from(result == last_result);
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_relay_last_result{{result=\"{}\"}} {value}\n",
|
||||||
|
result.as_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
|
|
@ -122,10 +357,12 @@ async fn main() -> Result<()> {
|
||||||
target_secret,
|
target_secret,
|
||||||
retry_attempts,
|
retry_attempts,
|
||||||
retry_base_delay_secs,
|
retry_base_delay_secs,
|
||||||
|
metrics: Arc::new(RelayMetrics::new()),
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.route("/health", get(health))
|
.route("/health", get(health))
|
||||||
|
.route("/metrics", get(get_metrics))
|
||||||
.route("/webhook", post(post_webhook))
|
.route("/webhook", post(post_webhook))
|
||||||
.with_state(state);
|
.with_state(state);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,18 @@
|
||||||
use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
|
use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||||
|
use std::{fs, process::Command};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use axum::extract::State;
|
use axum::extract::State;
|
||||||
use axum::http::{HeaderMap, StatusCode};
|
use axum::http::{header, HeaderMap, StatusCode};
|
||||||
|
use axum::response::{IntoResponse, Response};
|
||||||
use axum::routing::get;
|
use axum::routing::get;
|
||||||
use axum::{Json, Router};
|
use axum::{Json, Router};
|
||||||
use gpiod::{Bias, Chip, Options};
|
use gpiod::{Bias, Chip, Options};
|
||||||
use noisebell_common::{
|
use noisebell_common::{
|
||||||
validate_bearer, DoorStatus, PiStatusResponse, SignalLevel, WebhookPayload,
|
prometheus_escape_label_value, validate_bearer, DoorStatus, PiStatusResponse, SignalLevel,
|
||||||
|
WebhookPayload, PROMETHEUS_CONTENT_TYPE,
|
||||||
};
|
};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
|
|
@ -44,10 +47,126 @@ impl LocalDoorState {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
enum StateEventKind {
|
||||||
|
Startup,
|
||||||
|
StateChange,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StateEventKind {
|
||||||
|
const fn as_str(self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Self::Startup => "startup",
|
||||||
|
Self::StateChange => "state_change",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct AppState {
|
struct AppState {
|
||||||
door_state: AtomicU8,
|
door_state: AtomicU8,
|
||||||
last_changed: AtomicU64,
|
last_changed: AtomicU64,
|
||||||
inbound_api_key: String,
|
inbound_api_key: String,
|
||||||
|
metrics: AppMetrics,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct AppMetrics {
|
||||||
|
process_start_time: u64,
|
||||||
|
notify_success_total: AtomicU64,
|
||||||
|
notify_attempt_failure_total: AtomicU64,
|
||||||
|
notify_failure_total: AtomicU64,
|
||||||
|
notify_last_attempt_timestamp: AtomicU64,
|
||||||
|
notify_last_success_timestamp: AtomicU64,
|
||||||
|
notify_last_failure_timestamp: AtomicU64,
|
||||||
|
notify_last_duration_millis: AtomicU64,
|
||||||
|
notify_last_http_status: AtomicU64,
|
||||||
|
notify_last_result: AtomicU64,
|
||||||
|
state_change_open_total: AtomicU64,
|
||||||
|
state_change_closed_total: AtomicU64,
|
||||||
|
gpio_last_read_timestamp: AtomicU64,
|
||||||
|
gpio_raw_level: AtomicU8,
|
||||||
|
gpio_read_error_total: AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||||
|
enum NotifyResultKind {
|
||||||
|
Never = 0,
|
||||||
|
Success = 1,
|
||||||
|
HttpError = 2,
|
||||||
|
RequestError = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NotifyResultKind {
|
||||||
|
const ALL: [Self; 4] = [Self::Never, Self::Success, Self::HttpError, Self::RequestError];
|
||||||
|
|
||||||
|
const fn as_str(self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Self::Never => "never",
|
||||||
|
Self::Success => "success",
|
||||||
|
Self::HttpError => "http_error",
|
||||||
|
Self::RequestError => "request_error",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fn from_code(code: u64) -> Self {
|
||||||
|
match code {
|
||||||
|
1 => Self::Success,
|
||||||
|
2 => Self::HttpError,
|
||||||
|
3 => Self::RequestError,
|
||||||
|
_ => Self::Never,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AppMetrics {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
process_start_time: unix_timestamp(),
|
||||||
|
notify_success_total: AtomicU64::new(0),
|
||||||
|
notify_attempt_failure_total: AtomicU64::new(0),
|
||||||
|
notify_failure_total: AtomicU64::new(0),
|
||||||
|
notify_last_attempt_timestamp: AtomicU64::new(0),
|
||||||
|
notify_last_success_timestamp: AtomicU64::new(0),
|
||||||
|
notify_last_failure_timestamp: AtomicU64::new(0),
|
||||||
|
notify_last_duration_millis: AtomicU64::new(0),
|
||||||
|
notify_last_http_status: AtomicU64::new(0),
|
||||||
|
notify_last_result: AtomicU64::new(NotifyResultKind::Never as u64),
|
||||||
|
state_change_open_total: AtomicU64::new(0),
|
||||||
|
state_change_closed_total: AtomicU64::new(0),
|
||||||
|
gpio_last_read_timestamp: AtomicU64::new(0),
|
||||||
|
gpio_raw_level: AtomicU8::new(0),
|
||||||
|
gpio_read_error_total: AtomicU64::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_notify_attempt(&self, timestamp: u64) {
|
||||||
|
self.notify_last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_notify_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
|
||||||
|
self.notify_success_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.notify_last_success_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.notify_last_http_status.store(u64::from(status), Ordering::Relaxed);
|
||||||
|
self.notify_last_result.store(NotifyResultKind::Success as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_notify_failure(
|
||||||
|
&self,
|
||||||
|
kind: NotifyResultKind,
|
||||||
|
timestamp: u64,
|
||||||
|
duration_millis: u64,
|
||||||
|
status: Option<u16>,
|
||||||
|
final_failure: bool,
|
||||||
|
) {
|
||||||
|
self.notify_attempt_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
if final_failure {
|
||||||
|
self.notify_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
self.notify_last_failure_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.notify_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.notify_last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
|
||||||
|
self.notify_last_result.store(kind as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppState {
|
impl AppState {
|
||||||
|
|
@ -60,11 +179,17 @@ fn unix_timestamp() -> u64 {
|
||||||
SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs()
|
SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn duration_millis(started_at: Instant) -> u64 {
|
||||||
|
let millis = started_at.elapsed().as_millis();
|
||||||
|
millis.try_into().unwrap_or(u64::MAX)
|
||||||
|
}
|
||||||
|
|
||||||
async fn get_status(
|
async fn get_status(
|
||||||
State(state): State<Arc<AppState>>,
|
State(state): State<Arc<AppState>>,
|
||||||
headers: HeaderMap,
|
headers: HeaderMap,
|
||||||
) -> Result<Json<PiStatusResponse>, StatusCode> {
|
) -> Result<Json<PiStatusResponse>, StatusCode> {
|
||||||
if !validate_bearer(&headers, &state.inbound_api_key) {
|
if !validate_bearer(&headers, &state.inbound_api_key) {
|
||||||
|
warn!("unauthorized status request rejected");
|
||||||
return Err(StatusCode::UNAUTHORIZED);
|
return Err(StatusCode::UNAUTHORIZED);
|
||||||
}
|
}
|
||||||
Ok(Json(PiStatusResponse {
|
Ok(Json(PiStatusResponse {
|
||||||
|
|
@ -73,6 +198,211 @@ async fn get_status(
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
|
||||||
|
let mut body = String::new();
|
||||||
|
let current_status = state.current_door_state().as_door_status();
|
||||||
|
|
||||||
|
body.push_str("# HELP noisebell_pi_door_status Current local Pi door status.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_door_status gauge\n");
|
||||||
|
for status in [DoorStatus::Open, DoorStatus::Closed] {
|
||||||
|
let value = u8::from(current_status == status);
|
||||||
|
let status = prometheus_escape_label_value(status.as_str());
|
||||||
|
body.push_str(&format!("noisebell_pi_door_status{{status=\"{status}\"}} {value}\n"));
|
||||||
|
}
|
||||||
|
body.push_str("# HELP noisebell_pi_last_changed_timestamp_seconds Unix timestamp for the last local door state change.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_last_changed_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_last_changed_timestamp_seconds {}\n",
|
||||||
|
state.last_changed.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_process_start_time_seconds Unix timestamp when the Pi service started.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_process_start_time_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_process_start_time_seconds {}\n",
|
||||||
|
state.metrics.process_start_time
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_pi_notify_success_total Successful state webhooks sent to the cache.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_success_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_success_total {}\n",
|
||||||
|
state.metrics.notify_success_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_attempt_failure_total Failed state webhook attempts before retry or final failure.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_attempt_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_attempt_failure_total {}\n",
|
||||||
|
state.metrics.notify_attempt_failure_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_failure_total State changes that failed to reach the cache after all retries.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_failure_total {}\n",
|
||||||
|
state.metrics.notify_failure_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_last_attempt_timestamp_seconds Unix timestamp of the last cache webhook attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_attempt_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_attempt_timestamp_seconds {}\n",
|
||||||
|
state.metrics.notify_last_attempt_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_last_success_timestamp_seconds Unix timestamp of the last successful cache webhook.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_success_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_success_timestamp_seconds {}\n",
|
||||||
|
state.metrics.notify_last_success_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_last_failure_timestamp_seconds Unix timestamp of the last failed cache webhook attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_failure_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_failure_timestamp_seconds {}\n",
|
||||||
|
state.metrics.notify_last_failure_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_last_duration_seconds Duration of the most recent cache webhook attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_duration_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_duration_seconds {}\n",
|
||||||
|
state.metrics.notify_last_duration_millis.load(Ordering::Relaxed) as f64 / 1000.0
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_notify_last_http_status HTTP status from the most recent cache webhook attempt, or 0 when no HTTP response was received.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_http_status gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_http_status {}\n",
|
||||||
|
state.metrics.notify_last_http_status.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
let last_notify =
|
||||||
|
NotifyResultKind::from_code(state.metrics.notify_last_result.load(Ordering::Relaxed));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_pi_notify_last_result Last cache webhook result as one-hot labels.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_pi_notify_last_result gauge\n");
|
||||||
|
for result in NotifyResultKind::ALL {
|
||||||
|
let value = u8::from(result == last_notify);
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_notify_last_result{{result=\"{}\"}} {value}\n",
|
||||||
|
result.as_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
body.push_str("# HELP noisebell_pi_state_change_total Local debounced door state changes by resulting status.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_state_change_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_state_change_total{{status=\"open\"}} {}\n",
|
||||||
|
state.metrics.state_change_open_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_state_change_total{{status=\"closed\"}} {}\n",
|
||||||
|
state.metrics.state_change_closed_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_gpio_raw_level Last GPIO raw signal level, 0 for low and 1 for high.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_gpio_raw_level gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_gpio_raw_level {}\n",
|
||||||
|
state.metrics.gpio_raw_level.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_gpio_last_read_timestamp_seconds Unix timestamp of the last successful GPIO read.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_gpio_last_read_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_gpio_last_read_timestamp_seconds {}\n",
|
||||||
|
state.metrics.gpio_last_read_timestamp.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_gpio_read_error_total GPIO read errors.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_gpio_read_error_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_gpio_read_error_total {}\n",
|
||||||
|
state.metrics.gpio_read_error_total.load(Ordering::Relaxed)
|
||||||
|
));
|
||||||
|
|
||||||
|
if let Some(boot_id) = read_trimmed("/proc/sys/kernel/random/boot_id") {
|
||||||
|
let boot_id = prometheus_escape_label_value(&boot_id);
|
||||||
|
body.push_str("# HELP noisebell_pi_boot_info Pi boot identity. Changes on reboot.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_boot_info gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_pi_boot_info{{boot_id=\"{boot_id}\"}} 1\n"));
|
||||||
|
}
|
||||||
|
if let Some(uptime) = read_uptime_seconds() {
|
||||||
|
body.push_str("# HELP noisebell_pi_uptime_seconds Pi system uptime in seconds.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_uptime_seconds gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_pi_uptime_seconds {uptime}\n"));
|
||||||
|
}
|
||||||
|
if let Some(temp) = read_temperature_celsius() {
|
||||||
|
body.push_str("# HELP noisebell_pi_temperature_celsius Pi CPU temperature in Celsius.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_temperature_celsius gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_pi_temperature_celsius {temp}\n"));
|
||||||
|
}
|
||||||
|
if let Some(throttled) = read_throttled_flags() {
|
||||||
|
body.push_str("# HELP noisebell_pi_throttled_flags Raspberry Pi throttling bitfield from vcgencmd get_throttled.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_throttled_flags gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_pi_throttled_flags {throttled}\n"));
|
||||||
|
}
|
||||||
|
if let Some((interface, link, level)) = read_wifi_metrics() {
|
||||||
|
let interface = prometheus_escape_label_value(&interface);
|
||||||
|
body.push_str("# HELP noisebell_pi_wifi_link_quality Wireless link quality from /proc/net/wireless.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_wifi_link_quality gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_wifi_link_quality{{interface=\"{interface}\"}} {link}\n"
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_pi_wifi_signal_dbm Wireless signal level in dBm from /proc/net/wireless.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_wifi_signal_dbm gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_pi_wifi_signal_dbm{{interface=\"{interface}\"}} {level}\n"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
body.push_str("# HELP noisebell_pi_tailscale_running Whether tailscale status reports BackendState Running.\n");
|
||||||
|
body.push_str("# TYPE noisebell_pi_tailscale_running gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_pi_tailscale_running {}\n", u8::from(tailscale_running())));
|
||||||
|
|
||||||
|
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_trimmed(path: &str) -> Option<String> {
|
||||||
|
fs::read_to_string(path)
|
||||||
|
.ok()
|
||||||
|
.map(|value| value.trim().to_string())
|
||||||
|
.filter(|value| !value.is_empty())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_uptime_seconds() -> Option<f64> {
|
||||||
|
read_trimmed("/proc/uptime")?.split_whitespace().next()?.parse().ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_temperature_celsius() -> Option<f64> {
|
||||||
|
let raw: f64 = read_trimmed("/sys/class/thermal/thermal_zone0/temp")?.parse().ok()?;
|
||||||
|
Some(raw / 1000.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_throttled_flags() -> Option<u64> {
|
||||||
|
let output = Command::new("vcgencmd").arg("get_throttled").output().ok()?;
|
||||||
|
if !output.status.success() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let text = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let value = text.trim().strip_prefix("throttled=0x")?;
|
||||||
|
u64::from_str_radix(value, 16).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_wifi_metrics() -> Option<(String, f64, f64)> {
|
||||||
|
let text = read_trimmed("/proc/net/wireless")?;
|
||||||
|
for line in text.lines().skip(2) {
|
||||||
|
let (interface, values) = line.split_once(':')?;
|
||||||
|
let mut parts = values.split_whitespace();
|
||||||
|
let _status = parts.next()?;
|
||||||
|
let link: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
|
||||||
|
let level: f64 = parts.next()?.trim_end_matches('.').parse().ok()?;
|
||||||
|
return Some((interface.trim().to_string(), link, level));
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tailscale_running() -> bool {
|
||||||
|
let output = match Command::new("tailscale").args(["status", "--json"]).output() {
|
||||||
|
Ok(output) => output,
|
||||||
|
Err(_) => return false,
|
||||||
|
};
|
||||||
|
output.status.success()
|
||||||
|
&& String::from_utf8_lossy(&output.stdout).contains("\"BackendState\":\"Running\"")
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
|
|
@ -151,17 +481,23 @@ async fn main() -> Result<()> {
|
||||||
door_state: AtomicU8::new(initial_state as u8),
|
door_state: AtomicU8::new(initial_state as u8),
|
||||||
last_changed: AtomicU64::new(now),
|
last_changed: AtomicU64::new(now),
|
||||||
inbound_api_key,
|
inbound_api_key,
|
||||||
|
metrics: AppMetrics::new(),
|
||||||
});
|
});
|
||||||
|
state
|
||||||
|
.metrics
|
||||||
|
.gpio_raw_level
|
||||||
|
.store(u8::from(initial_raw_level == SignalLevel::High), Ordering::Relaxed);
|
||||||
|
state.metrics.gpio_last_read_timestamp.store(now, Ordering::Relaxed);
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
initial_status = %initial_state.as_door_status(),
|
initial_status = %initial_state.as_door_status(),
|
||||||
"GPIO initialized"
|
"GPIO initialized"
|
||||||
);
|
);
|
||||||
|
|
||||||
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64)>();
|
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<(DoorStatus, u64, StateEventKind)>();
|
||||||
|
|
||||||
// Sync initial state with the cache on startup
|
// Sync initial state with the cache on startup
|
||||||
let _ = tx.send((initial_state.as_door_status(), now));
|
let _ = tx.send((initial_state.as_door_status(), now, StateEventKind::Startup));
|
||||||
|
|
||||||
// Poll the input level and debounce in software. This is less elegant than
|
// Poll the input level and debounce in software. This is less elegant than
|
||||||
// edge-triggered reads, but it is robust on Raspberry Pi OS.
|
// edge-triggered reads, but it is robust on Raspberry Pi OS.
|
||||||
|
|
@ -173,18 +509,46 @@ async fn main() -> Result<()> {
|
||||||
let mut current_state = initial_state;
|
let mut current_state = initial_state;
|
||||||
let mut pending_state = current_state;
|
let mut pending_state = current_state;
|
||||||
let mut pending_since = std::time::Instant::now();
|
let mut pending_since = std::time::Instant::now();
|
||||||
|
let mut gpio_read_error_count = 0u64;
|
||||||
|
let mut last_gpio_error_log: Option<Instant> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let values = match inputs.get_values([false]) {
|
let values = match inputs.get_values([false]) {
|
||||||
Ok(values) => values,
|
Ok(values) => values,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!(error = %e, "failed to read GPIO value");
|
state_for_edges.metrics.gpio_read_error_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
gpio_read_error_count = gpio_read_error_count.saturating_add(1);
|
||||||
|
let should_log = last_gpio_error_log
|
||||||
|
.map(|last| last.elapsed() >= Duration::from_secs(60))
|
||||||
|
.unwrap_or(true);
|
||||||
|
if should_log {
|
||||||
|
error!(
|
||||||
|
error = %e,
|
||||||
|
consecutive_errors = gpio_read_error_count,
|
||||||
|
"failed to read GPIO value"
|
||||||
|
);
|
||||||
|
last_gpio_error_log = Some(Instant::now());
|
||||||
|
}
|
||||||
std::thread::sleep(Duration::from_secs(1));
|
std::thread::sleep(Duration::from_secs(1));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if gpio_read_error_count > 0 {
|
||||||
|
info!(recovered_after_errors = gpio_read_error_count, "GPIO reads recovered");
|
||||||
|
gpio_read_error_count = 0;
|
||||||
|
last_gpio_error_log = None;
|
||||||
|
}
|
||||||
|
|
||||||
let new_raw_level = if values[0] { SignalLevel::High } else { SignalLevel::Low };
|
let new_raw_level = if values[0] { SignalLevel::High } else { SignalLevel::Low };
|
||||||
|
state_for_edges
|
||||||
|
.metrics
|
||||||
|
.gpio_raw_level
|
||||||
|
.store(u8::from(new_raw_level == SignalLevel::High), Ordering::Relaxed);
|
||||||
|
state_for_edges
|
||||||
|
.metrics
|
||||||
|
.gpio_last_read_timestamp
|
||||||
|
.store(unix_timestamp(), Ordering::Relaxed);
|
||||||
let new_state = LocalDoorState::from_raw_level(new_raw_level, active_level);
|
let new_state = LocalDoorState::from_raw_level(new_raw_level, active_level);
|
||||||
|
|
||||||
if new_state != pending_state {
|
if new_state != pending_state {
|
||||||
|
|
@ -203,7 +567,25 @@ async fn main() -> Result<()> {
|
||||||
|
|
||||||
let timestamp = unix_timestamp();
|
let timestamp = unix_timestamp();
|
||||||
state_for_edges.last_changed.store(timestamp, Ordering::Relaxed);
|
state_for_edges.last_changed.store(timestamp, Ordering::Relaxed);
|
||||||
let _ = edge_tx.send((new_state.as_door_status(), timestamp));
|
match new_state {
|
||||||
|
LocalDoorState::Open => {
|
||||||
|
state_for_edges
|
||||||
|
.metrics
|
||||||
|
.state_change_open_total
|
||||||
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
LocalDoorState::Closed => {
|
||||||
|
state_for_edges
|
||||||
|
.metrics
|
||||||
|
.state_change_closed_total
|
||||||
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let _ = edge_tx.send((
|
||||||
|
new_state.as_door_status(),
|
||||||
|
timestamp,
|
||||||
|
StateEventKind::StateChange,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::thread::sleep(poll_interval);
|
std::thread::sleep(poll_interval);
|
||||||
|
|
@ -211,33 +593,97 @@ async fn main() -> Result<()> {
|
||||||
});
|
});
|
||||||
drop(tx); // Drop original sender so rx closes when edge_handle is dropped
|
drop(tx); // Drop original sender so rx closes when edge_handle is dropped
|
||||||
|
|
||||||
|
let state_for_notify = state.clone();
|
||||||
let notify_handle = tokio::spawn(async move {
|
let notify_handle = tokio::spawn(async move {
|
||||||
let client = reqwest::Client::builder()
|
let client = reqwest::Client::builder()
|
||||||
.timeout(Duration::from_secs(http_timeout_secs))
|
.timeout(Duration::from_secs(http_timeout_secs))
|
||||||
.build()
|
.build()
|
||||||
.expect("failed to build HTTP client");
|
.expect("failed to build HTTP client");
|
||||||
|
|
||||||
while let Some((status, timestamp)) = rx.recv().await {
|
while let Some((status, timestamp, event_kind)) = rx.recv().await {
|
||||||
info!(status = %status, timestamp, "state changed");
|
match event_kind {
|
||||||
|
StateEventKind::Startup => {
|
||||||
|
info!(status = %status, timestamp, event = event_kind.as_str(), "syncing initial door state");
|
||||||
|
}
|
||||||
|
StateEventKind::StateChange => {
|
||||||
|
info!(status = %status, timestamp, event = event_kind.as_str(), "door state changed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let payload = WebhookPayload { status, timestamp };
|
let payload = WebhookPayload { status, timestamp };
|
||||||
|
|
||||||
for attempt in 0..=retry_attempts {
|
for attempt in 0..=retry_attempts {
|
||||||
|
let notify_started_at = Instant::now();
|
||||||
|
state_for_notify.metrics.record_notify_attempt(unix_timestamp());
|
||||||
let result =
|
let result =
|
||||||
client.post(&endpoint_url).bearer_auth(&api_key).json(&payload).send().await;
|
client.post(&endpoint_url).bearer_auth(&api_key).json(&payload).send().await;
|
||||||
match result {
|
match result {
|
||||||
Ok(resp) if resp.status().is_success() => break,
|
Ok(resp) if resp.status().is_success() => {
|
||||||
_ => {
|
let duration_ms = duration_millis(notify_started_at);
|
||||||
|
let http_status = resp.status().as_u16();
|
||||||
|
state_for_notify.metrics.record_notify_success(
|
||||||
|
unix_timestamp(),
|
||||||
|
duration_ms,
|
||||||
|
http_status,
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
event = event_kind.as_str(),
|
||||||
|
http_status,
|
||||||
|
duration_ms,
|
||||||
|
attempts = attempt + 1,
|
||||||
|
"notified cache of door state"
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result => {
|
||||||
let err_msg = match &result {
|
let err_msg = match &result {
|
||||||
Ok(resp) => format!("HTTP {}", resp.status()),
|
Ok(resp) => format!("HTTP {}", resp.status()),
|
||||||
Err(e) => e.to_string(),
|
Err(e) => e.to_string(),
|
||||||
};
|
};
|
||||||
|
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
|
||||||
|
let kind = if http_status.is_some() {
|
||||||
|
NotifyResultKind::HttpError
|
||||||
|
} else {
|
||||||
|
NotifyResultKind::RequestError
|
||||||
|
};
|
||||||
|
let duration_ms = duration_millis(notify_started_at);
|
||||||
|
state_for_notify.metrics.record_notify_failure(
|
||||||
|
kind,
|
||||||
|
unix_timestamp(),
|
||||||
|
duration_ms,
|
||||||
|
http_status,
|
||||||
|
attempt == retry_attempts,
|
||||||
|
);
|
||||||
if attempt == retry_attempts {
|
if attempt == retry_attempts {
|
||||||
error!(error = %err_msg, "failed to notify endpoint after {} attempts", retry_attempts + 1);
|
error!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
event = event_kind.as_str(),
|
||||||
|
error = %err_msg,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempts = retry_attempts + 1,
|
||||||
|
"failed to notify cache after retries"
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
let delay =
|
let delay =
|
||||||
Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
|
Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
|
||||||
warn!(error = %err_msg, attempt = attempt + 1, "notify failed, retrying in {:?}", delay);
|
warn!(
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
event = event_kind.as_str(),
|
||||||
|
error = %err_msg,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempt = attempt + 1,
|
||||||
|
total_attempts = retry_attempts + 1,
|
||||||
|
delay_seconds = delay.as_secs(),
|
||||||
|
"notify cache failed, retrying"
|
||||||
|
);
|
||||||
tokio::time::sleep(delay).await;
|
tokio::time::sleep(delay).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -246,7 +692,10 @@ async fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = Router::new().route("/", get(get_status)).with_state(state);
|
let app = Router::new()
|
||||||
|
.route("/", get(get_status))
|
||||||
|
.route("/metrics", get(get_metrics))
|
||||||
|
.with_state(state);
|
||||||
|
|
||||||
let listener = tokio::net::TcpListener::bind((&*bind_address, port))
|
let listener = tokio::net::TcpListener::bind((&*bind_address, port))
|
||||||
.await
|
.await
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,5 @@ rusqlite = { version = "0.33", features = ["bundled"] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "sync", "signal", "time"] }
|
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "sync", "signal", "time"] }
|
||||||
tower-http = { version = "0.6", features = ["trace"] }
|
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,16 @@ If the Pi stops responding to polls (configurable threshold, default 3 misses),
|
||||||
|--------|------|------|-------------|
|
|--------|------|------|-------------|
|
||||||
| `GET` | `/status` | — | Current door status (`status`, `since`, `last_checked`) |
|
| `GET` | `/status` | — | Current door status (`status`, `since`, `last_checked`) |
|
||||||
| `GET` | `/badge.svg` | — | Live README badge with Noisebridge logo |
|
| `GET` | `/badge.svg` | — | Live README badge with Noisebridge logo |
|
||||||
|
| `GET` | `/metrics` | — | Prometheus metrics, scraped locally by the DO Prometheus |
|
||||||
| `POST` | `/webhook` | Bearer | Inbound webhook from the Pi |
|
| `POST` | `/webhook` | Bearer | Inbound webhook from the Pi |
|
||||||
| `GET` | `/health` | — | Health check |
|
| `GET` | `/health` | — | Health check |
|
||||||
|
|
||||||
`since` is the Pi-reported time when the current state began. `last_checked` is when the cache most recently attempted a poll.
|
`since` is the Pi-reported time when the current state began. `last_checked` is when the cache most recently attempted a poll.
|
||||||
|
|
||||||
|
The public Caddy vhost returns `404` for `/metrics`; Prometheus scrapes the cache directly on localhost. Metrics include the configured Pi target, poll interval, offline threshold, last poll result, last HTTP status, last poll duration, last poll attempt/success/failure timestamps, and failure counters split into HTTP, timeout, connect, request-other, and parse failures.
|
||||||
|
|
||||||
|
Regular timer-driven poll data should be debugged from Prometheus and Grafana, not by scanning logs. The cache logs sparse events instead: state changes applied from the Pi, offline/online transitions, first or changed poll failures in a failure streak, stale events, auth/rate-limit rejections, outbound webhook deliveries, retries, and final failures. Successful unchanged polls, badge/image/status reads, and metrics scrapes are intentionally quiet at `INFO`.
|
||||||
|
|
||||||
## Badge
|
## Badge
|
||||||
|
|
||||||
`/badge.svg` serves a classic shields.io-style SVG badge with the Noisebridge logo and the current cache status (`open`, `closed`, or `offline`).
|
`/badge.svg` serves a classic shields.io-style SVG badge with the Noisebridge logo and the current cache status (`open`, `closed`, or `offline`).
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,7 @@ in
|
||||||
|
|
||||||
services.caddy.virtualHosts.${cfg.domain}.extraConfig = ''
|
services.caddy.virtualHosts.${cfg.domain}.extraConfig = ''
|
||||||
redir / https://git.extremist.software/jet/noisebell 302
|
redir / https://git.extremist.software/jet/noisebell 302
|
||||||
|
respond /metrics 404
|
||||||
reverse_proxy localhost:${toString cfg.port}
|
reverse_proxy localhost:${toString cfg.port}
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use axum::extract::State;
|
use axum::extract::State;
|
||||||
|
|
@ -6,12 +7,16 @@ use axum::response::{IntoResponse, Response};
|
||||||
use axum::Json;
|
use axum::Json;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use chrono_tz::America::Los_Angeles;
|
use chrono_tz::America::Los_Angeles;
|
||||||
use noisebell_common::{validate_bearer, CacheStatusResponse, DoorStatus, WebhookPayload};
|
use noisebell_common::{
|
||||||
|
prometheus_escape_label_value, validate_bearer, CacheStatusResponse, DoorStatus,
|
||||||
|
WebhookPayload, PROMETHEUS_CONTENT_TYPE,
|
||||||
|
};
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use tracing::{error, info};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
use crate::db;
|
use crate::db;
|
||||||
use crate::db::ApplyStateOutcome;
|
use crate::db::ApplyStateOutcome;
|
||||||
|
use crate::metrics::{atomic_seconds_from_millis, atomic_value, CacheMetrics, PollResultKind};
|
||||||
use crate::types::WebhookTarget;
|
use crate::types::WebhookTarget;
|
||||||
use crate::webhook;
|
use crate::webhook;
|
||||||
|
|
||||||
|
|
@ -108,6 +113,7 @@ pub struct AppState {
|
||||||
pub retry_base_delay_secs: u64,
|
pub retry_base_delay_secs: u64,
|
||||||
pub webhook_last_request: std::sync::atomic::AtomicU64,
|
pub webhook_last_request: std::sync::atomic::AtomicU64,
|
||||||
pub webhook_tokens: std::sync::atomic::AtomicU32,
|
pub webhook_tokens: std::sync::atomic::AtomicU32,
|
||||||
|
pub metrics: Arc<CacheMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unix_now() -> u64 {
|
fn unix_now() -> u64 {
|
||||||
|
|
@ -186,8 +192,14 @@ pub async fn post_webhook(
|
||||||
Json(body): Json<WebhookPayload>,
|
Json(body): Json<WebhookPayload>,
|
||||||
) -> StatusCode {
|
) -> StatusCode {
|
||||||
if !validate_bearer(&headers, &state.inbound_api_key) {
|
if !validate_bearer(&headers, &state.inbound_api_key) {
|
||||||
|
warn!(
|
||||||
|
status = %body.status,
|
||||||
|
timestamp = body.timestamp,
|
||||||
|
"unauthorized webhook rejected"
|
||||||
|
);
|
||||||
return StatusCode::UNAUTHORIZED;
|
return StatusCode::UNAUTHORIZED;
|
||||||
}
|
}
|
||||||
|
state.metrics.webhook_received_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
|
||||||
// Simple rate limiting: reset tokens every window, reject if exhausted.
|
// Simple rate limiting: reset tokens every window, reject if exhausted.
|
||||||
let now = unix_now();
|
let now = unix_now();
|
||||||
|
|
@ -202,6 +214,14 @@ pub async fn post_webhook(
|
||||||
|n| if n > 0 { Some(n - 1) } else { None },
|
|n| if n > 0 { Some(n - 1) } else { None },
|
||||||
);
|
);
|
||||||
if remaining.is_err() {
|
if remaining.is_err() {
|
||||||
|
state.metrics.webhook_rate_limited_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
warn!(
|
||||||
|
status = %body.status,
|
||||||
|
timestamp = body.timestamp,
|
||||||
|
limit = WEBHOOK_RATE_LIMIT,
|
||||||
|
window_seconds = WEBHOOK_RATE_WINDOW_SECS,
|
||||||
|
"webhook rate limit exceeded"
|
||||||
|
);
|
||||||
return StatusCode::TOO_MANY_REQUESTS;
|
return StatusCode::TOO_MANY_REQUESTS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -224,7 +244,7 @@ pub async fn post_webhook(
|
||||||
"state updated via webhook"
|
"state updated via webhook"
|
||||||
);
|
);
|
||||||
|
|
||||||
webhook::forward(
|
let summary = webhook::forward(
|
||||||
&state.client,
|
&state.client,
|
||||||
&state.webhooks,
|
&state.webhooks,
|
||||||
&WebhookPayload { status, timestamp: body.timestamp },
|
&WebhookPayload { status, timestamp: body.timestamp },
|
||||||
|
|
@ -232,16 +252,11 @@ pub async fn post_webhook(
|
||||||
state.retry_base_delay_secs,
|
state.retry_base_delay_secs,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
state.metrics.add_outbound(summary.delivered, summary.failed);
|
||||||
}
|
}
|
||||||
Ok(ApplyStateOutcome::Duplicate) => {
|
Ok(ApplyStateOutcome::Duplicate) => {}
|
||||||
info!(
|
|
||||||
status = %status,
|
|
||||||
timestamp = body.timestamp,
|
|
||||||
"duplicate webhook ignored"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(ApplyStateOutcome::Stale) => {
|
Ok(ApplyStateOutcome::Stale) => {
|
||||||
info!(
|
warn!(
|
||||||
status = %status,
|
status = %status,
|
||||||
timestamp = body.timestamp,
|
timestamp = body.timestamp,
|
||||||
"stale webhook ignored"
|
"stale webhook ignored"
|
||||||
|
|
@ -281,6 +296,231 @@ pub async fn health() -> StatusCode {
|
||||||
StatusCode::OK
|
StatusCode::OK
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_metrics(State(state): State<Arc<AppState>>) -> Response {
|
||||||
|
let db = state.db.clone();
|
||||||
|
let snapshot = match tokio::task::spawn_blocking(move || {
|
||||||
|
let conn = db.blocking_lock();
|
||||||
|
db::get_metrics_snapshot(&conn)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.expect("db task panicked")
|
||||||
|
{
|
||||||
|
Ok(snapshot) => snapshot,
|
||||||
|
Err(e) => {
|
||||||
|
error!(error = %e, "failed to get metrics snapshot");
|
||||||
|
return StatusCode::INTERNAL_SERVER_ERROR.into_response();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut body = String::new();
|
||||||
|
body.push_str("# HELP noisebell_cache_status Current cached door status.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_status gauge\n");
|
||||||
|
for status in DoorStatus::ALL {
|
||||||
|
let value = u8::from(snapshot.status == status);
|
||||||
|
let status = prometheus_escape_label_value(status.as_str());
|
||||||
|
body.push_str(&format!("noisebell_cache_status{{status=\"{status}\"}} {value}\n"));
|
||||||
|
}
|
||||||
|
body.push_str("# HELP noisebell_cache_status_since_timestamp_seconds Unix timestamp for when the current cache state began.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_status_since_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_status_since_timestamp_seconds {}\n",
|
||||||
|
snapshot.since.unwrap_or(0)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_last_seen_timestamp_seconds Unix timestamp for the last successful Pi state update.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_last_seen_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_last_seen_timestamp_seconds {}\n",
|
||||||
|
snapshot.last_seen.unwrap_or(0)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_last_checked_timestamp_seconds Unix timestamp for the last Pi poll attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_last_checked_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_last_checked_timestamp_seconds {}\n",
|
||||||
|
snapshot.last_checked.unwrap_or(0)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_process_start_time_seconds Unix timestamp when the cache service started.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_process_start_time_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_process_start_time_seconds {}\n",
|
||||||
|
state.metrics.process_start_time
|
||||||
|
));
|
||||||
|
let pi_address = prometheus_escape_label_value(&state.metrics.pi_address);
|
||||||
|
body.push_str("# HELP noisebell_cache_pi_target_info Configured Pi polling target.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_pi_target_info gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_cache_pi_target_info{{address=\"{pi_address}\"}} 1\n"));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_interval_seconds Configured Pi poll interval.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_interval_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_interval_seconds {}\n",
|
||||||
|
state.metrics.poll_interval_secs
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_offline_threshold Configured consecutive failure threshold before marking the Pi offline.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_offline_threshold gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_offline_threshold {}\n",
|
||||||
|
state.metrics.offline_threshold
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_retry_attempts Configured outbound retry attempts.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_retry_attempts gauge\n");
|
||||||
|
body.push_str(&format!("noisebell_cache_retry_attempts {}\n", state.metrics.retry_attempts));
|
||||||
|
body.push_str("# HELP noisebell_cache_http_timeout_seconds Configured HTTP timeout for cache HTTP clients.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_http_timeout_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_http_timeout_seconds {}\n",
|
||||||
|
state.metrics.http_timeout_secs
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_webhook_received_total Authenticated inbound webhooks received.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_webhook_received_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_webhook_received_total {}\n",
|
||||||
|
atomic_value(&state.metrics.webhook_received_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_webhook_rate_limited_total Inbound webhooks rejected by rate limiting.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_webhook_rate_limited_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_webhook_rate_limited_total {}\n",
|
||||||
|
atomic_value(&state.metrics.webhook_rate_limited_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_outbound_webhook_success_total Outbound webhook deliveries that succeeded.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_outbound_webhook_success_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_outbound_webhook_success_total {}\n",
|
||||||
|
atomic_value(&state.metrics.outbound_success_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_outbound_webhook_failure_total Outbound webhook deliveries that failed after retries.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_outbound_webhook_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_outbound_webhook_failure_total {}\n",
|
||||||
|
atomic_value(&state.metrics.outbound_failure_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_success_total Successful Pi status polls.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_success_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_success_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_success_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_attempt_total Pi status poll attempts.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_attempt_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_attempt_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_attempt_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_failure_total Failed Pi status polls.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_failure_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_failure_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_http_error_total Pi poll responses with non-success HTTP status.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_http_error_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_http_error_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_http_error_total)
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_poll_request_timeout_total Pi poll request timeout failures.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_request_timeout_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_request_timeout_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_request_timeout_total)
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_poll_request_connect_total Pi poll connection failures.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_request_connect_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_request_connect_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_request_connect_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_request_other_total Pi poll request failures that were not timeout/connect failures.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_request_other_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_request_other_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_request_other_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_parse_failure_total Successful Pi HTTP responses that could not be parsed.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_parse_failure_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_parse_failure_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_parse_failure_total)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_offline_transition_total Times the cache marked the Pi offline.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_offline_transition_total counter\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_offline_transition_total {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_offline_transition_total)
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_poll_consecutive_failures Current consecutive Pi poll failures.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_consecutive_failures gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_consecutive_failures {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_consecutive_failures)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_attempt_timestamp_seconds Unix timestamp of the last Pi poll attempt.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_attempt_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_attempt_timestamp_seconds {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_last_attempt_timestamp)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_success_timestamp_seconds Unix timestamp of the last successful Pi poll.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_success_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_success_timestamp_seconds {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_last_success_timestamp)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_failure_timestamp_seconds Unix timestamp of the last failed Pi poll.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_failure_timestamp_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_failure_timestamp_seconds {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_last_failure_timestamp)
|
||||||
|
));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_poll_last_duration_seconds Duration of the most recent Pi poll.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_duration_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_duration_seconds {}\n",
|
||||||
|
atomic_seconds_from_millis(&state.metrics.poll_last_duration_millis)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_success_duration_seconds Duration of the most recent successful Pi poll.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_success_duration_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_success_duration_seconds {}\n",
|
||||||
|
atomic_seconds_from_millis(&state.metrics.poll_last_success_duration_millis)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_failure_duration_seconds Duration of the most recent failed Pi poll.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_failure_duration_seconds gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_failure_duration_seconds {}\n",
|
||||||
|
atomic_seconds_from_millis(&state.metrics.poll_last_failure_duration_millis)
|
||||||
|
));
|
||||||
|
body.push_str("# HELP noisebell_cache_poll_last_http_status HTTP status from the most recent Pi poll, or 0 when no HTTP response was received.\n");
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_http_status gauge\n");
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_http_status {}\n",
|
||||||
|
atomic_value(&state.metrics.poll_last_http_status)
|
||||||
|
));
|
||||||
|
let last_result = PollResultKind::from_code(atomic_value(&state.metrics.poll_last_result));
|
||||||
|
body.push_str(
|
||||||
|
"# HELP noisebell_cache_poll_last_result Last Pi poll result as one-hot labels.\n",
|
||||||
|
);
|
||||||
|
body.push_str("# TYPE noisebell_cache_poll_last_result gauge\n");
|
||||||
|
for result in PollResultKind::ALL {
|
||||||
|
let value = u8::from(result == last_result);
|
||||||
|
body.push_str(&format!(
|
||||||
|
"noisebell_cache_poll_last_result{{result=\"{}\"}} {value}\n",
|
||||||
|
result.as_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
([(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)], body).into_response()
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_image_open() -> impl IntoResponse {
|
pub async fn get_image_open() -> impl IntoResponse {
|
||||||
(
|
(
|
||||||
[(header::CONTENT_TYPE, "image/png"), (header::CACHE_CONTROL, "public, max-age=86400")],
|
[(header::CONTENT_TYPE, "image/png"), (header::CACHE_CONTROL, "public, max-age=86400")],
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,14 @@ struct CurrentStateRow {
|
||||||
last_checked: Option<u64>,
|
last_checked: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct CacheMetricsSnapshot {
|
||||||
|
pub status: DoorStatus,
|
||||||
|
pub since: Option<u64>,
|
||||||
|
pub last_seen: Option<u64>,
|
||||||
|
pub last_checked: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_status(status: &str, location: &str) -> Result<DoorStatus> {
|
fn parse_status(status: &str, location: &str) -> Result<DoorStatus> {
|
||||||
status.parse().with_context(|| format!("invalid door status {status:?} in {location}"))
|
status.parse().with_context(|| format!("invalid door status {status:?} in {location}"))
|
||||||
}
|
}
|
||||||
|
|
@ -183,6 +191,16 @@ pub fn get_status(conn: &Connection) -> Result<CacheStatusResponse> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_metrics_snapshot(conn: &Connection) -> Result<CacheMetricsSnapshot> {
|
||||||
|
let row = current_state_row(conn)?;
|
||||||
|
Ok(CacheMetricsSnapshot {
|
||||||
|
status: row.state.status_for_api(),
|
||||||
|
since: row.state.since_for_api(),
|
||||||
|
last_seen: row.last_seen,
|
||||||
|
last_checked: row.last_checked,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
fn write_state_change(
|
fn write_state_change(
|
||||||
conn: &Connection,
|
conn: &Connection,
|
||||||
status: DoorStatus,
|
status: DoorStatus,
|
||||||
|
|
|
||||||
|
|
@ -6,11 +6,11 @@ use axum::routing::{get, post};
|
||||||
use axum::Router;
|
use axum::Router;
|
||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use tower_http::trace::TraceLayer;
|
use tracing::info;
|
||||||
use tracing::{info, Level};
|
|
||||||
|
|
||||||
mod api;
|
mod api;
|
||||||
mod db;
|
mod db;
|
||||||
|
mod metrics;
|
||||||
mod poller;
|
mod poller;
|
||||||
mod types;
|
mod types;
|
||||||
mod webhook;
|
mod webhook;
|
||||||
|
|
@ -100,6 +100,14 @@ async fn main() -> Result<()> {
|
||||||
.build()
|
.build()
|
||||||
.context("failed to build HTTP client")?;
|
.context("failed to build HTTP client")?;
|
||||||
|
|
||||||
|
let metrics = Arc::new(metrics::CacheMetrics::new(
|
||||||
|
pi_address.clone(),
|
||||||
|
status_poll_interval_secs,
|
||||||
|
offline_threshold,
|
||||||
|
retry_attempts,
|
||||||
|
http_timeout_secs,
|
||||||
|
));
|
||||||
|
|
||||||
let poller_config = Arc::new(poller::PollerConfig {
|
let poller_config = Arc::new(poller::PollerConfig {
|
||||||
pi_address,
|
pi_address,
|
||||||
pi_api_key,
|
pi_api_key,
|
||||||
|
|
@ -108,6 +116,7 @@ async fn main() -> Result<()> {
|
||||||
retry_attempts,
|
retry_attempts,
|
||||||
retry_base_delay_secs,
|
retry_base_delay_secs,
|
||||||
webhooks: webhooks.clone(),
|
webhooks: webhooks.clone(),
|
||||||
|
metrics: metrics.clone(),
|
||||||
});
|
});
|
||||||
|
|
||||||
poller::spawn_status_poller(poller_config.clone(), db.clone(), client.clone());
|
poller::spawn_status_poller(poller_config.clone(), db.clone(), client.clone());
|
||||||
|
|
@ -121,22 +130,19 @@ async fn main() -> Result<()> {
|
||||||
retry_base_delay_secs,
|
retry_base_delay_secs,
|
||||||
webhook_last_request: AtomicU64::new(0),
|
webhook_last_request: AtomicU64::new(0),
|
||||||
webhook_tokens: std::sync::atomic::AtomicU32::new(10),
|
webhook_tokens: std::sync::atomic::AtomicU32::new(10),
|
||||||
|
metrics,
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.route("/health", get(api::health))
|
.route("/health", get(api::health))
|
||||||
.route("/webhook", post(api::post_webhook))
|
.route("/webhook", post(api::post_webhook))
|
||||||
.route("/status", get(api::get_status))
|
.route("/status", get(api::get_status))
|
||||||
|
.route("/metrics", get(api::get_metrics))
|
||||||
.route("/badge.svg", get(api::get_badge))
|
.route("/badge.svg", get(api::get_badge))
|
||||||
.route("/image", get(api::get_image))
|
.route("/image", get(api::get_image))
|
||||||
.route("/image/open.png", get(api::get_image_open))
|
.route("/image/open.png", get(api::get_image_open))
|
||||||
.route("/image/closed.png", get(api::get_image_closed))
|
.route("/image/closed.png", get(api::get_image_closed))
|
||||||
.route("/image/offline.png", get(api::get_image_offline))
|
.route("/image/offline.png", get(api::get_image_offline))
|
||||||
.layer(
|
|
||||||
TraceLayer::new_for_http()
|
|
||||||
.make_span_with(tower_http::trace::DefaultMakeSpan::new().level(Level::INFO))
|
|
||||||
.on_response(tower_http::trace::DefaultOnResponse::new().level(Level::INFO)),
|
|
||||||
)
|
|
||||||
.with_state(app_state);
|
.with_state(app_state);
|
||||||
|
|
||||||
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
||||||
|
|
|
||||||
186
remote/cache-service/src/metrics.rs
Normal file
186
remote/cache-service/src/metrics.rs
Normal file
|
|
@ -0,0 +1,186 @@
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum PollResultKind {
|
||||||
|
Never = 0,
|
||||||
|
Success = 1,
|
||||||
|
HttpError = 2,
|
||||||
|
RequestTimeout = 3,
|
||||||
|
RequestConnect = 4,
|
||||||
|
RequestOther = 5,
|
||||||
|
ParseError = 6,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PollResultKind {
|
||||||
|
pub const ALL: [Self; 7] = [
|
||||||
|
Self::Never,
|
||||||
|
Self::Success,
|
||||||
|
Self::HttpError,
|
||||||
|
Self::RequestTimeout,
|
||||||
|
Self::RequestConnect,
|
||||||
|
Self::RequestOther,
|
||||||
|
Self::ParseError,
|
||||||
|
];
|
||||||
|
|
||||||
|
pub const fn as_str(self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Self::Never => "never",
|
||||||
|
Self::Success => "success",
|
||||||
|
Self::HttpError => "http_error",
|
||||||
|
Self::RequestTimeout => "request_timeout",
|
||||||
|
Self::RequestConnect => "request_connect",
|
||||||
|
Self::RequestOther => "request_other",
|
||||||
|
Self::ParseError => "parse_error",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn from_code(code: u64) -> Self {
|
||||||
|
match code {
|
||||||
|
1 => Self::Success,
|
||||||
|
2 => Self::HttpError,
|
||||||
|
3 => Self::RequestTimeout,
|
||||||
|
4 => Self::RequestConnect,
|
||||||
|
5 => Self::RequestOther,
|
||||||
|
6 => Self::ParseError,
|
||||||
|
_ => Self::Never,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct CacheMetrics {
|
||||||
|
pub process_start_time: u64,
|
||||||
|
pub pi_address: String,
|
||||||
|
pub poll_interval_secs: u64,
|
||||||
|
pub offline_threshold: u32,
|
||||||
|
pub retry_attempts: u32,
|
||||||
|
pub http_timeout_secs: u64,
|
||||||
|
pub webhook_received_total: AtomicU64,
|
||||||
|
pub webhook_rate_limited_total: AtomicU64,
|
||||||
|
pub outbound_success_total: AtomicU64,
|
||||||
|
pub outbound_failure_total: AtomicU64,
|
||||||
|
pub poll_attempt_total: AtomicU64,
|
||||||
|
pub poll_success_total: AtomicU64,
|
||||||
|
pub poll_failure_total: AtomicU64,
|
||||||
|
pub poll_http_error_total: AtomicU64,
|
||||||
|
pub poll_request_timeout_total: AtomicU64,
|
||||||
|
pub poll_request_connect_total: AtomicU64,
|
||||||
|
pub poll_request_other_total: AtomicU64,
|
||||||
|
pub poll_parse_failure_total: AtomicU64,
|
||||||
|
pub poll_offline_transition_total: AtomicU64,
|
||||||
|
pub poll_consecutive_failures: AtomicU64,
|
||||||
|
pub poll_last_attempt_timestamp: AtomicU64,
|
||||||
|
pub poll_last_success_timestamp: AtomicU64,
|
||||||
|
pub poll_last_failure_timestamp: AtomicU64,
|
||||||
|
pub poll_last_duration_millis: AtomicU64,
|
||||||
|
pub poll_last_success_duration_millis: AtomicU64,
|
||||||
|
pub poll_last_failure_duration_millis: AtomicU64,
|
||||||
|
pub poll_last_http_status: AtomicU64,
|
||||||
|
pub poll_last_result: AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CacheMetrics {
|
||||||
|
pub fn new(
|
||||||
|
pi_address: String,
|
||||||
|
poll_interval_secs: u64,
|
||||||
|
offline_threshold: u32,
|
||||||
|
retry_attempts: u32,
|
||||||
|
http_timeout_secs: u64,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
process_start_time: unix_now(),
|
||||||
|
pi_address,
|
||||||
|
poll_interval_secs,
|
||||||
|
offline_threshold,
|
||||||
|
retry_attempts,
|
||||||
|
http_timeout_secs,
|
||||||
|
webhook_received_total: AtomicU64::new(0),
|
||||||
|
webhook_rate_limited_total: AtomicU64::new(0),
|
||||||
|
outbound_success_total: AtomicU64::new(0),
|
||||||
|
outbound_failure_total: AtomicU64::new(0),
|
||||||
|
poll_attempt_total: AtomicU64::new(0),
|
||||||
|
poll_success_total: AtomicU64::new(0),
|
||||||
|
poll_failure_total: AtomicU64::new(0),
|
||||||
|
poll_http_error_total: AtomicU64::new(0),
|
||||||
|
poll_request_timeout_total: AtomicU64::new(0),
|
||||||
|
poll_request_connect_total: AtomicU64::new(0),
|
||||||
|
poll_request_other_total: AtomicU64::new(0),
|
||||||
|
poll_parse_failure_total: AtomicU64::new(0),
|
||||||
|
poll_offline_transition_total: AtomicU64::new(0),
|
||||||
|
poll_consecutive_failures: AtomicU64::new(0),
|
||||||
|
poll_last_attempt_timestamp: AtomicU64::new(0),
|
||||||
|
poll_last_success_timestamp: AtomicU64::new(0),
|
||||||
|
poll_last_failure_timestamp: AtomicU64::new(0),
|
||||||
|
poll_last_duration_millis: AtomicU64::new(0),
|
||||||
|
poll_last_success_duration_millis: AtomicU64::new(0),
|
||||||
|
poll_last_failure_duration_millis: AtomicU64::new(0),
|
||||||
|
poll_last_http_status: AtomicU64::new(0),
|
||||||
|
poll_last_result: AtomicU64::new(PollResultKind::Never as u64),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_outbound(&self, delivered: u64, failed: u64) {
|
||||||
|
self.outbound_success_total.fetch_add(delivered, Ordering::Relaxed);
|
||||||
|
self.outbound_failure_total.fetch_add(failed, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn record_poll_attempt(&self, timestamp: u64) {
|
||||||
|
self.poll_attempt_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.poll_last_attempt_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn record_poll_success(&self, timestamp: u64, duration_millis: u64, status: u16) {
|
||||||
|
self.poll_success_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.poll_last_success_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.poll_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.poll_last_success_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.poll_last_http_status.store(u64::from(status), Ordering::Relaxed);
|
||||||
|
self.poll_last_result.store(PollResultKind::Success as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn record_poll_failure(
|
||||||
|
&self,
|
||||||
|
kind: PollResultKind,
|
||||||
|
timestamp: u64,
|
||||||
|
duration_millis: u64,
|
||||||
|
status: Option<u16>,
|
||||||
|
) {
|
||||||
|
self.poll_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
match kind {
|
||||||
|
PollResultKind::HttpError => {
|
||||||
|
self.poll_http_error_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
PollResultKind::RequestTimeout => {
|
||||||
|
self.poll_request_timeout_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
PollResultKind::RequestConnect => {
|
||||||
|
self.poll_request_connect_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
PollResultKind::RequestOther => {
|
||||||
|
self.poll_request_other_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
PollResultKind::ParseError => {
|
||||||
|
self.poll_parse_failure_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
PollResultKind::Never | PollResultKind::Success => {}
|
||||||
|
}
|
||||||
|
self.poll_last_failure_timestamp.store(timestamp, Ordering::Relaxed);
|
||||||
|
self.poll_last_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.poll_last_failure_duration_millis.store(duration_millis, Ordering::Relaxed);
|
||||||
|
self.poll_last_http_status.store(status.map(u64::from).unwrap_or(0), Ordering::Relaxed);
|
||||||
|
self.poll_last_result.store(kind as u64, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unix_now() -> u64 {
|
||||||
|
SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn atomic_value(value: &AtomicU64) -> u64 {
|
||||||
|
value.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn atomic_seconds_from_millis(value: &AtomicU64) -> f64 {
|
||||||
|
value.load(Ordering::Relaxed) as f64 / 1000.0
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use noisebell_common::{DoorStatus, PiStatusResponse, WebhookPayload};
|
use noisebell_common::{DoorStatus, PiStatusResponse, WebhookPayload};
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
|
|
@ -7,6 +8,7 @@ use tracing::{error, info, warn};
|
||||||
|
|
||||||
use crate::db;
|
use crate::db;
|
||||||
use crate::db::ApplyStateOutcome;
|
use crate::db::ApplyStateOutcome;
|
||||||
|
use crate::metrics::{CacheMetrics, PollResultKind};
|
||||||
use crate::types::WebhookTarget;
|
use crate::types::WebhookTarget;
|
||||||
use crate::webhook;
|
use crate::webhook;
|
||||||
|
|
||||||
|
|
@ -18,12 +20,81 @@ pub struct PollerConfig {
|
||||||
pub retry_attempts: u32,
|
pub retry_attempts: u32,
|
||||||
pub retry_base_delay_secs: u64,
|
pub retry_base_delay_secs: u64,
|
||||||
pub webhooks: Vec<WebhookTarget>,
|
pub webhooks: Vec<WebhookTarget>,
|
||||||
|
pub metrics: Arc<CacheMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unix_now() -> u64 {
|
fn unix_now() -> u64 {
|
||||||
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
|
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn duration_millis(started_at: Instant) -> u64 {
|
||||||
|
let millis = started_at.elapsed().as_millis();
|
||||||
|
millis.try_into().unwrap_or(u64::MAX)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_request_error(error: &reqwest::Error) -> PollResultKind {
|
||||||
|
if error.is_timeout() {
|
||||||
|
PollResultKind::RequestTimeout
|
||||||
|
} else if error.is_connect() {
|
||||||
|
PollResultKind::RequestConnect
|
||||||
|
} else {
|
||||||
|
PollResultKind::RequestOther
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_log_poll_failure(
|
||||||
|
consecutive_failures: u32,
|
||||||
|
kind: PollResultKind,
|
||||||
|
status_code: Option<u16>,
|
||||||
|
last_failure_log: &mut Option<(PollResultKind, Option<u16>)>,
|
||||||
|
) -> bool {
|
||||||
|
let key = (kind, status_code);
|
||||||
|
let changed = match *last_failure_log {
|
||||||
|
Some(previous) => previous != key,
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
*last_failure_log = Some(key);
|
||||||
|
consecutive_failures == 1 || changed
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn mark_pi_offline(
|
||||||
|
config: &PollerConfig,
|
||||||
|
database: Arc<Mutex<rusqlite::Connection>>,
|
||||||
|
client: &reqwest::Client,
|
||||||
|
now: u64,
|
||||||
|
consecutive_failures: u32,
|
||||||
|
) {
|
||||||
|
let marked = tokio::task::spawn_blocking(move || {
|
||||||
|
let conn = database.blocking_lock();
|
||||||
|
db::mark_offline(&conn, now)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.expect("db task panicked");
|
||||||
|
|
||||||
|
match marked {
|
||||||
|
Ok(()) => {
|
||||||
|
config.metrics.poll_offline_transition_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
warn!(
|
||||||
|
consecutive_failures,
|
||||||
|
threshold = config.offline_threshold,
|
||||||
|
"Pi marked offline after poll failures"
|
||||||
|
);
|
||||||
|
let summary = webhook::forward(
|
||||||
|
client,
|
||||||
|
&config.webhooks,
|
||||||
|
&WebhookPayload { status: DoorStatus::Offline, timestamp: now },
|
||||||
|
config.retry_attempts,
|
||||||
|
config.retry_base_delay_secs,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
config.metrics.add_outbound(summary.delivered, summary.failed);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(error = %e, "failed to mark Pi offline");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn spawn_status_poller(
|
pub fn spawn_status_poller(
|
||||||
config: Arc<PollerConfig>,
|
config: Arc<PollerConfig>,
|
||||||
db: Arc<Mutex<rusqlite::Connection>>,
|
db: Arc<Mutex<rusqlite::Connection>>,
|
||||||
|
|
@ -32,10 +103,15 @@ pub fn spawn_status_poller(
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut consecutive_failures: u32 = 0;
|
let mut consecutive_failures: u32 = 0;
|
||||||
let mut was_offline = false;
|
let mut was_offline = false;
|
||||||
|
let mut last_failure_log: Option<(PollResultKind, Option<u16>)> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
|
let poll_started_at = Instant::now();
|
||||||
|
let poll_started_timestamp = unix_now();
|
||||||
|
config.metrics.record_poll_attempt(poll_started_timestamp);
|
||||||
|
|
||||||
{
|
{
|
||||||
let now = unix_now();
|
let now = poll_started_timestamp;
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
let _ = tokio::task::spawn_blocking(move || {
|
let _ = tokio::task::spawn_blocking(move || {
|
||||||
let conn = db.blocking_lock();
|
let conn = db.blocking_lock();
|
||||||
|
|
@ -53,16 +129,77 @@ pub fn spawn_status_poller(
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(resp) if resp.status().is_success() => {
|
Ok(resp) => {
|
||||||
consecutive_failures = 0;
|
let status_code = resp.status().as_u16();
|
||||||
if was_offline {
|
if !resp.status().is_success() {
|
||||||
info!("Pi is back online");
|
consecutive_failures += 1;
|
||||||
was_offline = false;
|
let kind = PollResultKind::HttpError;
|
||||||
|
config.metrics.record_poll_failure(
|
||||||
|
kind,
|
||||||
|
unix_now(),
|
||||||
|
duration_millis(poll_started_at),
|
||||||
|
Some(status_code),
|
||||||
|
);
|
||||||
|
config
|
||||||
|
.metrics
|
||||||
|
.poll_consecutive_failures
|
||||||
|
.store(consecutive_failures.into(), Ordering::Relaxed);
|
||||||
|
|
||||||
|
if should_log_poll_failure(
|
||||||
|
consecutive_failures,
|
||||||
|
kind,
|
||||||
|
Some(status_code),
|
||||||
|
&mut last_failure_log,
|
||||||
|
) {
|
||||||
|
warn!(
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = status_code,
|
||||||
|
consecutive_failures,
|
||||||
|
"Pi status poll failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if consecutive_failures >= config.offline_threshold && !was_offline {
|
||||||
|
was_offline = true;
|
||||||
|
let now = unix_now();
|
||||||
|
mark_pi_offline(
|
||||||
|
&config,
|
||||||
|
db.clone(),
|
||||||
|
&client,
|
||||||
|
now,
|
||||||
|
consecutive_failures,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(config.status_poll_interval).await;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let now = unix_now();
|
let now = unix_now();
|
||||||
match resp.json::<PiStatusResponse>().await {
|
match resp.json::<PiStatusResponse>().await {
|
||||||
Ok(body) => {
|
Ok(body) => {
|
||||||
|
if was_offline {
|
||||||
|
info!(
|
||||||
|
previous_consecutive_failures = consecutive_failures,
|
||||||
|
"Pi is back online"
|
||||||
|
);
|
||||||
|
was_offline = false;
|
||||||
|
} else if consecutive_failures > 0 {
|
||||||
|
info!(
|
||||||
|
previous_consecutive_failures = consecutive_failures,
|
||||||
|
"Pi status poll recovered"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
consecutive_failures = 0;
|
||||||
|
last_failure_log = None;
|
||||||
|
config.metrics.poll_consecutive_failures.store(0, Ordering::Relaxed);
|
||||||
|
|
||||||
|
config.metrics.record_poll_success(
|
||||||
|
now,
|
||||||
|
duration_millis(poll_started_at),
|
||||||
|
status_code,
|
||||||
|
);
|
||||||
let status = body.status;
|
let status = body.status;
|
||||||
let event_timestamp = body.timestamp;
|
let event_timestamp = body.timestamp;
|
||||||
|
|
||||||
|
|
@ -102,7 +239,7 @@ pub fn spawn_status_poller(
|
||||||
timestamp,
|
timestamp,
|
||||||
"state updated from poll"
|
"state updated from poll"
|
||||||
);
|
);
|
||||||
webhook::forward(
|
let summary = webhook::forward(
|
||||||
&client,
|
&client,
|
||||||
&config.webhooks,
|
&config.webhooks,
|
||||||
&WebhookPayload { status, timestamp },
|
&WebhookPayload { status, timestamp },
|
||||||
|
|
@ -110,14 +247,11 @@ pub fn spawn_status_poller(
|
||||||
config.retry_base_delay_secs,
|
config.retry_base_delay_secs,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
config
|
||||||
|
.metrics
|
||||||
|
.add_outbound(summary.delivered, summary.failed);
|
||||||
}
|
}
|
||||||
ApplyStateOutcome::Duplicate => {
|
ApplyStateOutcome::Duplicate => {}
|
||||||
info!(
|
|
||||||
status = %status,
|
|
||||||
timestamp,
|
|
||||||
"duplicate poll state ignored"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
ApplyStateOutcome::Stale => {
|
ApplyStateOutcome::Stale => {
|
||||||
warn!(
|
warn!(
|
||||||
status = %status,
|
status = %status,
|
||||||
|
|
@ -129,52 +263,80 @@ pub fn spawn_status_poller(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!(error = %e, "failed to parse status poll response");
|
consecutive_failures += 1;
|
||||||
|
let kind = PollResultKind::ParseError;
|
||||||
|
config.metrics.record_poll_failure(
|
||||||
|
kind,
|
||||||
|
unix_now(),
|
||||||
|
duration_millis(poll_started_at),
|
||||||
|
Some(status_code),
|
||||||
|
);
|
||||||
|
config
|
||||||
|
.metrics
|
||||||
|
.poll_consecutive_failures
|
||||||
|
.store(consecutive_failures.into(), Ordering::Relaxed);
|
||||||
|
if should_log_poll_failure(
|
||||||
|
consecutive_failures,
|
||||||
|
kind,
|
||||||
|
Some(status_code),
|
||||||
|
&mut last_failure_log,
|
||||||
|
) {
|
||||||
|
error!(
|
||||||
|
error = %e,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
http_status = status_code,
|
||||||
|
consecutive_failures,
|
||||||
|
"failed to parse Pi status poll response"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if consecutive_failures >= config.offline_threshold && !was_offline {
|
||||||
|
was_offline = true;
|
||||||
|
let now = unix_now();
|
||||||
|
mark_pi_offline(
|
||||||
|
&config,
|
||||||
|
db.clone(),
|
||||||
|
&client,
|
||||||
|
now,
|
||||||
|
consecutive_failures,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
Err(e) => {
|
||||||
consecutive_failures += 1;
|
consecutive_failures += 1;
|
||||||
let err_msg = match &result {
|
let kind = classify_request_error(&e);
|
||||||
Ok(resp) => format!("HTTP {}", resp.status()),
|
config.metrics.record_poll_failure(
|
||||||
Err(e) => e.to_string(),
|
kind,
|
||||||
};
|
unix_now(),
|
||||||
warn!(
|
duration_millis(poll_started_at),
|
||||||
error = %err_msg,
|
None,
|
||||||
consecutive_failures,
|
|
||||||
"status poll failed"
|
|
||||||
);
|
);
|
||||||
|
config
|
||||||
|
.metrics
|
||||||
|
.poll_consecutive_failures
|
||||||
|
.store(consecutive_failures.into(), Ordering::Relaxed);
|
||||||
|
if should_log_poll_failure(
|
||||||
|
consecutive_failures,
|
||||||
|
kind,
|
||||||
|
None,
|
||||||
|
&mut last_failure_log,
|
||||||
|
) {
|
||||||
|
warn!(
|
||||||
|
error = %e,
|
||||||
|
kind = kind.as_str(),
|
||||||
|
consecutive_failures,
|
||||||
|
"Pi status poll failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if consecutive_failures >= config.offline_threshold && !was_offline {
|
if consecutive_failures >= config.offline_threshold && !was_offline {
|
||||||
was_offline = true;
|
was_offline = true;
|
||||||
let now = unix_now();
|
let now = unix_now();
|
||||||
let db = db.clone();
|
mark_pi_offline(&config, db.clone(), &client, now, consecutive_failures)
|
||||||
let marked = tokio::task::spawn_blocking(move || {
|
.await;
|
||||||
let conn = db.blocking_lock();
|
|
||||||
db::mark_offline(&conn, now)
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.expect("db task panicked");
|
|
||||||
|
|
||||||
match marked {
|
|
||||||
Ok(()) => {
|
|
||||||
info!(
|
|
||||||
"Pi marked offline after {} consecutive failures",
|
|
||||||
consecutive_failures
|
|
||||||
);
|
|
||||||
webhook::forward(
|
|
||||||
&client,
|
|
||||||
&config.webhooks,
|
|
||||||
&WebhookPayload { status: DoorStatus::Offline, timestamp: now },
|
|
||||||
config.retry_attempts,
|
|
||||||
config.retry_base_delay_secs,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
error!(error = %e, "failed to mark Pi offline");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,28 @@
|
||||||
use std::time::Duration;
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use noisebell_common::WebhookPayload;
|
use noisebell_common::WebhookPayload;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
use crate::types::{WebhookAuth, WebhookTarget};
|
use crate::types::{WebhookAuth, WebhookTarget};
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct ForwardSummary {
|
||||||
|
pub delivered: u64,
|
||||||
|
pub failed: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn duration_millis(started_at: Instant) -> u64 {
|
||||||
|
let millis = started_at.elapsed().as_millis();
|
||||||
|
millis.try_into().unwrap_or(u64::MAX)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn forward(
|
pub async fn forward(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
targets: &[WebhookTarget],
|
targets: &[WebhookTarget],
|
||||||
payload: &WebhookPayload,
|
payload: &WebhookPayload,
|
||||||
retry_attempts: u32,
|
retry_attempts: u32,
|
||||||
retry_base_delay_secs: u64,
|
retry_base_delay_secs: u64,
|
||||||
) {
|
) -> ForwardSummary {
|
||||||
let mut set = tokio::task::JoinSet::new();
|
let mut set = tokio::task::JoinSet::new();
|
||||||
|
|
||||||
for target in targets {
|
for target in targets {
|
||||||
|
|
@ -21,9 +32,8 @@ pub async fn forward(
|
||||||
let auth = target.auth.clone();
|
let auth = target.auth.clone();
|
||||||
|
|
||||||
set.spawn(async move {
|
set.spawn(async move {
|
||||||
info!(url = %url, status = %payload.status, "forwarding to outbound webhook");
|
|
||||||
|
|
||||||
for attempt in 0..=retry_attempts {
|
for attempt in 0..=retry_attempts {
|
||||||
|
let attempt_started_at = Instant::now();
|
||||||
let mut req = client.post(&url).json(&payload);
|
let mut req = client.post(&url).json(&payload);
|
||||||
if let WebhookAuth::Bearer(secret) = &auth {
|
if let WebhookAuth::Bearer(secret) = &auth {
|
||||||
req = req.bearer_auth(secret);
|
req = req.bearer_auth(secret);
|
||||||
|
|
@ -31,30 +41,75 @@ pub async fn forward(
|
||||||
|
|
||||||
match req.send().await {
|
match req.send().await {
|
||||||
Ok(resp) if resp.status().is_success() => {
|
Ok(resp) if resp.status().is_success() => {
|
||||||
info!(url = %url, "outbound webhook delivered");
|
info!(
|
||||||
return;
|
url = %url,
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
http_status = resp.status().as_u16(),
|
||||||
|
duration_ms = duration_millis(attempt_started_at),
|
||||||
|
attempts = attempt + 1,
|
||||||
|
"outbound webhook delivered"
|
||||||
|
);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
result => {
|
result => {
|
||||||
let err_msg = match &result {
|
let err_msg = match &result {
|
||||||
Ok(resp) => format!("HTTP {}", resp.status()),
|
Ok(resp) => format!("HTTP {}", resp.status()),
|
||||||
Err(e) => e.to_string(),
|
Err(e) => e.to_string(),
|
||||||
};
|
};
|
||||||
|
let http_status = result.as_ref().ok().map(|resp| resp.status().as_u16());
|
||||||
|
let kind =
|
||||||
|
if http_status.is_some() { "http_error" } else { "request_error" };
|
||||||
|
let duration_ms = duration_millis(attempt_started_at);
|
||||||
if attempt == retry_attempts {
|
if attempt == retry_attempts {
|
||||||
error!(url = %url, error = %err_msg, "outbound webhook failed after {} attempts", retry_attempts + 1);
|
error!(
|
||||||
|
url = %url,
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
error = %err_msg,
|
||||||
|
kind,
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempts = retry_attempts + 1,
|
||||||
|
"outbound webhook failed after retries"
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
let delay = Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
|
let delay =
|
||||||
warn!(url = %url, error = %err_msg, attempt = attempt + 1, "outbound webhook failed, retrying in {:?}", delay);
|
Duration::from_secs(retry_base_delay_secs * 2u64.pow(attempt));
|
||||||
|
warn!(
|
||||||
|
url = %url,
|
||||||
|
status = %payload.status,
|
||||||
|
timestamp = payload.timestamp,
|
||||||
|
error = %err_msg,
|
||||||
|
kind,
|
||||||
|
http_status = http_status.unwrap_or(0),
|
||||||
|
duration_ms,
|
||||||
|
attempt = attempt + 1,
|
||||||
|
total_attempts = retry_attempts + 1,
|
||||||
|
delay_seconds = delay.as_secs(),
|
||||||
|
"outbound webhook failed, retrying"
|
||||||
|
);
|
||||||
tokio::time::sleep(delay).await;
|
tokio::time::sleep(delay).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
false
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut summary = ForwardSummary::default();
|
||||||
while let Some(result) = set.join_next().await {
|
while let Some(result) = set.join_next().await {
|
||||||
if let Err(e) = result {
|
match result {
|
||||||
error!(error = %e, "webhook task panicked");
|
Ok(true) => summary.delivered += 1,
|
||||||
|
Ok(false) => summary.failed += 1,
|
||||||
|
Err(e) => {
|
||||||
|
summary.failed += 1;
|
||||||
|
error!(error = %e, "webhook task panicked");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
summary
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,5 @@ serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
serenity = { version = "0.12", default-features = false, features = ["client", "gateway", "model", "rustls_backend"] }
|
serenity = { version = "0.12", default-features = false, features = ["client", "gateway", "model", "rustls_backend"] }
|
||||||
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "sync", "signal", "time"] }
|
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "sync", "signal", "time"] }
|
||||||
tower-http = { version = "0.6", features = ["trace"] }
|
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
|
||||||
|
|
@ -12,8 +12,7 @@ use serenity::all::{
|
||||||
CreateInteractionResponseMessage, CreateMessage, GatewayIntents, Interaction,
|
CreateInteractionResponseMessage, CreateMessage, GatewayIntents, Interaction,
|
||||||
};
|
};
|
||||||
use serenity::async_trait;
|
use serenity::async_trait;
|
||||||
use tower_http::trace::TraceLayer;
|
use tracing::{error, info, warn};
|
||||||
use tracing::{error, info, warn, Level};
|
|
||||||
|
|
||||||
struct AppState {
|
struct AppState {
|
||||||
http: Arc<serenity::all::Http>,
|
http: Arc<serenity::all::Http>,
|
||||||
|
|
@ -66,6 +65,11 @@ async fn post_webhook(
|
||||||
Json(body): Json<WebhookPayload>,
|
Json(body): Json<WebhookPayload>,
|
||||||
) -> StatusCode {
|
) -> StatusCode {
|
||||||
if !validate_bearer(&headers, &state.webhook_secret) {
|
if !validate_bearer(&headers, &state.webhook_secret) {
|
||||||
|
warn!(
|
||||||
|
status = %body.status,
|
||||||
|
timestamp = body.timestamp,
|
||||||
|
"unauthorized Discord webhook rejected"
|
||||||
|
);
|
||||||
return StatusCode::UNAUTHORIZED;
|
return StatusCode::UNAUTHORIZED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -226,11 +230,6 @@ async fn main() -> Result<()> {
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.route("/health", get(|| async { StatusCode::OK }))
|
.route("/health", get(|| async { StatusCode::OK }))
|
||||||
.route("/webhook", post(post_webhook))
|
.route("/webhook", post(post_webhook))
|
||||||
.layer(
|
|
||||||
TraceLayer::new_for_http()
|
|
||||||
.make_span_with(tower_http::trace::DefaultMakeSpan::new().level(Level::INFO))
|
|
||||||
.on_response(tower_http::trace::DefaultOnResponse::new().level(Level::INFO)),
|
|
||||||
)
|
|
||||||
.with_state(app_state);
|
.with_state(app_state);
|
||||||
|
|
||||||
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,21 @@ pub fn validate_bearer(headers: &HeaderMap, expected: &str) -> bool {
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn prometheus_escape_label_value(value: &str) -> String {
|
||||||
|
let mut escaped = String::with_capacity(value.len());
|
||||||
|
for ch in value.chars() {
|
||||||
|
match ch {
|
||||||
|
'\\' => escaped.push_str("\\\\"),
|
||||||
|
'"' => escaped.push_str("\\\""),
|
||||||
|
'\n' => escaped.push_str("\\n"),
|
||||||
|
_ => escaped.push(ch),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
escaped
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const PROMETHEUS_CONTENT_TYPE: &str = "text/plain; version=0.0.4; charset=utf-8";
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
#[serde(rename_all = "lowercase")]
|
#[serde(rename_all = "lowercase")]
|
||||||
pub enum DoorStatus {
|
pub enum DoorStatus {
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,5 @@ chrono = "0.4"
|
||||||
noisebell-common = { path = "../noisebell-common" }
|
noisebell-common = { path = "../noisebell-common" }
|
||||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||||
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "signal", "time"] }
|
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "signal", "time"] }
|
||||||
tower-http = { version = "0.6", features = ["trace"] }
|
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
|
||||||
|
|
@ -9,8 +9,7 @@ use axum::routing::get;
|
||||||
use axum::{Json, Router};
|
use axum::{Json, Router};
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use noisebell_common::{CacheStatusResponse, DoorStatus};
|
use noisebell_common::{CacheStatusResponse, DoorStatus};
|
||||||
use tower_http::trace::TraceLayer;
|
use tracing::{error, info};
|
||||||
use tracing::{error, info, Level};
|
|
||||||
|
|
||||||
const FEED_TTL_MINUTES: u32 = 1;
|
const FEED_TTL_MINUTES: u32 = 1;
|
||||||
const README_URL: &str =
|
const README_URL: &str =
|
||||||
|
|
@ -596,11 +595,6 @@ async fn main() -> Result<()> {
|
||||||
.route("/open/", get(get_open_rss))
|
.route("/open/", get(get_open_rss))
|
||||||
.route("/open/rss.xml", get(get_open_rss))
|
.route("/open/rss.xml", get(get_open_rss))
|
||||||
.route("/open/atom.xml", get(get_open_atom))
|
.route("/open/atom.xml", get(get_open_atom))
|
||||||
.layer(
|
|
||||||
TraceLayer::new_for_http()
|
|
||||||
.make_span_with(tower_http::trace::DefaultMakeSpan::new().level(Level::INFO))
|
|
||||||
.on_response(tower_http::trace::DefaultOnResponse::new().level(Level::INFO)),
|
|
||||||
)
|
|
||||||
.with_state(app_state);
|
.with_state(app_state);
|
||||||
|
|
||||||
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
let listener = tokio::net::TcpListener::bind(("0.0.0.0", port))
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,5 @@ noisebell-common = { path = "../noisebell-common" }
|
||||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "signal"] }
|
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "signal"] }
|
||||||
tower-http = { version = "0.6", features = ["trace"] }
|
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ use axum::routing::{get, post};
|
||||||
use axum::{Json, Router};
|
use axum::{Json, Router};
|
||||||
use noisebell_common::{validate_bearer, DoorStatus, WebhookPayload};
|
use noisebell_common::{validate_bearer, DoorStatus, WebhookPayload};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use tower_http::trace::TraceLayer;
|
use tracing::{error, info, warn};
|
||||||
use tracing::{error, info, Level};
|
|
||||||
|
|
||||||
struct AppState {
|
struct AppState {
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
|
|
@ -54,6 +53,11 @@ async fn post_webhook(
|
||||||
Json(body): Json<WebhookPayload>,
|
Json(body): Json<WebhookPayload>,
|
||||||
) -> StatusCode {
|
) -> StatusCode {
|
||||||
if !validate_bearer(&headers, &state.webhook_secret) {
|
if !validate_bearer(&headers, &state.webhook_secret) {
|
||||||
|
warn!(
|
||||||
|
status = %body.status,
|
||||||
|
timestamp = body.timestamp,
|
||||||
|
"unauthorized Zulip webhook rejected"
|
||||||
|
);
|
||||||
return StatusCode::UNAUTHORIZED;
|
return StatusCode::UNAUTHORIZED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -131,11 +135,6 @@ async fn main() -> Result<()> {
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.route("/health", get(|| async { StatusCode::OK }))
|
.route("/health", get(|| async { StatusCode::OK }))
|
||||||
.route("/webhook", post(post_webhook))
|
.route("/webhook", post(post_webhook))
|
||||||
.layer(
|
|
||||||
TraceLayer::new_for_http()
|
|
||||||
.make_span_with(tower_http::trace::DefaultMakeSpan::new().level(Level::INFO))
|
|
||||||
.on_response(tower_http::trace::DefaultOnResponse::new().level(Level::INFO)),
|
|
||||||
)
|
|
||||||
.with_state(Arc::new(AppState {
|
.with_state(Arc::new(AppState {
|
||||||
client,
|
client,
|
||||||
webhook_secret,
|
webhook_secret,
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,7 @@ ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "DEPLOY_HOSTNAME='$DEPLOY_HOSTNAME' HOME_ASS
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y curl rsync avahi-daemon
|
sudo apt-get install -y curl jq rsync avahi-daemon prometheus-node-exporter
|
||||||
|
|
||||||
sudo hostnamectl set-hostname "$DEPLOY_HOSTNAME"
|
sudo hostnamectl set-hostname "$DEPLOY_HOSTNAME"
|
||||||
sudo tee /etc/hostname >/dev/null <<<"$DEPLOY_HOSTNAME"
|
sudo tee /etc/hostname >/dev/null <<<"$DEPLOY_HOSTNAME"
|
||||||
|
|
@ -86,7 +86,16 @@ HOSTSEOF
|
||||||
if ! command -v tailscale >/dev/null 2>&1; then
|
if ! command -v tailscale >/dev/null 2>&1; then
|
||||||
curl -fsSL https://tailscale.com/install.sh | sh
|
curl -fsSL https://tailscale.com/install.sh | sh
|
||||||
fi
|
fi
|
||||||
sudo systemctl enable --now ssh avahi-daemon tailscaled
|
sudo mkdir -p /etc/systemd/journald.conf.d /var/log/journal
|
||||||
|
sudo tee /etc/systemd/journald.conf.d/noisebell-persistent.conf >/dev/null <<'JOURNALCONF'
|
||||||
|
[Journal]
|
||||||
|
Storage=persistent
|
||||||
|
SystemMaxUse=200M
|
||||||
|
MaxRetentionSec=30day
|
||||||
|
JOURNALCONF
|
||||||
|
sudo systemctl restart systemd-journald
|
||||||
|
|
||||||
|
sudo systemctl enable --now ssh avahi-daemon tailscaled prometheus-node-exporter
|
||||||
|
|
||||||
sudo install -m 755 "$REMOTE_TMP_DIR/noisebell" "$REMOTE_RELEASE_DIR/noisebell"
|
sudo install -m 755 "$REMOTE_TMP_DIR/noisebell" "$REMOTE_RELEASE_DIR/noisebell"
|
||||||
sudo install -m 755 "$REMOTE_TMP_DIR/noisebell-relay" "$REMOTE_RELEASE_DIR/noisebell-relay"
|
sudo install -m 755 "$REMOTE_TMP_DIR/noisebell-relay" "$REMOTE_RELEASE_DIR/noisebell-relay"
|
||||||
|
|
@ -159,12 +168,95 @@ RestartSec=5
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
UNITEOF
|
UNITEOF
|
||||||
|
|
||||||
|
sudo tee /usr/local/bin/noisebell-loki-journal >/dev/null <<'SCRIPTEOF'
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
LOKI_URL=${LOKI_URL:-http://noisebell-do:3100/loki/api/v1/push}
|
||||||
|
HOST_LABEL=${HOST_LABEL:-$(hostname)}
|
||||||
|
CURSOR_DIR=/var/lib/noisebell-loki-journal
|
||||||
|
CURSOR_FILE=$CURSOR_DIR/cursor
|
||||||
|
|
||||||
|
mkdir -p "$CURSOR_DIR"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
args=(--output=json --no-pager --lines=100)
|
||||||
|
if [ -s "$CURSOR_FILE" ]; then
|
||||||
|
args+=(--after-cursor="$(cat "$CURSOR_FILE")")
|
||||||
|
else
|
||||||
|
args+=(--since=-5min)
|
||||||
|
fi
|
||||||
|
|
||||||
|
saw_entry=0
|
||||||
|
hit_error=0
|
||||||
|
|
||||||
|
while IFS= read -r entry; do
|
||||||
|
saw_entry=1
|
||||||
|
cursor=$(jq -r '.__CURSOR // empty' <<<"$entry")
|
||||||
|
timestamp=$(jq -r '.__REALTIME_TIMESTAMP // empty' <<<"$entry")
|
||||||
|
if [ -n "$timestamp" ] && [ "$timestamp" != "null" ]; then
|
||||||
|
timestamp="${timestamp}000"
|
||||||
|
else
|
||||||
|
timestamp=$(date +%s%N)
|
||||||
|
fi
|
||||||
|
|
||||||
|
unit=$(jq -r '._SYSTEMD_UNIT // .SYSLOG_IDENTIFIER // "journal"' <<<"$entry")
|
||||||
|
message=$(jq -r '.MESSAGE // .' <<<"$entry")
|
||||||
|
|
||||||
|
payload=$(jq -cn \
|
||||||
|
--arg host "$HOST_LABEL" \
|
||||||
|
--arg unit "$unit" \
|
||||||
|
--arg ts "$timestamp" \
|
||||||
|
--arg line "$message" \
|
||||||
|
'{streams:[{stream:{job:"journal",host:$host,unit:$unit},values:[[$ts,$line]]}]}')
|
||||||
|
|
||||||
|
if curl -fsS --max-time 5 \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-X POST \
|
||||||
|
--data "$payload" \
|
||||||
|
"$LOKI_URL" >/dev/null 2>&1; then
|
||||||
|
if [ -n "$cursor" ]; then
|
||||||
|
printf '%s\n' "$cursor" > "$CURSOR_FILE"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
hit_error=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done < <(journalctl "${args[@]}" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "$hit_error" -eq 1 ] || [ "$saw_entry" -eq 0 ]; then
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
SCRIPTEOF
|
||||||
|
sudo chmod 755 /usr/local/bin/noisebell-loki-journal
|
||||||
|
|
||||||
|
sudo tee /etc/systemd/system/noisebell-loki-journal.service >/dev/null <<'UNITEOF'
|
||||||
|
[Unit]
|
||||||
|
Description=Noisebell journal shipper to Loki
|
||||||
|
After=network-online.target tailscaled.service
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
Environment=LOKI_URL=http://noisebell-do:3100/loki/api/v1/push
|
||||||
|
Environment=HOST_LABEL=noisebell-pi
|
||||||
|
ExecStart=/usr/local/bin/noisebell-loki-journal
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
UNITEOF
|
||||||
|
|
||||||
sudo ln -sfn "$REMOTE_RELEASE_DIR" "$REMOTE_CURRENT_LINK"
|
sudo ln -sfn "$REMOTE_RELEASE_DIR" "$REMOTE_CURRENT_LINK"
|
||||||
sudo systemctl daemon-reload
|
sudo systemctl daemon-reload
|
||||||
sudo systemctl enable noisebell.service
|
sudo systemctl enable noisebell.service
|
||||||
sudo systemctl enable noisebell-relay.service
|
sudo systemctl enable noisebell-relay.service
|
||||||
|
sudo systemctl enable noisebell-loki-journal.service
|
||||||
sudo systemctl restart noisebell.service
|
sudo systemctl restart noisebell.service
|
||||||
sudo systemctl restart noisebell-relay.service
|
sudo systemctl restart noisebell-relay.service
|
||||||
|
sudo systemctl restart noisebell-loki-journal.service
|
||||||
sudo systemctl restart avahi-daemon
|
sudo systemctl restart avahi-daemon
|
||||||
|
|
||||||
sudo tailscale up --auth-key="$(sudo cat /etc/noisebell/tailscale-auth-key)" --hostname=noisebell-pi || true
|
sudo tailscale up --auth-key="$(sudo cat /etc/noisebell/tailscale-auth-key)" --hostname=noisebell-pi || true
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue