noisebell/hosts/noisebell-do/observability.nix

902 lines
26 KiB
Nix

{ lib, pkgs, ... }:
let
prometheusPort = 9090;
lokiPort = 3100;
grafanaPort = 3030;
grafanaDomain = "grafana-noisebell.extremist.software";
nodeExporterPort = 9100;
blackboxExporterPort = 9115;
publicDashboardToken = "6e6f69736562656c6c7075626c696330";
sharePublicDashboard = pkgs.writeShellApplication {
name = "noisebell-grafana-share-public-dashboard";
runtimeInputs = [
pkgs.coreutils
pkgs.curl
pkgs.jq
];
text = ''
set -euo pipefail
base_url=http://127.0.0.1:${toString grafanaPort}
dashboard_uid=noisebell-public
public_uid=${publicDashboardToken}
access_token=${publicDashboardToken}
password=$(tr -d '\r\n' < /var/lib/grafana/admin_password)
ready=0
for _ in $(seq 1 60); do
if curl -fsS -u "admin:$password" "$base_url/api/health" >/dev/null; then
ready=1
break
fi
sleep 1
done
if [ "$ready" -ne 1 ]; then
echo "Grafana did not become ready at $base_url" >&2
exit 1
fi
dashboard_ready=0
for _ in $(seq 1 60); do
if curl -fsS -u "admin:$password" "$base_url/api/dashboards/uid/$dashboard_uid" >/dev/null; then
dashboard_ready=1
break
fi
sleep 1
done
if [ "$dashboard_ready" -ne 1 ]; then
echo "Grafana dashboard '$dashboard_uid' was not provisioned" >&2
exit 1
fi
existing=$(curl -fsS -u "admin:$password" \
"$base_url/api/dashboards/uid/$dashboard_uid/public-dashboards/" 2>/dev/null || true)
existing_uid=""
existing_token=""
if [ -n "$existing" ]; then
existing_uid=$(jq -r '.uid // empty' <<<"$existing")
existing_token=$(jq -r '.accessToken // empty' <<<"$existing")
fi
if [ -n "$existing_uid" ] && { [ "$existing_uid" != "$public_uid" ] || [ "$existing_token" != "$access_token" ]; }; then
curl -fsS -u "admin:$password" \
-X DELETE \
"$base_url/api/dashboards/uid/$dashboard_uid/public-dashboards/$existing_uid" >/dev/null
existing_uid=""
fi
if [ -n "$existing_uid" ]; then
body=$(jq -cn '{timeSelectionEnabled:true,isEnabled:true,annotationsEnabled:false,share:"public"}')
curl -fsS -u "admin:$password" \
-H 'Content-Type: application/json' \
-X PATCH \
--data "$body" \
"$base_url/api/dashboards/uid/$dashboard_uid/public-dashboards/$existing_uid" >/dev/null
else
body=$(jq -cn \
--arg uid "$public_uid" \
--arg accessToken "$access_token" \
'{uid:$uid,accessToken:$accessToken,timeSelectionEnabled:true,isEnabled:true,annotationsEnabled:false,share:"public"}')
curl -fsS -u "admin:$password" \
-H 'Content-Type: application/json' \
-X POST \
--data "$body" \
"$base_url/api/dashboards/uid/$dashboard_uid/public-dashboards/" >/dev/null
fi
'';
};
blackboxConfig = pkgs.writeText "noisebell-blackbox.yml" ''
modules:
http_2xx:
prober: http
timeout: 5s
http:
follow_redirects: true
preferred_ip_protocol: ip4
tcp_connect:
prober: tcp
timeout: 5s
'';
prometheusDatasource = {
type = "prometheus";
uid = "prometheus";
};
lokiDatasource = {
type = "loki";
uid = "loki";
};
prometheusPanel =
{
id,
title,
type ? "timeseries",
x,
y,
w ? 12,
h ? 8,
targets,
}:
{
inherit id title type targets;
datasource = prometheusDatasource;
gridPos = {
inherit h w x y;
};
};
lokiPanel =
{
id,
title,
type ? "logs",
x,
y,
w ? 12,
h ? 8,
targets,
}:
{
inherit id title type targets;
datasource = lokiDatasource;
gridPos = {
inherit h w x y;
};
};
promTarget = refId: expr: legendFormat: {
inherit refId expr legendFormat;
};
lokiTarget = refId: expr: {
inherit refId expr;
};
dashboard = pkgs.writeText "noisebell-dashboard.json" (builtins.toJSON {
uid = "noisebell";
title = "Noisebell Full Debug";
tags = [
"noisebell"
"prometheus"
"loki"
];
timezone = "browser";
schemaVersion = 39;
version = 1;
refresh = "30s";
time = {
from = "now-6h";
to = "now";
};
panels = [
(prometheusPanel {
id = 1;
title = "Host Scrape Health (Prometheus)";
type = "stat";
x = 0;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "up{job=~\"noisebell-(do|pi)-node\"}" "{{host}}") ];
})
(prometheusPanel {
id = 2;
title = "Noisebell Service Health (Prometheus)";
type = "stat";
x = 6;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "up{job=~\"noisebell-(cache|pi-app|pi-relay)\"}" "{{job}}") ];
})
(prometheusPanel {
id = 3;
title = "Probe Health (Prometheus)";
type = "stat";
x = 12;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "probe_success{job=~\"noisebell-.*-probes\"}" "{{instance}}") ];
})
(prometheusPanel {
id = 4;
title = "Door State (Prometheus)";
type = "stat";
x = 18;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "noisebell_cache_status" "{{status}}") ];
})
(prometheusPanel {
id = 5;
title = "CPU Used % (DO + Pi)";
x = 0;
y = 6;
w = 8;
targets = [ (promTarget "A" "100 - (avg by (host) (rate(node_cpu_seconds_total{job=~\"noisebell-(do|pi)-node\",mode=\"idle\"}[5m])) * 100)" "{{host}}") ];
})
(prometheusPanel {
id = 6;
title = "Memory Used % (DO + Pi)";
x = 8;
y = 6;
w = 8;
targets = [ (promTarget "A" "100 * (1 - (node_memory_MemAvailable_bytes{job=~\"noisebell-(do|pi)-node\"} / node_memory_MemTotal_bytes{job=~\"noisebell-(do|pi)-node\"}))" "{{host}}") ];
})
(prometheusPanel {
id = 7;
title = "Root Disk Used % (DO + Pi)";
x = 16;
y = 6;
w = 8;
targets = [ (promTarget "A" "100 * (1 - (node_filesystem_avail_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}))" "{{host}}") ];
})
(prometheusPanel {
id = 8;
title = "Host Uptime Hours (DO + Pi)";
type = "stat";
x = 0;
y = 14;
w = 6;
h = 6;
targets = [ (promTarget "A" "(time() - node_boot_time_seconds{job=~\"noisebell-(do|pi)-node\"}) / 3600" "{{host}}") ];
})
(prometheusPanel {
id = 9;
title = "Observability Stack Health (Prometheus)";
type = "stat";
x = 6;
y = 14;
w = 6;
h = 6;
targets = [ (promTarget "A" "up{job=~\"observability-.*\"}" "{{service}}") ];
})
(prometheusPanel {
id = 10;
title = "Pi Hardware (Prometheus)";
x = 12;
y = 14;
w = 6;
h = 6;
targets = [
(promTarget "A" "noisebell_pi_temperature_celsius" "temp C")
(promTarget "B" "noisebell_pi_throttled_flags" "throttled flags")
(promTarget "C" "noisebell_pi_tailscale_running" "tailscale running")
];
})
(prometheusPanel {
id = 11;
title = "Pi Wi-Fi Signal (Prometheus)";
x = 18;
y = 14;
w = 6;
h = 6;
targets = [
(promTarget "A" "noisebell_pi_wifi_signal_dbm" "{{interface}} dBm")
(promTarget "B" "noisebell_pi_wifi_link_quality" "{{interface}} link")
];
})
(prometheusPanel {
id = 12;
title = "Cache Poll Health (Prometheus)";
x = 0;
y = 20;
targets = [
(promTarget "A" "noisebell_cache_poll_consecutive_failures" "consecutive failures")
(promTarget "B" "rate(noisebell_cache_poll_failure_total[5m])" "failure rate")
(promTarget "C" "rate(noisebell_cache_poll_success_total[5m])" "success rate")
(promTarget "D" "noisebell_cache_poll_last_duration_seconds" "last duration")
];
})
(prometheusPanel {
id = 13;
title = "Pi App Webhook Delivery (Prometheus)";
x = 12;
y = 20;
targets = [
(promTarget "A" "rate(noisebell_pi_notify_success_total[5m])" "success")
(promTarget "B" "rate(noisebell_pi_notify_attempt_failure_total[5m])" "attempt failures")
(promTarget "C" "rate(noisebell_pi_notify_failure_total[5m])" "final failures")
];
})
(prometheusPanel {
id = 14;
title = "DO -> Pi Last Poll Details (Prometheus)";
type = "stat";
x = 0;
y = 28;
w = 12;
targets = [
(promTarget "A" "noisebell_cache_poll_last_result" "result {{result}}")
(promTarget "B" "noisebell_cache_poll_last_http_status" "last HTTP status")
(promTarget "C" "noisebell_cache_poll_last_duration_seconds" "last duration sec")
(promTarget "D" "time() - noisebell_cache_poll_last_attempt_timestamp_seconds" "seconds since attempt")
(promTarget "E" "time() - noisebell_cache_poll_last_success_timestamp_seconds" "seconds since success")
(promTarget "F" "time() - noisebell_cache_poll_last_failure_timestamp_seconds" "seconds since failure")
];
})
(prometheusPanel {
id = 15;
title = "DO -> Pi Poll Failure Types (Prometheus)";
x = 12;
y = 28;
targets = [
(promTarget "A" "rate(noisebell_cache_poll_http_error_total[5m])" "http error")
(promTarget "B" "rate(noisebell_cache_poll_request_timeout_total[5m])" "timeout")
(promTarget "C" "rate(noisebell_cache_poll_request_connect_total[5m])" "connect")
(promTarget "D" "rate(noisebell_cache_poll_request_other_total[5m])" "request other")
(promTarget "E" "rate(noisebell_cache_poll_parse_failure_total[5m])" "parse")
];
})
(prometheusPanel {
id = 16;
title = "Relay Delivery (Prometheus)";
x = 0;
y = 36;
targets = [
(promTarget "A" "rate(noisebell_relay_forwarded_total[5m])" "forwarded")
(promTarget "B" "rate(noisebell_relay_attempt_failure_total[5m])" "attempt failures")
(promTarget "C" "rate(noisebell_relay_failed_total[5m])" "final failures")
(promTarget "D" "noisebell_relay_last_duration_seconds" "last duration")
];
})
(lokiPanel {
id = 17;
title = "Journal Log Rate (Loki, DO + Pi)";
type = "timeseries";
x = 12;
y = 36;
targets = [
{
refId = "A";
expr = "sum by (host) (rate({job=\"journal\"}[5m]))";
legendFormat = "{{host}}";
}
];
})
(lokiPanel {
id = 18;
title = "DO Journal Logs (Loki)";
x = 0;
y = 44;
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-do\"}") ];
})
(lokiPanel {
id = 19;
title = "Pi Journal Logs (Loki)";
x = 12;
y = 44;
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-pi\"}") ];
})
];
});
publicDashboard = pkgs.writeText "noisebell-public-dashboard.json" (builtins.toJSON {
uid = "noisebell-public";
title = "Noisebell Public";
tags = [
"noisebell"
"public"
"prometheus"
];
timezone = "browser";
schemaVersion = 39;
version = 1;
refresh = "30s";
time = {
from = "now-6h";
to = "now";
};
panels = [
(prometheusPanel {
id = 1;
title = "Door State";
type = "stat";
x = 0;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "noisebell_cache_status" "{{status}}") ];
})
(prometheusPanel {
id = 2;
title = "Public Status Endpoint";
type = "stat";
x = 6;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "probe_success{job=\"noisebell-http-probes\",instance=\"https://noisebell.extremist.software/status\"}" "status endpoint") ];
})
(prometheusPanel {
id = 3;
title = "Noisebell Service Health";
type = "stat";
x = 12;
y = 0;
w = 6;
h = 6;
targets = [ (promTarget "A" "up{job=~\"noisebell-(cache|pi-app|pi-relay)\"}" "{{job}}") ];
})
(prometheusPanel {
id = 4;
title = "Cache Poll Health";
x = 0;
y = 6;
w = 12;
targets = [
(promTarget "A" "noisebell_cache_poll_consecutive_failures" "consecutive failures")
(promTarget "B" "rate(noisebell_cache_poll_failure_total[5m])" "failure rate")
(promTarget "C" "rate(noisebell_cache_poll_success_total[5m])" "success rate")
(promTarget "D" "noisebell_cache_poll_last_duration_seconds" "last duration")
];
})
(prometheusPanel {
id = 5;
title = "Last Poll Result";
type = "stat";
x = 12;
y = 6;
w = 12;
targets = [
(promTarget "A" "noisebell_cache_poll_last_result" "result {{result}}")
(promTarget "B" "time() - noisebell_cache_poll_last_attempt_timestamp_seconds" "seconds since attempt")
(promTarget "C" "time() - noisebell_cache_poll_last_success_timestamp_seconds" "seconds since success")
];
})
(prometheusPanel {
id = 6;
title = "Poll Failure Types";
x = 0;
y = 14;
w = 12;
targets = [
(promTarget "A" "rate(noisebell_cache_poll_http_error_total[5m])" "http error")
(promTarget "B" "rate(noisebell_cache_poll_request_timeout_total[5m])" "timeout")
(promTarget "C" "rate(noisebell_cache_poll_request_connect_total[5m])" "connect")
(promTarget "D" "rate(noisebell_cache_poll_request_other_total[5m])" "request other")
(promTarget "E" "rate(noisebell_cache_poll_parse_failure_total[5m])" "parse")
];
})
(prometheusPanel {
id = 7;
title = "Pi App Delivery";
x = 12;
y = 14;
w = 12;
targets = [
(promTarget "A" "rate(noisebell_pi_notify_success_total[5m])" "success")
(promTarget "B" "rate(noisebell_pi_notify_attempt_failure_total[5m])" "attempt failures")
(promTarget "C" "rate(noisebell_pi_notify_failure_total[5m])" "final failures")
];
})
(prometheusPanel {
id = 8;
title = "Relay Delivery";
x = 0;
y = 22;
w = 12;
targets = [
(promTarget "A" "rate(noisebell_relay_forwarded_total[5m])" "forwarded")
(promTarget "B" "rate(noisebell_relay_attempt_failure_total[5m])" "attempt failures")
(promTarget "C" "rate(noisebell_relay_failed_total[5m])" "final failures")
(promTarget "D" "noisebell_relay_last_duration_seconds" "last duration")
];
})
(prometheusPanel {
id = 9;
title = "Pi Hardware Summary";
x = 12;
y = 22;
w = 12;
targets = [
(promTarget "A" "noisebell_pi_temperature_celsius" "temperature C")
(promTarget "B" "noisebell_pi_throttled_flags" "throttled flags")
];
})
];
});
dashboardDir = pkgs.runCommand "noisebell-grafana-dashboards" { } ''
mkdir -p "$out"
cp ${dashboard} "$out/noisebell.json"
cp ${publicDashboard} "$out/noisebell-public.json"
'';
blackboxRelabels = [
{
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
target_label = "__address__";
replacement = "127.0.0.1:${toString blackboxExporterPort}";
}
];
in
{
services.prometheus = {
enable = true;
listenAddress = "0.0.0.0";
port = prometheusPort;
retentionTime = "7d";
globalConfig = {
scrape_interval = "15s";
evaluation_interval = "15s";
};
exporters = {
node = {
enable = true;
port = nodeExporterPort;
enabledCollectors = [ "systemd" ];
};
blackbox = {
enable = true;
port = blackboxExporterPort;
configFile = blackboxConfig;
};
};
scrapeConfigs = [
{
job_name = "noisebell-do-node";
static_configs = [
{
targets = [ "127.0.0.1:${toString nodeExporterPort}" ];
labels.host = "noisebell-do";
}
];
}
{
job_name = "noisebell-pi-node";
static_configs = [
{
targets = [ "noisebell-pi:${toString nodeExporterPort}" ];
labels.host = "noisebell-pi";
}
];
}
{
job_name = "noisebell-cache";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "127.0.0.1:3000" ];
labels.host = "noisebell-do";
}
];
}
{
job_name = "noisebell-pi-app";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "noisebell-pi:80" ];
labels.host = "noisebell-pi";
}
];
}
{
job_name = "noisebell-pi-relay";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "noisebell-pi:8090" ];
labels.host = "noisebell-pi";
}
];
}
{
job_name = "observability-prometheus";
static_configs = [
{
targets = [ "127.0.0.1:${toString prometheusPort}" ];
labels = {
host = "noisebell-do";
service = "prometheus";
};
}
];
}
{
job_name = "observability-loki";
static_configs = [
{
targets = [ "127.0.0.1:${toString lokiPort}" ];
labels = {
host = "noisebell-do";
service = "loki";
};
}
];
}
{
job_name = "observability-grafana";
static_configs = [
{
targets = [ "127.0.0.1:${toString grafanaPort}" ];
labels = {
host = "noisebell-do";
service = "grafana";
};
}
];
}
{
job_name = "observability-alloy";
static_configs = [
{
targets = [ "127.0.0.1:12345" ];
labels = {
host = "noisebell-do";
service = "alloy";
};
}
];
}
{
job_name = "observability-blackbox";
static_configs = [
{
targets = [ "127.0.0.1:${toString blackboxExporterPort}" ];
labels = {
host = "noisebell-do";
service = "blackbox";
};
}
];
}
{
job_name = "noisebell-http-probes";
metrics_path = "/probe";
params.module = [ "http_2xx" ];
static_configs = [
{
targets = [
"http://noisebell-pi/metrics"
"http://noisebell-pi:8090/health"
"https://noisebell.extremist.software/status"
];
}
];
relabel_configs = blackboxRelabels;
}
{
job_name = "noisebell-tcp-probes";
metrics_path = "/probe";
params.module = [ "tcp_connect" ];
static_configs = [
{
targets = [
"noisebell-pi:22"
"noisebell-pi:80"
"noisebell-pi:8090"
];
}
];
relabel_configs = blackboxRelabels;
}
];
rules = [
''
groups:
- name: noisebell
rules:
- alert: NoisebellPiAppDown
expr: up{job="noisebell-pi-app"} == 0
for: 2m
labels:
severity: page
annotations:
summary: Noisebell Pi app metrics are down
- alert: NoisebellPiNodeExporterDown
expr: up{job="noisebell-pi-node"} == 0
for: 2m
labels:
severity: page
annotations:
summary: Noisebell Pi node exporter is down
- alert: NoisebellProbeFailed
expr: probe_success{job=~"noisebell-.*-probes"} == 0
for: 2m
labels:
severity: page
annotations:
summary: Noisebell probe failed for {{ $labels.instance }}
- alert: NoisebellCachePollFailures
expr: noisebell_cache_poll_consecutive_failures >= 3
for: 1m
labels:
severity: page
annotations:
summary: Noisebell cache cannot poll the Pi
- alert: NoisebellPiRecentlyRebooted
expr: noisebell_pi_uptime_seconds < 300
for: 30s
labels:
severity: info
annotations:
summary: Noisebell Pi rebooted recently
- alert: NoisebellPiThrottled
expr: noisebell_pi_throttled_flags > 0
for: 1m
labels:
severity: warning
annotations:
summary: Noisebell Pi reports throttling flags
''
];
};
services.loki = {
enable = true;
configuration = {
auth_enabled = false;
server = {
http_listen_address = "0.0.0.0";
http_listen_port = lokiPort;
grpc_listen_address = "127.0.0.1";
grpc_listen_port = 9096;
};
common = {
path_prefix = "/var/lib/loki";
replication_factor = 1;
ring.kvstore.store = "inmemory";
storage.filesystem = {
chunks_directory = "/var/lib/loki/chunks";
rules_directory = "/var/lib/loki/rules";
};
};
schema_config.configs = [
{
from = "2024-01-01";
store = "tsdb";
object_store = "filesystem";
schema = "v13";
index = {
prefix = "index_";
period = "24h";
};
}
];
limits_config = {
retention_period = "168h";
reject_old_samples = true;
reject_old_samples_max_age = "168h";
};
compactor = {
working_directory = "/var/lib/loki/compactor";
compaction_interval = "10m";
retention_enabled = true;
retention_delete_delay = "2h";
retention_delete_worker_count = 1;
delete_request_store = "filesystem";
};
};
};
services.alloy = {
enable = true;
extraFlags = [ "--server.http.listen-addr=127.0.0.1:12345" ];
};
environment.etc."alloy/config.alloy".text = ''
loki.write "local" {
endpoint {
url = "http://127.0.0.1:${toString lokiPort}/loki/api/v1/push"
}
}
loki.source.journal "system" {
max_age = "12h"
labels = {
job = "journal",
host = "noisebell-do",
}
forward_to = [loki.write.local.receiver]
}
'';
systemd.services.alloy = {
after = [ "loki.service" ];
wants = [ "loki.service" ];
};
services.grafana = {
enable = true;
settings = {
server = {
http_addr = "127.0.0.1";
http_port = grafanaPort;
domain = grafanaDomain;
root_url = "https://${grafanaDomain}/";
};
analytics.reporting_enabled = false;
metrics.enabled = true;
security = {
admin_user = "admin";
admin_password = "$__file{/var/lib/grafana/admin_password}";
secret_key = "$__file{/var/lib/grafana/secret_key}";
disable_initial_admin_creation = false;
cookie_secure = true;
strict_transport_security = true;
strict_transport_security_max_age_seconds = 31536000;
};
auth.disable_login_form = false;
users.allow_sign_up = false;
"auth.anonymous" = {
enabled = false;
};
};
provision = {
enable = true;
datasources.settings = {
apiVersion = 1;
prune = true;
datasources = [
{
name = "Prometheus";
uid = "prometheus";
type = "prometheus";
access = "proxy";
url = "http://127.0.0.1:${toString prometheusPort}";
isDefault = true;
editable = false;
}
{
name = "Loki";
uid = "loki";
type = "loki";
access = "proxy";
url = "http://127.0.0.1:${toString lokiPort}";
editable = false;
}
];
};
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "Noisebell";
type = "file";
allowUiUpdates = false;
options.path = dashboardDir;
}
];
};
};
};
systemd.services.grafana.preStart = lib.mkBefore ''
if [ ! -s /var/lib/grafana/secret_key ]; then
umask 077
${pkgs.coreutils}/bin/head -c 64 /dev/urandom | ${pkgs.coreutils}/bin/base64 --wrap=0 > /var/lib/grafana/secret_key
fi
if [ ! -s /var/lib/grafana/admin_password ]; then
umask 077
${pkgs.coreutils}/bin/head -c 36 /dev/urandom | ${pkgs.coreutils}/bin/base64 --wrap=0 > /var/lib/grafana/admin_password
fi
'';
systemd.services.noisebell-grafana-public-dashboard = {
description = "Ensure deterministic Noisebell public Grafana dashboard share";
wantedBy = [ "multi-user.target" ];
after = [ "grafana.service" ];
wants = [ "grafana.service" ];
restartTriggers = [ dashboardDir ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${sharePublicDashboard}/bin/noisebell-grafana-share-public-dashboard";
};
};
}