674 lines
18 KiB
Nix
674 lines
18 KiB
Nix
{ lib, pkgs, ... }:
|
|
|
|
let
|
|
prometheusPort = 9090;
|
|
lokiPort = 3100;
|
|
grafanaPort = 3030;
|
|
nodeExporterPort = 9100;
|
|
blackboxExporterPort = 9115;
|
|
|
|
blackboxConfig = pkgs.writeText "noisebell-blackbox.yml" ''
|
|
modules:
|
|
http_2xx:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
follow_redirects: true
|
|
preferred_ip_protocol: ip4
|
|
tcp_connect:
|
|
prober: tcp
|
|
timeout: 5s
|
|
'';
|
|
|
|
prometheusDatasource = {
|
|
type = "prometheus";
|
|
uid = "prometheus";
|
|
};
|
|
|
|
lokiDatasource = {
|
|
type = "loki";
|
|
uid = "loki";
|
|
};
|
|
|
|
prometheusPanel =
|
|
{
|
|
id,
|
|
title,
|
|
type ? "timeseries",
|
|
x,
|
|
y,
|
|
w ? 12,
|
|
h ? 8,
|
|
targets,
|
|
}:
|
|
{
|
|
inherit id title type targets;
|
|
datasource = prometheusDatasource;
|
|
gridPos = {
|
|
inherit h w x y;
|
|
};
|
|
};
|
|
|
|
lokiPanel =
|
|
{
|
|
id,
|
|
title,
|
|
type ? "logs",
|
|
x,
|
|
y,
|
|
w ? 12,
|
|
h ? 8,
|
|
targets,
|
|
}:
|
|
{
|
|
inherit id title type targets;
|
|
datasource = lokiDatasource;
|
|
gridPos = {
|
|
inherit h w x y;
|
|
};
|
|
};
|
|
|
|
promTarget = refId: expr: legendFormat: {
|
|
inherit refId expr legendFormat;
|
|
};
|
|
|
|
lokiTarget = refId: expr: {
|
|
inherit refId expr;
|
|
};
|
|
|
|
dashboard = pkgs.writeText "noisebell-dashboard.json" (builtins.toJSON {
|
|
uid = "noisebell";
|
|
title = "Noisebell DO + Pi";
|
|
tags = [
|
|
"noisebell"
|
|
"prometheus"
|
|
"loki"
|
|
];
|
|
timezone = "browser";
|
|
schemaVersion = 39;
|
|
version = 1;
|
|
refresh = "30s";
|
|
time = {
|
|
from = "now-6h";
|
|
to = "now";
|
|
};
|
|
panels = [
|
|
(prometheusPanel {
|
|
id = 1;
|
|
title = "Host Scrape Health (Prometheus)";
|
|
type = "stat";
|
|
x = 0;
|
|
y = 0;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "up{job=~\"noisebell-(do|pi)-node\"}" "{{host}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 2;
|
|
title = "Noisebell Service Health (Prometheus)";
|
|
type = "stat";
|
|
x = 6;
|
|
y = 0;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "up{job=~\"noisebell-(cache|pi-app|pi-relay)\"}" "{{job}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 3;
|
|
title = "Probe Health (Prometheus)";
|
|
type = "stat";
|
|
x = 12;
|
|
y = 0;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "probe_success{job=~\"noisebell-.*-probes\"}" "{{instance}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 4;
|
|
title = "Door State (Prometheus)";
|
|
type = "stat";
|
|
x = 18;
|
|
y = 0;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "noisebell_cache_status" "{{status}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 5;
|
|
title = "CPU Used % (DO + Pi)";
|
|
x = 0;
|
|
y = 6;
|
|
w = 8;
|
|
targets = [ (promTarget "A" "100 - (avg by (host) (rate(node_cpu_seconds_total{job=~\"noisebell-(do|pi)-node\",mode=\"idle\"}[5m])) * 100)" "{{host}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 6;
|
|
title = "Memory Used % (DO + Pi)";
|
|
x = 8;
|
|
y = 6;
|
|
w = 8;
|
|
targets = [ (promTarget "A" "100 * (1 - (node_memory_MemAvailable_bytes{job=~\"noisebell-(do|pi)-node\"} / node_memory_MemTotal_bytes{job=~\"noisebell-(do|pi)-node\"}))" "{{host}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 7;
|
|
title = "Root Disk Used % (DO + Pi)";
|
|
x = 16;
|
|
y = 6;
|
|
w = 8;
|
|
targets = [ (promTarget "A" "100 * (1 - (node_filesystem_avail_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}))" "{{host}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 8;
|
|
title = "Host Uptime Hours (DO + Pi)";
|
|
type = "stat";
|
|
x = 0;
|
|
y = 14;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "(time() - node_boot_time_seconds{job=~\"noisebell-(do|pi)-node\"}) / 3600" "{{host}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 9;
|
|
title = "Observability Stack Health (Prometheus)";
|
|
type = "stat";
|
|
x = 6;
|
|
y = 14;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [ (promTarget "A" "up{job=~\"observability-.*\"}" "{{service}}") ];
|
|
})
|
|
(prometheusPanel {
|
|
id = 10;
|
|
title = "Pi Hardware (Prometheus)";
|
|
x = 12;
|
|
y = 14;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [
|
|
(promTarget "A" "noisebell_pi_temperature_celsius" "temp C")
|
|
(promTarget "B" "noisebell_pi_throttled_flags" "throttled flags")
|
|
(promTarget "C" "noisebell_pi_tailscale_running" "tailscale running")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 11;
|
|
title = "Pi Wi-Fi Signal (Prometheus)";
|
|
x = 18;
|
|
y = 14;
|
|
w = 6;
|
|
h = 6;
|
|
targets = [
|
|
(promTarget "A" "noisebell_pi_wifi_signal_dbm" "{{interface}} dBm")
|
|
(promTarget "B" "noisebell_pi_wifi_link_quality" "{{interface}} link")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 12;
|
|
title = "Cache Poll Health (Prometheus)";
|
|
x = 0;
|
|
y = 20;
|
|
targets = [
|
|
(promTarget "A" "noisebell_cache_poll_consecutive_failures" "consecutive failures")
|
|
(promTarget "B" "rate(noisebell_cache_poll_failure_total[5m])" "failure rate")
|
|
(promTarget "C" "rate(noisebell_cache_poll_success_total[5m])" "success rate")
|
|
(promTarget "D" "noisebell_cache_poll_last_duration_seconds" "last duration")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 13;
|
|
title = "Pi App Webhook Delivery (Prometheus)";
|
|
x = 12;
|
|
y = 20;
|
|
targets = [
|
|
(promTarget "A" "rate(noisebell_pi_notify_success_total[5m])" "success")
|
|
(promTarget "B" "rate(noisebell_pi_notify_attempt_failure_total[5m])" "attempt failures")
|
|
(promTarget "C" "rate(noisebell_pi_notify_failure_total[5m])" "final failures")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 14;
|
|
title = "DO -> Pi Last Poll Details (Prometheus)";
|
|
type = "stat";
|
|
x = 0;
|
|
y = 28;
|
|
w = 12;
|
|
targets = [
|
|
(promTarget "A" "noisebell_cache_poll_last_result" "result {{result}}")
|
|
(promTarget "B" "noisebell_cache_poll_last_http_status" "last HTTP status")
|
|
(promTarget "C" "noisebell_cache_poll_last_duration_seconds" "last duration sec")
|
|
(promTarget "D" "time() - noisebell_cache_poll_last_attempt_timestamp_seconds" "seconds since attempt")
|
|
(promTarget "E" "time() - noisebell_cache_poll_last_success_timestamp_seconds" "seconds since success")
|
|
(promTarget "F" "time() - noisebell_cache_poll_last_failure_timestamp_seconds" "seconds since failure")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 15;
|
|
title = "DO -> Pi Poll Failure Types (Prometheus)";
|
|
x = 12;
|
|
y = 28;
|
|
targets = [
|
|
(promTarget "A" "rate(noisebell_cache_poll_http_error_total[5m])" "http error")
|
|
(promTarget "B" "rate(noisebell_cache_poll_request_timeout_total[5m])" "timeout")
|
|
(promTarget "C" "rate(noisebell_cache_poll_request_connect_total[5m])" "connect")
|
|
(promTarget "D" "rate(noisebell_cache_poll_request_other_total[5m])" "request other")
|
|
(promTarget "E" "rate(noisebell_cache_poll_parse_failure_total[5m])" "parse")
|
|
];
|
|
})
|
|
(prometheusPanel {
|
|
id = 16;
|
|
title = "Relay Delivery (Prometheus)";
|
|
x = 0;
|
|
y = 36;
|
|
targets = [
|
|
(promTarget "A" "rate(noisebell_relay_forwarded_total[5m])" "forwarded")
|
|
(promTarget "B" "rate(noisebell_relay_attempt_failure_total[5m])" "attempt failures")
|
|
(promTarget "C" "rate(noisebell_relay_failed_total[5m])" "final failures")
|
|
(promTarget "D" "noisebell_relay_last_duration_seconds" "last duration")
|
|
];
|
|
})
|
|
(lokiPanel {
|
|
id = 17;
|
|
title = "Journal Log Rate (Loki, DO + Pi)";
|
|
type = "timeseries";
|
|
x = 12;
|
|
y = 36;
|
|
targets = [
|
|
{
|
|
refId = "A";
|
|
expr = "sum by (host) (rate({job=\"journal\"}[5m]))";
|
|
legendFormat = "{{host}}";
|
|
}
|
|
];
|
|
})
|
|
(lokiPanel {
|
|
id = 18;
|
|
title = "DO Journal Logs (Loki)";
|
|
x = 0;
|
|
y = 44;
|
|
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-do\"}") ];
|
|
})
|
|
(lokiPanel {
|
|
id = 19;
|
|
title = "Pi Journal Logs (Loki)";
|
|
x = 12;
|
|
y = 44;
|
|
targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-pi\"}") ];
|
|
})
|
|
];
|
|
});
|
|
|
|
dashboardDir = pkgs.runCommand "noisebell-grafana-dashboards" { } ''
|
|
mkdir -p "$out"
|
|
cp ${dashboard} "$out/noisebell.json"
|
|
'';
|
|
|
|
blackboxRelabels = [
|
|
{
|
|
source_labels = [ "__address__" ];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = [ "__param_target" ];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
target_label = "__address__";
|
|
replacement = "127.0.0.1:${toString blackboxExporterPort}";
|
|
}
|
|
];
|
|
in
|
|
{
|
|
services.prometheus = {
|
|
enable = true;
|
|
listenAddress = "0.0.0.0";
|
|
port = prometheusPort;
|
|
retentionTime = "7d";
|
|
globalConfig = {
|
|
scrape_interval = "15s";
|
|
evaluation_interval = "15s";
|
|
};
|
|
exporters = {
|
|
node = {
|
|
enable = true;
|
|
port = nodeExporterPort;
|
|
enabledCollectors = [ "systemd" ];
|
|
};
|
|
blackbox = {
|
|
enable = true;
|
|
port = blackboxExporterPort;
|
|
configFile = blackboxConfig;
|
|
};
|
|
};
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "noisebell-do-node";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:${toString nodeExporterPort}" ];
|
|
labels.host = "noisebell-do";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "noisebell-pi-node";
|
|
static_configs = [
|
|
{
|
|
targets = [ "noisebell-pi:${toString nodeExporterPort}" ];
|
|
labels.host = "noisebell-pi";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "noisebell-cache";
|
|
metrics_path = "/metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:3000" ];
|
|
labels.host = "noisebell-do";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "noisebell-pi-app";
|
|
metrics_path = "/metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [ "noisebell-pi:80" ];
|
|
labels.host = "noisebell-pi";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "noisebell-pi-relay";
|
|
metrics_path = "/metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [ "noisebell-pi:8090" ];
|
|
labels.host = "noisebell-pi";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "observability-prometheus";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:${toString prometheusPort}" ];
|
|
labels = {
|
|
host = "noisebell-do";
|
|
service = "prometheus";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "observability-loki";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:${toString lokiPort}" ];
|
|
labels = {
|
|
host = "noisebell-do";
|
|
service = "loki";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "observability-grafana";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:${toString grafanaPort}" ];
|
|
labels = {
|
|
host = "noisebell-do";
|
|
service = "grafana";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "observability-alloy";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:12345" ];
|
|
labels = {
|
|
host = "noisebell-do";
|
|
service = "alloy";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "observability-blackbox";
|
|
static_configs = [
|
|
{
|
|
targets = [ "127.0.0.1:${toString blackboxExporterPort}" ];
|
|
labels = {
|
|
host = "noisebell-do";
|
|
service = "blackbox";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "noisebell-http-probes";
|
|
metrics_path = "/probe";
|
|
params.module = [ "http_2xx" ];
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"http://noisebell-pi/metrics"
|
|
"http://noisebell-pi:8090/health"
|
|
"https://noisebell.extremist.software/status"
|
|
];
|
|
}
|
|
];
|
|
relabel_configs = blackboxRelabels;
|
|
}
|
|
{
|
|
job_name = "noisebell-tcp-probes";
|
|
metrics_path = "/probe";
|
|
params.module = [ "tcp_connect" ];
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"noisebell-pi:22"
|
|
"noisebell-pi:80"
|
|
"noisebell-pi:8090"
|
|
];
|
|
}
|
|
];
|
|
relabel_configs = blackboxRelabels;
|
|
}
|
|
];
|
|
rules = [
|
|
''
|
|
groups:
|
|
- name: noisebell
|
|
rules:
|
|
- alert: NoisebellPiAppDown
|
|
expr: up{job="noisebell-pi-app"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: Noisebell Pi app metrics are down
|
|
- alert: NoisebellPiNodeExporterDown
|
|
expr: up{job="noisebell-pi-node"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: Noisebell Pi node exporter is down
|
|
- alert: NoisebellProbeFailed
|
|
expr: probe_success{job=~"noisebell-.*-probes"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: Noisebell probe failed for {{ $labels.instance }}
|
|
- alert: NoisebellCachePollFailures
|
|
expr: noisebell_cache_poll_consecutive_failures >= 3
|
|
for: 1m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: Noisebell cache cannot poll the Pi
|
|
- alert: NoisebellPiRecentlyRebooted
|
|
expr: noisebell_pi_uptime_seconds < 300
|
|
for: 30s
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Noisebell Pi rebooted recently
|
|
- alert: NoisebellPiThrottled
|
|
expr: noisebell_pi_throttled_flags > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Noisebell Pi reports throttling flags
|
|
''
|
|
];
|
|
};
|
|
|
|
services.loki = {
|
|
enable = true;
|
|
configuration = {
|
|
auth_enabled = false;
|
|
server = {
|
|
http_listen_address = "0.0.0.0";
|
|
http_listen_port = lokiPort;
|
|
grpc_listen_address = "127.0.0.1";
|
|
grpc_listen_port = 9096;
|
|
};
|
|
common = {
|
|
path_prefix = "/var/lib/loki";
|
|
replication_factor = 1;
|
|
ring.kvstore.store = "inmemory";
|
|
storage.filesystem = {
|
|
chunks_directory = "/var/lib/loki/chunks";
|
|
rules_directory = "/var/lib/loki/rules";
|
|
};
|
|
};
|
|
schema_config.configs = [
|
|
{
|
|
from = "2024-01-01";
|
|
store = "tsdb";
|
|
object_store = "filesystem";
|
|
schema = "v13";
|
|
index = {
|
|
prefix = "index_";
|
|
period = "24h";
|
|
};
|
|
}
|
|
];
|
|
limits_config = {
|
|
retention_period = "168h";
|
|
reject_old_samples = true;
|
|
reject_old_samples_max_age = "168h";
|
|
};
|
|
compactor = {
|
|
working_directory = "/var/lib/loki/compactor";
|
|
compaction_interval = "10m";
|
|
retention_enabled = true;
|
|
retention_delete_delay = "2h";
|
|
retention_delete_worker_count = 1;
|
|
delete_request_store = "filesystem";
|
|
};
|
|
};
|
|
};
|
|
|
|
services.alloy = {
|
|
enable = true;
|
|
extraFlags = [ "--server.http.listen-addr=127.0.0.1:12345" ];
|
|
};
|
|
|
|
environment.etc."alloy/config.alloy".text = ''
|
|
loki.write "local" {
|
|
endpoint {
|
|
url = "http://127.0.0.1:${toString lokiPort}/loki/api/v1/push"
|
|
}
|
|
}
|
|
|
|
loki.source.journal "system" {
|
|
max_age = "12h"
|
|
labels = {
|
|
job = "journal",
|
|
host = "noisebell-do",
|
|
}
|
|
forward_to = [loki.write.local.receiver]
|
|
}
|
|
'';
|
|
|
|
systemd.services.alloy = {
|
|
after = [ "loki.service" ];
|
|
wants = [ "loki.service" ];
|
|
};
|
|
|
|
services.grafana = {
|
|
enable = true;
|
|
settings = {
|
|
server = {
|
|
http_addr = "0.0.0.0";
|
|
http_port = grafanaPort;
|
|
domain = "noisebell-do";
|
|
root_url = "http://noisebell-do:${toString grafanaPort}/";
|
|
};
|
|
analytics.reporting_enabled = false;
|
|
metrics.enabled = true;
|
|
security = {
|
|
secret_key = "$__file{/var/lib/grafana/secret_key}";
|
|
disable_initial_admin_creation = true;
|
|
};
|
|
auth.disable_login_form = true;
|
|
users.allow_sign_up = false;
|
|
"auth.anonymous" = {
|
|
enabled = true;
|
|
org_role = "Viewer";
|
|
};
|
|
};
|
|
provision = {
|
|
enable = true;
|
|
datasources.settings = {
|
|
apiVersion = 1;
|
|
prune = true;
|
|
datasources = [
|
|
{
|
|
name = "Prometheus";
|
|
uid = "prometheus";
|
|
type = "prometheus";
|
|
access = "proxy";
|
|
url = "http://127.0.0.1:${toString prometheusPort}";
|
|
isDefault = true;
|
|
editable = false;
|
|
}
|
|
{
|
|
name = "Loki";
|
|
uid = "loki";
|
|
type = "loki";
|
|
access = "proxy";
|
|
url = "http://127.0.0.1:${toString lokiPort}";
|
|
editable = false;
|
|
}
|
|
];
|
|
};
|
|
dashboards.settings = {
|
|
apiVersion = 1;
|
|
providers = [
|
|
{
|
|
name = "Noisebell";
|
|
type = "file";
|
|
allowUiUpdates = false;
|
|
options.path = dashboardDir;
|
|
}
|
|
];
|
|
};
|
|
};
|
|
};
|
|
|
|
systemd.services.grafana.preStart = lib.mkBefore ''
|
|
if [ ! -s /var/lib/grafana/secret_key ]; then
|
|
umask 077
|
|
${pkgs.coreutils}/bin/head -c 64 /dev/urandom | ${pkgs.coreutils}/bin/base64 --wrap=0 > /var/lib/grafana/secret_key
|
|
fi
|
|
'';
|
|
}
|