{ lib, pkgs, ... }: let prometheusPort = 9090; lokiPort = 3100; grafanaPort = 3030; nodeExporterPort = 9100; blackboxExporterPort = 9115; blackboxConfig = pkgs.writeText "noisebell-blackbox.yml" '' modules: http_2xx: prober: http timeout: 5s http: follow_redirects: true preferred_ip_protocol: ip4 tcp_connect: prober: tcp timeout: 5s ''; prometheusDatasource = { type = "prometheus"; uid = "prometheus"; }; lokiDatasource = { type = "loki"; uid = "loki"; }; prometheusPanel = { id, title, type ? "timeseries", x, y, w ? 12, h ? 8, targets, }: { inherit id title type targets; datasource = prometheusDatasource; gridPos = { inherit h w x y; }; }; lokiPanel = { id, title, type ? "logs", x, y, w ? 12, h ? 8, targets, }: { inherit id title type targets; datasource = lokiDatasource; gridPos = { inherit h w x y; }; }; promTarget = refId: expr: legendFormat: { inherit refId expr legendFormat; }; lokiTarget = refId: expr: { inherit refId expr; }; dashboard = pkgs.writeText "noisebell-dashboard.json" (builtins.toJSON { uid = "noisebell"; title = "Noisebell DO + Pi"; tags = [ "noisebell" "prometheus" "loki" ]; timezone = "browser"; schemaVersion = 39; version = 1; refresh = "30s"; time = { from = "now-6h"; to = "now"; }; panels = [ (prometheusPanel { id = 1; title = "Host Scrape Health (Prometheus)"; type = "stat"; x = 0; y = 0; w = 6; h = 6; targets = [ (promTarget "A" "up{job=~\"noisebell-(do|pi)-node\"}" "{{host}}") ]; }) (prometheusPanel { id = 2; title = "Noisebell Service Health (Prometheus)"; type = "stat"; x = 6; y = 0; w = 6; h = 6; targets = [ (promTarget "A" "up{job=~\"noisebell-(cache|pi-app|pi-relay)\"}" "{{job}}") ]; }) (prometheusPanel { id = 3; title = "Probe Health (Prometheus)"; type = "stat"; x = 12; y = 0; w = 6; h = 6; targets = [ (promTarget "A" "probe_success{job=~\"noisebell-.*-probes\"}" "{{instance}}") ]; }) (prometheusPanel { id = 4; title = "Door State (Prometheus)"; type = "stat"; x = 18; y = 0; w = 6; h = 6; targets = [ (promTarget "A" "noisebell_cache_status" "{{status}}") ]; }) (prometheusPanel { id = 5; title = "CPU Used % (DO + Pi)"; x = 0; y = 6; w = 8; targets = [ (promTarget "A" "100 - (avg by (host) (rate(node_cpu_seconds_total{job=~\"noisebell-(do|pi)-node\",mode=\"idle\"}[5m])) * 100)" "{{host}}") ]; }) (prometheusPanel { id = 6; title = "Memory Used % (DO + Pi)"; x = 8; y = 6; w = 8; targets = [ (promTarget "A" "100 * (1 - (node_memory_MemAvailable_bytes{job=~\"noisebell-(do|pi)-node\"} / node_memory_MemTotal_bytes{job=~\"noisebell-(do|pi)-node\"}))" "{{host}}") ]; }) (prometheusPanel { id = 7; title = "Root Disk Used % (DO + Pi)"; x = 16; y = 6; w = 8; targets = [ (promTarget "A" "100 * (1 - (node_filesystem_avail_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{job=~\"noisebell-(do|pi)-node\",mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}))" "{{host}}") ]; }) (prometheusPanel { id = 8; title = "Host Uptime Hours (DO + Pi)"; type = "stat"; x = 0; y = 14; w = 6; h = 6; targets = [ (promTarget "A" "(time() - node_boot_time_seconds{job=~\"noisebell-(do|pi)-node\"}) / 3600" "{{host}}") ]; }) (prometheusPanel { id = 9; title = "Observability Stack Health (Prometheus)"; type = "stat"; x = 6; y = 14; w = 6; h = 6; targets = [ (promTarget "A" "up{job=~\"observability-.*\"}" "{{service}}") ]; }) (prometheusPanel { id = 10; title = "Pi Hardware (Prometheus)"; x = 12; y = 14; w = 6; h = 6; targets = [ (promTarget "A" "noisebell_pi_temperature_celsius" "temp C") (promTarget "B" "noisebell_pi_throttled_flags" "throttled flags") (promTarget "C" "noisebell_pi_tailscale_running" "tailscale running") ]; }) (prometheusPanel { id = 11; title = "Pi Wi-Fi Signal (Prometheus)"; x = 18; y = 14; w = 6; h = 6; targets = [ (promTarget "A" "noisebell_pi_wifi_signal_dbm" "{{interface}} dBm") (promTarget "B" "noisebell_pi_wifi_link_quality" "{{interface}} link") ]; }) (prometheusPanel { id = 12; title = "Cache Poll Health (Prometheus)"; x = 0; y = 20; targets = [ (promTarget "A" "noisebell_cache_poll_consecutive_failures" "consecutive failures") (promTarget "B" "rate(noisebell_cache_poll_failure_total[5m])" "failure rate") (promTarget "C" "rate(noisebell_cache_poll_success_total[5m])" "success rate") (promTarget "D" "noisebell_cache_poll_last_duration_seconds" "last duration") ]; }) (prometheusPanel { id = 13; title = "Pi App Webhook Delivery (Prometheus)"; x = 12; y = 20; targets = [ (promTarget "A" "rate(noisebell_pi_notify_success_total[5m])" "success") (promTarget "B" "rate(noisebell_pi_notify_attempt_failure_total[5m])" "attempt failures") (promTarget "C" "rate(noisebell_pi_notify_failure_total[5m])" "final failures") ]; }) (prometheusPanel { id = 14; title = "DO -> Pi Last Poll Details (Prometheus)"; type = "stat"; x = 0; y = 28; w = 12; targets = [ (promTarget "A" "noisebell_cache_poll_last_result" "result {{result}}") (promTarget "B" "noisebell_cache_poll_last_http_status" "last HTTP status") (promTarget "C" "noisebell_cache_poll_last_duration_seconds" "last duration sec") (promTarget "D" "time() - noisebell_cache_poll_last_attempt_timestamp_seconds" "seconds since attempt") (promTarget "E" "time() - noisebell_cache_poll_last_success_timestamp_seconds" "seconds since success") (promTarget "F" "time() - noisebell_cache_poll_last_failure_timestamp_seconds" "seconds since failure") ]; }) (prometheusPanel { id = 15; title = "DO -> Pi Poll Failure Types (Prometheus)"; x = 12; y = 28; targets = [ (promTarget "A" "rate(noisebell_cache_poll_http_error_total[5m])" "http error") (promTarget "B" "rate(noisebell_cache_poll_request_timeout_total[5m])" "timeout") (promTarget "C" "rate(noisebell_cache_poll_request_connect_total[5m])" "connect") (promTarget "D" "rate(noisebell_cache_poll_request_other_total[5m])" "request other") (promTarget "E" "rate(noisebell_cache_poll_parse_failure_total[5m])" "parse") ]; }) (prometheusPanel { id = 16; title = "Relay Delivery (Prometheus)"; x = 0; y = 36; targets = [ (promTarget "A" "rate(noisebell_relay_forwarded_total[5m])" "forwarded") (promTarget "B" "rate(noisebell_relay_attempt_failure_total[5m])" "attempt failures") (promTarget "C" "rate(noisebell_relay_failed_total[5m])" "final failures") (promTarget "D" "noisebell_relay_last_duration_seconds" "last duration") ]; }) (lokiPanel { id = 17; title = "Journal Log Rate (Loki, DO + Pi)"; type = "timeseries"; x = 12; y = 36; targets = [ { refId = "A"; expr = "sum by (host) (rate({job=\"journal\"}[5m]))"; legendFormat = "{{host}}"; } ]; }) (lokiPanel { id = 18; title = "DO Journal Logs (Loki)"; x = 0; y = 44; targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-do\"}") ]; }) (lokiPanel { id = 19; title = "Pi Journal Logs (Loki)"; x = 12; y = 44; targets = [ (lokiTarget "A" "{job=\"journal\", host=\"noisebell-pi\"}") ]; }) ]; }); dashboardDir = pkgs.runCommand "noisebell-grafana-dashboards" { } '' mkdir -p "$out" cp ${dashboard} "$out/noisebell.json" ''; blackboxRelabels = [ { source_labels = [ "__address__" ]; target_label = "__param_target"; } { source_labels = [ "__param_target" ]; target_label = "instance"; } { target_label = "__address__"; replacement = "127.0.0.1:${toString blackboxExporterPort}"; } ]; in { services.prometheus = { enable = true; listenAddress = "0.0.0.0"; port = prometheusPort; retentionTime = "7d"; globalConfig = { scrape_interval = "15s"; evaluation_interval = "15s"; }; exporters = { node = { enable = true; port = nodeExporterPort; enabledCollectors = [ "systemd" ]; }; blackbox = { enable = true; port = blackboxExporterPort; configFile = blackboxConfig; }; }; scrapeConfigs = [ { job_name = "noisebell-do-node"; static_configs = [ { targets = [ "127.0.0.1:${toString nodeExporterPort}" ]; labels.host = "noisebell-do"; } ]; } { job_name = "noisebell-pi-node"; static_configs = [ { targets = [ "noisebell-pi:${toString nodeExporterPort}" ]; labels.host = "noisebell-pi"; } ]; } { job_name = "noisebell-cache"; metrics_path = "/metrics"; static_configs = [ { targets = [ "127.0.0.1:3000" ]; labels.host = "noisebell-do"; } ]; } { job_name = "noisebell-pi-app"; metrics_path = "/metrics"; static_configs = [ { targets = [ "noisebell-pi:80" ]; labels.host = "noisebell-pi"; } ]; } { job_name = "noisebell-pi-relay"; metrics_path = "/metrics"; static_configs = [ { targets = [ "noisebell-pi:8090" ]; labels.host = "noisebell-pi"; } ]; } { job_name = "observability-prometheus"; static_configs = [ { targets = [ "127.0.0.1:${toString prometheusPort}" ]; labels = { host = "noisebell-do"; service = "prometheus"; }; } ]; } { job_name = "observability-loki"; static_configs = [ { targets = [ "127.0.0.1:${toString lokiPort}" ]; labels = { host = "noisebell-do"; service = "loki"; }; } ]; } { job_name = "observability-grafana"; static_configs = [ { targets = [ "127.0.0.1:${toString grafanaPort}" ]; labels = { host = "noisebell-do"; service = "grafana"; }; } ]; } { job_name = "observability-alloy"; static_configs = [ { targets = [ "127.0.0.1:12345" ]; labels = { host = "noisebell-do"; service = "alloy"; }; } ]; } { job_name = "observability-blackbox"; static_configs = [ { targets = [ "127.0.0.1:${toString blackboxExporterPort}" ]; labels = { host = "noisebell-do"; service = "blackbox"; }; } ]; } { job_name = "noisebell-http-probes"; metrics_path = "/probe"; params.module = [ "http_2xx" ]; static_configs = [ { targets = [ "http://noisebell-pi/metrics" "http://noisebell-pi:8090/health" "https://noisebell.extremist.software/status" ]; } ]; relabel_configs = blackboxRelabels; } { job_name = "noisebell-tcp-probes"; metrics_path = "/probe"; params.module = [ "tcp_connect" ]; static_configs = [ { targets = [ "noisebell-pi:22" "noisebell-pi:80" "noisebell-pi:8090" ]; } ]; relabel_configs = blackboxRelabels; } ]; rules = [ '' groups: - name: noisebell rules: - alert: NoisebellPiAppDown expr: up{job="noisebell-pi-app"} == 0 for: 2m labels: severity: page annotations: summary: Noisebell Pi app metrics are down - alert: NoisebellPiNodeExporterDown expr: up{job="noisebell-pi-node"} == 0 for: 2m labels: severity: page annotations: summary: Noisebell Pi node exporter is down - alert: NoisebellProbeFailed expr: probe_success{job=~"noisebell-.*-probes"} == 0 for: 2m labels: severity: page annotations: summary: Noisebell probe failed for {{ $labels.instance }} - alert: NoisebellCachePollFailures expr: noisebell_cache_poll_consecutive_failures >= 3 for: 1m labels: severity: page annotations: summary: Noisebell cache cannot poll the Pi - alert: NoisebellPiRecentlyRebooted expr: noisebell_pi_uptime_seconds < 300 for: 30s labels: severity: info annotations: summary: Noisebell Pi rebooted recently - alert: NoisebellPiThrottled expr: noisebell_pi_throttled_flags > 0 for: 1m labels: severity: warning annotations: summary: Noisebell Pi reports throttling flags '' ]; }; services.loki = { enable = true; configuration = { auth_enabled = false; server = { http_listen_address = "0.0.0.0"; http_listen_port = lokiPort; grpc_listen_address = "127.0.0.1"; grpc_listen_port = 9096; }; common = { path_prefix = "/var/lib/loki"; replication_factor = 1; ring.kvstore.store = "inmemory"; storage.filesystem = { chunks_directory = "/var/lib/loki/chunks"; rules_directory = "/var/lib/loki/rules"; }; }; schema_config.configs = [ { from = "2024-01-01"; store = "tsdb"; object_store = "filesystem"; schema = "v13"; index = { prefix = "index_"; period = "24h"; }; } ]; limits_config = { retention_period = "168h"; reject_old_samples = true; reject_old_samples_max_age = "168h"; }; compactor = { working_directory = "/var/lib/loki/compactor"; compaction_interval = "10m"; retention_enabled = true; retention_delete_delay = "2h"; retention_delete_worker_count = 1; delete_request_store = "filesystem"; }; }; }; services.alloy = { enable = true; extraFlags = [ "--server.http.listen-addr=127.0.0.1:12345" ]; }; environment.etc."alloy/config.alloy".text = '' loki.write "local" { endpoint { url = "http://127.0.0.1:${toString lokiPort}/loki/api/v1/push" } } loki.source.journal "system" { max_age = "12h" labels = { job = "journal", host = "noisebell-do", } forward_to = [loki.write.local.receiver] } ''; systemd.services.alloy = { after = [ "loki.service" ]; wants = [ "loki.service" ]; }; services.grafana = { enable = true; settings = { server = { http_addr = "0.0.0.0"; http_port = grafanaPort; domain = "noisebell-do"; root_url = "http://noisebell-do:${toString grafanaPort}/"; }; analytics.reporting_enabled = false; metrics.enabled = true; security = { secret_key = "$__file{/var/lib/grafana/secret_key}"; disable_initial_admin_creation = true; }; auth.disable_login_form = true; users.allow_sign_up = false; "auth.anonymous" = { enabled = true; org_role = "Viewer"; }; }; provision = { enable = true; datasources.settings = { apiVersion = 1; prune = true; datasources = [ { name = "Prometheus"; uid = "prometheus"; type = "prometheus"; access = "proxy"; url = "http://127.0.0.1:${toString prometheusPort}"; isDefault = true; editable = false; } { name = "Loki"; uid = "loki"; type = "loki"; access = "proxy"; url = "http://127.0.0.1:${toString lokiPort}"; editable = false; } ]; }; dashboards.settings = { apiVersion = 1; providers = [ { name = "Noisebell"; type = "file"; allowUiUpdates = false; options.path = dashboardDir; } ]; }; }; }; systemd.services.grafana.preStart = lib.mkBefore '' if [ ! -s /var/lib/grafana/secret_key ]; then umask 077 ${pkgs.coreutils}/bin/head -c 64 /dev/urandom | ${pkgs.coreutils}/bin/base64 --wrap=0 > /var/lib/grafana/secret_key fi ''; }