{ config, pkgs, lib, ... }: let # PHP-FPM exporter wrapper to handle the semicolon-in-URI escaping issue phpfpmExporterScript = pkgs.writeShellScript "phpfpm-exporter-wrapper" '' exec ${pkgs.prometheus-php-fpm-exporter}/bin/php-fpm-exporter server \ --phpfpm.scrape-uri 'unix:///run/phpfpm/mediawiki.sock;/fpm-status' \ --web.listen-address ':9253' ''; in { services.prometheus = { enable = true; port = 9090; listenAddress = "127.0.0.1"; retentionTime = "90d"; extraFlags = [ "--storage.tsdb.max-block-duration=2h" "--storage.tsdb.retention.size=5GB" ]; globalConfig = { scrape_interval = "15s"; evaluation_interval = "15s"; }; scrapeConfigs = [ # ── Primary system metrics ── { job_name = "node"; static_configs = [{ targets = [ "localhost:9100" ]; labels = { instance = "wiki"; }; }]; } # ── Replica system metrics (over Tailscale) ── { job_name = "node-replica"; static_configs = [{ targets = [ "wiki-replica:9100" ]; labels = { instance = "wiki-replica"; }; }]; } # ── Primary MariaDB ── # Queries/s, connections, buffer pool hit ratio, slow queries, # binlog position, table locks, InnoDB row operations { job_name = "mysqld"; static_configs = [{ targets = [ "localhost:9104" ]; labels = { instance = "wiki"; }; }]; } # ── Replica MariaDB (over Tailscale) ── # Replication lag (Seconds_Behind_Master), IO/SQL thread status, # relay log position, read-only query volume { job_name = "mysqld-replica"; static_configs = [{ targets = [ "wiki-replica:9104" ]; labels = { instance = "wiki-replica"; }; }]; } # ── Primary Caddy ── # Requests/s by status code (2xx, 3xx, 4xx, 5xx), response latency # histograms, active connections, bytes in/out { job_name = "caddy"; static_configs = [{ targets = [ "localhost:2019" ]; labels = { instance = "wiki"; }; }]; } # ── Replica Caddy (over Tailscale) ── { job_name = "caddy-replica"; static_configs = [{ targets = [ "wiki-replica:2019" ]; labels = { instance = "wiki-replica"; }; }]; } # ── Primary PHP-FPM ── # Active/idle/total workers, accepted connections, request duration, # slow requests, max_children reached count { job_name = "phpfpm"; static_configs = [{ targets = [ "localhost:9253" ]; labels = { instance = "wiki"; }; }]; } # ── Primary memcached ── # Hit rate, miss rate, evictions, current items, bytes used/limit, # connections, get/set/delete rates { job_name = "memcached"; static_configs = [{ targets = [ "localhost:9150" ]; labels = { instance = "wiki"; }; }]; } # ── Replica memcached (over Tailscale) ── { job_name = "memcached-replica"; static_configs = [{ targets = [ "wiki-replica:9150" ]; labels = { instance = "wiki-replica"; }; }]; } # ── Blackbox HTTP probes ── # End-to-end: DNS resolution time, TCP connect, TLS handshake, # HTTP response time, status code, TLS cert expiry { job_name = "blackbox-http"; metrics_path = "/probe"; params = { module = [ "http_2xx" ]; }; static_configs = [{ targets = [ # Primary wiki "https://www.noisebridge.net" "https://www.noisebridge.net/wiki/Main_Page" "https://www.noisebridge.net/health" # Replica wiki "https://readonly.noisebridge.net" "https://readonly.noisebridge.net/wiki/Main_Page" "https://readonly.noisebridge.net/health" # Grafana "https://grafana.noisebridge.net" ]; }]; relabel_configs = [ { source_labels = [ "__address__" ]; target_label = "__param_target"; } { source_labels = [ "__param_target" ]; target_label = "instance"; } { target_label = "__address__"; replacement = "localhost:9115"; } ]; } # ── Grafana internal metrics ── { job_name = "grafana"; static_configs = [{ targets = [ "localhost:3000" ]; }]; } ]; }; # ── Node exporter ── # System-level: CPU, RAM, disk I/O, filesystem usage, network traffic, # systemd unit states, plus custom textfile metrics from the backup script services.prometheus.exporters.node = { enable = true; port = 9100; enabledCollectors = [ "cpu" "diskstats" "filesystem" "loadavg" "meminfo" "netdev" "stat" "time" "vmstat" "systemd" "textfile" ]; extraFlags = [ "--collector.textfile.directory=/var/lib/prometheus-node-exporter/textfile" ]; }; # ── Blackbox exporter ── # Makes actual HTTP requests and reports: probe success/failure, response # time broken into phases (DNS, connect, TLS, processing, transfer), # HTTP status code, TLS certificate expiry date services.prometheus.exporters.blackbox = { enable = true; port = 9115; configFile = pkgs.writeText "blackbox.yml" (builtins.toJSON { modules = { http_2xx = { prober = "http"; timeout = "10s"; http = { valid_http_versions = [ "HTTP/1.1" "HTTP/2.0" ]; valid_status_codes = [ 200 ]; method = "GET"; follow_redirects = true; preferred_ip_protocol = "ip4"; }; }; }; }); }; # ── Memcached exporter ── # Exposes: cmd_get, cmd_set, get_hits, get_misses (→ hit ratio), # evictions, curr_items, bytes (used), limit_maxbytes, # curr_connections, total_connections services.prometheus.exporters.memcached = { enable = true; port = 9150; extraFlags = [ "--memcached.address=localhost:11211" ]; }; # ── PHP-FPM exporter ── # Exposes: active_processes, idle_processes, total_processes, # accepted_conn, listen_queue, max_listen_queue, # slow_requests, max_children_reached # Uses a wrapper script to handle the semicolon in the scrape URI systemd.services.prometheus-phpfpm-exporter = { description = "Prometheus PHP-FPM exporter"; after = [ "phpfpm-mediawiki.service" ]; wantedBy = [ "multi-user.target" ]; serviceConfig = { ExecStart = phpfpmExporterScript; User = "mediawiki"; Group = "mediawiki"; Restart = "always"; RestartSec = "5s"; }; }; # Textfile collector directory for backup and sync metrics systemd.tmpfiles.rules = [ "d /var/lib/prometheus-node-exporter/textfile 0755 root root -" ]; }