{ config, pkgs, lib, ... }: { services.prometheus = { alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }]; rules = [ (builtins.toJSON { groups = [ { name = "wiki-availability"; rules = [ { alert = "WikiDown"; expr = ''probe_success{job="blackbox-http",instance=~".*www.noisebridge.net.*"} == 0''; "for" = "2m"; labels.severity = "critical"; annotations = { summary = "Primary wiki is unreachable"; description = "{{ $labels.instance }} has been down for more than 2 minutes."; }; } { alert = "ReplicaDown"; expr = ''probe_success{job="blackbox-http",instance=~".*readonly.noisebridge.net.*"} == 0''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "Replica wiki is unreachable"; description = "{{ $labels.instance }} has been down for more than 5 minutes."; }; } { alert = "HighErrorRate"; expr = ''sum(rate(caddy_http_responses_total{code=~"5.."}[5m])) by (instance) / sum(rate(caddy_http_responses_total[5m])) by (instance) > 0.05''; "for" = "5m"; labels.severity = "critical"; annotations = { summary = "High HTTP 5xx error rate on {{ $labels.instance }}"; description = "More than 5% of requests are returning server errors."; }; } { alert = "HighLatency"; expr = ''histogram_quantile(0.95, sum(rate(caddy_http_request_duration_seconds_bucket[5m])) by (le, instance)) > 2''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "High p95 latency on {{ $labels.instance }}"; description = "95th percentile response time is {{ $value | humanizeDuration }}."; }; } { alert = "TLSCertExpiringSoon"; expr = ''probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 7 * 86400''; "for" = "1h"; labels.severity = "warning"; annotations = { summary = "TLS certificate expiring within 7 days"; description = "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."; }; } ]; } { name = "wiki-infrastructure"; rules = [ { alert = "DiskFull"; expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "Disk usage above 85% on {{ $labels.instance }}"; description = "Root filesystem is {{ $value | humanizePercentage }} free."; }; } { alert = "DiskCritical"; expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05''; "for" = "2m"; labels.severity = "critical"; annotations = { summary = "Disk almost full on {{ $labels.instance }}"; description = "Root filesystem is {{ $value | humanizePercentage }} free. Immediate action required."; }; } { alert = "HighMemoryUsage"; expr = ''(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.9''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "Memory usage above 90% on {{ $labels.instance }}"; description = "Available memory is {{ $value | humanizePercentage }} of total."; }; } { alert = "HighCPU"; expr = ''1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85''; "for" = "10m"; labels.severity = "warning"; annotations = { summary = "Sustained high CPU on {{ $labels.instance }}"; description = "CPU usage has been above 85% for 10 minutes."; }; } { alert = "SystemdUnitFailed"; expr = ''node_systemd_unit_state{state="failed"} == 1''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "Systemd unit failed on {{ $labels.instance }}"; description = "Unit {{ $labels.name }} is in failed state."; }; } ]; } { name = "wiki-database"; rules = [ { alert = "ReplicationBroken"; expr = ''mysql_slave_status_slave_io_running{instance="wiki-replica"} == 0 or mysql_slave_status_slave_sql_running{instance="wiki-replica"} == 0''; "for" = "2m"; labels.severity = "critical"; annotations = { summary = "MySQL replication thread stopped"; description = "Replication IO or SQL thread is not running on the replica."; }; } { alert = "ReplicationLagging"; expr = ''mysql_slave_status_seconds_behind_master{instance="wiki-replica"} > 300''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "MySQL replication lagging"; description = "Replica is {{ $value }}s behind the primary."; }; } { alert = "MySQLConnectionsExhausted"; expr = ''mysql_global_status_threads_connected / mysql_global_variables_max_connections > 0.8''; "for" = "5m"; labels.severity = "warning"; annotations = { summary = "MySQL connections above 80% on {{ $labels.instance }}"; description = "{{ $value | humanizePercentage }} of max connections in use."; }; } { alert = "MySQLSlowQueries"; expr = ''rate(mysql_global_status_slow_queries[5m]) > 0.1''; "for" = "10m"; labels.severity = "warning"; annotations = { summary = "Elevated slow queries on {{ $labels.instance }}"; description = "{{ $value }} slow queries per second over the last 5 minutes."; }; } ]; } { name = "wiki-application"; rules = [ { alert = "PHPFPMExhausted"; expr = ''phpfpm_active_processes >= phpfpm_total_processes''; "for" = "1m"; labels.severity = "warning"; annotations = { summary = "PHP-FPM workers exhausted"; description = "All PHP-FPM workers are active — requests may be queuing."; }; } { alert = "PHPFPMDown"; expr = ''up{job="phpfpm"} == 0''; "for" = "1m"; labels.severity = "critical"; annotations = { summary = "PHP-FPM exporter is down"; description = "Cannot scrape PHP-FPM metrics — the PHP-FPM process may be dead."; }; } { alert = "MemcachedDown"; expr = ''up{job=~"memcached.*"} == 0''; "for" = "2m"; labels.severity = "critical"; annotations = { summary = "Memcached is down on {{ $labels.instance }}"; description = "The memcached exporter is unreachable. MediaWiki will fall back to database queries and be slow."; }; } { alert = "MemcachedEvictions"; expr = ''rate(memcached_items_evicted_total[5m]) > 10''; "for" = "10m"; labels.severity = "warning"; annotations = { summary = "High memcached eviction rate on {{ $labels.instance }}"; description = "{{ $value }} evictions/sec — cache is too small, consider increasing maxMemory."; }; } { alert = "MemcachedHitRateLow"; expr = ''rate(memcached_commands_total{command="get",status="hit"}[5m]) / rate(memcached_commands_total{command="get"}[5m]) < 0.8''; "for" = "15m"; labels.severity = "warning"; annotations = { summary = "Low memcached hit rate on {{ $labels.instance }}"; description = "Cache hit rate is {{ $value | humanizePercentage }}. Pages may be slow."; }; } ]; } { name = "wiki-backups"; rules = [ { alert = "BackupStale"; expr = ''(time() - backup_latest_timestamp_seconds) > 86400''; "for" = "1h"; labels.severity = "warning"; annotations = { summary = "Wiki backup is stale"; description = "Last successful backup was more than 24 hours ago."; }; } { alert = "BackupFailed"; expr = ''backup_b2_sync_success != 1''; "for" = "10m"; labels.severity = "critical"; annotations = { summary = "B2 backup sync failed"; description = "The last rclone sync to Backblaze B2 did not succeed."; }; } { alert = "ImageSyncStale"; expr = ''(time() - imagesync_latest_timestamp_seconds) > 7200''; "for" = "30m"; labels.severity = "warning"; annotations = { summary = "Image sync to replica is stale"; description = "Last successful image sync was more than 2 hours ago. Replica may have broken image links."; }; } ]; } ]; }) ]; }; services.prometheus.alertmanager = { enable = true; port = 9093; listenAddress = "127.0.0.1"; configuration = { route = { receiver = "discord"; group_by = [ "alertname" "instance" ]; group_wait = "30s"; group_interval = "5m"; repeat_interval = "4h"; }; receivers = [ { name = "discord"; webhook_configs = [{ url_file = config.age.secrets.discord-webhook.path; }]; } ]; }; }; age.secrets.discord-webhook = { file = ../../secrets/discord-webhook.age; owner = "alertmanager"; group = "alertmanager"; }; }