noisebridge-wiki/modules/wiki-primary/alerting.nix
Jet 8cfede9f57
Some checks failed
CI / check (push) Has been cancelled
CI / deploy (push) Has been cancelled
feat: init
2026-03-17 04:07:44 -07:00

291 lines
12 KiB
Nix

{ config, pkgs, lib, ... }:
{
services.prometheus = {
alertmanagers = [{
static_configs = [{
targets = [ "localhost:9093" ];
}];
}];
rules = [
(builtins.toJSON {
groups = [
{
name = "wiki-availability";
rules = [
{
alert = "WikiDown";
expr = ''probe_success{job="blackbox-http",instance=~".*www.noisebridge.net.*"} == 0'';
"for" = "2m";
labels.severity = "critical";
annotations = {
summary = "Primary wiki is unreachable";
description = "{{ $labels.instance }} has been down for more than 2 minutes.";
};
}
{
alert = "ReplicaDown";
expr = ''probe_success{job="blackbox-http",instance=~".*readonly.noisebridge.net.*"} == 0'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "Replica wiki is unreachable";
description = "{{ $labels.instance }} has been down for more than 5 minutes.";
};
}
{
alert = "HighErrorRate";
expr = ''sum(rate(caddy_http_responses_total{code=~"5.."}[5m])) by (instance) / sum(rate(caddy_http_responses_total[5m])) by (instance) > 0.05'';
"for" = "5m";
labels.severity = "critical";
annotations = {
summary = "High HTTP 5xx error rate on {{ $labels.instance }}";
description = "More than 5% of requests are returning server errors.";
};
}
{
alert = "HighLatency";
expr = ''histogram_quantile(0.95, sum(rate(caddy_http_request_duration_seconds_bucket[5m])) by (le, instance)) > 2'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "High p95 latency on {{ $labels.instance }}";
description = "95th percentile response time is {{ $value | humanizeDuration }}.";
};
}
{
alert = "TLSCertExpiringSoon";
expr = ''probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 7 * 86400'';
"for" = "1h";
labels.severity = "warning";
annotations = {
summary = "TLS certificate expiring within 7 days";
description = "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}.";
};
}
];
}
{
name = "wiki-infrastructure";
rules = [
{
alert = "DiskFull";
expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "Disk usage above 85% on {{ $labels.instance }}";
description = "Root filesystem is {{ $value | humanizePercentage }} free.";
};
}
{
alert = "DiskCritical";
expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05'';
"for" = "2m";
labels.severity = "critical";
annotations = {
summary = "Disk almost full on {{ $labels.instance }}";
description = "Root filesystem is {{ $value | humanizePercentage }} free. Immediate action required.";
};
}
{
alert = "HighMemoryUsage";
expr = ''(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.9'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "Memory usage above 90% on {{ $labels.instance }}";
description = "Available memory is {{ $value | humanizePercentage }} of total.";
};
}
{
alert = "HighCPU";
expr = ''1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85'';
"for" = "10m";
labels.severity = "warning";
annotations = {
summary = "Sustained high CPU on {{ $labels.instance }}";
description = "CPU usage has been above 85% for 10 minutes.";
};
}
{
alert = "SystemdUnitFailed";
expr = ''node_systemd_unit_state{state="failed"} == 1'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "Systemd unit failed on {{ $labels.instance }}";
description = "Unit {{ $labels.name }} is in failed state.";
};
}
];
}
{
name = "wiki-database";
rules = [
{
alert = "ReplicationBroken";
expr = ''mysql_slave_status_slave_io_running{instance="wiki-replica"} == 0 or mysql_slave_status_slave_sql_running{instance="wiki-replica"} == 0'';
"for" = "2m";
labels.severity = "critical";
annotations = {
summary = "MySQL replication thread stopped";
description = "Replication IO or SQL thread is not running on the replica.";
};
}
{
alert = "ReplicationLagging";
expr = ''mysql_slave_status_seconds_behind_master{instance="wiki-replica"} > 300'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "MySQL replication lagging";
description = "Replica is {{ $value }}s behind the primary.";
};
}
{
alert = "MySQLConnectionsExhausted";
expr = ''mysql_global_status_threads_connected / mysql_global_variables_max_connections > 0.8'';
"for" = "5m";
labels.severity = "warning";
annotations = {
summary = "MySQL connections above 80% on {{ $labels.instance }}";
description = "{{ $value | humanizePercentage }} of max connections in use.";
};
}
{
alert = "MySQLSlowQueries";
expr = ''rate(mysql_global_status_slow_queries[5m]) > 0.1'';
"for" = "10m";
labels.severity = "warning";
annotations = {
summary = "Elevated slow queries on {{ $labels.instance }}";
description = "{{ $value }} slow queries per second over the last 5 minutes.";
};
}
];
}
{
name = "wiki-application";
rules = [
{
alert = "PHPFPMExhausted";
expr = ''phpfpm_active_processes >= phpfpm_total_processes'';
"for" = "1m";
labels.severity = "warning";
annotations = {
summary = "PHP-FPM workers exhausted";
description = "All PHP-FPM workers are active requests may be queuing.";
};
}
{
alert = "PHPFPMDown";
expr = ''up{job="phpfpm"} == 0'';
"for" = "1m";
labels.severity = "critical";
annotations = {
summary = "PHP-FPM exporter is down";
description = "Cannot scrape PHP-FPM metrics the PHP-FPM process may be dead.";
};
}
{
alert = "MemcachedDown";
expr = ''up{job=~"memcached.*"} == 0'';
"for" = "2m";
labels.severity = "critical";
annotations = {
summary = "Memcached is down on {{ $labels.instance }}";
description = "The memcached exporter is unreachable. MediaWiki will fall back to database queries and be slow.";
};
}
{
alert = "MemcachedEvictions";
expr = ''rate(memcached_items_evicted_total[5m]) > 10'';
"for" = "10m";
labels.severity = "warning";
annotations = {
summary = "High memcached eviction rate on {{ $labels.instance }}";
description = "{{ $value }} evictions/sec cache is too small, consider increasing maxMemory.";
};
}
{
alert = "MemcachedHitRateLow";
expr = ''rate(memcached_commands_total{command="get",status="hit"}[5m]) / rate(memcached_commands_total{command="get"}[5m]) < 0.8'';
"for" = "15m";
labels.severity = "warning";
annotations = {
summary = "Low memcached hit rate on {{ $labels.instance }}";
description = "Cache hit rate is {{ $value | humanizePercentage }}. Pages may be slow.";
};
}
];
}
{
name = "wiki-backups";
rules = [
{
alert = "BackupStale";
expr = ''(time() - backup_latest_timestamp_seconds) > 86400'';
"for" = "1h";
labels.severity = "warning";
annotations = {
summary = "Wiki backup is stale";
description = "Last successful backup was more than 24 hours ago.";
};
}
{
alert = "BackupFailed";
expr = ''backup_b2_sync_success != 1'';
"for" = "10m";
labels.severity = "critical";
annotations = {
summary = "B2 backup sync failed";
description = "The last rclone sync to Backblaze B2 did not succeed.";
};
}
{
alert = "ImageSyncStale";
expr = ''(time() - imagesync_latest_timestamp_seconds) > 7200'';
"for" = "30m";
labels.severity = "warning";
annotations = {
summary = "Image sync to replica is stale";
description = "Last successful image sync was more than 2 hours ago. Replica may have broken image links.";
};
}
];
}
];
})
];
};
services.prometheus.alertmanager = {
enable = true;
port = 9093;
listenAddress = "127.0.0.1";
configuration = {
route = {
receiver = "discord";
group_by = [ "alertname" "instance" ];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
};
receivers = [
{
name = "discord";
webhook_configs = [{
url_file = config.age.secrets.discord-webhook.path;
}];
}
];
};
};
age.secrets.discord-webhook = {
file = ../../secrets/discord-webhook.age;
owner = "alertmanager";
group = "alertmanager";
};
}