feat: init
This commit is contained in:
commit
8cfede9f57
28 changed files with 2129 additions and 0 deletions
291
modules/wiki-primary/alerting.nix
Normal file
291
modules/wiki-primary/alerting.nix
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
{ config, pkgs, lib, ... }:
|
||||
{
|
||||
services.prometheus = {
|
||||
alertmanagers = [{
|
||||
static_configs = [{
|
||||
targets = [ "localhost:9093" ];
|
||||
}];
|
||||
}];
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "wiki-availability";
|
||||
rules = [
|
||||
{
|
||||
alert = "WikiDown";
|
||||
expr = ''probe_success{job="blackbox-http",instance=~".*www.noisebridge.net.*"} == 0'';
|
||||
"for" = "2m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "Primary wiki is unreachable";
|
||||
description = "{{ $labels.instance }} has been down for more than 2 minutes.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "ReplicaDown";
|
||||
expr = ''probe_success{job="blackbox-http",instance=~".*readonly.noisebridge.net.*"} == 0'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Replica wiki is unreachable";
|
||||
description = "{{ $labels.instance }} has been down for more than 5 minutes.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighErrorRate";
|
||||
expr = ''sum(rate(caddy_http_responses_total{code=~"5.."}[5m])) by (instance) / sum(rate(caddy_http_responses_total[5m])) by (instance) > 0.05'';
|
||||
"for" = "5m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "High HTTP 5xx error rate on {{ $labels.instance }}";
|
||||
description = "More than 5% of requests are returning server errors.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighLatency";
|
||||
expr = ''histogram_quantile(0.95, sum(rate(caddy_http_request_duration_seconds_bucket[5m])) by (le, instance)) > 2'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "High p95 latency on {{ $labels.instance }}";
|
||||
description = "95th percentile response time is {{ $value | humanizeDuration }}.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "TLSCertExpiringSoon";
|
||||
expr = ''probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 7 * 86400'';
|
||||
"for" = "1h";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "TLS certificate expiring within 7 days";
|
||||
description = "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}.";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "wiki-infrastructure";
|
||||
rules = [
|
||||
{
|
||||
alert = "DiskFull";
|
||||
expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Disk usage above 85% on {{ $labels.instance }}";
|
||||
description = "Root filesystem is {{ $value | humanizePercentage }} free.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "DiskCritical";
|
||||
expr = ''(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.05'';
|
||||
"for" = "2m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "Disk almost full on {{ $labels.instance }}";
|
||||
description = "Root filesystem is {{ $value | humanizePercentage }} free. Immediate action required.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighMemoryUsage";
|
||||
expr = ''(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.9'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Memory usage above 90% on {{ $labels.instance }}";
|
||||
description = "Available memory is {{ $value | humanizePercentage }} of total.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighCPU";
|
||||
expr = ''1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85'';
|
||||
"for" = "10m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Sustained high CPU on {{ $labels.instance }}";
|
||||
description = "CPU usage has been above 85% for 10 minutes.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "SystemdUnitFailed";
|
||||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Systemd unit failed on {{ $labels.instance }}";
|
||||
description = "Unit {{ $labels.name }} is in failed state.";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "wiki-database";
|
||||
rules = [
|
||||
{
|
||||
alert = "ReplicationBroken";
|
||||
expr = ''mysql_slave_status_slave_io_running{instance="wiki-replica"} == 0 or mysql_slave_status_slave_sql_running{instance="wiki-replica"} == 0'';
|
||||
"for" = "2m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "MySQL replication thread stopped";
|
||||
description = "Replication IO or SQL thread is not running on the replica.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "ReplicationLagging";
|
||||
expr = ''mysql_slave_status_seconds_behind_master{instance="wiki-replica"} > 300'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "MySQL replication lagging";
|
||||
description = "Replica is {{ $value }}s behind the primary.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "MySQLConnectionsExhausted";
|
||||
expr = ''mysql_global_status_threads_connected / mysql_global_variables_max_connections > 0.8'';
|
||||
"for" = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "MySQL connections above 80% on {{ $labels.instance }}";
|
||||
description = "{{ $value | humanizePercentage }} of max connections in use.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "MySQLSlowQueries";
|
||||
expr = ''rate(mysql_global_status_slow_queries[5m]) > 0.1'';
|
||||
"for" = "10m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Elevated slow queries on {{ $labels.instance }}";
|
||||
description = "{{ $value }} slow queries per second over the last 5 minutes.";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "wiki-application";
|
||||
rules = [
|
||||
{
|
||||
alert = "PHPFPMExhausted";
|
||||
expr = ''phpfpm_active_processes >= phpfpm_total_processes'';
|
||||
"for" = "1m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "PHP-FPM workers exhausted";
|
||||
description = "All PHP-FPM workers are active — requests may be queuing.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "PHPFPMDown";
|
||||
expr = ''up{job="phpfpm"} == 0'';
|
||||
"for" = "1m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "PHP-FPM exporter is down";
|
||||
description = "Cannot scrape PHP-FPM metrics — the PHP-FPM process may be dead.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "MemcachedDown";
|
||||
expr = ''up{job=~"memcached.*"} == 0'';
|
||||
"for" = "2m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "Memcached is down on {{ $labels.instance }}";
|
||||
description = "The memcached exporter is unreachable. MediaWiki will fall back to database queries and be slow.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "MemcachedEvictions";
|
||||
expr = ''rate(memcached_items_evicted_total[5m]) > 10'';
|
||||
"for" = "10m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "High memcached eviction rate on {{ $labels.instance }}";
|
||||
description = "{{ $value }} evictions/sec — cache is too small, consider increasing maxMemory.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "MemcachedHitRateLow";
|
||||
expr = ''rate(memcached_commands_total{command="get",status="hit"}[5m]) / rate(memcached_commands_total{command="get"}[5m]) < 0.8'';
|
||||
"for" = "15m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Low memcached hit rate on {{ $labels.instance }}";
|
||||
description = "Cache hit rate is {{ $value | humanizePercentage }}. Pages may be slow.";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "wiki-backups";
|
||||
rules = [
|
||||
{
|
||||
alert = "BackupStale";
|
||||
expr = ''(time() - backup_latest_timestamp_seconds) > 86400'';
|
||||
"for" = "1h";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Wiki backup is stale";
|
||||
description = "Last successful backup was more than 24 hours ago.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "BackupFailed";
|
||||
expr = ''backup_b2_sync_success != 1'';
|
||||
"for" = "10m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "B2 backup sync failed";
|
||||
description = "The last rclone sync to Backblaze B2 did not succeed.";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "ImageSyncStale";
|
||||
expr = ''(time() - imagesync_latest_timestamp_seconds) > 7200'';
|
||||
"for" = "30m";
|
||||
labels.severity = "warning";
|
||||
annotations = {
|
||||
summary = "Image sync to replica is stale";
|
||||
description = "Last successful image sync was more than 2 hours ago. Replica may have broken image links.";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
})
|
||||
];
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
port = 9093;
|
||||
listenAddress = "127.0.0.1";
|
||||
configuration = {
|
||||
route = {
|
||||
receiver = "discord";
|
||||
group_by = [ "alertname" "instance" ];
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "discord";
|
||||
webhook_configs = [{
|
||||
url_file = config.age.secrets.discord-webhook.path;
|
||||
}];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
age.secrets.discord-webhook = {
|
||||
file = ../../secrets/discord-webhook.age;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue