- alert: ALERT_MANAGER_FAILURES
expr: rate(alertmanager_notifications_failed_total[5m]) > 0
title: Alertmanager is failing to send notications
description: Alertmanager is seeing errors {{$labels.integration}}
expr: avg_over_time(up{job=~"Hosts|Containers"}[2m]) == 0
summary: "{{$labels.instance}}: Host is unreachable. Host could be down. The Collecors are not accessible. If the host is up, make sure collectors are running."
description: "{{$labels.instance}}: Host is unreachable. Host could be down. The Collecors are not accessible. If the host is up, make sure collectors are running."
- alert: HOST_HIGH_MEMORY_USAGE
expr: (((avg_over_time(node_memory_MemTotal_bytes[5m]) - avg_over_time(node_memory_MemFree_bytes[5m]) - avg_over_time(node_memory_Cached_bytes[5m])) / (avg_over_time(node_memory_MemTotal_bytes[5m])) * 100)) > 80
summary: "{{$labels.instance}}: Memory Usage detected above 80"
description: "{{$labels.instance}}: Memory usage usage is above 80% (Current Used Memory % is: {{ $value }})"
- alert: HOST_HIGH_DISK_USAGE
expr: ((avg_over_time(node_filesystem_size_bytes{fstype=~"(ext.|xfs)"}[5m]) - avg_over_time(node_filesystem_free_bytes{fstype=~"(ext.|xfs)"}[5m])) * 100 / avg_over_time(node_filesystem_size_bytes{fstype=~"(ext.|xfs)"}[5m])) > 70
summary: "{{$labels.instance}}: Disk {{$labels.device}} Usage detected above 70"
description: "{{$labels.instance}}: Disk {{$labels.device}} usage usage is above 70% (Current Disk Used % is: {{ $value }})"
- alert: HOST_HIGH_CPU_USAGE
expr: (100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 70
summary: "{{$labels.instance}}: CPU Usage detected above 70"
description: "{{$labels.instance}}: CPU usage usage is above 70% (Current CPU % is: {{ $value }})"
- alert: HOST_HIGH_DISK_UTILIZATION
expr: rate(node_disk_io_time_seconds_total[5m]) / 10 > 90
summary: "{{$labels.instance}}: Disk ( {{ $labels.device }} ) utilization is very high."
description: "{{$labels.instance}}: Disk ( {{ $labels.device }} ) utilization is very high. (Current Utilization is: {{ $value }})"
- alert: HOST_HIGH_DISK_INODE
expr: avg_over_time(node_filesystem_files_free{fstype=~"(ext.|xfs)"}[5m]) / avg_over_time(node_filesystem_files{fstype=~"(ext.|xfs)"}[5m]) * 100 <= 20
summary: "{{$labels.instance}}: Disk ( {{ $labels.device }} ) High number of inode usage"
description: "{{$labels.instance}}: Disk ( {{ $labels.device }} ) High number of inode usage. (Current value is: {{ $value }})"