236 lines
8.2 KiB
YAML

groups:
- name: influence_app_alerts
interval: 30s
rules:
# Application availability
- alert: ApplicationDown
expr: up{job="influence-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Influence application is down"
description: "The Influence application has been down for more than 2 minutes."
# High error rate
- alert: HighErrorRate
expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Application is experiencing {{ $value }} errors per second."
# Email queue backing up
- alert: EmailQueueBacklog
expr: influence_email_queue_size > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Email queue has significant backlog"
description: "Email queue size is {{ $value }}, emails may be delayed."
# High email failure rate
- alert: HighEmailFailureRate
expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "High email failure rate"
description: "{{ $value | humanizePercentage }} of emails are failing to send."
# Rate limiting being hit frequently
- alert: FrequentRateLimiting
expr: rate(influence_rate_limit_hits_total[5m]) > 1
for: 5m
labels:
severity: info
annotations:
summary: "Rate limiting triggered frequently"
description: "Rate limits are being hit {{ $value }} times per second."
# Memory usage high
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
# Failed login attempts spike
- alert: SuspiciousLoginActivity
expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious login activity detected"
description: "{{ $value }} failed login attempts per second detected."
# External service failures
- alert: ExternalServiceFailures
expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "External service failures detected"
description: "{{ $labels.service }} is failing at {{ $value }} requests per second."
# High API latency
- alert: HighAPILatency
expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High API latency"
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
# System health alerts
- name: system_alerts
interval: 30s
rules:
# NocoDB unreachable
- alert: NocoDBUnreachable
expr: up{job="nocodb"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "NocoDB database is unreachable"
description: "NocoDB has been unreachable for more than 2 minutes. All database operations will fail."
# Redis down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis cache is down"
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
# Disk space running low
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space is running low"
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
# Disk space critical
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
for: 2m
labels:
severity: critical
annotations:
summary: "CRITICAL: Disk space nearly exhausted"
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
# Container CPU throttling
- alert: ContainerCPUThrottling
expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Container is being CPU throttled"
description: "Container {{ $labels.name }} is experiencing CPU throttling."
# Container memory usage high
- alert: ContainerMemoryHigh
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.90
for: 5m
labels:
severity: warning
annotations:
summary: "Container memory usage is high"
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
# Infrastructure alerts
- name: infrastructure_alerts
interval: 30s
rules:
# Prometheus scrape failures
- alert: PrometheusScrapeFailures
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus scrape failures detected"
description: "Prometheus is failing to scrape {{ $labels.job }} target."
# Prometheus configuration reload failure
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 1m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failed"
description: "Prometheus failed to reload its configuration. Check prometheus logs."
# Alertmanager down
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
# Security alerts
- name: security_alerts
interval: 15s
rules:
# Possible DDoS attack
- alert: PossibleDDoSAttack
expr: rate(influence_http_requests_total[1m]) > 1000
for: 2m
labels:
severity: critical
annotations:
summary: "Possible DDoS attack detected"
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
# Sustained high traffic
- alert: SustainedHighTraffic
expr: rate(influence_http_requests_total[5m]) > 500
for: 10m
labels:
severity: warning
annotations:
summary: "Sustained high traffic detected"
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
# Too many 4xx errors
- alert: HighClientErrorRate
expr: rate(influence_http_requests_total{status_code=~"4.."}[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High rate of 4xx client errors"
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."