236 lines
8.2 KiB
YAML
236 lines
8.2 KiB
YAML
groups:
|
|
- name: influence_app_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="influence-app"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Influence application is down"
|
|
description: "The Influence application has been down for more than 2 minutes."
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Application is experiencing {{ $value }} errors per second."
|
|
|
|
# Email queue backing up
|
|
- alert: EmailQueueBacklog
|
|
expr: influence_email_queue_size > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Email queue has significant backlog"
|
|
description: "Email queue size is {{ $value }}, emails may be delayed."
|
|
|
|
# High email failure rate
|
|
- alert: HighEmailFailureRate
|
|
expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High email failure rate"
|
|
description: "{{ $value | humanizePercentage }} of emails are failing to send."
|
|
|
|
# Rate limiting being hit frequently
|
|
- alert: FrequentRateLimiting
|
|
expr: rate(influence_rate_limit_hits_total[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Rate limiting triggered frequently"
|
|
description: "Rate limits are being hit {{ $value }} times per second."
|
|
|
|
# Memory usage high
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
|
|
|
|
# Failed login attempts spike
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "{{ $value }} failed login attempts per second detected."
|
|
|
|
# External service failures
|
|
- alert: ExternalServiceFailures
|
|
expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External service failures detected"
|
|
description: "{{ $labels.service }} is failing at {{ $value }} requests per second."
|
|
|
|
# High API latency
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
|
|
|
|
# System health alerts
|
|
- name: system_alerts
|
|
interval: 30s
|
|
rules:
|
|
# NocoDB unreachable
|
|
- alert: NocoDBUnreachable
|
|
expr: up{job="nocodb"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NocoDB database is unreachable"
|
|
description: "NocoDB has been unreachable for more than 2 minutes. All database operations will fail."
|
|
|
|
# Redis down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis cache is down"
|
|
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
|
|
|
|
# Disk space running low
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space is running low"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
|
|
|
|
# Disk space critical
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "CRITICAL: Disk space nearly exhausted"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
|
|
|
|
# Container CPU throttling
|
|
- alert: ContainerCPUThrottling
|
|
expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container is being CPU throttled"
|
|
description: "Container {{ $labels.name }} is experiencing CPU throttling."
|
|
|
|
# Container memory usage high
|
|
- alert: ContainerMemoryHigh
|
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container memory usage is high"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
|
|
|
|
# Infrastructure alerts
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Prometheus scrape failures
|
|
- alert: PrometheusScrapeFailures
|
|
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus scrape failures detected"
|
|
description: "Prometheus is failing to scrape {{ $labels.job }} target."
|
|
|
|
# Prometheus configuration reload failure
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failed"
|
|
description: "Prometheus failed to reload its configuration. Check prometheus logs."
|
|
|
|
# Alertmanager down
|
|
- alert: AlertmanagerDown
|
|
expr: up{job="alertmanager"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
|
|
|
|
# Security alerts
|
|
- name: security_alerts
|
|
interval: 15s
|
|
rules:
|
|
# Possible DDoS attack
|
|
- alert: PossibleDDoSAttack
|
|
expr: rate(influence_http_requests_total[1m]) > 1000
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible DDoS attack detected"
|
|
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
|
|
|
|
# Sustained high traffic
|
|
- alert: SustainedHighTraffic
|
|
expr: rate(influence_http_requests_total[5m]) > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Sustained high traffic detected"
|
|
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
|
|
|
|
# Too many 4xx errors
|
|
- alert: HighClientErrorRate
|
|
expr: rate(influence_http_requests_total{status_code=~"4.."}[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate of 4xx client errors"
|
|
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."
|