groups: - name: influence_app_alerts interval: 30s rules: # Application availability - alert: ApplicationDown expr: up{job="influence-app"} == 0 for: 2m labels: severity: critical annotations: summary: "Influence application is down" description: "The Influence application has been down for more than 2 minutes." # High error rate - alert: HighErrorRate expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate detected" description: "Application is experiencing {{ $value }} errors per second." # Email queue backing up - alert: EmailQueueBacklog expr: influence_email_queue_size > 100 for: 10m labels: severity: warning annotations: summary: "Email queue has significant backlog" description: "Email queue size is {{ $value }}, emails may be delayed." # High email failure rate - alert: HighEmailFailureRate expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2 for: 10m labels: severity: warning annotations: summary: "High email failure rate" description: "{{ $value | humanizePercentage }} of emails are failing to send." # Rate limiting being hit frequently - alert: FrequentRateLimiting expr: rate(influence_rate_limit_hits_total[5m]) > 1 for: 5m labels: severity: info annotations: summary: "Rate limiting triggered frequently" description: "Rate limits are being hit {{ $value }} times per second." # Memory usage high - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85 for: 10m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is above 85% ({{ $value | humanizePercentage }})." # Failed login attempts spike - alert: SuspiciousLoginActivity expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5 for: 2m labels: severity: warning annotations: summary: "Suspicious login activity detected" description: "{{ $value }} failed login attempts per second detected." # External service failures - alert: ExternalServiceFailures expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "External service failures detected" description: "{{ $labels.service }} is failing at {{ $value }} requests per second." # High API latency - alert: HighAPILatency expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High API latency" description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}." # System health alerts - name: system_alerts interval: 30s rules: # NocoDB unreachable - alert: NocoDBUnreachable expr: up{job="nocodb"} == 0 for: 2m labels: severity: critical annotations: summary: "NocoDB database is unreachable" description: "NocoDB has been unreachable for more than 2 minutes. All database operations will fail." # Redis down - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: summary: "Redis cache is down" description: "Redis has been down for more than 1 minute. Caching and session management will fail." # Disk space running low - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: warning annotations: summary: "Disk space is running low" description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem." # Disk space critical - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10 for: 2m labels: severity: critical annotations: summary: "CRITICAL: Disk space nearly exhausted" description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon." # High CPU usage - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "High CPU usage detected" description: "CPU usage is {{ $value }}% on {{ $labels.instance }}." # Container CPU throttling - alert: ContainerCPUThrottling expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "Container is being CPU throttled" description: "Container {{ $labels.name }} is experiencing CPU throttling." # Container memory usage high - alert: ContainerMemoryHigh expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.90 for: 5m labels: severity: warning annotations: summary: "Container memory usage is high" description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit." # Infrastructure alerts - name: infrastructure_alerts interval: 30s rules: # Prometheus scrape failures - alert: PrometheusScrapeFailures expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "Prometheus scrape failures detected" description: "Prometheus is failing to scrape {{ $labels.job }} target." # Prometheus configuration reload failure - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 1m labels: severity: warning annotations: summary: "Prometheus configuration reload failed" description: "Prometheus failed to reload its configuration. Check prometheus logs." # Alertmanager down - alert: AlertmanagerDown expr: up{job="alertmanager"} == 0 for: 2m labels: severity: critical annotations: summary: "Alertmanager is down" description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!" # Security alerts - name: security_alerts interval: 15s rules: # Possible DDoS attack - alert: PossibleDDoSAttack expr: rate(influence_http_requests_total[1m]) > 1000 for: 2m labels: severity: critical annotations: summary: "Possible DDoS attack detected" description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack." # Sustained high traffic - alert: SustainedHighTraffic expr: rate(influence_http_requests_total[5m]) > 500 for: 10m labels: severity: warning annotations: summary: "Sustained high traffic detected" description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues." # Too many 4xx errors - alert: HighClientErrorRate expr: rate(influence_http_requests_total{status_code=~"4.."}[5m]) > 5 for: 5m labels: severity: warning annotations: summary: "High rate of 4xx client errors" description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."