groups: - name: influence_app_alerts interval: 30s rules: # Application availability - alert: ApplicationDown expr: up{job="influence-app"} == 0 for: 2m labels: severity: critical annotations: summary: "Influence application is down" description: "The Influence application has been down for more than 2 minutes." # High error rate - alert: HighErrorRate expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate detected" description: "Application is experiencing {{ $value }} errors per second." # Email queue backing up - alert: EmailQueueBacklog expr: influence_email_queue_size > 100 for: 10m labels: severity: warning annotations: summary: "Email queue has significant backlog" description: "Email queue size is {{ $value }}, emails may be delayed." # High email failure rate - alert: HighEmailFailureRate expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2 for: 10m labels: severity: warning annotations: summary: "High email failure rate" description: "{{ $value | humanizePercentage }} of emails are failing to send." # Rate limiting being hit frequently - alert: FrequentRateLimiting expr: rate(influence_rate_limit_hits_total[5m]) > 1 for: 5m labels: severity: info annotations: summary: "Rate limiting triggered frequently" description: "Rate limits are being hit {{ $value }} times per second." # Memory usage high - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85 for: 10m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is above 85% ({{ $value | humanizePercentage }})." # Failed login attempts spike - alert: SuspiciousLoginActivity expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5 for: 2m labels: severity: warning annotations: summary: "Suspicious login activity detected" description: "{{ $value }} failed login attempts per second detected." # External service failures - alert: ExternalServiceFailures expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "External service failures detected" description: "{{ $labels.service }} is failing at {{ $value }} requests per second." # High API latency - alert: HighAPILatency expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High API latency" description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."