- Implemented a comprehensive health check utility to monitor system dependencies including NocoDB, SMTP, Represent API, disk space, and memory usage. - Created a logger utility using Winston for structured logging with daily rotation and various log levels. - Developed a metrics utility using Prometheus client to track application performance metrics such as email sends, HTTP requests, and user activity. - Added a backup script for automated backups of NocoDB data, uploaded files, and environment configurations with optional S3 support. - Introduced a toggle script to switch between development (MailHog) and production (ProtonMail) SMTP configurations.
94 lines
3.2 KiB
YAML
94 lines
3.2 KiB
YAML
groups:
|
|
- name: influence_app_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="influence-app"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Influence application is down"
|
|
description: "The Influence application has been down for more than 2 minutes."
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Application is experiencing {{ $value }} errors per second."
|
|
|
|
# Email queue backing up
|
|
- alert: EmailQueueBacklog
|
|
expr: influence_email_queue_size > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Email queue has significant backlog"
|
|
description: "Email queue size is {{ $value }}, emails may be delayed."
|
|
|
|
# High email failure rate
|
|
- alert: HighEmailFailureRate
|
|
expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High email failure rate"
|
|
description: "{{ $value | humanizePercentage }} of emails are failing to send."
|
|
|
|
# Rate limiting being hit frequently
|
|
- alert: FrequentRateLimiting
|
|
expr: rate(influence_rate_limit_hits_total[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Rate limiting triggered frequently"
|
|
description: "Rate limits are being hit {{ $value }} times per second."
|
|
|
|
# Memory usage high
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
|
|
|
|
# Failed login attempts spike
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "{{ $value }} failed login attempts per second detected."
|
|
|
|
# External service failures
|
|
- alert: ExternalServiceFailures
|
|
expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External service failures detected"
|
|
description: "{{ $labels.service }} is failing at {{ $value }} requests per second."
|
|
|
|
# High API latency
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
|