admin e5c32ad25a Add health check utility, logger, metrics, backup, and SMTP toggle scripts
- Implemented a comprehensive health check utility to monitor system dependencies including NocoDB, SMTP, Represent API, disk space, and memory usage.
- Created a logger utility using Winston for structured logging with daily rotation and various log levels.
- Developed a metrics utility using Prometheus client to track application performance metrics such as email sends, HTTP requests, and user activity.
- Added a backup script for automated backups of NocoDB data, uploaded files, and environment configurations with optional S3 support.
- Introduced a toggle script to switch between development (MailHog) and production (ProtonMail) SMTP configurations.
2025-10-23 11:33:00 -06:00

94 lines
3.2 KiB
YAML

groups:
- name: influence_app_alerts
interval: 30s
rules:
# Application availability
- alert: ApplicationDown
expr: up{job="influence-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Influence application is down"
description: "The Influence application has been down for more than 2 minutes."
# High error rate
- alert: HighErrorRate
expr: rate(influence_http_requests_total{status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Application is experiencing {{ $value }} errors per second."
# Email queue backing up
- alert: EmailQueueBacklog
expr: influence_email_queue_size > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Email queue has significant backlog"
description: "Email queue size is {{ $value }}, emails may be delayed."
# High email failure rate
- alert: HighEmailFailureRate
expr: rate(influence_emails_failed_total[5m]) / rate(influence_emails_sent_total[5m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "High email failure rate"
description: "{{ $value | humanizePercentage }} of emails are failing to send."
# Rate limiting being hit frequently
- alert: FrequentRateLimiting
expr: rate(influence_rate_limit_hits_total[5m]) > 1
for: 5m
labels:
severity: info
annotations:
summary: "Rate limiting triggered frequently"
description: "Rate limits are being hit {{ $value }} times per second."
# Memory usage high
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
# Failed login attempts spike
- alert: SuspiciousLoginActivity
expr: rate(influence_login_attempts_total{status="failed"}[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious login activity detected"
description: "{{ $value }} failed login attempts per second detected."
# External service failures
- alert: ExternalServiceFailures
expr: rate(influence_external_service_requests_total{status="failed"}[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "External service failures detected"
description: "{{ $labels.service }} is failing at {{ $value }} requests per second."
# High API latency
- alert: HighAPILatency
expr: histogram_quantile(0.95, rate(influence_http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High API latency"
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."