diff --git a/docker-compose.yml b/docker-compose.yml index 692b419..8576a99 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -101,6 +101,7 @@ services: # Telegram Configuration - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN} - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID} + - ROCKET_WEBHOOK_URL=${ROCKET_WEBHOOK_URL} volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro diff --git a/env.example b/env.example index d6e8b4b..677c9ee 100644 --- a/env.example +++ b/env.example @@ -62,3 +62,8 @@ TELEGRAM_CHAT_ID=123456789 # TELEGRAM_BOT_TOKEN= # TELEGRAM_CHAT_ID= +# ============================================ +# ROCKET.CHAT WEBHOOK +# ============================================ +# Full Rocket.Chat Incoming Webhook URL +ROCKET_WEBHOOK_URL= diff --git a/grafana/provisioning/alerting/contactpoints.yml b/grafana/provisioning/alerting/contactpoints.yml index 18b2387..cc113d1 100644 --- a/grafana/provisioning/alerting/contactpoints.yml +++ b/grafana/provisioning/alerting/contactpoints.yml @@ -39,3 +39,38 @@ contactPoints: parse_mode: Markdown disableResolveMessage: false + - orgId: 1 + name: Rocket Notifications + receivers: + - uid: rocket-contact-point + type: slack + settings: + url: ${ROCKET_WEBHOOK_URL} + username: Grafana Monitor + text: | + {{ range .Alerts -}} + 🚨 *{{ .Labels.alertname }}* - {{ .Status | toUpper }} + {{- if .Labels.severity }} + *Severity:* {{ .Labels.severity }} + {{- end }} + {{- if .Labels.chain }} + *Chain:* {{ .Labels.chain }} + {{- end }} + {{- if .Labels.instance }} + *Instance:* {{ .Labels.instance }} + {{- end }} + {{- if .Annotations.summary }} + + 📋 {{ .Annotations.summary }} + {{- end }} + {{- if .Annotations.description }} + {{ .Annotations.description }} + {{- end }} + + {{- if .GeneratorURL }} + 🔗 <{{ .GeneratorURL }}|View in Grafana>{{ if .SilenceURL }} | <{{ .SilenceURL }}|Silence>{{ end }} + {{- else if .SilenceURL }} + 🔗 <{{ .SilenceURL }}|Silence> + {{- end }} + {{ end -}} + disableResolveMessage: false diff --git a/grafana/provisioning/alerting/policies.yml b/grafana/provisioning/alerting/policies.yml index 1dd4a63..b9b60be 100644 --- a/grafana/provisioning/alerting/policies.yml +++ b/grafana/provisioning/alerting/policies.yml @@ -9,16 +9,37 @@ policies: repeat_interval: 4h routes: # 🔴 DIRAC - HIGHEST PRIORITY - # Critical alerts (Faucet/Explorer/Task Master DOWN) → Telegram + Email - - receiver: Telegram Notifications + # Business critical alerts (No New Blocks) → Rocket + Email, grouped by alert only + - receiver: Rocket Notifications matchers: - chain = dirac - severity = critical + - alertname = "No New Blocks" + group_by: ['alertname', 'chain'] # Group by alert name only, ignore instance group_wait: 2m # First notification after 2 minutes repeat_interval: 30m # Repeat every 30 minutes continue: true # Continue to email - # All Dirac alerts → Email + - receiver: Email Notifications + matchers: + - chain = dirac + - severity = critical + - alertname = "No New Blocks" + group_by: ['alertname', 'chain'] # Same grouping as Rocket + group_wait: 2m + repeat_interval: 30m + continue: false + + # Infrastructure critical alerts (Node Down, Low Peers, etc.) → Rocket + Email, per instance + - receiver: Rocket Notifications + matchers: + - chain = dirac + - severity = critical + group_wait: 2m + repeat_interval: 30m + continue: true + + # All other Dirac alerts → Email - receiver: Email Notifications matchers: - chain = dirac @@ -36,8 +57,27 @@ policies: continue: false # GENERIC ALERTS (no chain label) - # Critical (Node Down, No Blocks, Disk Full, etc.) → Telegram + Email - - receiver: Telegram Notifications + # Business critical alerts (No New Blocks) → Rocket + Email, grouped + - receiver: Rocket Notifications + matchers: + - severity = critical + - alertname = "No New Blocks" + group_by: ['alertname'] # Group by alert name only, ignore instance + group_wait: 10s + repeat_interval: 1h + continue: true + + - receiver: Email Notifications + matchers: + - severity = critical + - alertname = "No New Blocks" + group_by: ['alertname'] + group_wait: 10s + repeat_interval: 1h + continue: false + + # Infrastructure critical alerts (Node Down, Disk Full, etc.) → Rocket + Email, per instance + - receiver: Rocket Notifications matchers: - severity = critical group_wait: 10s