Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ services:
# Telegram Configuration
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
- ROCKET_WEBHOOK_URL=${ROCKET_WEBHOOK_URL}
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
Expand Down
5 changes: 5 additions & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,8 @@ TELEGRAM_CHAT_ID=123456789
# TELEGRAM_BOT_TOKEN=
# TELEGRAM_CHAT_ID=

# ============================================
# ROCKET.CHAT WEBHOOK
# ============================================
# Full Rocket.Chat Incoming Webhook URL
ROCKET_WEBHOOK_URL=
35 changes: 35 additions & 0 deletions grafana/provisioning/alerting/contactpoints.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,38 @@ contactPoints:
parse_mode: Markdown
disableResolveMessage: false

- orgId: 1
name: Rocket Notifications
receivers:
- uid: rocket-contact-point
type: slack
settings:
url: ${ROCKET_WEBHOOK_URL}
username: Grafana Monitor
text: |
{{ range .Alerts -}}
🚨 *{{ .Labels.alertname }}* - {{ .Status | toUpper }}
{{- if .Labels.severity }}
*Severity:* {{ .Labels.severity }}
{{- end }}
{{- if .Labels.chain }}
*Chain:* {{ .Labels.chain }}
{{- end }}
{{- if .Labels.instance }}
*Instance:* {{ .Labels.instance }}
{{- end }}
{{- if .Annotations.summary }}

📋 {{ .Annotations.summary }}
{{- end }}
{{- if .Annotations.description }}
{{ .Annotations.description }}
{{- end }}

{{- if .GeneratorURL }}
🔗 <{{ .GeneratorURL }}|View in Grafana>{{ if .SilenceURL }} | <{{ .SilenceURL }}|Silence>{{ end }}
{{- else if .SilenceURL }}
🔗 <{{ .SilenceURL }}|Silence>
{{- end }}
{{ end -}}
disableResolveMessage: false
50 changes: 45 additions & 5 deletions grafana/provisioning/alerting/policies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,37 @@ policies:
repeat_interval: 4h
routes:
# 🔴 DIRAC - HIGHEST PRIORITY
# Critical alerts (Faucet/Explorer/Task Master DOWN) → Telegram + Email
- receiver: Telegram Notifications
# Business critical alerts (No New Blocks) → Rocket + Email, grouped by alert only
- receiver: Rocket Notifications
matchers:
- chain = dirac
- severity = critical
- alertname = "No New Blocks"
group_by: ['alertname', 'chain'] # Group by alert name only, ignore instance
group_wait: 2m # First notification after 2 minutes
repeat_interval: 30m # Repeat every 30 minutes
continue: true # Continue to email

# All Dirac alerts → Email
- receiver: Email Notifications
matchers:
- chain = dirac
- severity = critical
- alertname = "No New Blocks"
group_by: ['alertname', 'chain'] # Same grouping as Rocket
group_wait: 2m
repeat_interval: 30m
continue: false

# Infrastructure critical alerts (Node Down, Low Peers, etc.) → Rocket + Email, per instance
- receiver: Rocket Notifications
matchers:
- chain = dirac
- severity = critical
group_wait: 2m
repeat_interval: 30m
continue: true

# All other Dirac alerts → Email
- receiver: Email Notifications
matchers:
- chain = dirac
Expand All @@ -36,8 +57,27 @@ policies:
continue: false

# GENERIC ALERTS (no chain label)
# Critical (Node Down, No Blocks, Disk Full, etc.) → Telegram + Email
- receiver: Telegram Notifications
# Business critical alerts (No New Blocks) → Rocket + Email, grouped
- receiver: Rocket Notifications
matchers:
- severity = critical
- alertname = "No New Blocks"
group_by: ['alertname'] # Group by alert name only, ignore instance
group_wait: 10s
repeat_interval: 1h
continue: true

- receiver: Email Notifications
matchers:
- severity = critical
- alertname = "No New Blocks"
group_by: ['alertname']
group_wait: 10s
repeat_interval: 1h
continue: false

# Infrastructure critical alerts (Node Down, Disk Full, etc.) → Rocket + Email, per instance
- receiver: Rocket Notifications
matchers:
- severity = critical
group_wait: 10s
Expand Down