WIP: monitoring improvements
This commit is contained in:
@@ -10,7 +10,7 @@
|
||||
file:
|
||||
path: "{{ current_stack_dest }}/{{ item.path }}"
|
||||
state: directory
|
||||
mode: "700"
|
||||
mode: "755"
|
||||
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
|
||||
when: item.state == "directory"
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
template:
|
||||
src: "{{ item.src }}"
|
||||
dest: "{{ current_stack_dest }}/{{ item.path | regex_replace('\\.j2$', '') }}"
|
||||
mode: "600"
|
||||
mode: "644"
|
||||
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
|
||||
when: item.state == "file"
|
||||
|
||||
@@ -30,5 +30,5 @@
|
||||
remove_orphans: yes
|
||||
register: docker_compose_output
|
||||
|
||||
- debug:
|
||||
var: docker_compose_output
|
||||
# - debug:
|
||||
# var: docker_compose_output
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}
|
||||
@@ -0,0 +1,68 @@
|
||||
# The root route on which each incoming alert enters.
|
||||
route:
|
||||
group_by: ["alertname", "job"]
|
||||
group_wait: 20s
|
||||
group_interval: 5m
|
||||
repeat_interval: 3h
|
||||
receiver: discord_webhook
|
||||
|
||||
receivers:
|
||||
- name: "discord_webhook"
|
||||
discord_configs:
|
||||
- webhook_url: "{{ alertmanager_discord_webhook }}"
|
||||
{# - send_resolved: true#}
|
||||
{# username: 'Alertmanager'#}
|
||||
{# webhook_configs:#}
|
||||
{# - send_resolved: true#}
|
||||
{# url: '{{ alertmanager_discord_webhook }}'#}
|
||||
{# username: 'Alertmanager'#}
|
||||
{# icon_url: 'https://prometheus.io/assets/icon.png'#}
|
||||
{# icon_emoji: ':alert:'#}
|
||||
{# send_resolved: true#}
|
||||
{# text: "{{ .CommonAnnotations.summary }}"#}
|
||||
{# title: "{{ .CommonLabels.alertname }}"#}
|
||||
{# color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
|
||||
{# footer: '{{ .CommonLabels.monitor }}'#}
|
||||
{# footer_icon: 'https://prometheus.io/assets/icon.png'#}
|
||||
{# actions:#}
|
||||
{# - type: 'button'#}
|
||||
{# text: 'Open in Grafana'#}
|
||||
{# url: '{{ .ExternalURL }}'#}
|
||||
{# style: 'primary'#}
|
||||
{# send_resolved: true#}
|
||||
{# confirm:#}
|
||||
{# title: 'Are you sure?'#}
|
||||
{# text: 'This will open Grafana in a new tab.'#}
|
||||
{# ok_text: 'Yes'#}
|
||||
{# dismiss_text: 'No'#}
|
||||
{# fields:#}
|
||||
{# - title: 'Description'#}
|
||||
{# value: "{{ .CommonAnnotations.description }}"#}
|
||||
{# short: false#}
|
||||
{# - title: 'Details'#}
|
||||
{# value: "{{ .CommonAnnotations.details }}"#}
|
||||
{# short: false#}
|
||||
{# - title: 'Severity'#}
|
||||
{# value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Host'#}
|
||||
{# value: '{{ .CommonLabels.monitor }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Starts At'#}
|
||||
{# value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Ends At'#}
|
||||
{# value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Runbook'#}
|
||||
{# value: '{{ .CommonAnnotations.runbook_url }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Dashboard'#}
|
||||
{# value: '{{ .CommonAnnotations.dashboard_url }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Alerting Rule'#}
|
||||
{# value: '{{ .CommonLabels.alertname }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Alerting Rule Description'#}
|
||||
{# value: '{{ .CommonLabels.alertname }}'#}
|
||||
{# short: true#}
|
||||
@@ -66,6 +66,24 @@ services:
|
||||
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
|
||||
- {{ base_volume_path }}/monitoring/prometheus:/prometheus
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
# Needed to make config files readable
|
||||
# user: "{{ remote_uid }}"
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
volumes:
|
||||
- ./alertmanager_config:/etc/alertmanager:ro
|
||||
|
||||
# alerts-discord:
|
||||
# image: rogerrum/alertmanager-discord:1.0.6
|
||||
# container_name: alerts-discord
|
||||
# restart: unless-stopped
|
||||
# env_file:
|
||||
# - .env.alertmanager
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
|
||||
@@ -15,6 +15,19 @@ datasources:
|
||||
url: http://prometheus:9090
|
||||
editable: false
|
||||
|
||||
- name: Alertmanager
|
||||
type: alertmanager
|
||||
access: proxy
|
||||
uid: alertmanager
|
||||
url: http://alertmanager:9093
|
||||
jsonData:
|
||||
# Valid options for implementation include mimir, cortex and prometheus
|
||||
implementation: prometheus
|
||||
# Whether Grafana should send alert instances to this Alertmanager
|
||||
ha
|
||||
ndleGrafanaManagedAlerts: false
|
||||
editable: false
|
||||
|
||||
- name: InfluxDB
|
||||
type: influxdb
|
||||
access: proxy
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
groups:
|
||||
- name: demo-service-alerts
|
||||
rules:
|
||||
- alert: DemoServiceHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum without(status, instance) (
|
||||
rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
|
||||
)
|
||||
/
|
||||
sum without(status, instance) (
|
||||
rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
|
||||
) * 100 > 0.5
|
||||
)
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
|
||||
description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'
|
||||
@@ -5,6 +5,11 @@ global:
|
||||
external_labels:
|
||||
monitor: "{{ ansible_host }}"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
@@ -30,7 +35,15 @@ scrape_configs:
|
||||
static_configs:
|
||||
- targets: ["promtail:9080"]
|
||||
|
||||
- job_name: 'demo'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'demo.promlabs.com:10000'
|
||||
- 'demo.promlabs.com:10001'
|
||||
- 'demo.promlabs.com:10002'
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/demo-alerts.yml"
|
||||
- "/etc/prometheus/extra/rules/*.yml"
|
||||
- "/etc/prometheus/extra/rules/*.json"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user