WIP: monitoring improvements

This commit is contained in:
2024-10-12 02:28:52 -07:00
parent dd0330c85a
commit 6504100cd5
17 changed files with 650 additions and 314 deletions

View File

@@ -10,7 +10,7 @@
file:
path: "{{ current_stack_dest }}/{{ item.path }}"
state: directory
mode: "700"
mode: "755"
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
when: item.state == "directory"
@@ -18,9 +18,17 @@
template:
src: "{{ item.src }}"
dest: "{{ current_stack_dest }}/{{ item.path | regex_replace('\\.j2$', '') }}"
mode: "600"
mode: "644"
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
when: item.state == "file"
when: item.state == "file" and item.path | regex_search('\\.j2$')
- name: Generate {{ current_stack_name }} deployment from static files
copy:
src: "{{ item.src }}"
dest: "{{ current_stack_dest }}/{{ item.path }}"
mode: "644"
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
when: item.state == "file" and not item.path | regex_search('\\.j2$')
- name: Deploy docker-compose for {{ current_stack_name }}
community.docker.docker_compose_v2:
@@ -30,5 +38,5 @@
remove_orphans: yes
register: docker_compose_output
- debug:
var: docker_compose_output
# - debug:
# var: docker_compose_output

View File

@@ -0,0 +1 @@
DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}

View File

@@ -0,0 +1,13 @@
FROM python:3-alpine AS builder
RUN pip install grafanalib
COPY ./grafana_config/dashboards /dashboards
RUN generate-dashboards /dashboards/*.dashboard.py
FROM grafana/grafana:latest
#COPY ./grafana_config /etc/grafana
COPY ./grafana_config/dashboards/*.yaml /etc/grafana/provisioning/dashboards
COPY --from=builder /dashboards/*.json /etc/grafana/provisioning/dashboards

View File

@@ -0,0 +1,68 @@
# The root route on which each incoming alert enters.
route:
group_by: ["alertname", "job"]
group_wait: 20s
group_interval: 5m
repeat_interval: 3h
receiver: discord_webhook
receivers:
- name: "discord_webhook"
discord_configs:
- webhook_url: "{{ alertmanager_discord_webhook }}"
{# - send_resolved: true#}
{# username: 'Alertmanager'#}
{# webhook_configs:#}
{# - send_resolved: true#}
{# url: '{{ alertmanager_discord_webhook }}'#}
{# username: 'Alertmanager'#}
{# icon_url: 'https://prometheus.io/assets/icon.png'#}
{# icon_emoji: ':alert:'#}
{# send_resolved: true#}
{# text: "{{ .CommonAnnotations.summary }}"#}
{# title: "{{ .CommonLabels.alertname }}"#}
{# color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
{# footer: '{{ .CommonLabels.monitor }}'#}
{# footer_icon: 'https://prometheus.io/assets/icon.png'#}
{# actions:#}
{# - type: 'button'#}
{# text: 'Open in Grafana'#}
{# url: '{{ .ExternalURL }}'#}
{# style: 'primary'#}
{# send_resolved: true#}
{# confirm:#}
{# title: 'Are you sure?'#}
{# text: 'This will open Grafana in a new tab.'#}
{# ok_text: 'Yes'#}
{# dismiss_text: 'No'#}
{# fields:#}
{# - title: 'Description'#}
{# value: "{{ .CommonAnnotations.description }}"#}
{# short: false#}
{# - title: 'Details'#}
{# value: "{{ .CommonAnnotations.details }}"#}
{# short: false#}
{# - title: 'Severity'#}
{# value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
{# short: true#}
{# - title: 'Host'#}
{# value: '{{ .CommonLabels.monitor }}'#}
{# short: true#}
{# - title: 'Starts At'#}
{# value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Ends At'#}
{# value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Runbook'#}
{# value: '{{ .CommonAnnotations.runbook_url }}'#}
{# short: true#}
{# - title: 'Dashboard'#}
{# value: '{{ .CommonAnnotations.dashboard_url }}'#}
{# short: true#}
{# - title: 'Alerting Rule'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}
{# - title: 'Alerting Rule Description'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}

View File

@@ -7,25 +7,30 @@ networks:
services:
grafana:
image: grafana/grafana:latest
{# image: grafana/grafana:latest#}
build:
context: .
dockerfile: Dockerfile
container_name: grafana
labels:
- {{ helpers.traefik_labels('grafana', port='3000') | indent(6) }}
restart: unless-stopped
# Needed to make config files readable
# Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}"
networks:
- default
- traefik_traefik
volumes:
- {{ base_volume_path }}/monitoring/grafana:/var/lib/grafana
- ./grafana_config:/etc/grafana:ro
- ./grafana_config/grafana.ini:/etc/grafana/grafana.ini:ro
- ./grafana_config/datasources:/etc/grafana/provisioning/datasources:ro
{# - ./grafana_config:/etc/grafana:ro#}
loki:
image: grafana/loki:latest
container_name: loki
restart: unless-stopped
# Needed to make config files readable
# Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}"
command:
- -config.file=/etc/loki/loki-config.yaml
@@ -56,7 +61,7 @@ services:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
# Needed to make config files readable
# Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}"
command:
- --config.file=/etc/prometheus/prometheus.yml
@@ -66,6 +71,16 @@ services:
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
- {{ base_volume_path }}/monitoring/prometheus:/prometheus
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
volumes:
- ./alertmanager_config:/etc/alertmanager:ro
# TODO: add volume for alertmanager data
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter

View File

@@ -0,0 +1,9 @@
apiVersion: 1
providers:
- name: "Grafana"
org_id: 1
folder: "Services"
type: "file"
options:
path: "/etc/grafana/provisioning/dashboards"

View File

@@ -0,0 +1,51 @@
from grafanalib.core import (
Dashboard, TimeSeries, GaugePanel,
Target, GridPos,
OPS_FORMAT
)
dashboard = Dashboard(
title="Python generated example dashboard",
description="Example dashboard using the Random Walk and default Prometheus datasource",
tags=[
'example'
],
timezone="browser",
panels=[
TimeSeries(
title="Random Walk",
dataSource='default',
targets=[
Target(
datasource='grafana',
expr='example',
),
],
gridPos=GridPos(h=8, w=16, x=0, y=0),
),
GaugePanel(
title="Random Walk",
dataSource='default',
targets=[
Target(
datasource='grafana',
expr='example',
),
],
gridPos=GridPos(h=4, w=4, x=17, y=0),
),
TimeSeries(
title="Prometheus http requests",
dataSource='prometheus',
targets=[
Target(
expr='rate(prometheus_http_requests_total[5m])',
legendFormat="{{ handler }}",
refId='A',
),
],
unit=OPS_FORMAT,
gridPos=GridPos(h=8, w=16, x=0, y=10),
),
],
).auto_panel_ids()

View File

@@ -0,0 +1,59 @@
from grafanalib.core import (
Dashboard, TimeSeries, GaugePanel,
Target, GridPos,
OPS_FORMAT, Templating, Template, REFRESH_ON_TIME_RANGE_CHANGE
)
from grafanalib.formatunits import BYTES_IEC
dashboard = Dashboard(
title="Containers",
description="Data for compose projects from default Prometheus datasource collected by Cadvisor",
tags=[
'example'
],
templating=Templating(list=[
# TODO: test how much of this is actually necessary
Template(
name="compose_project",
label="compose_project",
dataSource="prometheus",
query='label_values({__name__=~"container.*"}, container_label_com_docker_compose_project)',
includeAll=True,
multi=True,
hide=0,
sort=1,
type="query",
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name="container_name",
label="container_name",
dataSource="prometheus",
query='label_values({__name__=~"container.*", container_label_com_docker_compose_project=~"$compose_project"}, name)',
includeAll=True,
multi=True,
hide=0,
sort=1,
type="query",
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
]),
timezone="browser",
panels=[
TimeSeries(
title="Container Memory Usage",
# dataSource='prometheus',
targets=[
Target(
datasource='prometheus',
expr='max by (name) (container_memory_usage_bytes{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"})',
legendFormat="{{ name }}",
refId='A',
),
],
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=16, x=0, y=0),
),
],
).auto_panel_ids()

View File

@@ -15,6 +15,18 @@ datasources:
url: http://prometheus:9090
editable: false
- name: Alertmanager
type: alertmanager
access: proxy
uid: alertmanager
url: http://alertmanager:9093
jsonData:
# Valid options for implementation include mimir, cortex and prometheus
implementation: prometheus
# Whether Grafana should send alert instances to this Alertmanager
handleGrafanaManagedAlerts: true
editable: false
- name: InfluxDB
type: influxdb
access: proxy

View File

@@ -0,0 +1,20 @@
groups:
- name: demo-service-alerts
rules:
- alert: DemoServiceHighErrorRate
expr: |
(
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
)
/
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
) * 100 > 0.5
)
for: 1m
labels:
severity: critical
annotations:
title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'

View File

@@ -5,6 +5,11 @@ global:
external_labels:
monitor: "{{ ansible_host }}"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs:
- job_name: "prometheus"
static_configs:
@@ -30,7 +35,15 @@ scrape_configs:
static_configs:
- targets: ["promtail:9080"]
- job_name: 'demo'
static_configs:
- targets:
- 'demo.promlabs.com:10000'
- 'demo.promlabs.com:10001'
- 'demo.promlabs.com:10002'
rule_files:
- "/etc/prometheus/demo-alerts.yml"
- "/etc/prometheus/extra/rules/*.yml"
- "/etc/prometheus/extra/rules/*.json"