WIP: monitoring improvements
This commit is contained in:
@@ -10,7 +10,7 @@
|
||||
file:
|
||||
path: "{{ current_stack_dest }}/{{ item.path }}"
|
||||
state: directory
|
||||
mode: "700"
|
||||
mode: "755"
|
||||
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
|
||||
when: item.state == "directory"
|
||||
|
||||
@@ -18,9 +18,17 @@
|
||||
template:
|
||||
src: "{{ item.src }}"
|
||||
dest: "{{ current_stack_dest }}/{{ item.path | regex_replace('\\.j2$', '') }}"
|
||||
mode: "600"
|
||||
mode: "644"
|
||||
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
|
||||
when: item.state == "file"
|
||||
when: item.state == "file" and item.path | regex_search('\\.j2$')
|
||||
|
||||
- name: Generate {{ current_stack_name }} deployment from static files
|
||||
copy:
|
||||
src: "{{ item.src }}"
|
||||
dest: "{{ current_stack_dest }}/{{ item.path }}"
|
||||
mode: "644"
|
||||
loop: "{{ lookup('community.general.filetree', current_stack_source) }}"
|
||||
when: item.state == "file" and not item.path | regex_search('\\.j2$')
|
||||
|
||||
- name: Deploy docker-compose for {{ current_stack_name }}
|
||||
community.docker.docker_compose_v2:
|
||||
@@ -30,5 +38,5 @@
|
||||
remove_orphans: yes
|
||||
register: docker_compose_output
|
||||
|
||||
- debug:
|
||||
var: docker_compose_output
|
||||
# - debug:
|
||||
# var: docker_compose_output
|
||||
|
@@ -0,0 +1 @@
|
||||
DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}
|
13
roles/alpina/templates/services/monitoring/Dockerfile
Normal file
13
roles/alpina/templates/services/monitoring/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM python:3-alpine AS builder
|
||||
|
||||
RUN pip install grafanalib
|
||||
|
||||
COPY ./grafana_config/dashboards /dashboards
|
||||
|
||||
RUN generate-dashboards /dashboards/*.dashboard.py
|
||||
|
||||
FROM grafana/grafana:latest
|
||||
|
||||
#COPY ./grafana_config /etc/grafana
|
||||
COPY ./grafana_config/dashboards/*.yaml /etc/grafana/provisioning/dashboards
|
||||
COPY --from=builder /dashboards/*.json /etc/grafana/provisioning/dashboards
|
@@ -0,0 +1,68 @@
|
||||
# The root route on which each incoming alert enters.
|
||||
route:
|
||||
group_by: ["alertname", "job"]
|
||||
group_wait: 20s
|
||||
group_interval: 5m
|
||||
repeat_interval: 3h
|
||||
receiver: discord_webhook
|
||||
|
||||
receivers:
|
||||
- name: "discord_webhook"
|
||||
discord_configs:
|
||||
- webhook_url: "{{ alertmanager_discord_webhook }}"
|
||||
{# - send_resolved: true#}
|
||||
{# username: 'Alertmanager'#}
|
||||
{# webhook_configs:#}
|
||||
{# - send_resolved: true#}
|
||||
{# url: '{{ alertmanager_discord_webhook }}'#}
|
||||
{# username: 'Alertmanager'#}
|
||||
{# icon_url: 'https://prometheus.io/assets/icon.png'#}
|
||||
{# icon_emoji: ':alert:'#}
|
||||
{# send_resolved: true#}
|
||||
{# text: "{{ .CommonAnnotations.summary }}"#}
|
||||
{# title: "{{ .CommonLabels.alertname }}"#}
|
||||
{# color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
|
||||
{# footer: '{{ .CommonLabels.monitor }}'#}
|
||||
{# footer_icon: 'https://prometheus.io/assets/icon.png'#}
|
||||
{# actions:#}
|
||||
{# - type: 'button'#}
|
||||
{# text: 'Open in Grafana'#}
|
||||
{# url: '{{ .ExternalURL }}'#}
|
||||
{# style: 'primary'#}
|
||||
{# send_resolved: true#}
|
||||
{# confirm:#}
|
||||
{# title: 'Are you sure?'#}
|
||||
{# text: 'This will open Grafana in a new tab.'#}
|
||||
{# ok_text: 'Yes'#}
|
||||
{# dismiss_text: 'No'#}
|
||||
{# fields:#}
|
||||
{# - title: 'Description'#}
|
||||
{# value: "{{ .CommonAnnotations.description }}"#}
|
||||
{# short: false#}
|
||||
{# - title: 'Details'#}
|
||||
{# value: "{{ .CommonAnnotations.details }}"#}
|
||||
{# short: false#}
|
||||
{# - title: 'Severity'#}
|
||||
{# value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Host'#}
|
||||
{# value: '{{ .CommonLabels.monitor }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Starts At'#}
|
||||
{# value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Ends At'#}
|
||||
{# value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Runbook'#}
|
||||
{# value: '{{ .CommonAnnotations.runbook_url }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Dashboard'#}
|
||||
{# value: '{{ .CommonAnnotations.dashboard_url }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Alerting Rule'#}
|
||||
{# value: '{{ .CommonLabels.alertname }}'#}
|
||||
{# short: true#}
|
||||
{# - title: 'Alerting Rule Description'#}
|
||||
{# value: '{{ .CommonLabels.alertname }}'#}
|
||||
{# short: true#}
|
@@ -7,25 +7,30 @@ networks:
|
||||
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
{# image: grafana/grafana:latest#}
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: grafana
|
||||
labels:
|
||||
- {{ helpers.traefik_labels('grafana', port='3000') | indent(6) }}
|
||||
restart: unless-stopped
|
||||
# Needed to make config files readable
|
||||
# Needed to make config files readable (not anymore, TODO: remove)
|
||||
user: "{{ remote_uid }}"
|
||||
networks:
|
||||
- default
|
||||
- traefik_traefik
|
||||
volumes:
|
||||
- {{ base_volume_path }}/monitoring/grafana:/var/lib/grafana
|
||||
- ./grafana_config:/etc/grafana:ro
|
||||
- ./grafana_config/grafana.ini:/etc/grafana/grafana.ini:ro
|
||||
- ./grafana_config/datasources:/etc/grafana/provisioning/datasources:ro
|
||||
{# - ./grafana_config:/etc/grafana:ro#}
|
||||
|
||||
loki:
|
||||
image: grafana/loki:latest
|
||||
container_name: loki
|
||||
restart: unless-stopped
|
||||
# Needed to make config files readable
|
||||
# Needed to make config files readable (not anymore, TODO: remove)
|
||||
user: "{{ remote_uid }}"
|
||||
command:
|
||||
- -config.file=/etc/loki/loki-config.yaml
|
||||
@@ -56,7 +61,7 @@ services:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
# Needed to make config files readable
|
||||
# Needed to make config files readable (not anymore, TODO: remove)
|
||||
user: "{{ remote_uid }}"
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
@@ -66,6 +71,16 @@ services:
|
||||
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
|
||||
- {{ base_volume_path }}/monitoring/prometheus:/prometheus
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
volumes:
|
||||
- ./alertmanager_config:/etc/alertmanager:ro
|
||||
# TODO: add volume for alertmanager data
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
|
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: "Grafana"
|
||||
org_id: 1
|
||||
folder: "Services"
|
||||
type: "file"
|
||||
options:
|
||||
path: "/etc/grafana/provisioning/dashboards"
|
@@ -0,0 +1,51 @@
|
||||
from grafanalib.core import (
|
||||
Dashboard, TimeSeries, GaugePanel,
|
||||
Target, GridPos,
|
||||
OPS_FORMAT
|
||||
)
|
||||
|
||||
dashboard = Dashboard(
|
||||
title="Python generated example dashboard",
|
||||
description="Example dashboard using the Random Walk and default Prometheus datasource",
|
||||
tags=[
|
||||
'example'
|
||||
],
|
||||
timezone="browser",
|
||||
panels=[
|
||||
TimeSeries(
|
||||
title="Random Walk",
|
||||
dataSource='default',
|
||||
targets=[
|
||||
Target(
|
||||
datasource='grafana',
|
||||
expr='example',
|
||||
),
|
||||
],
|
||||
gridPos=GridPos(h=8, w=16, x=0, y=0),
|
||||
),
|
||||
GaugePanel(
|
||||
title="Random Walk",
|
||||
dataSource='default',
|
||||
targets=[
|
||||
Target(
|
||||
datasource='grafana',
|
||||
expr='example',
|
||||
),
|
||||
],
|
||||
gridPos=GridPos(h=4, w=4, x=17, y=0),
|
||||
),
|
||||
TimeSeries(
|
||||
title="Prometheus http requests",
|
||||
dataSource='prometheus',
|
||||
targets=[
|
||||
Target(
|
||||
expr='rate(prometheus_http_requests_total[5m])',
|
||||
legendFormat="{{ handler }}",
|
||||
refId='A',
|
||||
),
|
||||
],
|
||||
unit=OPS_FORMAT,
|
||||
gridPos=GridPos(h=8, w=16, x=0, y=10),
|
||||
),
|
||||
],
|
||||
).auto_panel_ids()
|
@@ -0,0 +1,59 @@
|
||||
from grafanalib.core import (
|
||||
Dashboard, TimeSeries, GaugePanel,
|
||||
Target, GridPos,
|
||||
OPS_FORMAT, Templating, Template, REFRESH_ON_TIME_RANGE_CHANGE
|
||||
)
|
||||
from grafanalib.formatunits import BYTES_IEC
|
||||
|
||||
dashboard = Dashboard(
|
||||
title="Containers",
|
||||
description="Data for compose projects from default Prometheus datasource collected by Cadvisor",
|
||||
tags=[
|
||||
'example'
|
||||
],
|
||||
templating=Templating(list=[
|
||||
# TODO: test how much of this is actually necessary
|
||||
Template(
|
||||
name="compose_project",
|
||||
label="compose_project",
|
||||
dataSource="prometheus",
|
||||
query='label_values({__name__=~"container.*"}, container_label_com_docker_compose_project)',
|
||||
includeAll=True,
|
||||
multi=True,
|
||||
hide=0,
|
||||
sort=1,
|
||||
type="query",
|
||||
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
|
||||
),
|
||||
Template(
|
||||
name="container_name",
|
||||
label="container_name",
|
||||
dataSource="prometheus",
|
||||
query='label_values({__name__=~"container.*", container_label_com_docker_compose_project=~"$compose_project"}, name)',
|
||||
includeAll=True,
|
||||
multi=True,
|
||||
hide=0,
|
||||
sort=1,
|
||||
type="query",
|
||||
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
|
||||
|
||||
),
|
||||
]),
|
||||
timezone="browser",
|
||||
panels=[
|
||||
TimeSeries(
|
||||
title="Container Memory Usage",
|
||||
# dataSource='prometheus',
|
||||
targets=[
|
||||
Target(
|
||||
datasource='prometheus',
|
||||
expr='max by (name) (container_memory_usage_bytes{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"})',
|
||||
legendFormat="{{ name }}",
|
||||
refId='A',
|
||||
),
|
||||
],
|
||||
unit=BYTES_IEC,
|
||||
gridPos=GridPos(h=8, w=16, x=0, y=0),
|
||||
),
|
||||
],
|
||||
).auto_panel_ids()
|
@@ -15,6 +15,18 @@ datasources:
|
||||
url: http://prometheus:9090
|
||||
editable: false
|
||||
|
||||
- name: Alertmanager
|
||||
type: alertmanager
|
||||
access: proxy
|
||||
uid: alertmanager
|
||||
url: http://alertmanager:9093
|
||||
jsonData:
|
||||
# Valid options for implementation include mimir, cortex and prometheus
|
||||
implementation: prometheus
|
||||
# Whether Grafana should send alert instances to this Alertmanager
|
||||
handleGrafanaManagedAlerts: true
|
||||
editable: false
|
||||
|
||||
- name: InfluxDB
|
||||
type: influxdb
|
||||
access: proxy
|
@@ -0,0 +1,20 @@
|
||||
groups:
|
||||
- name: demo-service-alerts
|
||||
rules:
|
||||
- alert: DemoServiceHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum without(status, instance) (
|
||||
rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
|
||||
)
|
||||
/
|
||||
sum without(status, instance) (
|
||||
rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
|
||||
) * 100 > 0.5
|
||||
)
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
|
||||
description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'
|
@@ -5,6 +5,11 @@ global:
|
||||
external_labels:
|
||||
monitor: "{{ ansible_host }}"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
@@ -30,7 +35,15 @@ scrape_configs:
|
||||
static_configs:
|
||||
- targets: ["promtail:9080"]
|
||||
|
||||
- job_name: 'demo'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'demo.promlabs.com:10000'
|
||||
- 'demo.promlabs.com:10001'
|
||||
- 'demo.promlabs.com:10002'
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/demo-alerts.yml"
|
||||
- "/etc/prometheus/extra/rules/*.yml"
|
||||
- "/etc/prometheus/extra/rules/*.json"
|
||||
|
||||
|
Reference in New Issue
Block a user