Compare commits

...

2 Commits

11 changed files with 24 additions and 153 deletions

View File

@ -8,6 +8,22 @@ running on top of TrueNAS SCALE, separating all the docker stuff from the applia
# Notes # Notes
## Monitoring
The monitoring stack is set up to monitor all the containers and the host.
This is a work in progress, Grafana is set up with grafanalib, a Python library that generates Grafana dashboards.
The dashboards are generated from Python scripts in
[grafana_config/dashboards](roles/alpina/templates/services/monitoring/grafana_config/dashboards).
This requires a custom grafana image, which is built from the
[Dockerfile](roles/alpina/templates/services/monitoring/Dockerfile).
This also means it has to be manually rebuilt whenever the dashboards are updated.
From the services/monitoring directory, run:
```bash
docker compose up -d --build --force-recreate grafana
```
## IPv6 ## IPv6
The current configuration is designed to work with IPv6. The current configuration is designed to work with IPv6.
However, because of how (not properly) I'm doing the subnetting However, because of how (not properly) I'm doing the subnetting

View File

@ -1 +0,0 @@
DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}

View File

@ -1,68 +0,0 @@
# The root route on which each incoming alert enters.
route:
group_by: ["alertname", "job"]
group_wait: 20s
group_interval: 5m
repeat_interval: 3h
receiver: discord_webhook
receivers:
- name: "discord_webhook"
discord_configs:
- webhook_url: "{{ alertmanager_discord_webhook }}"
{# - send_resolved: true#}
{# username: 'Alertmanager'#}
{# webhook_configs:#}
{# - send_resolved: true#}
{# url: '{{ alertmanager_discord_webhook }}'#}
{# username: 'Alertmanager'#}
{# icon_url: 'https://prometheus.io/assets/icon.png'#}
{# icon_emoji: ':alert:'#}
{# send_resolved: true#}
{# text: "{{ .CommonAnnotations.summary }}"#}
{# title: "{{ .CommonLabels.alertname }}"#}
{# color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
{# footer: '{{ .CommonLabels.monitor }}'#}
{# footer_icon: 'https://prometheus.io/assets/icon.png'#}
{# actions:#}
{# - type: 'button'#}
{# text: 'Open in Grafana'#}
{# url: '{{ .ExternalURL }}'#}
{# style: 'primary'#}
{# send_resolved: true#}
{# confirm:#}
{# title: 'Are you sure?'#}
{# text: 'This will open Grafana in a new tab.'#}
{# ok_text: 'Yes'#}
{# dismiss_text: 'No'#}
{# fields:#}
{# - title: 'Description'#}
{# value: "{{ .CommonAnnotations.description }}"#}
{# short: false#}
{# - title: 'Details'#}
{# value: "{{ .CommonAnnotations.details }}"#}
{# short: false#}
{# - title: 'Severity'#}
{# value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
{# short: true#}
{# - title: 'Host'#}
{# value: '{{ .CommonLabels.monitor }}'#}
{# short: true#}
{# - title: 'Starts At'#}
{# value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Ends At'#}
{# value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Runbook'#}
{# value: '{{ .CommonAnnotations.runbook_url }}'#}
{# short: true#}
{# - title: 'Dashboard'#}
{# value: '{{ .CommonAnnotations.dashboard_url }}'#}
{# short: true#}
{# - title: 'Alerting Rule'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}
{# - title: 'Alerting Rule Description'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}

View File

@ -60,8 +60,6 @@ services:
prometheus: prometheus:
image: prom/prometheus:latest image: prom/prometheus:latest
container_name: prometheus container_name: prometheus
labels:
- {{ helpers.traefik_labels('prom', port='9090') | indent(6) }}
restart: unless-stopped restart: unless-stopped
# Needed to make config files readable (not anymore, TODO: remove) # Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}" user: "{{ remote_uid }}"
@ -74,19 +72,6 @@ services:
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro - {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
- {{ base_volume_path }}/monitoring/prometheus:/prometheus - {{ base_volume_path }}/monitoring/prometheus:/prometheus
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
labels:
- {{ helpers.traefik_labels('alert', port='9093') | indent(6) }}
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --web.external-url=https://alert.{{ domain }}/
volumes:
- ./alertmanager_config:/etc/alertmanager:ro
- {{ base_volume_path }}/monitoring/alertmanager:/alertmanager
node-exporter: node-exporter:
image: prom/node-exporter:latest image: prom/node-exporter:latest
container_name: node-exporter container_name: node-exporter

View File

@ -3,7 +3,7 @@ apiVersion: 1
providers: providers:
- name: "Grafana" - name: "Grafana"
org_id: 1 org_id: 1
folder: "Services" folder: "Alpina"
type: "file" type: "file"
options: options:
path: "/etc/grafana/provisioning/dashboards" path: "/etc/grafana/provisioning/dashboards"

View File

@ -15,18 +15,6 @@ datasources:
url: http://prometheus:9090 url: http://prometheus:9090
editable: false editable: false
- name: Alertmanager
type: alertmanager
access: proxy
uid: alertmanager
url: http://alertmanager:9093
jsonData:
# Valid options for implementation include mimir, cortex and prometheus
implementation: prometheus
# Whether Grafana should send alert instances to this Alertmanager
handleGrafanaManagedAlerts: true
editable: false
- name: InfluxDB - name: InfluxDB
type: influxdb type: influxdb
access: proxy access: proxy

View File

@ -26,5 +26,5 @@ schema_config:
store: tsdb store: tsdb
# TODO: Figure this out # TODO: Figure this out
ruler: # ruler:
alertmanager_url: http://localhost:9093 # alertmanager_url: http://localhost:9093

View File

@ -1,23 +0,0 @@
groups:
- name: qbit-low-traffic
interval: 1m
rules:
- alert: QbitLowTraffic
expr: |
rate(container_network_transmit_bytes_total{name=~"gluetun"}[1m]) < 1024
for: 2m
labels:
severity: warning
annotations:
title: 'Low traffic on qBit'
description: |
The traffic on qBittorrent is lower than 1KiB/s for 2 minutes.
Last value was x bytes/s.
[Grafana Dashboard](https://grafana.{{ domain }}/d/containers?orgId=1)
[View in Grafana](https://grafana.{{ domain }}/d/containers?orgId=1&viewPanel=3)
__dashboard__uid: 'containers'
__orgId__: 1
__panelId__: 3

View File

@ -1,20 +0,0 @@
groups:
- name: demo-service-alerts
rules:
- alert: DemoServiceHighErrorRate
expr: |
(
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
)
/
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
) * 100 > 0.5
)
for: 1m
labels:
severity: critical
annotations:
title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'

View File

@ -5,11 +5,6 @@ global:
external_labels: external_labels:
monitor: "{{ ansible_host }}" monitor: "{{ ansible_host }}"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs: scrape_configs:
- job_name: "prometheus" - job_name: "prometheus"
static_configs: static_configs:
@ -43,7 +38,6 @@ scrape_configs:
- 'demo.promlabs.com:10002' - 'demo.promlabs.com:10002'
rule_files: rule_files:
{# - "/etc/prometheus/container.alerts.yml"#}
- "/etc/prometheus/extra/rules/*.yml" - "/etc/prometheus/extra/rules/*.yml"
- "/etc/prometheus/extra/rules/*.json" - "/etc/prometheus/extra/rules/*.json"

View File

@ -5,11 +5,11 @@
post_tasks: post_tasks:
- name: Docker prune objects - name: Docker prune objects
docker_prune: docker_prune:
containers: yes containers: true
images: yes # Keep images for building grafana
images: true
images_filters: images_filters:
dangling: false until: "720h"
networks: true networks: true
volumes: true volumes: true
builder_cache: true builder_cache: false
when: false