monitoring: prepare for merge to master

monitoring: readme for grafanalib
2024-10-30 16:03:39 -07:00 · 2024-10-30 15:20:13 -07:00
11 changed files with 24 additions and 153 deletions
--- a/README.md
+++ b/README.md
@@ -8,6 +8,22 @@ running on top of TrueNAS SCALE, separating all the docker stuff from the applia
 # Notes
 ## Monitoring
 The monitoring stack is set up to monitor all the containers and the host.
 This is a work in progress, Grafana is set up with grafanalib, a Python library that generates Grafana dashboards.
 The dashboards are generated from Python scripts in 
 [grafana_config/dashboards](roles/alpina/templates/services/monitoring/grafana_config/dashboards).
 This requires a custom grafana image, which is built from the 
 [Dockerfile](roles/alpina/templates/services/monitoring/Dockerfile).
 This also means it has to be manually rebuilt whenever the dashboards are updated.
 From the services/monitoring directory, run:
 ```bash
 docker compose up -d --build --force-recreate grafana
 ```
 ## IPv6
 The current configuration is designed to work with IPv6. 
 However, because of how (not properly) I'm doing the subnetting 
--- a/roles/alpina/templates/services/monitoring/.env.alertmanager.j2
+++ b/roles/alpina/templates/services/monitoring/.env.alertmanager.j2
@@ -1 +0,0 @@
 DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}
--- a/roles/alpina/templates/services/monitoring/alertmanager_config/alertmanager.yml.j2
+++ b/roles/alpina/templates/services/monitoring/alertmanager_config/alertmanager.yml.j2
@@ -1,68 +0,0 @@
 # The root route on which each incoming alert enters.
 route:
  group_by: ["alertname", "job"]
  group_wait: 20s
  group_interval: 5m
  repeat_interval: 3h
  receiver: discord_webhook
 receivers:
  - name: "discord_webhook"
    discord_configs:
      - webhook_url: "{{ alertmanager_discord_webhook }}"
 {#        - send_resolved: true#}
 {#            username: 'Alertmanager'#}
 {#            webhook_configs:#}
 {#            - send_resolved: true#}
 {#                url: '{{ alertmanager_discord_webhook }}'#}
 {#                username: 'Alertmanager'#}
 {#                icon_url: 'https://prometheus.io/assets/icon.png'#}
 {#                icon_emoji: ':alert:'#}
 {#                send_resolved: true#}
 {#                text: "{{ .CommonAnnotations.summary }}"#}
 {#                title: "{{ .CommonLabels.alertname }}"#}
 {#                color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
 {#                footer: '{{ .CommonLabels.monitor }}'#}
 {#                footer_icon: 'https://prometheus.io/assets/icon.png'#}
 {#                actions:#}
 {#                - type: 'button'#}
 {#                    text: 'Open in Grafana'#}
 {#                    url: '{{ .ExternalURL }}'#}
 {#                    style: 'primary'#}
 {#                    send_resolved: true#}
 {#                    confirm:#}
 {#                    title: 'Are you sure?'#}
 {#                    text: 'This will open Grafana in a new tab.'#}
 {#                    ok_text: 'Yes'#}
 {#                    dismiss_text: 'No'#}
 {#                fields:#}
 {#                - title: 'Description'#}
 {#                    value: "{{ .CommonAnnotations.description }}"#}
 {#                    short: false#}
 {#                - title: 'Details'#}
 {#                    value: "{{ .CommonAnnotations.details }}"#}
 {#                    short: false#}
 {#                - title: 'Severity'#}
 {#                    value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
 {#                    short: true#}
 {#                - title: 'Host'#}
 {#                    value: '{{ .CommonLabels.monitor }}'#}
 {#                    short: true#}
 {#                - title: 'Starts At'#}
 {#                    value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
 {#                    short: true#}
 {#                - title: 'Ends At'#}
 {#                    value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
 {#                    short: true#}
 {#                - title: 'Runbook'#}
 {#                    value: '{{ .CommonAnnotations.runbook_url }}'#}
 {#                    short: true#}
 {#                - title: 'Dashboard'#}
 {#                    value: '{{ .CommonAnnotations.dashboard_url }}'#}
 {#                    short: true#}
 {#                - title: 'Alerting Rule'#}
 {#                    value: '{{ .CommonLabels.alertname }}'#}
 {#                    short: true#}
 {#                - title: 'Alerting Rule Description'#}
 {#                    value: '{{ .CommonLabels.alertname }}'#}
 {#                    short: true#}
--- a/roles/alpina/templates/services/monitoring/docker-compose.yml.j2
+++ b/roles/alpina/templates/services/monitoring/docker-compose.yml.j2
@@ -60,8 +60,6 @@ services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    labels:
      - {{ helpers.traefik_labels('prom', port='9090') | indent(6) }}
    restart: unless-stopped
    # Needed to make config files readable (not anymore, TODO: remove)
    user: "{{ remote_uid }}"
@@ -74,19 +72,6 @@ services:
      - {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
      - {{ base_volume_path }}/monitoring/prometheus:/prometheus
  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    labels:
      - {{ helpers.traefik_labels('alert', port='9093') | indent(6) }}
    restart: unless-stopped
    command:
      - --config.file=/etc/alertmanager/alertmanager.yml
      - --web.external-url=https://alert.{{ domain }}/
    volumes:
      - ./alertmanager_config:/etc/alertmanager:ro
      - {{ base_volume_path }}/monitoring/alertmanager:/alertmanager
  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
--- a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/alpina.yaml
+++ b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/alpina.yaml
@@ -3,7 +3,7 @@ apiVersion: 1
 providers:
  - name: "Grafana"
    org_id: 1
-    folder: "Services"
+    folder: "Alpina"
    type: "file"
    options:
      path: "/etc/grafana/provisioning/dashboards"
--- a/roles/alpina/templates/services/monitoring/grafana_config/datasources/alpina.yaml.j2
+++ b/roles/alpina/templates/services/monitoring/grafana_config/datasources/alpina.yaml.j2
@@ -15,18 +15,6 @@ datasources:
    url: http://prometheus:9090
    editable: false
  - name: Alertmanager
    type: alertmanager
    access: proxy
    uid: alertmanager
    url: http://alertmanager:9093
    jsonData:
      # Valid options for implementation include mimir, cortex and prometheus
      implementation: prometheus
      # Whether Grafana should send alert instances to this Alertmanager
      handleGrafanaManagedAlerts: true
    editable: false
  - name: InfluxDB
    type: influxdb
    access: proxy
--- a/roles/alpina/templates/services/monitoring/loki_config/loki-config.yaml.j2
+++ b/roles/alpina/templates/services/monitoring/loki_config/loki-config.yaml.j2
@@ -26,5 +26,5 @@ schema_config:
      store: tsdb
 # TODO: Figure this out
-ruler:
+# ruler:
-  alertmanager_url: http://localhost:9093
+#   alertmanager_url: http://localhost:9093
--- a/roles/alpina/templates/services/monitoring/prometheus_config/container.alerts.yml
+++ b/roles/alpina/templates/services/monitoring/prometheus_config/container.alerts.yml
@@ -1,23 +0,0 @@
 groups:
  - name: qbit-low-traffic
    interval: 1m
    rules:
      - alert: QbitLowTraffic
        expr: |
          rate(container_network_transmit_bytes_total{name=~"gluetun"}[1m]) < 1024
        for: 2m
        labels:
          severity: warning
        annotations:
          title: 'Low traffic on qBit'
          description: |
            The traffic on qBittorrent is lower than 1KiB/s for 2 minutes.
            Last value was x bytes/s.
            [Grafana Dashboard](https://grafana.{{ domain }}/d/containers?orgId=1)
            [View in Grafana](https://grafana.{{ domain }}/d/containers?orgId=1&viewPanel=3)
          __dashboard__uid: 'containers'
          __orgId__: 1
          __panelId__: 3
--- a/roles/alpina/templates/services/monitoring/prometheus_config/demo.alerts.yml.j2
+++ b/roles/alpina/templates/services/monitoring/prometheus_config/demo.alerts.yml.j2
@@ -1,20 +0,0 @@
 groups:
  - name: demo-service-alerts
    rules:
      - alert: DemoServiceHighErrorRate
        expr: |
          (
            sum without(status, instance) (
              rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
            )
          /
            sum without(status, instance) (
              rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
            ) * 100 > 0.5
          )
        for: 1m
        labels:
          severity: critical
        annotations:
          title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
          description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'
--- a/roles/alpina/templates/services/monitoring/prometheus_config/prometheus.yml.j2
+++ b/roles/alpina/templates/services/monitoring/prometheus_config/prometheus.yml.j2
@@ -5,11 +5,6 @@ global:
  external_labels:
    monitor: "{{ ansible_host }}"
 alerting:
  alertmanagers:
    - static_configs:
        - targets: ["alertmanager:9093"]
 scrape_configs:
  - job_name: "prometheus"
    static_configs:
@@ -43,7 +38,6 @@ scrape_configs:
          - 'demo.promlabs.com:10002'
 rule_files:
 {#  - "/etc/prometheus/container.alerts.yml"#}
  - "/etc/prometheus/extra/rules/*.yml"
  - "/etc/prometheus/extra/rules/*.json"
--- a/services.yml
+++ b/services.yml
@@ -5,11 +5,11 @@
  post_tasks:
    - name: Docker prune objects
      docker_prune:
-        containers: yes
+        containers: true
-        images: yes
+        # Keep images for building grafana
        images: true
        images_filters:
-          dangling: false
+          until: "720h"
        networks: true
        volumes: true
-        builder_cache: true
+        builder_cache: false
      when: false
Author	SHA1	Message	Date
Yuri Tatishchev	531febffe2	monitoring: prepare for merge to master	2024-10-30 16:03:39 -07:00
Yuri Tatishchev	929a6619c8	monitoring: readme for grafanalib	2024-10-30 15:20:13 -07:00
		`@@ -1 +0,0 @@`
			`DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}`