11 Commits

19 changed files with 718 additions and 186 deletions

2
.idea/alpina.iml generated
View File

@@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Poetry (alpina) (4)" jdkType="Python SDK" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

16
.idea/jsonSchemas.xml generated
View File

@@ -39,6 +39,22 @@
</SchemaInfo>
</value>
</entry>
<entry key="Loki">
<value>
<SchemaInfo>
<option name="name" value="Loki" />
<option name="relativePathToSchema" value="https://json.schemastore.org/loki.json" />
<option name="applicationDefined" value="true" />
<option name="patterns">
<list>
<Item>
<option name="path" value="roles/alpina/templates/services/monitoring/loki_config/loki-config.yaml.j2" />
</Item>
</list>
</option>
</SchemaInfo>
</value>
</entry>
<entry key="Traefik v2">
<value>
<SchemaInfo>

2
.idea/misc.xml generated
View File

@@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="Poetry (alpina) (2)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (alpina) (4)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (alpina)" project-jdk-type="Python SDK" />
</project>

View File

@@ -8,6 +8,22 @@ running on top of TrueNAS SCALE, separating all the docker stuff from the applia
# Notes
## Monitoring
The monitoring stack is set up to monitor all the containers and the host.
This is a work in progress, Grafana is set up with grafanalib, a Python library that generates Grafana dashboards.
The dashboards are generated from Python scripts in
[grafana_config/dashboards](roles/alpina/templates/services/monitoring/grafana_config/dashboards).
This requires a custom grafana image, which is built from the
[Dockerfile](roles/alpina/templates/services/monitoring/Dockerfile).
This also means it has to be manually rebuilt whenever the dashboards are updated.
From the services/monitoring directory, run:
```bash
docker compose up -d --build --force-recreate grafana
```
## IPv6
The current configuration is designed to work with IPv6.
However, because of how (not properly) I'm doing the subnetting

View File

@@ -1 +0,0 @@
DISCORD_WEBHOOK={{ alertmanager_discord_webhook }}

View File

@@ -4,6 +4,10 @@ RUN pip install grafanalib
COPY ./grafana_config/dashboards /dashboards
# Required for grafanalib to find the shared python files like common.py
# https://github.com/weaveworks/grafanalib/issues/58
ENV PYTHONPATH=/dashboards
RUN generate-dashboards /dashboards/*.dashboard.py
FROM grafana/grafana:latest

View File

@@ -1,68 +0,0 @@
# The root route on which each incoming alert enters.
route:
group_by: ["alertname", "job"]
group_wait: 20s
group_interval: 5m
repeat_interval: 3h
receiver: discord_webhook
receivers:
- name: "discord_webhook"
discord_configs:
- webhook_url: "{{ alertmanager_discord_webhook }}"
{# - send_resolved: true#}
{# username: 'Alertmanager'#}
{# webhook_configs:#}
{# - send_resolved: true#}
{# url: '{{ alertmanager_discord_webhook }}'#}
{# username: 'Alertmanager'#}
{# icon_url: 'https://prometheus.io/assets/icon.png'#}
{# icon_emoji: ':alert:'#}
{# send_resolved: true#}
{# text: "{{ .CommonAnnotations.summary }}"#}
{# title: "{{ .CommonLabels.alertname }}"#}
{# color: '{{ if eq .Status "firing" }}#FF0000{{ else }}#00FF00{{ end }}'#}
{# footer: '{{ .CommonLabels.monitor }}'#}
{# footer_icon: 'https://prometheus.io/assets/icon.png'#}
{# actions:#}
{# - type: 'button'#}
{# text: 'Open in Grafana'#}
{# url: '{{ .ExternalURL }}'#}
{# style: 'primary'#}
{# send_resolved: true#}
{# confirm:#}
{# title: 'Are you sure?'#}
{# text: 'This will open Grafana in a new tab.'#}
{# ok_text: 'Yes'#}
{# dismiss_text: 'No'#}
{# fields:#}
{# - title: 'Description'#}
{# value: "{{ .CommonAnnotations.description }}"#}
{# short: false#}
{# - title: 'Details'#}
{# value: "{{ .CommonAnnotations.details }}"#}
{# short: false#}
{# - title: 'Severity'#}
{# value: '{{ if eq .Labels.severity "critical" }}Critical{{ else if eq .Labels.severity "warning" }}Warning{{ else }}Info{{ end }}'#}
{# short: true#}
{# - title: 'Host'#}
{# value: '{{ .CommonLabels.monitor }}'#}
{# short: true#}
{# - title: 'Starts At'#}
{# value: '{{ .StartsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Ends At'#}
{# value: '{{ .EndsAt.Format "2006-01-02 15:04:05" }}'#}
{# short: true#}
{# - title: 'Runbook'#}
{# value: '{{ .CommonAnnotations.runbook_url }}'#}
{# short: true#}
{# - title: 'Dashboard'#}
{# value: '{{ .CommonAnnotations.dashboard_url }}'#}
{# short: true#}
{# - title: 'Alerting Rule'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}
{# - title: 'Alerting Rule Description'#}
{# value: '{{ .CommonLabels.alertname }}'#}
{# short: true#}

View File

@@ -60,8 +60,6 @@ services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
labels:
- {{ helpers.traefik_labels('prom', port='9090') | indent(6) }}
restart: unless-stopped
# Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}"
@@ -74,19 +72,6 @@ services:
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
- {{ base_volume_path }}/monitoring/prometheus:/prometheus
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
labels:
- {{ helpers.traefik_labels('alert', port='9093') | indent(6) }}
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --web.external-url=https://alert.{{ domain }}/
volumes:
- ./alertmanager_config:/etc/alertmanager:ro
- {{ base_volume_path }}/monitoring/alertmanager:/alertmanager
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
@@ -100,6 +85,11 @@ services:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
command:
- --docker_only=true
- --store_container_labels=false
- --whitelisted_container_labels=com.docker.compose.project,com.docker.compose.service
- --enable_metrics=cpu,cpuLoad,diskIO,memory,network,oom_event,process
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw

View File

@@ -3,7 +3,7 @@ apiVersion: 1
providers:
- name: "Grafana"
org_id: 1
folder: "Services"
folder: "Alpina"
type: "file"
options:
path: "/etc/grafana/provisioning/dashboards"

View File

@@ -0,0 +1,27 @@
from grafanalib.core import Template
# TODO: consider default params for common params like line width, show points, tooltip
PrometheusTemplate = Template(
name='datasource',
type='datasource',
label='Prometheus',
query='prometheus',
)
# TODO: this slightly less (clown emoji), normal Target gave me errors in grafana
class LokiTarget(object):
def __init__(self, loki_datasource, expr, legendFormat, refId):
self.loki_datasource = loki_datasource
self.expr = expr
self.legendFormat = legendFormat
self.refId = refId
def to_json_data(self):
return {
'datasource': self.loki_datasource,
'expr': self.expr,
'legendFormat': self.legendFormat,
'refId': self.refId,
'queryType': 'range',
}

View File

@@ -5,28 +5,21 @@ from grafanalib.core import (
)
from grafanalib.formatunits import BYTES_IEC, SECONDS, BYTES_SEC_IEC
prom_datasource='prometheus'
loki_datasource='loki'
from common import LokiTarget, PrometheusTemplate
# TODO: this is (clown emoji), normal Target gave me errors in grafana
class LokiTarget(object):
def to_json_data(self):
return {
'datasource': loki_datasource,
'expr': '{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
'legendFormat': '{{ container_name }}',
'refId': 'A',
'queryType': 'range',
}
prom_datasource='${datasource}'
loki_datasource='loki'
dashboard = Dashboard(
title='Containers',
uid='containers',
description='Data for compose projects from default Prometheus datasource collected by Cadvisor',
tags=[
'example'
'linux',
'docker',
],
templating=Templating(list=[
PrometheusTemplate,
Template(
name='compose_project',
label='Compose Project',
@@ -44,7 +37,6 @@ dashboard = Dashboard(
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='logs_query',
@@ -56,7 +48,6 @@ dashboard = Dashboard(
timezone='browser',
panels=[
TimeSeries(
id=1,
title='Container Memory Usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=0),
@@ -76,13 +67,14 @@ dashboard = Dashboard(
],
),
TimeSeries(
id=2,
title='Container CPU Usage',
unit=SECONDS,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource=prom_datasource,
@@ -93,7 +85,6 @@ dashboard = Dashboard(
],
),
TimeSeries(
id=3,
title='Container Network Traffic',
unit=BYTES_SEC_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=8),
@@ -118,7 +109,6 @@ dashboard = Dashboard(
],
),
Logs(
id=4,
title='',
gridPos=GridPos(h=8, w=12, x=12, y=8),
showLabels=True,
@@ -127,13 +117,12 @@ dashboard = Dashboard(
prettifyLogMessage=True,
dedupStrategy='numbers',
targets=[
LokiTarget(),
# Target(
# datasource=loki_datasource,
# expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
# legendFormat='{{ container_name }}',
# refId='A',
# ),
LokiTarget(
loki_datasource=loki_datasource,
expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
legendFormat='{{ container_name }}',
refId='A',
),
],
),
],

View File

@@ -0,0 +1,139 @@
from grafanalib.core import Dashboard, Templating, Template, TimeSeries, PERCENT_UNIT_FORMAT, GridPos, Target
from grafanalib.formatunits import BYTES_IEC
from common import PrometheusTemplate
from node_consts import CPU_BASIC_COLORS, MEMORY_BASIC_COLORS
dashboard = Dashboard(
title='Node Exporter',
uid='node',
description='Node Exporter (not quite full)',
tags=[
'linux',
],
timezone='browser',
templating=Templating(list=[
# Datasource
PrometheusTemplate,
# Job
Template(
name='job',
label='Job',
dataSource='${datasource}',
query='label_values(node_uname_info, job)',
),
# Instance
Template(
name='instance',
label='Instance',
dataSource='${datasource}',
query='label_values(node_uname_info{job="$job"}, instance)',
),
]),
panels=[
# CPU Basic
TimeSeries(
title='CPU Basic',
description='Basic CPU usage info',
unit=PERCENT_UNIT_FORMAT,
gridPos=GridPos(h=8, w=12, x=0, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'percent', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="system"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy System',
refId='A',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="user"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy User',
refId='B',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="iowait"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Iowait',
refId='C',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode=~".*irq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy IRQs',
refId='D',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode!="idle",mode!="user",mode!="system",mode!="iowait",mode!="irq",mode!="softirq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Other',
refId='E',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="idle"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Idle',
refId='F',
),
],
# Extra JSON for the colors
extraJson=CPU_BASIC_COLORS,
),
# Memory Basic
TimeSeries(
title='Memory Basic',
description='Basic memory usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'normal', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource='${datasource}',
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"}',
format='time_series',
legendFormat='RAM Total',
refId='A',
),
Target(
datasource='${datasource}',
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"} - node_memory_MemFree_bytes{instance="$instance",job="$job"} - (node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"})',
format='time_series',
legendFormat='RAM Used',
refId='B',
),
Target(
datasource='${datasource}',
expr='node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Cache + Buffer',
refId='C',
),
Target(
datasource='${datasource}',
expr='node_memory_MemFree_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Free',
refId='D',
),
Target(
datasource='${datasource}',
expr='(node_memory_SwapTotal_bytes{instance="$instance",job="$job"} - node_memory_SwapFree_bytes{instance="$instance",job="$job"})',
legendFormat='SWAP Used',
refId='E',
),
],
# Extra JSON for the colors
extraJson=MEMORY_BASIC_COLORS,
),
# TODO: Network Basic
# TODO: Disk Basic
],
).auto_panel_ids()

View File

@@ -0,0 +1,487 @@
# TODO: Question life decisions (I'm not sure if this is good)
CPU_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy System"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy User"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Other"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
}
]
},
}
MEMORY_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Apps"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#629E51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Buffers"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#614D93",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cached"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#511749",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Committed"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#508642",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#CFFAFF",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inactive"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#584477",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "PageTables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Page_Tables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "SWAP Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#806EB7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0752D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#C15C17",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#2F575E",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Unused"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Total"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Cache + Buffer"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Available"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#DEDAF7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
}
]
}
}

View File

@@ -15,18 +15,6 @@ datasources:
url: http://prometheus:9090
editable: false
- name: Alertmanager
type: alertmanager
access: proxy
uid: alertmanager
url: http://alertmanager:9093
jsonData:
# Valid options for implementation include mimir, cortex and prometheus
implementation: prometheus
# Whether Grafana should send alert instances to this Alertmanager
handleGrafanaManagedAlerts: true
editable: false
- name: InfluxDB
type: influxdb
access: proxy

View File

@@ -17,13 +17,6 @@ common:
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
- from: 2024-10-18
index:
period: 24h
@@ -33,5 +26,5 @@ schema_config:
store: tsdb
# TODO: Figure this out
ruler:
alertmanager_url: http://localhost:9093
# ruler:
# alertmanager_url: http://localhost:9093

View File

@@ -1,23 +0,0 @@
groups:
- name: qbit-low-traffic
interval: 1m
rules:
- alert: QbitLowTraffic
expr: |
rate(container_network_transmit_bytes_total{name=~"gluetun"}[1m]) < 1024
for: 2m
labels:
severity: warning
annotations:
title: 'Low traffic on qBit'
description: |
The traffic on qBittorrent is lower than 1KiB/s for 2 minutes.
Last value was x bytes/s.
[Grafana Dashboard](https://grafana.{{ domain }}/d/containers?orgId=1)
[View in Grafana](https://grafana.{{ domain }}/d/containers?orgId=1&viewPanel=3)
__dashboard__uid: 'containers'
__orgId__: 1
__panelId__: 3

View File

@@ -1,20 +0,0 @@
groups:
- name: demo-service-alerts
rules:
- alert: DemoServiceHighErrorRate
expr: |
(
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{status=~"5..",job="demo"}[1m])
)
/
sum without(status, instance) (
rate(demo_api_request_duration_seconds_count{job="demo"}[1m])
) * 100 > 0.5
)
for: 1m
labels:
severity: critical
annotations:
title: 'High 5xx rate for {{'{{ $labels.method }}'}} on {{'{{ $labels.path }}'}}'
description: 'The 5xx error rate for path {{'{{ $labels.path }}'}} with method {{'{{ $labels.method }}'}} in {{'{{ $labels.job }}'}} is {{'{{ printf "%.2f" $value }}'}}%.'

View File

@@ -5,11 +5,6 @@ global:
external_labels:
monitor: "{{ ansible_host }}"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs:
- job_name: "prometheus"
static_configs:
@@ -43,7 +38,6 @@ scrape_configs:
- 'demo.promlabs.com:10002'
rule_files:
- "/etc/prometheus/container.alerts.yml"
- "/etc/prometheus/extra/rules/*.yml"
- "/etc/prometheus/extra/rules/*.json"

View File

@@ -5,10 +5,11 @@
post_tasks:
- name: Docker prune objects
docker_prune:
containers: yes
images: yes
containers: true
# Keep images for building grafana
images: true
images_filters:
dangling: false
until: "720h"
networks: true
volumes: true
builder_cache: true
builder_cache: false