4 Commits

8 changed files with 201 additions and 62 deletions

22
.idea/jsonSchemas.xml generated
View File

@@ -116,6 +116,28 @@
<Item> <Item>
<option name="path" value="roles/alpina/collections/services/monitoring/templates/prometheus_config/prometheus.yml.j2" /> <option name="path" value="roles/alpina/collections/services/monitoring/templates/prometheus_config/prometheus.yml.j2" />
</Item> </Item>
<Item>
<option name="path" value="roles/alpina/templates/services/monitoring/prometheus_config/prometheus.yml.j2" />
</Item>
</list>
</option>
</SchemaInfo>
</value>
</entry>
<entry key="prometheus.rules.json">
<value>
<SchemaInfo>
<option name="name" value="prometheus.rules.json" />
<option name="relativePathToSchema" value="https://json.schemastore.org/prometheus.rules.json" />
<option name="applicationDefined" value="true" />
<option name="patterns">
<list>
<Item>
<option name="path" value="roles/alpina/templates/services/monitoring/prometheus_config/container-alerts.yml" />
</Item>
<Item>
<option name="path" value="roles/alpina/templates/services/monitoring/prometheus_config/container.alerts.yml" />
</Item>
</list> </list>
</option> </option>
</SchemaInfo> </SchemaInfo>

View File

@@ -60,12 +60,15 @@ services:
prometheus: prometheus:
image: prom/prometheus:latest image: prom/prometheus:latest
container_name: prometheus container_name: prometheus
labels:
- {{ helpers.traefik_labels('prom', port='9090') | indent(6) }}
restart: unless-stopped restart: unless-stopped
# Needed to make config files readable (not anymore, TODO: remove) # Needed to make config files readable (not anymore, TODO: remove)
user: "{{ remote_uid }}" user: "{{ remote_uid }}"
command: command:
- --config.file=/etc/prometheus/prometheus.yml - --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time=30d - --storage.tsdb.retention.time=30d
- --web.external-url=https://prom.{{ domain }}/
volumes: volumes:
- ./prometheus_config:/etc/prometheus:ro - ./prometheus_config:/etc/prometheus:ro
- {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro - {{ base_volume_path }}/monitoring/prometheus_configs:/etc/prometheus/extra:ro
@@ -74,12 +77,15 @@ services:
alertmanager: alertmanager:
image: prom/alertmanager:latest image: prom/alertmanager:latest
container_name: alertmanager container_name: alertmanager
labels:
- {{ helpers.traefik_labels('alert', port='9093') | indent(6) }}
restart: unless-stopped restart: unless-stopped
command: command:
- --config.file=/etc/alertmanager/alertmanager.yml - --config.file=/etc/alertmanager/alertmanager.yml
- --web.external-url=https://alert.{{ domain }}/
volumes: volumes:
- ./alertmanager_config:/etc/alertmanager:ro - ./alertmanager_config:/etc/alertmanager:ro
# TODO: add volume for alertmanager data - {{ base_volume_path }}/monitoring/alertmanager:/alertmanager
node-exporter: node-exporter:
image: prom/node-exporter:latest image: prom/node-exporter:latest

View File

@@ -0,0 +1,140 @@
from grafanalib.core import (
Dashboard, TimeSeries,
Target, GridPos,
Templating, Template, REFRESH_ON_TIME_RANGE_CHANGE, Logs
)
from grafanalib.formatunits import BYTES_IEC, SECONDS, BYTES_SEC_IEC
prom_datasource='prometheus'
loki_datasource='loki'
# TODO: this is (clown emoji), normal Target gave me errors in grafana
class LokiTarget(object):
def to_json_data(self):
return {
'datasource': loki_datasource,
'expr': '{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
'legendFormat': '{{ container_name }}',
'refId': 'A',
'queryType': 'range',
}
dashboard = Dashboard(
title='Containers',
uid='containers',
description='Data for compose projects from default Prometheus datasource collected by Cadvisor',
tags=[
'example'
],
templating=Templating(list=[
Template(
name='compose_project',
label='Compose Project',
dataSource=prom_datasource,
query='label_values({__name__=~"container.*"}, container_label_com_docker_compose_project)',
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='container_name',
label='Container',
dataSource=prom_datasource,
query='label_values({__name__=~"container.*", container_label_com_docker_compose_project=~"$compose_project"}, name)',
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='logs_query',
label='Log Search',
query='',
type='textbox',
),
]),
timezone='browser',
panels=[
TimeSeries(
id=1,
title='Container Memory Usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
stacking={'mode': 'normal'},
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource=prom_datasource,
expr='max by (name) (container_memory_usage_bytes{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"})',
legendFormat='{{ name }}',
refId='A',
),
],
),
TimeSeries(
id=2,
title='Container CPU Usage',
unit=SECONDS,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
targets=[
Target(
datasource=prom_datasource,
expr='max by (name) (rate(container_cpu_usage_seconds_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat='{{ name }}',
refId='A',
),
],
),
TimeSeries(
id=3,
title='Container Network Traffic',
unit=BYTES_SEC_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=8),
lineWidth=2,
fillOpacity=10,
showPoints='never',
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource=prom_datasource,
expr='max by (name) (rate(container_network_receive_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat="rx {{ name }}",
refId='A',
),
Target(
datasource=prom_datasource,
expr='-max by (name) (rate(container_network_transmit_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat="tx {{ name }}",
refId='B',
),
],
),
Logs(
id=4,
title='',
gridPos=GridPos(h=8, w=12, x=12, y=8),
showLabels=True,
showCommonLabels=True,
wrapLogMessages=True,
prettifyLogMessage=True,
dedupStrategy='numbers',
targets=[
LokiTarget(),
# Target(
# datasource=loki_datasource,
# expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
# legendFormat='{{ container_name }}',
# refId='A',
# ),
],
),
],
).auto_panel_ids()

View File

@@ -1,59 +0,0 @@
from grafanalib.core import (
Dashboard, TimeSeries, GaugePanel,
Target, GridPos,
OPS_FORMAT, Templating, Template, REFRESH_ON_TIME_RANGE_CHANGE
)
from grafanalib.formatunits import BYTES_IEC
dashboard = Dashboard(
title="Containers",
description="Data for compose projects from default Prometheus datasource collected by Cadvisor",
tags=[
'example'
],
templating=Templating(list=[
# TODO: test how much of this is actually necessary
Template(
name="compose_project",
label="compose_project",
dataSource="prometheus",
query='label_values({__name__=~"container.*"}, container_label_com_docker_compose_project)',
includeAll=True,
multi=True,
hide=0,
sort=1,
type="query",
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name="container_name",
label="container_name",
dataSource="prometheus",
query='label_values({__name__=~"container.*", container_label_com_docker_compose_project=~"$compose_project"}, name)',
includeAll=True,
multi=True,
hide=0,
sort=1,
type="query",
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
]),
timezone="browser",
panels=[
TimeSeries(
title="Container Memory Usage",
# dataSource='prometheus',
targets=[
Target(
datasource='prometheus',
expr='max by (name) (container_memory_usage_bytes{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"})',
legendFormat="{{ name }}",
refId='A',
),
],
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=16, x=0, y=0),
),
],
).auto_panel_ids()

View File

@@ -20,10 +20,17 @@ schema_config:
- from: 2020-10-24 - from: 2020-10-24
store: boltdb-shipper store: boltdb-shipper
object_store: filesystem object_store: filesystem
schema: v11 schema: v12
index: index:
prefix: index_ prefix: index_
period: 24h period: 24h
- from: 2024-10-18
index:
period: 24h
prefix: index_
object_store: filesystem
schema: v13
store: tsdb
# TODO: Figure this out # TODO: Figure this out
ruler: ruler:

View File

@@ -0,0 +1,23 @@
groups:
- name: qbit-low-traffic
interval: 1m
rules:
- alert: QbitLowTraffic
expr: |
rate(container_network_transmit_bytes_total{name=~"gluetun"}[1m]) < 1024
for: 2m
labels:
severity: warning
annotations:
title: 'Low traffic on qBit'
description: |
The traffic on qBittorrent is lower than 1KiB/s for 2 minutes.
Last value was x bytes/s.
[Grafana Dashboard](https://grafana.{{ domain }}/d/containers?orgId=1)
[View in Grafana](https://grafana.{{ domain }}/d/containers?orgId=1&viewPanel=3)
__dashboard__uid: 'containers'
__orgId__: 1
__panelId__: 3

View File

@@ -43,7 +43,7 @@ scrape_configs:
- 'demo.promlabs.com:10002' - 'demo.promlabs.com:10002'
rule_files: rule_files:
- "/etc/prometheus/demo-alerts.yml" - "/etc/prometheus/container.alerts.yml"
- "/etc/prometheus/extra/rules/*.yml" - "/etc/prometheus/extra/rules/*.yml"
- "/etc/prometheus/extra/rules/*.json" - "/etc/prometheus/extra/rules/*.json"