monitoring: dashboard improvements, add network and disc panels to node exporter

This commit is contained in:
Yuri Tatishchev 2025-01-01 13:54:11 -08:00
parent 278839fdba
commit b03628f8de
Signed by: CaZzzer
GPG Key ID: E0EBF441EA424369
5 changed files with 174 additions and 615 deletions

2
.idea/alpina.iml generated
View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Poetry (alpina)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

View File

@ -1,27 +1,81 @@
from grafanalib.core import Template
from attrs import define
from grafanalib.core import Template, TimeSeries, Dashboard, HIDE_VARIABLE, Target
# TODO: consider default params for common params like line width, show points, tooltip
CONF_SUPPORT_LOKI = True
CONF_SUPPORT_ZFS = True
PrometheusTemplate = Template(
name='datasource',
CONF_DATASOURCE_VAR_PROM = 'prom_datasource'
CONF_DATASOURCE_VAR_LOKI = 'loki_datasource'
prom_datasource = f'${{{CONF_DATASOURCE_VAR_PROM}}}'
loki_datasource = f'${{{CONF_DATASOURCE_VAR_LOKI}}}'
prom_template = Template(
name=CONF_DATASOURCE_VAR_PROM,
type='datasource',
label='Prometheus',
query='prometheus',
hide=HIDE_VARIABLE,
)
# TODO: this slightly less (clown emoji), normal Target gave me errors in grafana
loki_template = Template(
name=CONF_DATASOURCE_VAR_LOKI,
type='datasource',
label='Loki',
query='loki',
hide=HIDE_VARIABLE,
)
@define
class MyDashboard(Dashboard):
"""Wrapper class for Dashboard with some default values"""
timezone: str = 'browser'
sharedCrosshair: bool = True
@define
class MyTimeSeries(TimeSeries):
"""Wrapper class for TimeSeries with some default values and custom fields"""
fillOpacity: int = 10
lineWidth: int = 1
showPoints: str = 'never'
tooltipMode: str = 'multi'
maxDataPoints: int = None
# new fields
axisCenteredZero: bool = False
def to_json_data(self):
data = super().to_json_data()
data['fieldConfig']['defaults']['custom']['axisCenteredZero'] = self.axisCenteredZero
return data
@define
class PromTarget(Target):
"""Wrapper class for Target with default prometheus datasource"""
datasource: str = prom_datasource
@define
class LokiTarget(object):
def __init__(self, loki_datasource, expr, legendFormat, refId):
self.loki_datasource = loki_datasource
self.expr = expr
self.legendFormat = legendFormat
self.refId = refId
"""Custom class for Loki Target, because normal Target gave errors in grafana"""
expr: str
legendFormat: str
datasource: str = loki_datasource
refId: str = None
queryType: str = 'range'
def to_json_data(self):
return {
'datasource': self.loki_datasource,
'datasource': self.datasource,
'expr': self.expr,
'legendFormat': self.legendFormat,
'refId': self.refId,
'queryType': 'range',
'queryType': self.queryType,
}
def filter_none(l: list):
return [i for i in l if i is not None]

View File

@ -1,16 +1,10 @@
from grafanalib.core import (
Dashboard, TimeSeries,
Target, GridPos,
Templating, Template, REFRESH_ON_TIME_RANGE_CHANGE, Logs
)
from grafanalib.core import GridPos, Templating, Template, Logs
from grafanalib.formatunits import BYTES_IEC, SECONDS, BYTES_SEC_IEC
from common import LokiTarget, PrometheusTemplate
from common import LokiTarget, prom_template, loki_template, MyTimeSeries, MyDashboard, CONF_SUPPORT_LOKI, filter_none, \
prom_datasource, PromTarget
prom_datasource='${datasource}'
loki_datasource='loki'
dashboard = Dashboard(
dashboard = MyDashboard(
title='Containers',
uid='containers',
description='Data for compose projects from default Prometheus datasource collected by Cadvisor',
@ -18,8 +12,9 @@ dashboard = Dashboard(
'linux',
'docker',
],
templating=Templating(list=[
PrometheusTemplate,
templating=Templating(list=filter_none([
prom_template,
loki_template if CONF_SUPPORT_LOKI else None,
Template(
name='compose_project',
label='Compose Project',
@ -27,7 +22,6 @@ dashboard = Dashboard(
query='label_values({__name__=~"container.*"}, container_label_com_docker_compose_project)',
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='container_name',
@ -36,7 +30,6 @@ dashboard = Dashboard(
query='label_values({__name__=~"container.*", container_label_com_docker_compose_project=~"$compose_project"}, name)',
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='logs_query',
@ -44,67 +37,48 @@ dashboard = Dashboard(
query='',
type='textbox',
),
]),
timezone='browser',
panels=[
TimeSeries(
])),
panels=filter_none([
MyTimeSeries(
title='Container Memory Usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
stacking={'mode': 'normal'},
tooltipMode='all',
tooltipSort='desc',
stacking={'mode': 'normal'},
targets=[
Target(
datasource=prom_datasource,
PromTarget(
expr='max by (name) (container_memory_usage_bytes{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"})',
legendFormat='{{ name }}',
refId='A',
),
],
),
TimeSeries(
MyTimeSeries(
title='Container CPU Usage',
unit=SECONDS,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
tooltipMode='all',
tooltipSort='desc',
stacking={'mode': 'normal'},
targets=[
Target(
datasource=prom_datasource,
expr='max by (name) (rate(container_cpu_usage_seconds_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
PromTarget(
expr='max by (name) (irate(container_cpu_usage_seconds_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat='{{ name }}',
refId='A',
),
],
),
TimeSeries(
MyTimeSeries(
title='Container Network Traffic',
unit=BYTES_SEC_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=8),
lineWidth=2,
fillOpacity=10,
showPoints='never',
tooltipMode='all',
tooltipSort='desc',
axisCenteredZero=True,
targets=[
Target(
datasource=prom_datasource,
expr='max by (name) (rate(container_network_receive_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
PromTarget(
expr='max by (name) (irate(container_network_receive_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat="rx {{ name }}",
refId='A',
),
Target(
datasource=prom_datasource,
expr='-max by (name) (rate(container_network_transmit_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
PromTarget(
expr='-max by (name) (irate(container_network_transmit_bytes_total{name=~"$container_name", container_label_com_docker_compose_project=~"$compose_project"}[$__rate_interval]))',
legendFormat="tx {{ name }}",
refId='B',
),
],
),
@ -118,12 +92,10 @@ dashboard = Dashboard(
dedupStrategy='numbers',
targets=[
LokiTarget(
loki_datasource=loki_datasource,
expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
legendFormat='{{ container_name }}',
refId='A',
),
],
),
],
) if CONF_SUPPORT_LOKI else None,
]),
).auto_panel_ids()

View File

@ -1,139 +1,159 @@
from grafanalib.core import Dashboard, Templating, Template, TimeSeries, PERCENT_UNIT_FORMAT, GridPos, Target
from grafanalib.formatunits import BYTES_IEC
from grafanalib.core import Templating, Template, GridPos
from grafanalib.formatunits import BYTES_IEC, BITS_SEC, PERCENT_UNIT
from common import PrometheusTemplate
from node_consts import CPU_BASIC_COLORS, MEMORY_BASIC_COLORS
from common import prom_template, MyTimeSeries, MyDashboard, CONF_SUPPORT_ZFS, PromTarget, prom_datasource
dashboard = Dashboard(
dashboard = MyDashboard(
title='Node Exporter',
uid='node',
description='Node Exporter (not quite full)',
tags=[
'linux',
],
timezone='browser',
templating=Templating(list=[
# Datasource
PrometheusTemplate,
prom_template,
# Job
Template(
name='job',
label='Job',
dataSource='${datasource}',
dataSource=prom_datasource,
query='label_values(node_uname_info, job)',
),
# Instance
Template(
name='instance',
label='Instance',
dataSource='${datasource}',
dataSource=prom_datasource,
query='label_values(node_uname_info{job="$job"}, instance)',
),
]),
panels=[
# CPU Basic
TimeSeries(
MyTimeSeries(
title='CPU Basic',
description='Basic CPU usage info',
unit=PERCENT_UNIT_FORMAT,
unit=PERCENT_UNIT,
gridPos=GridPos(h=8, w=12, x=0, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'percent', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
stacking={'mode': 'percent'},
targets=[
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="system"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy System',
refId='A',
),
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="user"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy User',
refId='B',
),
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="iowait"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Iowait',
refId='C',
),
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode=~".*irq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy IRQs',
refId='D',
),
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode!="idle",mode!="user",mode!="system",mode!="iowait",mode!="irq",mode!="softirq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Other',
refId='E',
),
Target(
datasource='${datasource}',
PromTarget(
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="idle"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Idle',
refId='F',
),
],
# Extra JSON for the colors
extraJson=CPU_BASIC_COLORS,
),
# Memory Basic
TimeSeries(
MyTimeSeries(
title='Memory Basic',
description='Basic memory usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'normal', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
stacking={'mode': 'normal'},
valueMin=0,
targets=[
Target(
datasource='${datasource}',
PromTarget(
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"}',
format='time_series',
legendFormat='RAM Total',
refId='A',
),
Target(
datasource='${datasource}',
PromTarget(
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"} - node_memory_MemFree_bytes{instance="$instance",job="$job"} - (node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"})',
format='time_series',
legendFormat='RAM Used',
refId='B',
hide=CONF_SUPPORT_ZFS,
),
Target(
datasource='${datasource}',
PromTarget(
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"} - node_memory_MemFree_bytes{instance="$instance",job="$job"} - (node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"}) - node_zfs_arc_size{instance="$instance",job="$job"}',
format='time_series',
legendFormat='RAM Used',
hide=not CONF_SUPPORT_ZFS,
),
PromTarget(
expr='node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Cache + Buffer',
refId='C',
),
Target(
datasource='${datasource}',
PromTarget(
expr='node_zfs_arc_size{instance="$instance",job="$job"}',
legendFormat='ZFS Arc',
hide=not CONF_SUPPORT_ZFS,
),
PromTarget(
expr='node_memory_MemFree_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Free',
refId='D',
),
Target(
datasource='${datasource}',
PromTarget(
expr='(node_memory_SwapTotal_bytes{instance="$instance",job="$job"} - node_memory_SwapFree_bytes{instance="$instance",job="$job"})',
legendFormat='SWAP Used',
refId='E',
),
],
# Extra JSON for the colors
extraJson=MEMORY_BASIC_COLORS,
overrides=[
# Prevent total memory from being stacked
{
'matcher': {
'id': 'byName',
'options': 'RAM Total'
},
'properties': [
{
'id': 'custom.stacking',
'value': {'mode': 'none'}
}
]
},
],
),
# Network Traffic Basic
MyTimeSeries(
title='Network Traffic Basic',
description='Basic network usage info per interface',
unit=BITS_SEC,
gridPos=GridPos(h=8, w=12, x=0, y=8),
tooltipSort='desc',
axisCenteredZero=True,
targets=[
PromTarget(
expr='irate(node_network_receive_bytes_total{instance="$instance",job="$job"}[$__rate_interval]) * 8',
legendFormat='rx {{ device }}',
),
PromTarget(
expr='-irate(node_network_transmit_bytes_total{instance="$instance",job="$job"}[$__rate_interval]) * 8',
legendFormat='tx {{ device }}',
),
],
),
# Disk Space Basic
MyTimeSeries(
title='Disk Space Used Basic',
description='Disk space used of all filesystems mounted',
unit=PERCENT_UNIT,
gridPos=GridPos(h=8, w=12, x=12, y=8),
targets=[
PromTarget(
expr='1 - (node_filesystem_avail_bytes{instance="$instance",job="$job",device!~"rootfs"} / node_filesystem_size_bytes{instance="$instance",job="$job",device!~"rootfs"})',
legendFormat='{{ mountpoint }}',
),
],
),
# TODO: Network Basic
# TODO: Disk Basic
],
).auto_panel_ids()

View File

@ -1,487 +0,0 @@
# TODO: Question life decisions (I'm not sure if this is good)
CPU_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy System"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy User"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Other"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
}
]
},
}
MEMORY_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Apps"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#629E51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Buffers"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#614D93",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cached"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#511749",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Committed"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#508642",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#CFFAFF",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inactive"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#584477",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "PageTables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Page_Tables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "SWAP Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#806EB7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0752D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#C15C17",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#2F575E",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Unused"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Total"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Cache + Buffer"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Available"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#DEDAF7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
}
]
}
}