monitoring: container, node dashboard improvements; separate common.py logic

This commit is contained in:
Yuri Tatishchev 2024-10-23 21:36:13 -07:00
parent 0e43a68754
commit 3006e3e424
Signed by: CaZzzer
GPG Key ID: 28BE602058C08557
7 changed files with 673 additions and 27 deletions

2
.idea/alpina.iml generated
View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Poetry (alpina) (4)" jdkType="Python SDK" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

2
.idea/misc.xml generated
View File

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="Poetry (alpina) (2)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (alpina) (4)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (alpina)" project-jdk-type="Python SDK" />
</project>

View File

@ -4,6 +4,10 @@ RUN pip install grafanalib
COPY ./grafana_config/dashboards /dashboards
# Required for grafanalib to find the shared python files like common.py
# https://github.com/weaveworks/grafanalib/issues/58
ENV PYTHONPATH=/dashboards
RUN generate-dashboards /dashboards/*.dashboard.py
FROM grafana/grafana:latest

View File

@ -0,0 +1,27 @@
from grafanalib.core import Template
# TODO: consider default params for common params like line width, show points, tooltip
PrometheusTemplate = Template(
name='datasource',
type='datasource',
label='Prometheus',
query='prometheus',
)
# TODO: this slightly less (clown emoji), normal Target gave me errors in grafana
class LokiTarget(object):
def __init__(self, loki_datasource, expr, legendFormat, refId):
self.loki_datasource = loki_datasource
self.expr = expr
self.legendFormat = legendFormat
self.refId = refId
def to_json_data(self):
return {
'datasource': self.loki_datasource,
'expr': self.expr,
'legendFormat': self.legendFormat,
'refId': self.refId,
'queryType': 'range',
}

View File

@ -5,28 +5,21 @@ from grafanalib.core import (
)
from grafanalib.formatunits import BYTES_IEC, SECONDS, BYTES_SEC_IEC
prom_datasource='prometheus'
loki_datasource='loki'
from common import LokiTarget, PrometheusTemplate
# TODO: this is (clown emoji), normal Target gave me errors in grafana
class LokiTarget(object):
def to_json_data(self):
return {
'datasource': loki_datasource,
'expr': '{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
'legendFormat': '{{ container_name }}',
'refId': 'A',
'queryType': 'range',
}
prom_datasource='${datasource}'
loki_datasource='loki'
dashboard = Dashboard(
title='Containers',
uid='containers',
description='Data for compose projects from default Prometheus datasource collected by Cadvisor',
tags=[
'example'
'linux',
'docker',
],
templating=Templating(list=[
PrometheusTemplate,
Template(
name='compose_project',
label='Compose Project',
@ -44,7 +37,6 @@ dashboard = Dashboard(
includeAll=True,
multi=True,
refresh=REFRESH_ON_TIME_RANGE_CHANGE,
),
Template(
name='logs_query',
@ -56,7 +48,6 @@ dashboard = Dashboard(
timezone='browser',
panels=[
TimeSeries(
id=1,
title='Container Memory Usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=0),
@ -76,13 +67,14 @@ dashboard = Dashboard(
],
),
TimeSeries(
id=2,
title='Container CPU Usage',
unit=SECONDS,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=2,
fillOpacity=10,
showPoints='never',
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource=prom_datasource,
@ -93,7 +85,6 @@ dashboard = Dashboard(
],
),
TimeSeries(
id=3,
title='Container Network Traffic',
unit=BYTES_SEC_IEC,
gridPos=GridPos(h=8, w=12, x=0, y=8),
@ -118,7 +109,6 @@ dashboard = Dashboard(
],
),
Logs(
id=4,
title='',
gridPos=GridPos(h=8, w=12, x=12, y=8),
showLabels=True,
@ -127,13 +117,12 @@ dashboard = Dashboard(
prettifyLogMessage=True,
dedupStrategy='numbers',
targets=[
LokiTarget(),
# Target(
# datasource=loki_datasource,
# expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
# legendFormat='{{ container_name }}',
# refId='A',
# ),
LokiTarget(
loki_datasource=loki_datasource,
expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`',
legendFormat='{{ container_name }}',
refId='A',
),
],
),
],

View File

@ -0,0 +1,139 @@
from grafanalib.core import Dashboard, Templating, Template, TimeSeries, PERCENT_UNIT_FORMAT, GridPos, Target
from grafanalib.formatunits import BYTES_IEC
from common import PrometheusTemplate
from node_consts import CPU_BASIC_COLORS, MEMORY_BASIC_COLORS
dashboard = Dashboard(
title='Node Exporter',
uid='node',
description='Node Exporter (not quite full)',
tags=[
'linux',
],
timezone='browser',
templating=Templating(list=[
# Datasource
PrometheusTemplate,
# Job
Template(
name='job',
label='Job',
dataSource='${datasource}',
query='label_values(node_uname_info, job)',
),
# Instance
Template(
name='instance',
label='Instance',
dataSource='${datasource}',
query='label_values(node_uname_info{job="$job"}, instance)',
),
]),
panels=[
# CPU Basic
TimeSeries(
title='CPU Basic',
description='Basic CPU usage info',
unit=PERCENT_UNIT_FORMAT,
gridPos=GridPos(h=8, w=12, x=0, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'percent', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="system"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy System',
refId='A',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="user"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy User',
refId='B',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="iowait"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Iowait',
refId='C',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode=~".*irq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy IRQs',
refId='D',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode!="idle",mode!="user",mode!="system",mode!="iowait",mode!="irq",mode!="softirq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Busy Other',
refId='E',
),
Target(
datasource='${datasource}',
expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="idle"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))',
legendFormat='Idle',
refId='F',
),
],
# Extra JSON for the colors
extraJson=CPU_BASIC_COLORS,
),
# Memory Basic
TimeSeries(
title='Memory Basic',
description='Basic memory usage',
unit=BYTES_IEC,
gridPos=GridPos(h=8, w=12, x=12, y=0),
lineWidth=1,
fillOpacity=30,
showPoints='never',
stacking={'mode': 'normal', 'group': 'A'},
tooltipMode='all',
tooltipSort='desc',
targets=[
Target(
datasource='${datasource}',
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"}',
format='time_series',
legendFormat='RAM Total',
refId='A',
),
Target(
datasource='${datasource}',
expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"} - node_memory_MemFree_bytes{instance="$instance",job="$job"} - (node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"})',
format='time_series',
legendFormat='RAM Used',
refId='B',
),
Target(
datasource='${datasource}',
expr='node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Cache + Buffer',
refId='C',
),
Target(
datasource='${datasource}',
expr='node_memory_MemFree_bytes{instance="$instance",job="$job"}',
legendFormat='RAM Free',
refId='D',
),
Target(
datasource='${datasource}',
expr='(node_memory_SwapTotal_bytes{instance="$instance",job="$job"} - node_memory_SwapFree_bytes{instance="$instance",job="$job"})',
legendFormat='SWAP Used',
refId='E',
),
],
# Extra JSON for the colors
extraJson=MEMORY_BASIC_COLORS,
),
# TODO: Network Basic
# TODO: Disk Basic
],
).auto_panel_ids()

View File

@ -0,0 +1,487 @@
# TODO: Question life decisions (I'm not sure if this is good)
CPU_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Iowait"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Idle"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy System"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy User"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Busy Other"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
}
]
},
}
MEMORY_BASIC_COLORS = {
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Apps"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#629E51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Buffers"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#614D93",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6D1F62",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Cached"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#511749",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Committed"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#508642",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A437C",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#CFFAFF",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inactive"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#584477",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "PageTables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Page_Tables"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#0A50A1",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "SWAP Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#806EB7",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Slab_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0752D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap Used"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Cache"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#C15C17",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Swap_Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#2F575E",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Unused"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#EAB839",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Total"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#E0F9D7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Cache + Buffer"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#052B51",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RAM Free"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Available"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#DEDAF7",
"mode": "fixed"
}
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.stacking",
"value": {
"group": False,
"mode": "normal"
}
}
]
}
]
}
}