From 3006e3e4241d3b45be9c9882c01aa4a8d5052633 Mon Sep 17 00:00:00 2001 From: Yuri Tatishchev Date: Wed, 23 Oct 2024 21:36:13 -0700 Subject: [PATCH] monitoring: container, node dashboard improvements; separate common.py logic --- .idea/alpina.iml | 2 +- .idea/misc.xml | 2 +- .../templates/services/monitoring/Dockerfile | 4 + .../grafana_config/dashboards/common.py | 27 + .../dashboards/containers.dashboard.py | 39 +- .../dashboards/node.dashboard.py | 139 +++++ .../grafana_config/dashboards/node_consts.py | 487 ++++++++++++++++++ 7 files changed, 673 insertions(+), 27 deletions(-) create mode 100644 roles/alpina/templates/services/monitoring/grafana_config/dashboards/common.py create mode 100644 roles/alpina/templates/services/monitoring/grafana_config/dashboards/node.dashboard.py create mode 100644 roles/alpina/templates/services/monitoring/grafana_config/dashboards/node_consts.py diff --git a/.idea/alpina.iml b/.idea/alpina.iml index d07cdd8..bf75220 100644 --- a/.idea/alpina.iml +++ b/.idea/alpina.iml @@ -4,7 +4,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index aa18f0d..4fdd347 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/roles/alpina/templates/services/monitoring/Dockerfile b/roles/alpina/templates/services/monitoring/Dockerfile index 4604147..eebe9b2 100644 --- a/roles/alpina/templates/services/monitoring/Dockerfile +++ b/roles/alpina/templates/services/monitoring/Dockerfile @@ -4,6 +4,10 @@ RUN pip install grafanalib COPY ./grafana_config/dashboards /dashboards +# Required for grafanalib to find the shared python files like common.py +# https://github.com/weaveworks/grafanalib/issues/58 +ENV PYTHONPATH=/dashboards + RUN generate-dashboards /dashboards/*.dashboard.py FROM grafana/grafana:latest diff --git a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/common.py b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/common.py new file mode 100644 index 0000000..571b3da --- /dev/null +++ b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/common.py @@ -0,0 +1,27 @@ +from grafanalib.core import Template + +# TODO: consider default params for common params like line width, show points, tooltip + +PrometheusTemplate = Template( + name='datasource', + type='datasource', + label='Prometheus', + query='prometheus', +) + +# TODO: this slightly less (clown emoji), normal Target gave me errors in grafana +class LokiTarget(object): + def __init__(self, loki_datasource, expr, legendFormat, refId): + self.loki_datasource = loki_datasource + self.expr = expr + self.legendFormat = legendFormat + self.refId = refId + + def to_json_data(self): + return { + 'datasource': self.loki_datasource, + 'expr': self.expr, + 'legendFormat': self.legendFormat, + 'refId': self.refId, + 'queryType': 'range', + } diff --git a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/containers.dashboard.py b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/containers.dashboard.py index 484e21c..6de2cc3 100644 --- a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/containers.dashboard.py +++ b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/containers.dashboard.py @@ -5,28 +5,21 @@ from grafanalib.core import ( ) from grafanalib.formatunits import BYTES_IEC, SECONDS, BYTES_SEC_IEC -prom_datasource='prometheus' -loki_datasource='loki' +from common import LokiTarget, PrometheusTemplate -# TODO: this is (clown emoji), normal Target gave me errors in grafana -class LokiTarget(object): - def to_json_data(self): - return { - 'datasource': loki_datasource, - 'expr': '{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`', - 'legendFormat': '{{ container_name }}', - 'refId': 'A', - 'queryType': 'range', - } +prom_datasource='${datasource}' +loki_datasource='loki' dashboard = Dashboard( title='Containers', uid='containers', description='Data for compose projects from default Prometheus datasource collected by Cadvisor', tags=[ - 'example' + 'linux', + 'docker', ], templating=Templating(list=[ + PrometheusTemplate, Template( name='compose_project', label='Compose Project', @@ -44,7 +37,6 @@ dashboard = Dashboard( includeAll=True, multi=True, refresh=REFRESH_ON_TIME_RANGE_CHANGE, - ), Template( name='logs_query', @@ -56,7 +48,6 @@ dashboard = Dashboard( timezone='browser', panels=[ TimeSeries( - id=1, title='Container Memory Usage', unit=BYTES_IEC, gridPos=GridPos(h=8, w=12, x=0, y=0), @@ -76,13 +67,14 @@ dashboard = Dashboard( ], ), TimeSeries( - id=2, title='Container CPU Usage', unit=SECONDS, gridPos=GridPos(h=8, w=12, x=12, y=0), lineWidth=2, fillOpacity=10, showPoints='never', + tooltipMode='all', + tooltipSort='desc', targets=[ Target( datasource=prom_datasource, @@ -93,7 +85,6 @@ dashboard = Dashboard( ], ), TimeSeries( - id=3, title='Container Network Traffic', unit=BYTES_SEC_IEC, gridPos=GridPos(h=8, w=12, x=0, y=8), @@ -118,7 +109,6 @@ dashboard = Dashboard( ], ), Logs( - id=4, title='', gridPos=GridPos(h=8, w=12, x=12, y=8), showLabels=True, @@ -127,13 +117,12 @@ dashboard = Dashboard( prettifyLogMessage=True, dedupStrategy='numbers', targets=[ - LokiTarget(), - # Target( - # datasource=loki_datasource, - # expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`', - # legendFormat='{{ container_name }}', - # refId='A', - # ), + LokiTarget( + loki_datasource=loki_datasource, + expr='{compose_project=~"$compose_project", container_name=~"$container_name"} |= `$logs_query`', + legendFormat='{{ container_name }}', + refId='A', + ), ], ), ], diff --git a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node.dashboard.py b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node.dashboard.py new file mode 100644 index 0000000..d5c108b --- /dev/null +++ b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node.dashboard.py @@ -0,0 +1,139 @@ +from grafanalib.core import Dashboard, Templating, Template, TimeSeries, PERCENT_UNIT_FORMAT, GridPos, Target +from grafanalib.formatunits import BYTES_IEC + +from common import PrometheusTemplate +from node_consts import CPU_BASIC_COLORS, MEMORY_BASIC_COLORS + +dashboard = Dashboard( + title='Node Exporter', + uid='node', + description='Node Exporter (not quite full)', + tags=[ + 'linux', + ], + timezone='browser', + templating=Templating(list=[ + # Datasource + PrometheusTemplate, + # Job + Template( + name='job', + label='Job', + dataSource='${datasource}', + query='label_values(node_uname_info, job)', + ), + # Instance + Template( + name='instance', + label='Instance', + dataSource='${datasource}', + query='label_values(node_uname_info{job="$job"}, instance)', + ), + ]), + panels=[ + # CPU Basic + TimeSeries( + title='CPU Basic', + description='Basic CPU usage info', + unit=PERCENT_UNIT_FORMAT, + gridPos=GridPos(h=8, w=12, x=0, y=0), + lineWidth=1, + fillOpacity=30, + showPoints='never', + stacking={'mode': 'percent', 'group': 'A'}, + tooltipMode='all', + tooltipSort='desc', + targets=[ + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="system"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Busy System', + refId='A', + ), + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="user"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Busy User', + refId='B', + ), + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="iowait"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Busy Iowait', + refId='C', + ), + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode=~".*irq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Busy IRQs', + refId='D', + ), + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode!="idle",mode!="user",mode!="system",mode!="iowait",mode!="irq",mode!="softirq"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Busy Other', + refId='E', + ), + Target( + datasource='${datasource}', + expr='sum(irate(node_cpu_seconds_total{instance="$instance",job="$job", mode="idle"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance="$instance",job="$job"}) by (cpu)))', + legendFormat='Idle', + refId='F', + ), + ], + # Extra JSON for the colors + extraJson=CPU_BASIC_COLORS, + ), + # Memory Basic + TimeSeries( + title='Memory Basic', + description='Basic memory usage', + unit=BYTES_IEC, + gridPos=GridPos(h=8, w=12, x=12, y=0), + lineWidth=1, + fillOpacity=30, + showPoints='never', + stacking={'mode': 'normal', 'group': 'A'}, + tooltipMode='all', + tooltipSort='desc', + targets=[ + Target( + datasource='${datasource}', + expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"}', + format='time_series', + legendFormat='RAM Total', + refId='A', + ), + Target( + datasource='${datasource}', + expr='node_memory_MemTotal_bytes{instance="$instance",job="$job"} - node_memory_MemFree_bytes{instance="$instance",job="$job"} - (node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"})', + format='time_series', + legendFormat='RAM Used', + refId='B', + ), + Target( + datasource='${datasource}', + expr='node_memory_Cached_bytes{instance="$instance",job="$job"} + node_memory_Buffers_bytes{instance="$instance",job="$job"} + node_memory_SReclaimable_bytes{instance="$instance",job="$job"}', + legendFormat='RAM Cache + Buffer', + refId='C', + ), + Target( + datasource='${datasource}', + expr='node_memory_MemFree_bytes{instance="$instance",job="$job"}', + legendFormat='RAM Free', + refId='D', + ), + Target( + datasource='${datasource}', + expr='(node_memory_SwapTotal_bytes{instance="$instance",job="$job"} - node_memory_SwapFree_bytes{instance="$instance",job="$job"})', + legendFormat='SWAP Used', + refId='E', + ), + ], + # Extra JSON for the colors + extraJson=MEMORY_BASIC_COLORS, + ), + # TODO: Network Basic + # TODO: Disk Basic + ], +).auto_panel_ids() diff --git a/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node_consts.py b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node_consts.py new file mode 100644 index 0000000..a527781 --- /dev/null +++ b/roles/alpina/templates/services/monitoring/grafana_config/dashboards/node_consts.py @@ -0,0 +1,487 @@ +# TODO: Question life decisions (I'm not sure if this is good) + +CPU_BASIC_COLORS = { + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, +} + +MEMORY_BASIC_COLORS = { + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "SWAP Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": False, + "mode": "normal" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Cache + Buffer" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Available" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#DEDAF7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": False, + "mode": "normal" + } + } + ] + } + ] + } +}