From b1aa6c07e33cf17f65ea2ec0ede874083e3ff00f Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 14:27:29 +0000 Subject: [PATCH 01/16] installed loki and added grafana datasource --- .../kube_prometheus_stack/tasks/main.yml | 19 +++++++++++++++++++ .../inventory/group_vars/all/grafana.yml | 8 +++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 13488de58..aefbe9679 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -176,6 +176,25 @@ ansible.builtin.import_role: name: grafana-dashboards +- name: Install loki stack helm chart + kubernetes.core.helm: + chart_ref: loki-stack + chart_repo_url: https://grafana.github.io/helm-charts + chart_version: 2.10.2 + release_name: loki + release_namespace: "{{ kube_prometheus_stack_release_namespace }}" + release_values: + loki: + nodeSelector: + clusterrole: server + image: + tag: 2.9.3 + grafana: + sidecar: + grafana_datasources: + enabled: false + wait: yes + - name: Install kube-prometheus-stack on target Kubernetes cluster kubernetes.core.helm: chart_ref: "{{ kube_prometheus_stack_chart_name }}" diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index ee874d2ed..e2fbab79c 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -73,7 +73,13 @@ grafana_datasources: version: '7.10.2' flavor: elasticsearch editable: true - # readOnly: false + readOnly: false + - name: Loki + url: http://loki:3100 + type: loki + access: proxy + version: 1 + isDefault: false grafana_plugins: - grafana-opensearch-datasource 2.8.1 From 7d5488bd965113df59fb47f99f04ed942d18c99b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 14:42:07 +0000 Subject: [PATCH 02/16] added loki pod dashboard and fixed datasource provisioning --- .../files/loki-pod-logs-dashboard.json | 228 ++++++++++++++++++ .../kube_prometheus_stack/tasks/main.yml | 2 +- .../inventory/group_vars/all/grafana.yml | 2 + 3 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json diff --git a/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json new file mode 100644 index 000000000..c2008ba84 --- /dev/null +++ b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json @@ -0,0 +1,228 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Search pod logs stored in Loki", + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 31, + "links": [], + "panels": [ + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "Loki", + "expr": "sum(count_over_time({namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "type": "timeseries" + }, + { + "datasource": "Loki", + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": "Loki", + "expr": "{namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"", + "refId": "A" + } + ], + "title": "Logs Panel", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { + "selected": true, + "text": "ingress-nginx", + "value": "ingress-nginx" + }, + "datasource": "Loki", + "definition": "label_values(namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Loki", + "definition": "label_values({namespace=~\"$namespace\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": "label_values({namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "hide": 0, + "name": "search", + "options": [], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Pod Logs", + "uid": "209fd89b771c318dd442225414a50b59", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index aefbe9679..3f506e0d5 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -191,7 +191,7 @@ tag: 2.9.3 grafana: sidecar: - grafana_datasources: + datasources: enabled: false wait: yes diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index e2fbab79c..e6f796740 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -38,6 +38,8 @@ grafana_dashboards_default: - placeholder: DS_PROMETHEUS replacement: prometheus revision_id: 3 + - dashboard_file: loki-pod-logs-dashboard.json + replacements: [] grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" # Configmap names of kube prometheus stack's default dashboards to exclude From a4a73edbef722ef86ecf37af3711af28d34c9bb5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 15:02:43 +0000 Subject: [PATCH 03/16] added host logs + dashboard --- .../files/loki-systemd-logs-dashboard.json | 232 ++++++++++++++++++ .../kube_prometheus_stack/tasks/main.yml | 31 +++ .../inventory/group_vars/all/grafana.yml | 2 + 3 files changed, 265 insertions(+) create mode 100644 ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json diff --git a/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json new file mode 100644 index 000000000..88bf8987a --- /dev/null +++ b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json @@ -0,0 +1,232 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Search systemd logs stored in Loki", + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 33, + "links": [], + "panels": [ + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "Loki", + "expr": "sum(count_over_time({unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "type": "timeseries" + }, + { + "datasource": "Loki", + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": "Loki", + "expr": "{unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"", + "refId": "A" + } + ], + "title": "Logs Panel", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { + "selected": true, + "text": "ansible-init.service", + "value": "ansible-init.service" + }, + "datasource": "Loki", + "definition": "label_values(unit)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "unit", + "options": [], + "query": "label_values(unit)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Loki", + "definition": "label_values({unit=~\"$unit\"}, hostname)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "hostname", + "options": [], + "query": "label_values({unit=~\"$unit\"}, hostname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "hide": 0, + "name": "search", + "options": [], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Systemd Logs", + "uid": "fa1bd43aed803111be9cc923cada9811", + "version": 1, + "weekStart": "" + } diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 3f506e0d5..c958a6b44 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -193,6 +193,37 @@ sidecar: datasources: enabled: false + promtail: + config: + snippets: + extraScrapeConfigs: | + - job_name: journal + journal: + path: /var/log/journal + max_age: 12h + labels: + job: systemd-journal + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal__hostname'] + target_label: 'hostname' + - source_labels: ['__journal_priority_keyword'] + target_label: level + extraVolumes: + - name: journal + hostPath: + path: /var/log/journal + - name: machine-id + hostPath: + path: /etc/machine-id + extraVolumeMounts: + - name: journal + mountPath: /var/log/journal + readOnly: true + - name: machine-id + mountPath: /etc/machine-id + readOnly: true wait: yes - name: Install kube-prometheus-stack on target Kubernetes cluster diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index e6f796740..7da47e113 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -40,6 +40,8 @@ grafana_dashboards_default: revision_id: 3 - dashboard_file: loki-pod-logs-dashboard.json replacements: [] + - dashboard_file: loki-systemd-logs-dashboard.json + replacements: [] grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" # Configmap names of kube prometheus stack's default dashboards to exclude From 6244c24e42c1922b66c08a914f83352380217bb5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 15:30:56 +0000 Subject: [PATCH 04/16] refactored loki play --- .../files/loki-pod-logs-dashboard.json | 2 +- .../files/loki-systemd-logs-dashboard.json | 2 +- .../defaults/main/helm.yml | 42 ++++++++++++++++++ .../defaults/main/main.yml | 3 ++ .../kube_prometheus_stack/tasks/main.yml | 44 +------------------ 5 files changed, 49 insertions(+), 44 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json index c2008ba84..7b594079a 100644 --- a/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json +++ b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json @@ -16,7 +16,7 @@ ] }, "description": "Search pod logs stored in Loki", - "editable": false, + "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 31, diff --git a/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json index 88bf8987a..9b19c358c 100644 --- a/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json +++ b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json @@ -16,7 +16,7 @@ ] }, "description": "Search systemd logs stored in Loki", - "editable": false, + "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 33, diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index e9c04632f..19729a631 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -193,3 +193,45 @@ kube_prometheus_stack_release_values: >- kube_prometheus_stack_release_defaults | combine(kube_prometheus_stack_release_overrides, recursive = True) }} + +kube_prometheus_stack_loki_release_values: + loki: + nodeSelector: + clusterrole: server + image: + tag: "{{ kube_prometheus_stack_loki_image_tag }}" + grafana: + sidecar: + datasources: + enabled: false + promtail: + config: + snippets: + extraScrapeConfigs: | + - job_name: journal + journal: + path: /var/log/journal + max_age: 12h + labels: + job: systemd-journal + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal__hostname'] + target_label: 'hostname' + - source_labels: ['__journal_priority_keyword'] + target_label: level + extraVolumes: + - name: journal + hostPath: + path: /var/log/journal + - name: machine-id + hostPath: + path: /etc/machine-id + extraVolumeMounts: + - name: journal + mountPath: /var/log/journal + readOnly: true + - name: machine-id + mountPath: /etc/machine-id + readOnly: true diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index bf6097089..bc199f2ec 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -20,6 +20,9 @@ kube_prometheus_stack_wait_timeout: 5m kube_prometheus_stack_metrics_image_tag: v2.12.0 kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 +kube_prometheus_stack_loki_chart_version: 2.10.2 +kube_prometheus_stack_loki_image_tag: 2.9.3 + control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index c958a6b44..82b210875 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -180,50 +180,10 @@ kubernetes.core.helm: chart_ref: loki-stack chart_repo_url: https://grafana.github.io/helm-charts - chart_version: 2.10.2 + chart_version: "{{ kube_prometheus_stack_loki_chart_version }}" release_name: loki release_namespace: "{{ kube_prometheus_stack_release_namespace }}" - release_values: - loki: - nodeSelector: - clusterrole: server - image: - tag: 2.9.3 - grafana: - sidecar: - datasources: - enabled: false - promtail: - config: - snippets: - extraScrapeConfigs: | - - job_name: journal - journal: - path: /var/log/journal - max_age: 12h - labels: - job: systemd-journal - relabel_configs: - - source_labels: ['__journal__systemd_unit'] - target_label: 'unit' - - source_labels: ['__journal__hostname'] - target_label: 'hostname' - - source_labels: ['__journal_priority_keyword'] - target_label: level - extraVolumes: - - name: journal - hostPath: - path: /var/log/journal - - name: machine-id - hostPath: - path: /etc/machine-id - extraVolumeMounts: - - name: journal - mountPath: /var/log/journal - readOnly: true - - name: machine-id - mountPath: /etc/machine-id - readOnly: true + release_values: "{{ kube_prometheus_stack_loki_release_values }}" wait: yes - name: Install kube-prometheus-stack on target Kubernetes cluster From 6c5c1445b4d7ea5f83a5aa0fa4b5cd8957aca64b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 15:36:50 +0000 Subject: [PATCH 05/16] added loki images to pre-pull --- ansible/roles/kube_prometheus_stack/defaults/main/install.yml | 2 ++ ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml index b1fcdc7c1..f9bda1a67 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -9,3 +9,5 @@ image_list: - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } +- { name: "docker.io/grafana/loki", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } +- { name: "docker.io/grafana/promtail", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index bc199f2ec..0fa95a198 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -21,7 +21,7 @@ kube_prometheus_stack_metrics_image_tag: v2.12.0 kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 kube_prometheus_stack_loki_chart_version: 2.10.2 -kube_prometheus_stack_loki_image_tag: 2.9.3 +kube_prometheus_stack_loki_image_tag: 2.9.3 # also promtail tag control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" From 54572c53cb8c4411613fb02b7ba2cc6387c2560f Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:18:06 +0000 Subject: [PATCH 06/16] bump fatimage volume sizes --- packer/openstack.pkr.hcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fe922c78e..42e578054 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -125,9 +125,9 @@ variable "volume_size" { type = map(number) default = { # fat image builds, GB: - rocky-latest = 15 + rocky-latest = 20 rocky-latest-cuda = 30 - openhpc = 15 + openhpc = 20 openhpc-cuda = 30 } } From 0780a3728f6947b9dd944c074a596f78572d0eb3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Nov 2024 17:07:50 +0000 Subject: [PATCH 07/16] added loki test pod to image pre-pulls --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 2 ++ ansible/roles/kube_prometheus_stack/defaults/main/install.yml | 1 + ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 1 + 3 files changed, 4 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 19729a631..f1c4fa7be 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -195,6 +195,8 @@ kube_prometheus_stack_release_values: >- }} kube_prometheus_stack_loki_release_values: + test_pod: + image: "bats/bats:{{ kube_prometheus_stack_loki_test_pod_tag }}" loki: nodeSelector: clusterrole: server diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml index f9bda1a67..d4088751f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -11,3 +11,4 @@ image_list: - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } - { name: "docker.io/grafana/loki", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } - { name: "docker.io/grafana/promtail", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } +- { name: "docker.io/bats/bats", tag: "{{ kube_prometheus_stack_loki_test_pod_tag }}" } diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 0fa95a198..145bdc61d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -22,6 +22,7 @@ kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 kube_prometheus_stack_loki_chart_version: 2.10.2 kube_prometheus_stack_loki_image_tag: 2.9.3 # also promtail tag +kube_prometheus_stack_loki_test_pod_tag: 1.8.2 control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" From c026f9564d019c3e64ce7a8c16fd58ca1cd4fedf Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:28:26 +0000 Subject: [PATCH 08/16] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 6b440865e..5d5caf729 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241029-0905-f23c2fca", - "RL9": "openhpc-RL9-241029-0949-f23c2fca", - "RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca" + "RL8": "openhpc-RL8-241106-1719-0780a372", + "RL9": "openhpc-RL9-241106-1719-0780a372", + "RL9-cuda": "openhpc-cuda-RL9-241107-0924-0780a372" } } From 199be8312260f26492cce35c592e19d3e372728c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 14 Nov 2024 10:57:55 +0000 Subject: [PATCH 09/16] added loki persistent storage --- .../defaults/main/helm.yml | 11 +++++ .../defaults/main/main.yml | 3 ++ .../kube_prometheus_stack/tasks/main.yml | 40 +++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index f1c4fa7be..1f416efc9 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -198,6 +198,17 @@ kube_prometheus_stack_loki_release_values: test_pod: image: "bats/bats:{{ kube_prometheus_stack_loki_test_pod_tag }}" loki: + config: + compactor: + retention_enabled: true + limits_config: + retention_period: 72h + persistence: + enabled: true + selector: + matchLabels: + app.kubernetes.io/name: loki-dir + size: "{{ kube_prometheus_stack_loki_persistence_size }}" nodeSelector: clusterrole: server image: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 145bdc61d..c84c397bb 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -24,6 +24,9 @@ kube_prometheus_stack_loki_chart_version: 2.10.2 kube_prometheus_stack_loki_image_tag: 2.9.3 # also promtail tag kube_prometheus_stack_loki_test_pod_tag: 1.8.2 +kube_prometheus_stack_loki_data_dir: "{{ appliances_state_dir }}/loki" +kube_prometheus_stack_loki_persistence_size: "10Gi" + control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 82b210875..0a55e5edd 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -82,6 +82,46 @@ path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate +# Loki image seems to be hardcoded to use this uid/gid even when changed in +# podSecurityContext +- name: Create Loki group + ansible.builtin.group: + name: loki + gid: 10001 + +- name: Create Loki user + ansible.builtin.user: + name: loki + uid: 10001 + group: loki + +- name: Create Loki data directory + ansible.builtin.file: + state: directory + path: "{{ kube_prometheus_stack_loki_data_dir }}" + owner: 10001 + group: 10001 + mode: '775' + +- name: Create Loki hostPath volume in /var/lib/state + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: loki-dir + labels: + app.kubernetes.io/name: loki-dir + spec: + capacity: + storage: "{{ kube_prometheus_stack_loki_persistence_size }}" # not enforced but may be internally by loki? + accessModes: + - ReadWriteOnce + hostPath: + path: "{{ kube_prometheus_stack_loki_data_dir }}" + type: Directory + - name: Setting up k3s services for OnDemand Exporter when: groups['openondemand'] | count > 0 block: From 09a0373b5190c78145dac6c3f3a5fd4534edecdb Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 15 Nov 2024 16:06:02 +0000 Subject: [PATCH 10/16] added ci check for persistence --- .github/workflows/stackhpc.yml | 13 +++++++++++ ansible/ci/check_loki.yml | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 ansible/ci/check_loki.yml diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..e6ac352a6 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,6 +170,11 @@ jobs: env: TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Get timestamp in milliseconds before rebuild + id: timestamp + run: | + echo "TIME_BEFORE_REBUILD=$(date +%s%N | cut -b1-13)" >> $GITHUB_OUTPUT + # - name: Build environment-specific compute image # id: packer_build # run: | @@ -209,6 +214,14 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_grafana.yml + - name: Check Loki Slurm logs persisted through rebuild + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/ci/check_loki.yml -e "end_timestamp=${{ steps.timestamp.outputs.TIME_BEFORE_REBUILD }} testuser_password=$TESTUSER_PASSWORD" + env: + TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Delete infrastructure run: | . venv/bin/activate diff --git a/ansible/ci/check_loki.yml b/ansible/ci/check_loki.yml new file mode 100644 index 000000000..05bcdc86d --- /dev/null +++ b/ansible/ci/check_loki.yml @@ -0,0 +1,42 @@ +- hosts: control + gather_facts: no + become: no + tasks: + - name: Get Loki datasource uid + ansible.builtin.uri: + url: http://localhost:{{ grafana_port }}/api/datasources/name/Loki + url_username: 'testuser' + url_password: "{{ testuser_password }}" + follow_redirects: all + register: uid_response + + - name: Get Loki logs from before rebuild + vars: + testuser_password: mypassword + ansible.builtin.uri: + url: http://localhost:{{ grafana_port }}/api/ds/query + follow_redirects: all + url_username: testuser + url_password: "{{ testuser_password }}" + method: POST + body_format: json + headers: + Accept: application/json + Content-Type: application/json + # Queries from 20 mins before timestamp to timestamp + body: | + { + "queries":[ + { + "expr":"{unit=\"slurmd.service\"} |= ``", + "datasource":{"uid":"{{ uid_response.json.uid }}"}, + "format":"time_series" + }], + "from":"{{ end_timestamp | int - 1200000 }}", + "to":"{{ end_timestamp }}" + } + register: log_query_content + + - name: Check that logs exist + ansible.builtin.assert: + that: log_query_content.json.results.A.frames[0].data['values'][2] | length > 0 From e0812d7ab798dd7f5efaa23576724b8bd4f00e74 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 15 Nov 2024 16:08:16 +0000 Subject: [PATCH 11/16] exposed loki retention period --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 2 +- ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 1f416efc9..9b5579c54 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -202,7 +202,7 @@ kube_prometheus_stack_loki_release_values: compactor: retention_enabled: true limits_config: - retention_period: 72h + retention_period: "{{ kube_prometheus_stack_loki_retention_period }}" persistence: enabled: true selector: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index c84c397bb..d428a8672 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -26,6 +26,7 @@ kube_prometheus_stack_loki_test_pod_tag: 1.8.2 kube_prometheus_stack_loki_data_dir: "{{ appliances_state_dir }}/loki" kube_prometheus_stack_loki_persistence_size: "10Gi" +kube_prometheus_stack_loki_retention_period: "72h" control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" From 1b3e94d297209b53505c0754ab7e42538c079063 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:15:01 +0000 Subject: [PATCH 12/16] testing bumping leafcloud compute nodes --- environments/.stackhpc/terraform/LEAFCLOUD.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars index 3954a34a8..9e1d24eaf 100644 --- a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars +++ b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars @@ -1,6 +1,6 @@ cluster_net = "slurmapp-ci" cluster_subnet = "slurmapp-ci" control_node_flavor = "en1.medium" # min 8GB RAM -other_node_flavor = "en1.xsmall" +other_node_flavor = "en1.small" state_volume_type = "unencrypted" home_volume_type = "unencrypted" From e5b0e84ee7baa663f09b626caa7df572f9623fd8 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:03:24 +0000 Subject: [PATCH 13/16] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 1bf44d240..d29aa5a9e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241119-0915-6f164927", - "RL9": "openhpc-RL9-241119-0914-6f164927" + "RL8": "openhpc-RL8-241119-1321-30456375", + "RL9": "openhpc-RL9-241119-1321-30456375" } } From 2ff2117ad6e615f628cdf5b48e0dcb4f9f80a850 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:01:54 +0000 Subject: [PATCH 14/16] bumped arcus ci compute memory --- environments/.stackhpc/terraform/ARCUS.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/ARCUS.tfvars b/environments/.stackhpc/terraform/ARCUS.tfvars index 6aec599ef..ba2bab262 100644 --- a/environments/.stackhpc/terraform/ARCUS.tfvars +++ b/environments/.stackhpc/terraform/ARCUS.tfvars @@ -1,4 +1,4 @@ cluster_net = "portal-internal" cluster_subnet = "portal-internal" control_node_flavor = "vm.ska.cpu.general.eighth" -other_node_flavor = "vm.ska.cpu.general.small" +other_node_flavor = "vm.ska.cpu.general.eight" From 21b59632719067013721e15fa1f1a97884505f77 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:00:21 +0000 Subject: [PATCH 15/16] removed overrided debug var --- ansible/ci/check_loki.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ansible/ci/check_loki.yml b/ansible/ci/check_loki.yml index 05bcdc86d..ad3ce60be 100644 --- a/ansible/ci/check_loki.yml +++ b/ansible/ci/check_loki.yml @@ -11,8 +11,6 @@ register: uid_response - name: Get Loki logs from before rebuild - vars: - testuser_password: mypassword ansible.builtin.uri: url: http://localhost:{{ grafana_port }}/api/ds/query follow_redirects: all From 60875fc37bb90e33852edc8ea6be189dac9d1844 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:40:03 +0000 Subject: [PATCH 16/16] testing with grafana admin credentials --- ansible/ci/check_loki.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/ci/check_loki.yml b/ansible/ci/check_loki.yml index ad3ce60be..6a9d8293b 100644 --- a/ansible/ci/check_loki.yml +++ b/ansible/ci/check_loki.yml @@ -4,18 +4,18 @@ tasks: - name: Get Loki datasource uid ansible.builtin.uri: - url: http://localhost:{{ grafana_port }}/api/datasources/name/Loki - url_username: 'testuser' - url_password: "{{ testuser_password }}" + url: http://{{ grafana_api_address }}:{{ grafana_port }}/api/datasources/name/Loki + url_username: grafana + url_password: "{{ vault_grafana_admin_password }}" follow_redirects: all register: uid_response - name: Get Loki logs from before rebuild ansible.builtin.uri: - url: http://localhost:{{ grafana_port }}/api/ds/query + url: http://{{ grafana_api_address }}:{{ grafana_port }}/api/ds/query follow_redirects: all - url_username: testuser - url_password: "{{ testuser_password }}" + url_username: grafana + url_password: "{{ vault_grafana_admin_password }}" method: POST body_format: json headers: