From 5f9e20ada8d81971871a117b0e42792cecace9d5 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Tue, 22 Aug 2023 10:43:03 +0200 Subject: [PATCH] Monitoring / Prometheus fixes: Ceph scraping and simcore container labels (#322) * Remove ceph scrape (was broken) on osparc-public * Adress DevOps Changes https://github.com/ITISFoundation/osparc-simcore/pull/4453 --------- Co-authored-by: kaiser --- services/monitoring/Makefile | 8 ++- services/monitoring/docker-compose.public.yml | 26 ++++++++ .../simcore/s4l-lite-admin-overview.json | 2 +- .../simcore/s4l-lite-admin-overview.json | 4 +- .../dashboards/simcore/admin-overview.json | 6 +- .../monitoring/prometheus/prometheus-base.yml | 59 +++++++------------ 6 files changed, 58 insertions(+), 47 deletions(-) create mode 100644 services/monitoring/docker-compose.public.yml diff --git a/services/monitoring/Makefile b/services/monitoring/Makefile index 14906c14..6e6a56d6 100644 --- a/services/monitoring/Makefile +++ b/services/monitoring/Makefile @@ -35,7 +35,7 @@ up-letsencrypt-http: .init .env config.monitoring config.prometheus ${TEMP_COMP @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-letsencrypt-http ${STACK_NAME} .PHONY: up-letsencrypt-dns -up-letsencrypt-dns: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-letsencrypt-dns ## Deploys or updates current stack "$(STACK_NAME)" using let's encrypt dns challenge +up-letsencrypt-dns: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-letsencrypt-dns ## Deploys or updates current stack "$(STACK_NAME)" using let's encrypt dns challenge @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-letsencrypt-dns ${STACK_NAME} .PHONY: up-dalco @@ -43,7 +43,8 @@ up-dalco: .init .env config.monitoring config.prometheus.ceph.simcore ${TEMP_CO @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-dalco ${STACK_NAME} .PHONY: up-public -up-public: up-dalco +up-public: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-public ## Deploys monitoring stack for Public Cluster + @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-public ${STACK_NAME} .PHONY: up-aws up-aws: .init .env config.monitoring config.prometheus.simcore ${TEMP_COMPOSE}-aws ## Deploys or updates current stack "$(STACK_NAME)" in AWS @@ -65,6 +66,9 @@ ${TEMP_COMPOSE}-letsencrypt-dns: docker-compose.yml docker-compose.letsencrypt.d ${TEMP_COMPOSE}-dalco: docker-compose.yml docker-compose.dalco.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml @${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.letsencrypt.dns.yml docker-compose.dalco.yml > $@ +${TEMP_COMPOSE}-public: docker-compose.yml docker-compose.public.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml + @${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.letsencrypt.dns.yml docker-compose.public.yml > $@ + ${TEMP_COMPOSE}-aws: docker-compose.yml docker-compose.aws.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml @${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.aws.yml docker-compose.letsencrypt.dns.yml > $@ diff --git a/services/monitoring/docker-compose.public.yml b/services/monitoring/docker-compose.public.yml new file mode 100644 index 00000000..a03aa03c --- /dev/null +++ b/services/monitoring/docker-compose.public.yml @@ -0,0 +1,26 @@ +version: '3.7' +services: + cadvisor-exporter: + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + grafana: + dns: 8.8.8.8 + deploy: + placement: + constraints: + - node.labels.grafana==true + + prometheuscatchall: + deploy: + placement: + constraints: + - node.labels.prometheus==true + prometheuscadvisor: + deploy: + placement: + constraints: + - node.labels.prometheus==true diff --git a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json index 0237e28b..54c86e1f 100644 --- a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json +++ b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json @@ -383,7 +383,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)", + "expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/grafana/provisioning/staging.osparc.io/dashboards/simcore/s4l-lite-admin-overview.json b/services/monitoring/grafana/provisioning/staging.osparc.io/dashboards/simcore/s4l-lite-admin-overview.json index f9f12488..9f254684 100644 --- a/services/monitoring/grafana/provisioning/staging.osparc.io/dashboards/simcore/s4l-lite-admin-overview.json +++ b/services/monitoring/grafana/provisioning/staging.osparc.io/dashboards/simcore/s4l-lite-admin-overview.json @@ -383,13 +383,13 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)", + "expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Number of s4l-lite studies running (excluding puppeteer from v1.52.0)", + "title": "Number of s4l-lite studies running", "type": "timeseries" }, { diff --git a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json index de584a04..3a8cc228 100644 --- a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json +++ b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json @@ -180,7 +180,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)", + "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -273,7 +273,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)", + "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -366,7 +366,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)", + "expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/prometheus/prometheus-base.yml b/services/monitoring/prometheus/prometheus-base.yml index eb266383..bc358be6 100644 --- a/services/monitoring/prometheus/prometheus-base.yml +++ b/services/monitoring/prometheus/prometheus-base.yml @@ -97,48 +97,10 @@ scrape_configs: # this would require soffisticated regex parsing # To use these: # KEEP: container_label_simcore_service_settings # - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{\"Limits\": (\{".*\})?\{\"NanoCPUs\": )(\d*)([,\}])' - # target_label: docker_service_resources_nanocpu_limit - # replacement: ${4} - # action: replace - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{\"Limits\": \{)(\{".*\})?(\".*\"\: \d+[,\}] )?(\"MemoryBytes\": )(\d*)([\}])' - # target_label: docker_service_resources_memorybytes_limit - # replacement: ${6} - # action: replace - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{)("Limits".*, )?(\"Reservations\": (\{".*\})?\{\"NanoCPUs\": )(\d*)([,\}])' - # target_label: docker_service_resources_nanocpu_reservation - # replacement: ${6} - # action: replace - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{)(.*, )?(\"Reservations\": )(\{".*\})?\{(".*": \d+[,\}] )?(\"MemoryBytes\": )(\d*)([,\}])' - # target_label: docker_service_resources_nanocpu_reservation - # replacement: ${8} - # action: replace - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^"(.*)?("GenericResources": \[\{"DiscreteResourceSpec": )(.*)(\{"Kind": "VRAM", "Value": )(\d+)' - # target_label: docker_service_resources_vram_reservation - # replacement: ${5} - # action: replace - #- source_labels: [container_label_simcore_service_settings] - # separator: ; - # regex: '^"(.*)?("GenericResources": \[\{"DiscreteResourceSpec": )(.*)(\{"Kind": "AIRAM", "Value": )(\d+)' - # target_label: docker_service_resources_vram_reservation - # replacement: ${5} - # action: replace - regex: "container_label_com_docker_compose_config_hash" action: labeldrop # cAdvisor pruning - regex: "container_label_com_docker_compose_container_number" action: labeldrop # cAdvisor pruning - - regex: "container_label_io_simcore_.*" - action: labeldrop # cAdvisor pruning - regex: "container_label_simcore_service_compose_spec" action: labeldrop # cAdvisor pruning - regex: "container_label_simcore_service_container_http_entrypoint" @@ -157,7 +119,26 @@ scrape_configs: action: labeldrop # cAdvisor pruning - regex: "container_label_maintainer" action: labeldrop # cAdvisor pruning - + - regex: "container_label_com_docker_compose_project_working_dir" + action: labeldrop + - regex: "container_label_io_simcore_contact" + action: labeldrop + - regex: "container_label_io_simcore_authors" + action: labeldrop + - regex: "container_label_io_simcore_description" + action: labeldrop + - regex: "container_label_io_simcore_inputs" + action: labeldrop + - regex: "container_label_io_simcore_key" + action: labeldrop + - regex: "container_label_io_simcore_outputs" + action: labeldrop + - regex: "container_label_io_simcore_runtime_swarm_stack_name" + action: labeldrop + - regex: "container_label_io_simcore_thumbnail" + action: labeldrop + - regex: "container_label_simcore_service_settings" + action: labeldrop ### Used: # container_label_com_docker_swarm_node_id