diff --git a/playbooks/files/rax-maas/plugins/conntrack_count.py b/playbooks/files/rax-maas/plugins/conntrack_count.py index 5d1e6a60a..8cec92339 100755 --- a/playbooks/files/rax-maas/plugins/conntrack_count.py +++ b/playbooks/files/rax-maas/plugins/conntrack_count.py @@ -24,6 +24,8 @@ pass import maas_common +import os +import subprocess import tempfile @@ -63,9 +65,29 @@ def get_metrics(): 'nf_conntrack_max': { 'path': '/proc/sys/net/netfilter/nf_conntrack_max'}} + # Retrieve root namespace count for data in metrics.viewvalues(): data['value'] = get_value(data['path']) + # Retrieve conntrack count per namespace + # and report the namespace with the highest count. + # This is necessary to limit the number of metrics to report to MAAS, + # as we can not report a metric per namespace, which by nature are + # also volatile. + try: + namespaces = os.listdir('/var/run/netns') + for ns in namespaces: + ps = subprocess.check_output(['ip', 'netns', 'exec', + ns, 'cat', + '/proc/sys/net/netfilter/' + 'nf_conntrack_count']) + nscount = int(ps.strip(os.linesep)) + + if nscount > metrics['nf_conntrack_count']['value']: + metrics['nf_conntrack_count']['value'] = nscount + except (OSError): + pass + return metrics @@ -89,11 +111,13 @@ def get_metrics_lxc_container(container_name=''): # Check if container is even running try: with tempfile.TemporaryFile() as tmpfile: + # Retrieve root namespace count if cont.attach_wait(lxc.attach_run_command, ['cat', '/proc/sys/net/netfilter/nf_conntrack_count', '/proc/sys/net/netfilter/nf_conntrack_max'], - stdout=tmpfile) > -1: + stdout=tmpfile, + stderr=tempfile.TemporaryFile()) > -1: tmpfile.seek(0) output = tmpfile.read() @@ -101,7 +125,39 @@ def get_metrics_lxc_container(container_name=''): 'nf_conntrack_count': {'value': output.split('\n')[0]}, 'nf_conntrack_max': {'value': output.split('\n')[1]}} - return metrics + # Retrieve conntrack count per namespace + # and report the namespace with the highest count. + # This is necessary to limit the number of metrics to report to MAAS, + # as we can not report a metric per namespace, which by nature are + # also volatile. + with tempfile.TemporaryFile() as nsfile: + if cont.attach_wait(lxc.attach_run_command, + ['ls', + '-1', + '/var/run/netns'], + stdout=nsfile, + stderr=tempfile.TemporaryFile()) > -1: + nsfile.seek(0) + + for line in nsfile.readlines(): + ns = line.strip(os.linesep) + nscountfile = tempfile.TemporaryFile() + + if cont.attach_wait(lxc.attach_run_command, + ['ip', 'netns', 'exec', + ns, 'cat', + '/proc/sys/net/netfilter/' + 'nf_conntrack_count'], + stdout=nscountfile, + stderr=tempfile.TemporaryFile()) > -1: + + nscountfile.seek(0) + nscount = int(nscountfile.read().strip(os.linesep)) + + if nscount > metrics['nf_conntrack_count']['value']: + metrics['nf_conntrack_count']['value'] = nscount + + return metrics except maas_common.MaaSException as e: maas_common.status_err(str(e), m_name='maas_conntrack') diff --git a/playbooks/templates/rax-maas/conntrack_count.yaml.j2 b/playbooks/templates/rax-maas/conntrack_count.yaml.j2 index f296b566c..2ac6f928b 100644 --- a/playbooks/templates/rax-maas/conntrack_count.yaml.j2 +++ b/playbooks/templates/rax-maas/conntrack_count.yaml.j2 @@ -20,8 +20,8 @@ alarms : criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} if (percentage(metric["nf_conntrack_count"] , metric["nf_conntrack_max"]) > {{ maas_nf_conntrack_critical_threshold }}) { - return new AlarmStatus(CRITICAL, "Connection count is > {{ maas_nf_conntrack_critical_threshold }}% of maximum allowed."); + return new AlarmStatus(CRITICAL, "Connection tracking count is > {{ maas_nf_conntrack_critical_threshold }}% of the critical threshold. Please check all namespaces listed at /var/run/netns including the host."); } if (percentage(metric["nf_conntrack_count"] , metric["nf_conntrack_max"]) > {{ maas_nf_conntrack_warning_threshold }}) { - return new AlarmStatus(WARNING, "Connection count is > {{ maas_nf_conntrack_warning_threshold }}% of maximum allowed."); + return new AlarmStatus(WARNING, "Connection tracking count is > {{ maas_nf_conntrack_warning_threshold }}% of the warning threshold. Please check all namespaces inside listed at /var/run/netns including the host."); } diff --git a/releasenotes/notes/TURTLES-1006-add-ns-checks-conntack-plugin-5dcd0ff5de96a3b2.yaml b/releasenotes/notes/TURTLES-1006-add-ns-checks-conntack-plugin-5dcd0ff5de96a3b2.yaml new file mode 100644 index 000000000..e76e6bb25 --- /dev/null +++ b/releasenotes/notes/TURTLES-1006-add-ns-checks-conntack-plugin-5dcd0ff5de96a3b2.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + * The `conntrack_count.py` plugin is now checking for network namespaces + listed at `/var/run/netns` and retreives the iptables connection + tracking infomation for each namespace. + This ensures that embedded network namespaces are alerted in case + connection tracking hashes are about to exceed a configurable threshold. + Due to the limited availability of MAAS metrics per alarm, only the + namespace with the higest connection tracking count is reported.