Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 66 additions & 55 deletions python/pacemaker/_cts/audits.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def __call__(self):
"""Perform the audit action."""
max_attempts = 3
attempt = 0
passed = True

self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
while attempt <= max_attempts and not self._test_logging():
Expand All @@ -170,9 +171,9 @@ def __call__(self):

if attempt > max_attempts:
self._cm.log("ERROR: Cluster logging unrecoverable.")
return False
passed = False

return True
return passed

def is_applicable(self):
"""Return True if this audit is applicable in the current test configuration."""
Expand Down Expand Up @@ -205,7 +206,7 @@ def __init__(self, cm):

def __call__(self):
"""Perform the audit action."""
result = True
passed = True

# @TODO Use directory of PCMK_logfile if set on host
dfcmd = "df -BM %s | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%%'" % BuildOptions.LOG_DIR
Expand All @@ -228,15 +229,15 @@ def __call__(self):
else:
if remaining_mb < 10 or used_percent > 95:
self._cm.log(f"CRIT: Out of log disk space on {node} ({used_percent}% / {remaining_mb}MB)")
result = False
passed = False

if not should_continue(self._cm.env):
raise ValueError(f"Disk full on {node}")

elif remaining_mb < 100 or used_percent > 90:
self._cm.log(f"WARN: Low on log disk space ({remaining_mb}MB) on {node}")

return result
return passed

def is_applicable(self):
"""Return True if this audit is applicable in the current test configuration."""
Expand All @@ -263,37 +264,47 @@ def __init__(self, cm):
self.known = []
self.name = "FileAudit"

def _output_has_core(self, output, node):
found = False

for line in output:
line = line.strip()

if line in self.known:
continue

found = True
self.known.append(line)
self._cm.log(f"Warning: core file on {node}: {line}")

return found

def _find_core_on_fs(self, node, path):
"""Check for core dumps on the given node, under the given path."""
(_, lsout) = self._cm.rsh(node, f"ls -al {path} | grep core.[0-9]", verbose=1)
return self._output_has_core(lsout, node)

def __call__(self):
"""Perform the audit action."""
result = True
passed = True

self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
for node in self._cm.env["nodes"]:

(_, lsout) = self._cm.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", verbose=1)
for line in lsout:
line = line.strip()

if line not in self.known:
result = False
self.known.append(line)
self._cm.log(f"Warning: Pacemaker core file on {node}: {line}")

(_, lsout) = self._cm.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", verbose=1)
for line in lsout:
line = line.strip()
for node in self._cm.env["nodes"]:
found = self._find_core_on_fs(node, "/var/lib/pacemaker/cores/*", "Pacemaker")
if found:
passed = False

if line not in self.known:
result = False
self.known.append(line)
self._cm.log(f"Warning: Corosync core file on {node}: {line}")
found = self._find_core_on_fs(node, "/var/lib/corosync", "Corosync")
if found:
passed = False

if self._cm.expected_status.get(node) == "down":
clean = False
(_, lsout) = self._cm.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1)

for line in lsout:
result = False
passed = False
clean = True
self._cm.log(f"Warning: Stale IPC file on {node}: {line}")

Expand All @@ -308,7 +319,7 @@ def __call__(self):
else:
self._cm.debug(f"Skipping {node}")

return result
return passed

def is_applicable(self):
"""Return True if this audit is applicable in the current test configuration."""
Expand Down Expand Up @@ -504,17 +515,17 @@ def _setup(self):

def __call__(self):
"""Perform the audit action."""
result = True
passed = True

if not self._setup():
return result
return passed

quorum = self._cm.has_quorum(None)
for resource in self._resources:
if resource.type == "primitive" and not self._audit_resource(resource, quorum):
result = False
passed = False

return result
return passed

def is_applicable(self):
"""Return True if this audit is applicable in the current test configuration."""
Expand Down Expand Up @@ -546,10 +557,10 @@ def __init__(self, cm):
self.name = "GroupAudit"

def __call__(self):
result = True
passed = True

if not self._setup():
return result
return passed

for group in self._resources:
if group.type != "group":
Expand All @@ -570,7 +581,7 @@ def __call__(self):
first_match = False

if len(nodes) > 1:
result = False
passed = False
self._cm.log(f"Child {child.id} of {group.id} is active more than once: {nodes!r}")

elif not nodes:
Expand All @@ -580,13 +591,13 @@ def __call__(self):
self.debug(f"Child {child.id} of {group.id} is stopped")

elif nodes[0] != group_location:
result = False
passed = False
self._cm.log(f"Child {child.id} of {group.id} is active on the wrong "
f"node ({nodes[0]}) expected {group_location}")
else:
self.debug(f"Child {child.id} of {group.id} is active on {nodes[0]}")

return result
return passed


class CloneAudit(PrimitiveAudit):
Expand All @@ -607,10 +618,10 @@ def __init__(self, cm):
self.name = "CloneAudit"

def __call__(self):
result = True
passed = True

if not self._setup():
return result
return passed

for clone in self._resources:
if clone.type != "clone":
Expand All @@ -624,7 +635,7 @@ def __call__(self):
# crm_resource -g clone_max --meta -r child.id
# crm_resource -g clone_node_max --meta -r child.id

return result
return passed


class ColocationAudit(PrimitiveAudit):
Expand Down Expand Up @@ -661,10 +672,10 @@ def _crm_location(self, resource):
return hosts

def __call__(self):
result = True
passed = True

if not self._setup():
return result
return passed

for coloc in self._constraints:
if coloc.type != "rsc_colocation":
Expand All @@ -678,14 +689,14 @@ def __call__(self):
else:
for node in source:
if node not in target:
result = False
passed = False
self._cm.log(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
f"on {node} (not in {target!r})")
else:
self.debug(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
f"on {node} (in {target!r})")

return result
return passed


class ControllerStateAudit(ClusterAudit):
Expand All @@ -702,7 +713,7 @@ def __init__(self, cm):
self.name = "ControllerStateAudit"

def __call__(self):
result = True
passed = True
up_are_down = 0
down_are_up = 0
unstable_list = []
Expand All @@ -722,21 +733,21 @@ def __call__(self):
up_are_down += 1

if len(unstable_list) > 0:
result = False
passed = False
self._cm.log(f"Cluster is not stable: {len(unstable_list)} (of "
f"{self._cm.upcount()}): {unstable_list!r}")

if up_are_down > 0:
result = False
passed = False
self._cm.log(f"{up_are_down} (of {len(self._cm.env['nodes'])}) nodes "
"expected to be up were down.")

if down_are_up > 0:
result = False
passed = False
self._cm.log(f"{down_are_up} (of {len(self._cm.env['nodes'])}) nodes "
"expected to be down were up.")

return result
return passed

def is_applicable(self):
"""Return True if this audit is applicable in the current test configuration."""
Expand All @@ -763,20 +774,20 @@ def __init__(self, cm):
self.name = "CibAudit"

def __call__(self):
result = True
passed = True
ccm_partitions = self._cm.find_partitions()

if not ccm_partitions:
self.debug("\tNo partitions to audit")
return result
return passed

for partition in ccm_partitions:
self.debug(f"\tAuditing CIB consistency for: {partition}")

if self._audit_cib_contents(partition) == 0:
result = False
passed = False

return result
return passed

def _audit_cib_contents(self, hostlist):
"""Perform the CIB audit on the given hosts."""
Expand Down Expand Up @@ -883,26 +894,26 @@ def __init__(self, cm):
self._node_quorum = {}

def __call__(self):
result = True
passed = True
ccm_partitions = self._cm.find_partitions()

if not ccm_partitions:
return result
return passed

self._cm.cluster_stable(double_check=True)

if len(ccm_partitions) != self._cm.partitions_expected:
self._cm.log(f"ERROR: {len(ccm_partitions)} cluster partitions detected:")
result = False
passed = False

for partition in ccm_partitions:
self._cm.log(f"\t {partition}")

for partition in ccm_partitions:
if self._audit_partition(partition) == 0:
result = False
passed = False

return result
return passed

def _trim_string(self, avalue):
"""Remove the last character from a multi-character string."""
Expand Down