Merge pull request #3809 from clumens/fileaudit

nrwahl2 · web-flow · commit 0f7d244b2d1c · 2025-02-03T11:42:33.000-08:00
Fix the FileAudit test in CTS for systemd systems
diff --git a/python/pacemaker/_cts/audits.py b/python/pacemaker/_cts/audits.py
@@ -161,6 +161,7 @@ def __call__(self):
         """Perform the audit action."""
         max_attempts = 3
         attempt = 0
+        passed = True
 
         self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
         while attempt <= max_attempts and not self._test_logging():
@@ -170,9 +171,9 @@ def __call__(self):
 
         if attempt > max_attempts:
             self._cm.log("ERROR: Cluster logging unrecoverable.")
-            return False
+            passed = False
 
-        return True
+        return passed
 
     def is_applicable(self):
         """Return True if this audit is applicable in the current test configuration."""
@@ -205,7 +206,7 @@ def __init__(self, cm):
 
     def __call__(self):
         """Perform the audit action."""
-        result = True
+        passed = True
 
         # @TODO Use directory of PCMK_logfile if set on host
         dfcmd = "df -BM %s | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%%'" % BuildOptions.LOG_DIR
@@ -228,15 +229,15 @@ def __call__(self):
             else:
                 if remaining_mb < 10 or used_percent > 95:
                     self._cm.log(f"CRIT: Out of log disk space on {node} ({used_percent}% / {remaining_mb}MB)")
-                    result = False
+                    passed = False
 
                     if not should_continue(self._cm.env):
                         raise ValueError(f"Disk full on {node}")
 
                 elif remaining_mb < 100 or used_percent > 90:
                     self._cm.log(f"WARN: Low on log disk space ({remaining_mb}MB) on {node}")
 
-        return result
+        return passed
 
     def is_applicable(self):
         """Return True if this audit is applicable in the current test configuration."""
@@ -263,37 +264,66 @@ def __init__(self, cm):
         self.known = []
         self.name = "FileAudit"
 
+    def _output_has_core(self, output, node):
+        """Check output for any lines that would indicate the presence of a core dump."""
+        found = False
+
+        for line in output:
+            line = line.strip()
+
+            if line in self.known:
+                continue
+
+            found = True
+            self.known.append(line)
+            self._cm.log(f"Warning: core file on {node}: {line}")
+
+        return found
+
+    def _find_core_with_coredumpctl(self, node):
+        """Use coredumpctl to find core dumps on the given node."""
+        (_, lsout) = self._cm.rsh(node, "coredumpctl --no-legend --no-pager")
+        return self._output_has_core(lsout, node)
+
+    def _find_core_on_fs(self, node, paths):
+        """Check for core dumps on the given node, under any of the given paths."""
+        (_, lsout) = self._cm.rsh(node, f"ls -al {' '.join(paths)} | grep core.[0-9]",
+                                  verbose=1)
+        return self._output_has_core(lsout, node)
+
     def __call__(self):
         """Perform the audit action."""
-        result = True
+        passed = True
 
         self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
-        for node in self._cm.env["nodes"]:
 
-            (_, lsout) = self._cm.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", verbose=1)
-            for line in lsout:
-                line = line.strip()
-
-                if line not in self.known:
-                    result = False
-                    self.known.append(line)
-                    self._cm.log(f"Warning: Pacemaker core file on {node}: {line}")
+        for node in self._cm.env["nodes"]:
+            found = False
 
-            (_, lsout) = self._cm.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", verbose=1)
-            for line in lsout:
-                line = line.strip()
+            # If systemd is present, first see if coredumpctl logged any core dumps.
+            if self._cm.env["have_systemd"]:
+                found = self._find_core_with_coredumpctl(node)
+                if found:
+                    passed = False
 
-                if line not in self.known:
-                    result = False
-                    self.known.append(line)
-                    self._cm.log(f"Warning: Corosync core file on {node}: {line}")
+            # If we didn't find any core dumps, it's for one of three reasons:
+            # (1) Nothing crashed
+            # (2) systemd is not present
+            # (3) systemd is present but coredumpctl is not enabled
+            #
+            # To handle the last two cases, check the other filesystem locations.
+            if not found:
+                found = self._find_core_on_fs(node, ["/var/lib/pacemaker/cores/*",
+                                                     "/var/lib/corosync"])
+                if found:
+                    passed = False
 
             if self._cm.expected_status.get(node) == "down":
                 clean = False
                 (_, lsout) = self._cm.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1)
 
                 for line in lsout:
-                    result = False
+                    passed = False
                     clean = True
                     self._cm.log(f"Warning: Stale IPC file on {node}: {line}")
 
@@ -308,7 +338,7 @@ def __call__(self):
             else:
                 self._cm.debug(f"Skipping {node}")
 
-        return result
+        return passed
 
     def is_applicable(self):
         """Return True if this audit is applicable in the current test configuration."""
@@ -504,17 +534,17 @@ def _setup(self):
 
     def __call__(self):
         """Perform the audit action."""
-        result = True
+        passed = True
 
         if not self._setup():
-            return result
+            return passed
 
         quorum = self._cm.has_quorum(None)
         for resource in self._resources:
             if resource.type == "primitive" and not self._audit_resource(resource, quorum):
-                result = False
+                passed = False
 
-        return result
+        return passed
 
     def is_applicable(self):
         """Return True if this audit is applicable in the current test configuration."""
@@ -546,10 +576,10 @@ def __init__(self, cm):
         self.name = "GroupAudit"
 
     def __call__(self):
-        result = True
+        passed = True
 
         if not self._setup():
-            return result
+            return passed
 
         for group in self._resources:
             if group.type != "group":
@@ -570,7 +600,7 @@ def __call__(self):
                 first_match = False
 
                 if len(nodes) > 1:
-                    result = False
+                    passed = False
                     self._cm.log(f"Child {child.id} of {group.id} is active more than once: {nodes!r}")
 
                 elif not nodes:
@@ -580,13 +610,13 @@ def __call__(self):
                     self.debug(f"Child {child.id} of {group.id} is stopped")
 
                 elif nodes[0] != group_location:
-                    result = False
+                    passed = False
                     self._cm.log(f"Child {child.id} of {group.id} is active on the wrong "
                                  f"node ({nodes[0]}) expected {group_location}")
                 else:
                     self.debug(f"Child {child.id} of {group.id} is active on {nodes[0]}")
 
-        return result
+        return passed
 
 
 class CloneAudit(PrimitiveAudit):
@@ -607,10 +637,10 @@ def __init__(self, cm):
         self.name = "CloneAudit"
 
     def __call__(self):
-        result = True
+        passed = True
 
         if not self._setup():
-            return result
+            return passed
 
         for clone in self._resources:
             if clone.type != "clone":
@@ -624,7 +654,7 @@ def __call__(self):
                     #    crm_resource -g clone_max --meta -r child.id
                     #    crm_resource -g clone_node_max --meta -r child.id
 
-        return result
+        return passed
 
 
 class ColocationAudit(PrimitiveAudit):
@@ -661,10 +691,10 @@ def _crm_location(self, resource):
         return hosts
 
     def __call__(self):
-        result = True
+        passed = True
 
         if not self._setup():
-            return result
+            return passed
 
         for coloc in self._constraints:
             if coloc.type != "rsc_colocation":
@@ -678,14 +708,14 @@ def __call__(self):
             else:
                 for node in source:
                     if node not in target:
-                        result = False
+                        passed = False
                         self._cm.log(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
                                      f"on {node} (not in {target!r})")
                     else:
                         self.debug(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
                                    f"on {node} (in {target!r})")
 
-        return result
+        return passed
 
 
 class ControllerStateAudit(ClusterAudit):
@@ -702,7 +732,7 @@ def __init__(self, cm):
         self.name = "ControllerStateAudit"
 
     def __call__(self):
-        result = True
+        passed = True
         up_are_down = 0
         down_are_up = 0
         unstable_list = []
@@ -722,21 +752,21 @@ def __call__(self):
                 up_are_down += 1
 
         if len(unstable_list) > 0:
-            result = False
+            passed = False
             self._cm.log(f"Cluster is not stable: {len(unstable_list)} (of "
                          f"{self._cm.upcount()}): {unstable_list!r}")
 
         if up_are_down > 0:
-            result = False
+            passed = False
             self._cm.log(f"{up_are_down} (of {len(self._cm.env['nodes'])}) nodes "
                          "expected to be up were down.")
 
         if down_are_up > 0:
-            result = False
+            passed = False
             self._cm.log(f"{down_are_up} (of {len(self._cm.env['nodes'])}) nodes "
                          "expected to be down were up.")
 
-        return result
+        return passed
 
     def is_applicable(self):
         """Return True if this audit is applicable in the current test configuration."""
@@ -763,20 +793,20 @@ def __init__(self, cm):
         self.name = "CibAudit"
 
     def __call__(self):
-        result = True
+        passed = True
         ccm_partitions = self._cm.find_partitions()
 
         if not ccm_partitions:
             self.debug("\tNo partitions to audit")
-            return result
+            return passed
 
         for partition in ccm_partitions:
             self.debug(f"\tAuditing CIB consistency for: {partition}")
 
             if self._audit_cib_contents(partition) == 0:
-                result = False
+                passed = False
 
-        return result
+        return passed
 
     def _audit_cib_contents(self, hostlist):
         """Perform the CIB audit on the given hosts."""
@@ -883,26 +913,26 @@ def __init__(self, cm):
         self._node_quorum = {}
 
     def __call__(self):
-        result = True
+        passed = True
         ccm_partitions = self._cm.find_partitions()
 
         if not ccm_partitions:
-            return result
+            return passed
 
         self._cm.cluster_stable(double_check=True)
 
         if len(ccm_partitions) != self._cm.partitions_expected:
             self._cm.log(f"ERROR: {len(ccm_partitions)} cluster partitions detected:")
-            result = False
+            passed = False
 
             for partition in ccm_partitions:
                 self._cm.log(f"\t {partition}")
 
         for partition in ccm_partitions:
             if self._audit_partition(partition) == 0:
-                result = False
+                passed = False
 
-        return result
+        return passed
 
     def _trim_string(self, avalue):
         """Remove the last character from a multi-character string."""