Skip to content

Commit 0f7d244

Browse files
authored
Merge pull request #3809 from clumens/fileaudit
Fix the FileAudit test in CTS for systemd systems
2 parents 5ccb2fa + 35d5850 commit 0f7d244

File tree

1 file changed

+84
-54
lines changed

1 file changed

+84
-54
lines changed

python/pacemaker/_cts/audits.py

Lines changed: 84 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def __call__(self):
161161
"""Perform the audit action."""
162162
max_attempts = 3
163163
attempt = 0
164+
passed = True
164165

165166
self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
166167
while attempt <= max_attempts and not self._test_logging():
@@ -170,9 +171,9 @@ def __call__(self):
170171

171172
if attempt > max_attempts:
172173
self._cm.log("ERROR: Cluster logging unrecoverable.")
173-
return False
174+
passed = False
174175

175-
return True
176+
return passed
176177

177178
def is_applicable(self):
178179
"""Return True if this audit is applicable in the current test configuration."""
@@ -205,7 +206,7 @@ def __init__(self, cm):
205206

206207
def __call__(self):
207208
"""Perform the audit action."""
208-
result = True
209+
passed = True
209210

210211
# @TODO Use directory of PCMK_logfile if set on host
211212
dfcmd = "df -BM %s | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%%'" % BuildOptions.LOG_DIR
@@ -228,15 +229,15 @@ def __call__(self):
228229
else:
229230
if remaining_mb < 10 or used_percent > 95:
230231
self._cm.log(f"CRIT: Out of log disk space on {node} ({used_percent}% / {remaining_mb}MB)")
231-
result = False
232+
passed = False
232233

233234
if not should_continue(self._cm.env):
234235
raise ValueError(f"Disk full on {node}")
235236

236237
elif remaining_mb < 100 or used_percent > 90:
237238
self._cm.log(f"WARN: Low on log disk space ({remaining_mb}MB) on {node}")
238239

239-
return result
240+
return passed
240241

241242
def is_applicable(self):
242243
"""Return True if this audit is applicable in the current test configuration."""
@@ -263,37 +264,66 @@ def __init__(self, cm):
263264
self.known = []
264265
self.name = "FileAudit"
265266

267+
def _output_has_core(self, output, node):
268+
"""Check output for any lines that would indicate the presence of a core dump."""
269+
found = False
270+
271+
for line in output:
272+
line = line.strip()
273+
274+
if line in self.known:
275+
continue
276+
277+
found = True
278+
self.known.append(line)
279+
self._cm.log(f"Warning: core file on {node}: {line}")
280+
281+
return found
282+
283+
def _find_core_with_coredumpctl(self, node):
284+
"""Use coredumpctl to find core dumps on the given node."""
285+
(_, lsout) = self._cm.rsh(node, "coredumpctl --no-legend --no-pager")
286+
return self._output_has_core(lsout, node)
287+
288+
def _find_core_on_fs(self, node, paths):
289+
"""Check for core dumps on the given node, under any of the given paths."""
290+
(_, lsout) = self._cm.rsh(node, f"ls -al {' '.join(paths)} | grep core.[0-9]",
291+
verbose=1)
292+
return self._output_has_core(lsout, node)
293+
266294
def __call__(self):
267295
"""Perform the audit action."""
268-
result = True
296+
passed = True
269297

270298
self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"])
271-
for node in self._cm.env["nodes"]:
272299

273-
(_, lsout) = self._cm.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", verbose=1)
274-
for line in lsout:
275-
line = line.strip()
276-
277-
if line not in self.known:
278-
result = False
279-
self.known.append(line)
280-
self._cm.log(f"Warning: Pacemaker core file on {node}: {line}")
300+
for node in self._cm.env["nodes"]:
301+
found = False
281302

282-
(_, lsout) = self._cm.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", verbose=1)
283-
for line in lsout:
284-
line = line.strip()
303+
# If systemd is present, first see if coredumpctl logged any core dumps.
304+
if self._cm.env["have_systemd"]:
305+
found = self._find_core_with_coredumpctl(node)
306+
if found:
307+
passed = False
285308

286-
if line not in self.known:
287-
result = False
288-
self.known.append(line)
289-
self._cm.log(f"Warning: Corosync core file on {node}: {line}")
309+
# If we didn't find any core dumps, it's for one of three reasons:
310+
# (1) Nothing crashed
311+
# (2) systemd is not present
312+
# (3) systemd is present but coredumpctl is not enabled
313+
#
314+
# To handle the last two cases, check the other filesystem locations.
315+
if not found:
316+
found = self._find_core_on_fs(node, ["/var/lib/pacemaker/cores/*",
317+
"/var/lib/corosync"])
318+
if found:
319+
passed = False
290320

291321
if self._cm.expected_status.get(node) == "down":
292322
clean = False
293323
(_, lsout) = self._cm.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1)
294324

295325
for line in lsout:
296-
result = False
326+
passed = False
297327
clean = True
298328
self._cm.log(f"Warning: Stale IPC file on {node}: {line}")
299329

@@ -308,7 +338,7 @@ def __call__(self):
308338
else:
309339
self._cm.debug(f"Skipping {node}")
310340

311-
return result
341+
return passed
312342

313343
def is_applicable(self):
314344
"""Return True if this audit is applicable in the current test configuration."""
@@ -504,17 +534,17 @@ def _setup(self):
504534

505535
def __call__(self):
506536
"""Perform the audit action."""
507-
result = True
537+
passed = True
508538

509539
if not self._setup():
510-
return result
540+
return passed
511541

512542
quorum = self._cm.has_quorum(None)
513543
for resource in self._resources:
514544
if resource.type == "primitive" and not self._audit_resource(resource, quorum):
515-
result = False
545+
passed = False
516546

517-
return result
547+
return passed
518548

519549
def is_applicable(self):
520550
"""Return True if this audit is applicable in the current test configuration."""
@@ -546,10 +576,10 @@ def __init__(self, cm):
546576
self.name = "GroupAudit"
547577

548578
def __call__(self):
549-
result = True
579+
passed = True
550580

551581
if not self._setup():
552-
return result
582+
return passed
553583

554584
for group in self._resources:
555585
if group.type != "group":
@@ -570,7 +600,7 @@ def __call__(self):
570600
first_match = False
571601

572602
if len(nodes) > 1:
573-
result = False
603+
passed = False
574604
self._cm.log(f"Child {child.id} of {group.id} is active more than once: {nodes!r}")
575605

576606
elif not nodes:
@@ -580,13 +610,13 @@ def __call__(self):
580610
self.debug(f"Child {child.id} of {group.id} is stopped")
581611

582612
elif nodes[0] != group_location:
583-
result = False
613+
passed = False
584614
self._cm.log(f"Child {child.id} of {group.id} is active on the wrong "
585615
f"node ({nodes[0]}) expected {group_location}")
586616
else:
587617
self.debug(f"Child {child.id} of {group.id} is active on {nodes[0]}")
588618

589-
return result
619+
return passed
590620

591621

592622
class CloneAudit(PrimitiveAudit):
@@ -607,10 +637,10 @@ def __init__(self, cm):
607637
self.name = "CloneAudit"
608638

609639
def __call__(self):
610-
result = True
640+
passed = True
611641

612642
if not self._setup():
613-
return result
643+
return passed
614644

615645
for clone in self._resources:
616646
if clone.type != "clone":
@@ -624,7 +654,7 @@ def __call__(self):
624654
# crm_resource -g clone_max --meta -r child.id
625655
# crm_resource -g clone_node_max --meta -r child.id
626656

627-
return result
657+
return passed
628658

629659

630660
class ColocationAudit(PrimitiveAudit):
@@ -661,10 +691,10 @@ def _crm_location(self, resource):
661691
return hosts
662692

663693
def __call__(self):
664-
result = True
694+
passed = True
665695

666696
if not self._setup():
667-
return result
697+
return passed
668698

669699
for coloc in self._constraints:
670700
if coloc.type != "rsc_colocation":
@@ -678,14 +708,14 @@ def __call__(self):
678708
else:
679709
for node in source:
680710
if node not in target:
681-
result = False
711+
passed = False
682712
self._cm.log(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
683713
f"on {node} (not in {target!r})")
684714
else:
685715
self.debug(f"Colocation audit ({coloc.id}): {coloc.rsc} running "
686716
f"on {node} (in {target!r})")
687717

688-
return result
718+
return passed
689719

690720

691721
class ControllerStateAudit(ClusterAudit):
@@ -702,7 +732,7 @@ def __init__(self, cm):
702732
self.name = "ControllerStateAudit"
703733

704734
def __call__(self):
705-
result = True
735+
passed = True
706736
up_are_down = 0
707737
down_are_up = 0
708738
unstable_list = []
@@ -722,21 +752,21 @@ def __call__(self):
722752
up_are_down += 1
723753

724754
if len(unstable_list) > 0:
725-
result = False
755+
passed = False
726756
self._cm.log(f"Cluster is not stable: {len(unstable_list)} (of "
727757
f"{self._cm.upcount()}): {unstable_list!r}")
728758

729759
if up_are_down > 0:
730-
result = False
760+
passed = False
731761
self._cm.log(f"{up_are_down} (of {len(self._cm.env['nodes'])}) nodes "
732762
"expected to be up were down.")
733763

734764
if down_are_up > 0:
735-
result = False
765+
passed = False
736766
self._cm.log(f"{down_are_up} (of {len(self._cm.env['nodes'])}) nodes "
737767
"expected to be down were up.")
738768

739-
return result
769+
return passed
740770

741771
def is_applicable(self):
742772
"""Return True if this audit is applicable in the current test configuration."""
@@ -763,20 +793,20 @@ def __init__(self, cm):
763793
self.name = "CibAudit"
764794

765795
def __call__(self):
766-
result = True
796+
passed = True
767797
ccm_partitions = self._cm.find_partitions()
768798

769799
if not ccm_partitions:
770800
self.debug("\tNo partitions to audit")
771-
return result
801+
return passed
772802

773803
for partition in ccm_partitions:
774804
self.debug(f"\tAuditing CIB consistency for: {partition}")
775805

776806
if self._audit_cib_contents(partition) == 0:
777-
result = False
807+
passed = False
778808

779-
return result
809+
return passed
780810

781811
def _audit_cib_contents(self, hostlist):
782812
"""Perform the CIB audit on the given hosts."""
@@ -883,26 +913,26 @@ def __init__(self, cm):
883913
self._node_quorum = {}
884914

885915
def __call__(self):
886-
result = True
916+
passed = True
887917
ccm_partitions = self._cm.find_partitions()
888918

889919
if not ccm_partitions:
890-
return result
920+
return passed
891921

892922
self._cm.cluster_stable(double_check=True)
893923

894924
if len(ccm_partitions) != self._cm.partitions_expected:
895925
self._cm.log(f"ERROR: {len(ccm_partitions)} cluster partitions detected:")
896-
result = False
926+
passed = False
897927

898928
for partition in ccm_partitions:
899929
self._cm.log(f"\t {partition}")
900930

901931
for partition in ccm_partitions:
902932
if self._audit_partition(partition) == 0:
903-
result = False
933+
passed = False
904934

905-
return result
935+
return passed
906936

907937
def _trim_string(self, avalue):
908938
"""Remove the last character from a multi-character string."""

0 commit comments

Comments
 (0)