Skip to content

Commit 5335b8f

Browse files
committed
Low: daemons: Fix removing transient attrs on rolling upgrades.
The previous patches to fix race conditions in removing transient attributes rely on this sequence of events: * First, attrd notices that a peer has left the cluster at the corosync level. This causes it to set the values of all transient attributes to NULL. * Second, based updates the CIB to remove transient attributes, which triggers a callback in attrd. This callback removes all attributes whose value is NULL. This seems to work reliably in same-version clusters. However, in mixed version clusters (like if you're doing a rolling upgrade), this doesn't always happen. In fact in my testing, almost every time, the order of those two operations is reversed. What's happening here is that in the mixed version cluster, the DC (which will still be the old, un-upgraded cluster version) still has a controld that deletes transient attributes. It will be sending out requests that based performs this change to the CIB, which conflicts with the entire point of these patches - that controld is no longer in the business of deleting transient attributes. I think there's no way around this problem. No matter how you update cluster nodes, you'll always have a DC that's the old version with an old controld. If you take the existing DC out of the cluster to update it, another existing node will become the DC and it will be the problem node. The fix here is to bump the attrd protocol version so we know if a node has been updated or not. This is probably reasonable to do for the change in the first place. Then, attrd can inspect the protocol version of the peer that is telling it to remove a transient attribute. If the peer version is old enough, attrd can simply remove the transient attribute immediately instead of doing the NULL-then-delete process.
1 parent 0dac64b commit 5335b8f

File tree

4 files changed

+48
-12
lines changed

4 files changed

+48
-12
lines changed

daemons/attrd/attrd_attributes.c

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -312,16 +312,23 @@ attrd_for_cib(const attribute_t *a)
312312
* combination
313313
*/
314314
static void
315-
drop_removed_values(const char *cib_id, const char *set_type,
315+
drop_removed_values(const char *cib_id, const char *set_type, int peer_attrd_ver,
316316
bool (*func)(const attribute_t *, const char *,
317317
const char *))
318318
{
319+
int our_attrd_ver;
320+
bool drop_immediately = false;
319321
attribute_t *a = NULL;
320322
GHashTableIter attr_iter;
321323
const char *entry_type = pcmk__s(set_type, "status entry"); // for log
322324

323325
CRM_CHECK((cib_id != NULL) && (func != NULL), return);
324326

327+
pcmk__scan_min_int(ATTRD_PROTOCOL_VERSION, &our_attrd_ver, -1);
328+
drop_immediately = (peer_attrd_ver != -1)
329+
&& ATTRD_SUPPORTS_CLEARING_CIB(our_attrd_ver)
330+
&& !ATTRD_SUPPORTS_CLEARING_CIB(peer_attrd_ver);
331+
325332
// Check every attribute ...
326333
g_hash_table_iter_init(&attr_iter, attributes);
327334
while (g_hash_table_iter_next(&attr_iter, NULL, (gpointer *) &a)) {
@@ -338,7 +345,7 @@ drop_removed_values(const char *cib_id, const char *set_type,
338345
while (g_hash_table_iter_next(&value_iter, NULL, (gpointer *) &v)) {
339346
const char *id = NULL;
340347

341-
if (v->current != NULL) {
348+
if ((v->current != NULL) && !drop_immediately) {
342349
continue;
343350
}
344351

@@ -355,11 +362,17 @@ drop_removed_values(const char *cib_id, const char *set_type,
355362
}
356363

357364
if (!func(a, id, cib_id)) {
365+
crm_trace("%s != %s", id, cib_id);
358366
continue;
359367
}
360368

361-
crm_debug("Dropping %s[%s] after CIB erasure of %s %s",
362-
a->id, v->nodename, entry_type, cib_id);
369+
if (drop_immediately) {
370+
crm_debug("Dropping %s[%s] immediately", a->id, v->nodename);
371+
} else {
372+
crm_debug("Dropping %s[%s] after CIB erasure of %s %s",
373+
a->id, v->nodename, entry_type, cib_id);
374+
}
375+
363376
g_hash_table_iter_remove(&value_iter);
364377
}
365378
}
@@ -394,7 +407,7 @@ nvpair_matches(const attribute_t *a, const char *xml_id, const char *cib_id)
394407
void
395408
attrd_drop_removed_value(const char *set_type, const char *cib_id)
396409
{
397-
drop_removed_values(cib_id, set_type, nvpair_matches);
410+
drop_removed_values(cib_id, set_type, -1, nvpair_matches);
398411
}
399412

400413
/*!
@@ -431,7 +444,7 @@ set_id_matches(const attribute_t *a, const char *xml_id, const char *cib_id)
431444
void
432445
attrd_drop_removed_set(const char *set_type, const char *cib_id)
433446
{
434-
drop_removed_values(cib_id, set_type, set_id_matches);
447+
drop_removed_values(cib_id, set_type, -1, set_id_matches);
435448
}
436449

437450
/*!
@@ -457,7 +470,7 @@ node_matches(const attribute_t *a, const char *xml_id, const char *cib_id)
457470
* \param[in] cib_id ID of node state that was removed from CIB
458471
*/
459472
void
460-
attrd_drop_removed_values(const char *cib_id)
473+
attrd_drop_removed_values(const char *cib_id, int peer_attrd_ver)
461474
{
462-
drop_removed_values(cib_id, NULL, node_matches);
475+
drop_removed_values(cib_id, NULL, peer_attrd_ver, node_matches);
463476
}

daemons/attrd/attrd_cib.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ drop_values_in_deletion(xmlNode *xml, void *data)
113113
attrd_drop_removed_set(set_type, set_id);
114114

115115
} else if (!pcmk__str_empty(node_id)) {
116-
attrd_drop_removed_values(node_id);
116+
attrd_drop_removed_values(node_id, *(int *) data);
117117
}
118118

119119
done:
@@ -149,6 +149,9 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg)
149149

150150
client_name = pcmk__xe_get(msg, PCMK__XA_CIB_CLIENTNAME);
151151
if (!cib__client_triggers_refresh(client_name)) {
152+
const char *peer = pcmk__xe_get(msg, PCMK__XA_SRC);
153+
int peer_attrd_ver = attrd_get_peer_protocol_ver(peer);
154+
152155
/* This change came from a source that ensured the CIB is consistent
153156
* with our attributes table, so we don't need to write anything out.
154157
* If a removed attribute has been erased, we can forget it now.
@@ -165,7 +168,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg)
165168
* function signature.
166169
*/
167170
pcmk__xe_foreach_child((xmlNode *) patchset, PCMK_XE_CHANGE,
168-
drop_values_in_deletion, NULL);
171+
drop_values_in_deletion, &peer_attrd_ver);
169172
return;
170173
}
171174

daemons/attrd/attrd_utils.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,23 @@ attrd_remove_peer_protocol_ver(const char *host)
238238
}
239239
}
240240

241+
int
242+
attrd_get_peer_protocol_ver(const char *host)
243+
{
244+
gpointer key;
245+
246+
if (peer_protocol_vers == NULL) {
247+
return -1;
248+
}
249+
250+
key = g_hash_table_lookup(peer_protocol_vers, host);
251+
if (key == NULL) {
252+
return -1;
253+
}
254+
255+
return GPOINTER_TO_INT(key);
256+
}
257+
241258
/*!
242259
* \internal
243260
* \brief When a peer node broadcasts a message with its protocol version, keep

daemons/attrd/pacemaker-attrd.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@
4444
* 5 2.1.5 Peers can request confirmation of a sent message
4545
* 6 2.1.7 PCMK__ATTRD_CMD_PEER_REMOVE supports PCMK__XA_REAP
4646
* 7 3.0.0 "flush" support dropped
47+
* 8 3.0.2 attrd clears transient attributes from the CIB as well
4748
*/
48-
#define ATTRD_PROTOCOL_VERSION "7"
49+
#define ATTRD_PROTOCOL_VERSION "8"
4950

5051
#define ATTRD_SUPPORTS_MULTI_MESSAGE(x) ((x) >= 4)
5152
#define ATTRD_SUPPORTS_CONFIRMATION(x) ((x) >= 5)
53+
#define ATTRD_SUPPORTS_CLEARING_CIB(x) ((x) >= 8)
5254

5355
#define attrd_send_ack(client, id, flags) \
5456
pcmk__ipc_send_ack((client), (id), (flags), PCMK__XE_ACK, \
@@ -217,7 +219,7 @@ char *attrd_nvpair_id(const attribute_t *attr, const char *node_state_id);
217219
bool attrd_for_cib(const attribute_t *a);
218220
void attrd_drop_removed_value(const char *set_type, const char *cib_id);
219221
void attrd_drop_removed_set(const char *set_type, const char *cib_id);
220-
void attrd_drop_removed_values(const char *cib_id);
222+
void attrd_drop_removed_values(const char *cib_id, int peer_attrd_ver);
221223

222224
enum attrd_write_options {
223225
attrd_write_changed = 0,
@@ -231,6 +233,7 @@ void attrd_write_or_elect_attribute(attribute_t *a);
231233
extern int minimum_protocol_version;
232234
void attrd_remove_peer_protocol_ver(const char *host);
233235
void attrd_update_minimum_protocol_ver(const char *host, const char *value);
236+
int attrd_get_peer_protocol_ver(const char *host);
234237

235238
mainloop_timer_t *attrd_add_timer(const char *id, int timeout_ms, attribute_t *attr);
236239

0 commit comments

Comments
 (0)