Skip to content

Commit 406f1ef

Browse files
committed
PG-1604: Improve last key LSN calculation logic
Previosly we simply set the LSN for the new key to the first write location. This is however not correct, as there are many corner cases around this: * recovery / replication might write old LSNs * we can't handle multiple keys with the same TLI/LSN, which can happen with quick restarts without writes To support this in this commit we modify the following: * We only activate new keys outside crash recovery, or immediately if encryption is turned off * We also take the already existing last key into account (if exists), and only activate a new key if we progressed past its start location The remaining changes are just support infrastructure for this: * Since we might rewrite old records, we use the already existing keys for those writes, not the active last keys * We prefetch existing keys during initialization, so it doesn't accidentally happen in the critical section during a write There is a remaining bug with stopping wal encryption, also mentioned in a TODO message in the code. This will be addressed in a later PR as this fix already took too long.
1 parent 937ac6f commit 406f1ef

File tree

8 files changed

+1398
-15
lines changed

8 files changed

+1398
-15
lines changed

contrib/pg_tde/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ tap_tests = [
128128
't/wal_archiving.pl',
129129
't/wal_encrypt.pl',
130130
't/wal_key_tli.pl',
131+
't/recovery/2pc_replication.pl',
132+
't/recovery/stream_rep.pl',
131133
]
132134

133135
tests += {

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,16 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
246246
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
247247
}
248248

249+
keys = pg_tde_get_wal_cache_keys();
250+
251+
if (keys == NULL)
252+
{
253+
WalLocation start = {.tli = 1,.lsn = 0};
254+
255+
/* cache is empty, prefetch keys from disk */
256+
pg_tde_fetch_wal_keys(start);
257+
}
258+
249259
if (key)
250260
pfree(key);
251261
}
@@ -263,6 +273,32 @@ TDEXLogSmgrInitWriteReuseKey()
263273
}
264274
}
265275

276+
/*
277+
* Encrypt XLog page(s) from the buf and write to the segment file.
278+
*/
279+
static ssize_t
280+
TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t offset,
281+
TimeLineID tli, XLogSegNo segno, int segSize)
282+
{
283+
char *enc_buff = EncryptionBuf;
284+
285+
#ifndef FRONTEND
286+
Assert(count <= TDEXLogEncryptBuffSize());
287+
#endif
288+
289+
/* Copy the data as-is, as we might have unencrypted parts */
290+
memcpy(enc_buff, buf, count);
291+
292+
/*
293+
* This method potentially allocates, but only in very early execution
294+
* Shouldn't happen in a write, where we are in a critical section
295+
*/
296+
TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize);
297+
298+
return pg_pwrite(fd, enc_buff, count, offset);
299+
}
300+
301+
266302
/*
267303
* Encrypt XLog page(s) from the buf and write to the segment file.
268304
*/
@@ -284,6 +320,7 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
284320
#endif
285321

286322
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
323+
287324
pg_tde_stream_crypt(iv_prefix,
288325
offset,
289326
(char *) buf,
@@ -299,26 +336,66 @@ static ssize_t
299336
tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
300337
TimeLineID tli, XLogSegNo segno, int segSize)
301338
{
339+
bool lastKeyUsable;
340+
bool afterLastKey;
341+
#ifdef FRONTEND
342+
bool crashRecovery = false;
343+
#else
344+
bool crashRecovery = GetRecoveryState() == RECOVERY_STATE_CRASH;
345+
#endif
346+
347+
WalLocation loc = {.tli = tli};
348+
WalLocation last_key_loc;
349+
350+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
351+
302352
/*
303353
* Set the last (most recent) key's start LSN if not set.
304354
*
305355
* This func called with WALWriteLock held, so no need in any extra sync.
306356
*/
307-
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && TDEXLogGetEncKeyLsn() == 0)
308-
{
309-
WalLocation loc = {.tli = tli};
310357

311-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
358+
last_key_loc.lsn = TDEXLogGetEncKeyLsn();
359+
pg_read_barrier();
360+
last_key_loc.tli = TDEXLogGetEncKeyTli();
361+
362+
lastKeyUsable = (TDEXLogGetEncKeyLsn() != 0);
363+
afterLastKey = wal_location_cmp(last_key_loc, loc) <= 0;
312364

313-
pg_tde_wal_last_key_set_location(loc);
314-
EncryptionKey.wal_start = loc;
315-
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
365+
Assert(EncryptionKey.type != WAL_KEY_TYPE_INVALID);
366+
367+
if (!lastKeyUsable)
368+
{
369+
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
370+
371+
if (!crashRecovery || EncryptionKey.type == WAL_KEY_TYPE_UNENCRYPTED)
372+
{
373+
/*
374+
* TODO: the unencrypted case is still not perfect, we need to
375+
* report an error in some cornercases
376+
*/
377+
if (last_key == NULL || last_key->start.lsn < loc.lsn)
378+
{
379+
pg_tde_wal_last_key_set_location(loc);
380+
EncryptionKey.wal_start = loc;
381+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
382+
lastKeyUsable = true;
383+
}
384+
}
316385
}
317386

318-
if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
387+
if ((!afterLastKey || !lastKeyUsable) && EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
388+
{
389+
return TDEXLogWriteEncryptedPagesOldKeys(fd, buf, count, offset, tli, segno, segSize);
390+
}
391+
else if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
392+
{
319393
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
394+
}
320395
else
396+
{
321397
return pg_pwrite(fd, buf, count, offset);
398+
}
322399
}
323400

324401
/*
@@ -340,7 +417,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
340417
if (readsz <= 0)
341418
return readsz;
342419

343-
TDEXLogCryptBuffer(buf, count, offset, tli, segno, segSize);
420+
TDEXLogCryptBuffer(buf, buf, count, offset, tli, segno, segSize);
344421

345422
return readsz;
346423
}
@@ -349,15 +426,15 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
349426
* [De]Crypt buffer if needed based on provided segment offset, number and TLI
350427
*/
351428
void
352-
TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
429+
TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
353430
TimeLineID tli, XLogSegNo segno, int segSize)
354431
{
355432
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
356433
XLogRecPtr write_key_lsn;
357434
WalLocation data_end = {.tli = tli};
358435
WalLocation data_start = {.tli = tli};
359436

360-
if (!keys)
437+
if (keys == NULL)
361438
{
362439
WalLocation start = {.tli = 1,.lsn = 0};
363440

@@ -454,6 +531,7 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
454531
XLogSegmentOffset(end_lsn, segSize);
455532
size_t dec_sz;
456533
char *dec_buf = (char *) buf + (dec_off - offset);
534+
char *o_buf = (char *) out_buf + (dec_off - offset);
457535

458536
Assert(dec_off >= offset);
459537

@@ -468,17 +546,19 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
468546
dec_end = offset + count;
469547
}
470548

549+
Assert(dec_end > dec_off);
471550
dec_sz = dec_end - dec_off;
472551

473552
#ifdef TDE_XLOG_DEBUG
474553
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
475554
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
476555
#endif
556+
477557
pg_tde_stream_crypt(iv_prefix,
478558
dec_off,
479559
dec_buf,
480560
dec_sz,
481-
dec_buf,
561+
o_buf,
482562
curr_key->key.key,
483563
&curr_key->crypt_ctx);
484564
}

contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extern void TDEXLogSmgrInit(void);
1313
extern void TDEXLogSmgrInitWrite(bool encrypt_xlog);
1414
extern void TDEXLogSmgrInitWriteReuseKey(void);
1515

16-
extern void TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
16+
extern void TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
1717
TimeLineID tli, XLogSegNo segno, int segSize);
1818

1919
#endif /* PG_TDE_XLOGSMGR_H */

0 commit comments

Comments
 (0)