@@ -381,6 +381,62 @@ fn generate_chrony_config(
381
381
args : ChronySetupArgs ,
382
382
log : & Logger ,
383
383
) -> anyhow:: Result < ( ) > {
384
+ // Rack Time Synchronisation
385
+ // -------------------------
386
+ //
387
+ // Within an Oxide rack, every sled in the cluster runs an NTP server zone.
388
+ // Two of these zones are nominated to be "Boundary NTP Servers" which
389
+ // means that they have external connectivity via boundary networking
390
+ // services and are configured with the NTP server address(es)/name(s)
391
+ // provided during RSS. The other zones are "Internal NTP Servers", do not
392
+ // have external connectivity, and synchronise with the boundary servers
393
+ // across the rack's underlay network.
394
+ //
395
+ // Every sled initially starts up with the notion that it is late December
396
+ // 1986, and there are a number of challenges in order to reach consensus
397
+ // around time in the rack, particularly in situations where one of more
398
+ // boundary servers lacks external connectivity, either at startup or
399
+ // later. A number of strategies are employed in the configurations
400
+ // below.
401
+ //
402
+ // - Each boundary server can authoratitively advertise time at stratum
403
+ // 10 based on its local clock and will do this when there are no
404
+ // "selectable" upstream servers. However, to avoid the situation
405
+ // where December 1986 is advertised with authority, they will not use
406
+ // this local source until the clock has been successfully
407
+ // synchronised to an upstream source at least once. In the event that
408
+ // a rack starts up with no external NTP connectivity everything
409
+ // stops, waiting for time synchronisation to occur (that is, for the
410
+ // networking issue to be resolved).
411
+ //
412
+ // - Each boundary server has its upstream sources configured with:
413
+ // - maximum poll interval 2^5 (32 seconds). When a time source is
414
+ // considered trustworthy and relatively stable over time, the rate
415
+ // at which it is queried is reduced. We set a ceiling on the
416
+ // polling rate so that we can still react relatively quickly to
417
+ // events such as loss of external connectivity. Note that if
418
+ // an update fails, the poll interval will rapidly decrease back
419
+ // down towards one second.
420
+ // - maximum number of retained samples is 8. This sets an upper limit
421
+ // on the number of samples so that trust degrades more quickly in
422
+ // the event the source is not contactable.
423
+ // - The "failfast" flag causes the source to be immediately marked as
424
+ // "unselectable" if it has not been contactable for several
425
+ // consecutive attempts. Without this flag, the source would remain
426
+ // selected and its root dispersion (and therefore its distance)
427
+ // would increase fairly slowly. The source would become
428
+ // unselectable after around an hour given the rest of the
429
+ // configuration, which is far too slow.
430
+ //
431
+ // - The boundary servers include each other in their list of sources.
432
+ // While they will see themselves in their source list, they will
433
+ // automatically discount that to prevent a loop. Due to the "orphan"
434
+ // tab on the local source mentioned earlier, when both boundary
435
+ // servers fall back to their local clock source, the one with the
436
+ // lowest reference ID will be preferred, protecting against a split
437
+ // brain scenario when neither server has upstream connectivity and
438
+ // both are are advertising their local clock with authority.
439
+
384
440
let internal_ntp_tpl = "#
385
441
# Configuration file for an internal NTP server - one which communicates with
386
442
# boundary NTP servers within the rack.
@@ -432,18 +488,25 @@ allow @ALLOW@
432
488
# appears synchronised even if there are currently no active upstreams. When
433
489
# in this mode, we report as stratum 10 to clients. The `distance' parameter
434
490
# controls when we will decide to abandon the upstreams and switch to the local
435
- # reference. By setting `activate`, we prevent the server from ever activating
436
- # its local reference until it has synchronised with upstream at least once and
437
- # the root distance has dropped below the provided threshold. This prevents
438
- # a boundary server in a cold booted rack from authoritatively advertising a
439
- # time from the 1980s prior to gaining external connectivity.
491
+ # reference, although this is largely redundant due to the upstream sources
492
+ # being flagged as 'failfast'. By setting `activate`, we prevent the server
493
+ # from ever activating its local reference until it has synchronised with
494
+ # upstream at least once and the root distance has dropped below the
495
+ # provided threshold. This prevents a boundary server in a cold booted rack
496
+ # from authoritatively advertising a time from the 1980s prior to gaining
497
+ # external connectivity.
440
498
#
441
499
# distance: Distance from root above which we use the local reference, opting
442
500
# to ignore the upstream.
443
501
# activate: Distance from root below which we must fall once to ever consider
444
502
# the local reference.
503
+ # orphan: This option enables orphan mode, where sources with the same
504
+ # stratum as our local are ignored unless no other source is
505
+ # selectable and their reference IDs are smaller than ours. This
506
+ # protects against a split brain situation when neither boundary
507
+ # server has connectivity.
445
508
#
446
- local stratum 10 distance 0.4 activate 0.5
509
+ local stratum 10 orphan distance 0.4 activate 0.5
447
510
448
511
# makestep <threshold> <limit>
449
512
# We allow chrony to step the system clock during the first three time updates
@@ -454,6 +517,13 @@ makestep 0.1 3
454
517
leapsecmode slew
455
518
maxslewrate 2708.333
456
519
520
+ # Refresh boundary NTP servers every two minutes instead of every two weeks
521
+ refresh 120
522
+
523
+ # When a source is unreachable, increase its dispersion by 60 microseconds/s
524
+ # instead of the default of 1.
525
+ maxclockerror 60
526
+
457
527
" ;
458
528
459
529
let ChronySetupArgs {
@@ -481,17 +551,17 @@ maxslewrate 2708.333
481
551
for s in servers {
482
552
writeln ! (
483
553
& mut new_config,
484
- "pool {s} iburst maxdelay 0.1 minpoll 0 maxpoll 3 maxsources 16"
554
+ "pool {s} iburst maxdelay 0.1 maxsources 16 \
555
+ minpoll 0 maxpoll 5 maxsamples 8 failfast"
485
556
)
486
557
. expect ( "write to String is infallible" ) ;
487
558
}
488
- } else {
489
- writeln ! (
490
- & mut new_config,
491
- "pool {boundary_pool} iburst maxdelay 0.1 maxsources 16" ,
492
- )
493
- . expect ( "write to String is infallible" ) ;
494
559
}
560
+ writeln ! (
561
+ & mut new_config,
562
+ "pool {boundary_pool} iburst maxdelay 0.1 maxsources 16" ,
563
+ )
564
+ . expect ( "write to String is infallible" ) ;
495
565
496
566
// We read the contents from the old configuration file if it existed
497
567
// so that we can verify if it changed.
0 commit comments