From 326e32268e8eda925e9954c80051c16422b19e83 Mon Sep 17 00:00:00 2001 From: Lucas RAVAGNIER Date: Wed, 3 Jun 2026 12:11:35 +0200 Subject: [PATCH] xapi: Improve error reporting when pool join fails on TLS verification When a host joins a pool (pool.join_force), the process has two phases: 1. An unverified TLS connection is used to run pre-join checks and exchange host certificates. The joiner imports the pool bundle. 2. A verified TLS connection (verifyPeer=yes, SNI=pool) is opened using the freshly-generated pool bundle. Previously, any failure at Phase 2 surfaced as: INTERNAL_ERROR(Stunnel.Stunnel_verify_error( This error is opaque and gives no actionable information to the administrator. The idea is to improve error handling in order to obtain something more precise. Signed-off-by: Lucas RAVAGNIER --- ocaml/idl/datamodel_errors.ml | 19 +++++++++++++++++++ ocaml/xapi-consts/api_errors.ml | 6 ++++++ ocaml/xapi/cert_distrib.ml | 24 +++++++++++++++++++++++- ocaml/xapi/xapi_pool.ml | 22 ++++++++++++++++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index b334b00181a..5ad5ee3618e 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -904,6 +904,25 @@ let _ = "The host joining the pool must have one and only one IP on the \ clustering network" () ; + error Api_errors.pool_joining_host_tls_verification_mismatch [] + ~doc: + "The TLS verification check failed when the joining host attempted to \ + open a verified connection to the pool coordinator using the imported \ + pool certificate bundle." + () ; + error Api_errors.pool_joining_master_certificate_not_in_pool_bundle + ["master_uuid"] + ~doc: + "The pool coordinator's own certificate is absent from the pool \ + certificate bundle sent to the joining host. Run 'xe \ + pool-certificate-sync' on the coordinator and retry." + () ; + error Api_errors.pool_joining_pool_bundle_empty_after_import ["bundle_path"] + ~doc: + "The pool certificate bundle is empty or missing after import on the \ + joining host. The bundle generation script (update-ca-bundle.sh) likely \ + failed silently." + () ; (* External directory service *) error Api_errors.subject_cannot_be_resolved [] diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 026faadbbd6..270f7f05924 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -748,6 +748,12 @@ let pool_joining_host_has_network_sriovs = let pool_joining_host_tls_verification_mismatch = add_error "POOL_JOINING_HOST_TLS_VERIFICATION_MISMATCH" +let pool_joining_master_certificate_not_in_pool_bundle = + add_error "POOL_JOINING_MASTER_CERTIFICATE_NOT_IN_POOL_BUNDLE" + +let pool_joining_pool_bundle_empty_after_import = + add_error "POOL_JOINING_POOL_BUNDLE_EMPTY_AFTER_IMPORT" + let pool_joining_host_ca_certificates_conflict = add_error "POOL_JOINING_HOST_CA_CERTIFICATES_CONFLICT" diff --git a/ocaml/xapi/cert_distrib.ml b/ocaml/xapi/cert_distrib.ml index b5f9f923b29..4d3891dffb3 100644 --- a/ocaml/xapi/cert_distrib.ml +++ b/ocaml/xapi/cert_distrib.ml @@ -704,7 +704,29 @@ let exchange_certificates_with_joiner ~__context ~uuid ~certificate = let import_joining_pool_certs ~__context ~pool_certs = let pool_certs = List.map WireProtocol.certificate_file_of_pair pool_certs in Worker.local_write_cert_fs ~__context HostPoolCert Merge pool_certs ; - Worker.local_regen_bundle ~__context + Worker.local_regen_bundle ~__context ; + (* update-ca-bundle.sh can fail silently, leaving an empty bundle that would + cause an opaque Stunnel_verify_error when the verified connection is + opened in Phase 2 of the join. *) + let bundle_path = !Xapi_globs.pool_bundle_path in + let bundle_empty_or_missing = + match Unix.stat bundle_path with + | exception Unix.Unix_error (Unix.ENOENT, _, _) -> + true + | stats -> + stats.Unix.st_size = 0 + in + if bundle_empty_or_missing then ( + D.error + "import_joining_pool_certs: pool bundle '%s' is empty or missing after \ + certificate import. The bundle generation script \ + (/opt/xensource/bin/update-ca-bundle.sh) likely failed silently." + bundle_path ; + raise + Api_errors.( + Server_error (pool_joining_pool_bundle_empty_after_import, [bundle_path]) + ) + ) let collect_ca_certs ~__context ~names = Worker.local_collect_certs LegacyRootCert ~__context names diff --git a/ocaml/xapi/xapi_pool.ml b/ocaml/xapi/xapi_pool.ml index 8ac3b31a8d8..cdd1eec2173 100644 --- a/ocaml/xapi/xapi_pool.ml +++ b/ocaml/xapi/xapi_pool.ml @@ -1871,6 +1871,28 @@ let join_common ~__context ~master_address ~master_username ~master_password Client.Pool.exchange_certificates_on_join ~rpc:unverified_rpc ~session_id ~uuid:my_uuid ~certificate:my_certificate in + (* Verify the master included its own certificate in the pool bundle + before importing. If it is absent the verified connection in Phase 2 + will fail with an opaque Stunnel_verify_error. The filename convention + is ".pem" (see Cert_distrib.HostPoolProvider). *) + let master_uuid = + Client.Host.get_uuid ~rpc:unverified_rpc ~session_id + ~self:(get_master ~rpc:unverified_rpc ~session_id) + in + let expected_cert_filename = master_uuid ^ ".pem" in + if not (List.mem_assoc expected_cert_filename pool_certs) then ( + error + "join_common: master certificate file '%s' is absent from the pool's \ + certificate store (/etc/stunnel/certs-pool/). The pool bundle sent \ + to the joiner does not contain the master's own certificate. Run \ + 'xe pool-certificate-sync' on the master and retry." + expected_cert_filename ; + raise + Api_errors.( + Server_error + (pool_joining_master_certificate_not_in_pool_bundle, [master_uuid]) + ) + ) ; Cert_distrib.import_joining_pool_certs ~__context ~pool_certs ) (fun () -> Client.Session.logout ~rpc:unverified_rpc ~session_id) ;