From 77c1aa9c9ae35d3ddad1938a5fbb9f8da7db8919 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 11 Nov 2022 09:11:19 +0100 Subject: [PATCH 01/60] saphana_validate | sht_validate - get rid of external grep calls --- ra/SAPHana | 4 ++-- ra/SAPHanaTopology | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index cec1753d..600ee295 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -1598,7 +1598,7 @@ function saphana_validate() { # # SID is Alpha-AlphaNumeric-Alphanumeric? # - if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] + if [[ ! "$SID" =~ ^[A-Z][A-Z0-9][A-Z0-9]$ ]] then super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" rc=$OCF_ERR_ARGS @@ -1606,7 +1606,7 @@ function saphana_validate() { # # InstanceNr is a two-Digit? # - if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] + if [[ ! "$InstanceNr" =~ ^[0-9][0-9]$ ]] then super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index f3f4c098..6d4d5985 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -893,13 +893,13 @@ function sht_status() { function sht_validate() { super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local rc=$OCF_SUCCESS - if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] + if [[ ! "$SID" =~ ^[A-Z][A-Z0-9][A-Z0-9]$ ]] then super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" rc=$OCF_ERR_ARGS fi - if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] + if [[ ! "$InstanceNr" =~ ^[0-9][0-9]$ ]] then super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS From 0f0bac39ff1bb10174a9d48441202991fa45dfc7 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 11 Nov 2022 16:31:01 +0100 Subject: [PATCH 02/60] convert SID to sid - get rid of external call add sid to GLOBALS - was anyway used like a GLOBAL variable --- ra/SAPHana | 7 ++++--- ra/SAPHanaTopology | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index cec1753d..afca9989 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -798,7 +798,7 @@ function saphana_init() { export SAPSYSTEMNAME=$SID HANA_CALL_TIMEOUT="${OCF_RESKEY_HANA_CALL_TIMEOUT:-60}" super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" - sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sid="$SID" sidadm="${sid}adm" # TODO PRIO3: Do we need a parameter for the RA to be able to adjust hdbSrQueryTimeout? hdbSrQueryTimeout=180 @@ -2863,8 +2863,9 @@ function saphana_demote_clone() { # ## GLOBALS -SID="" -sidadm="" +SID='' +declare -l sid='' +sidadm='' InstanceName="" InstanceNr="" SAPVIRHOST="" diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index f3f4c098..19c7962b 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -492,7 +492,7 @@ function sht_init() { InstanceName="HDB${InstanceNr}" HANA_CALL_TIMEOUT="${OCF_RESKEY_HANA_CALL_TIMEOUT:-60}" super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" - sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sid="$SID" sidadm="${sid}adm" ocf_env=$(env | grep 'OCF_RESKEY_CRM') super_ocf_log debug "DBG3: OCF: $ocf_env" @@ -1165,8 +1165,9 @@ function sht_notify() { # ## GLOBALS -SID="" -sidadm="" +SID='' +declare -l sid='' +sidadm='' InstanceName="" InstanceNr="" DIR_EXECUTABLE="" From de762829afc6b6d6088bdf80484be3e0026f63a2 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Sat, 12 Nov 2022 18:47:30 +0100 Subject: [PATCH 03/60] scoring_crm_master - remove skip variable and use break; avoid unnecessary further looping --- ra/SAPHana | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index cec1753d..12910c50 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -588,19 +588,16 @@ scoring_crm_master() super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local roles="$1" local sync="$2" - local skip=0 local myScore="" for scan in "${SCORING_TABLE[@]}"; do - if [ $skip -eq 0 ]; then read rolePatt syncPatt score <<< $scan if grep "$rolePatt" <<< "$roles"; then if grep "$syncPatt" <<< "$sync"; then super_ocf_log info "DEC: scoring_crm_master: roles($roles) are matching pattern ($rolePatt)" super_ocf_log info "DEC: scoring_crm_master: sync($sync) is matching syncPattern ($syncPatt)" super_ocf_log info "DEC: scoring_crm_master: set score $score" - skip=1 myScore=$score - fi + break fi fi done From 7a96d8c8881628c7d5dc64062790b12882805a02 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Sat, 12 Nov 2022 18:49:29 +0100 Subject: [PATCH 04/60] scoring_crm_master - fix block indentation after removing skip-if --- ra/SAPHana | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 12910c50..98e1368f 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -588,15 +588,15 @@ scoring_crm_master() super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local roles="$1" local sync="$2" - local myScore="" + local myScore='' for scan in "${SCORING_TABLE[@]}"; do - read rolePatt syncPatt score <<< $scan if grep "$rolePatt" <<< "$roles"; then if grep "$syncPatt" <<< "$sync"; then - super_ocf_log info "DEC: scoring_crm_master: roles($roles) are matching pattern ($rolePatt)" - super_ocf_log info "DEC: scoring_crm_master: sync($sync) is matching syncPattern ($syncPatt)" - super_ocf_log info "DEC: scoring_crm_master: set score $score" - myScore=$score + read rolePatt syncPatt score <<< $scan + super_ocf_log info "DEC: scoring_crm_master: roles($roles) are matching pattern ($rolePatt)" + super_ocf_log info "DEC: scoring_crm_master: sync($sync) is matching syncPattern ($syncPatt)" + super_ocf_log info "DEC: scoring_crm_master: set score $score" + myScore=$score break fi fi From c74aa559ddde8b3177af753a97abdf0ea4717636 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Sat, 12 Nov 2022 18:50:27 +0100 Subject: [PATCH 05/60] scoring_crm_master - get rid of external grep and use bash regex (grep was anyway missing -q option) --- ra/SAPHana | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 98e1368f..b2e088fd 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -590,9 +590,9 @@ scoring_crm_master() local sync="$2" local myScore='' for scan in "${SCORING_TABLE[@]}"; do - if grep "$rolePatt" <<< "$roles"; then - if grep "$syncPatt" <<< "$sync"; then read rolePatt syncPatt score <<< $scan + if [[ "$roles" =~ $rolePatt ]] ; then + if [[ "$sync" =~ $syncPatt ]] ; then super_ocf_log info "DEC: scoring_crm_master: roles($roles) are matching pattern ($rolePatt)" super_ocf_log info "DEC: scoring_crm_master: sync($sync) is matching syncPattern ($syncPatt)" super_ocf_log info "DEC: scoring_crm_master: set score $score" From f55a06707927994a561ba315fb56eebcea10a9f2 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Tue, 15 Nov 2022 13:15:53 +0100 Subject: [PATCH 06/60] analyze_hana_sync_statusSRS - get rid of external grep - used bash regex --- ra/SAPHana | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index cec1753d..a0a3077c 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -1268,7 +1268,7 @@ function check_for_primary() { # function: analyze_hana_sync_statusSRS # params: - -# globals: DIR_EXECUTABLE(r), FULL_SR_STATUS(w), remoteNode +# globals: DIR_EXECUTABLE(r), FULL_SR_STATUS(w), remoteNode, remSR_name # # systemReplicationStatus.py return-codes: # NoHSR = 10 @@ -1290,7 +1290,7 @@ function analyze_hana_sync_statusSRS() # TODO: Limit the runtime of systemReplicationStatus.py # SAP_CALL FULL_SR_STATUS=$(HANA_CALL --timeout 5 --cmd "python systemReplicationStatus.py $siteParam" 2>/dev/null); srRc=$? - super_ocf_log info "DEC $FUNCNAME systemReplicationStatus.py (to site '$remSR_name')-> $srRc" + super_ocf_log info "DEC ${FUNCNAME[0]} systemReplicationStatus.py (to site '$remSR_name')-> $srRc" super_ocf_log info "FLOW ${FUNCNAME[0]} systemReplicationStatus.py (to site '$remSR_name')-> $srRc" # # TODO: PRIO2: Here we might also need to filter additional sites (if multi tier should be supported) @@ -1298,7 +1298,7 @@ function analyze_hana_sync_statusSRS() # if [ $srRc -eq 15 ]; then # Fix for a HANA BUG, where a non-working SR resulted in RC 15: - if grep -q "ACTIVE" <<< "$FULL_SR_STATUS"; then + if [[ $FULL_SR_STATUS == *ACTIVE* ]]; then super_ocf_log info "FLOW ${FUNCNAME[0]} SOK" set_hana_attribute "$remoteNode" "SOK" "${ATTR_NAME_HANA_SYNC_STATUS[@]}" super_ocf_log info "ACT site=$sr_name, setting SOK for secondary (1)" From d2eee5c612191e3f20238aad1282aa49643573a3 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 25 Nov 2022 14:52:45 +0100 Subject: [PATCH 07/60] Runtime - get rid of external calls to date and use bash internal SECONDS --- ra/SAPHana | 4 ++-- ra/SAPHanaTopology | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index dcce2370..9c691274 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -44,7 +44,7 @@ SAPHanaVersion="0.162.0" # # Initialization: -timeB=$(date '+%s') +timeB=${SECONDS} : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs @@ -2954,7 +2954,7 @@ case "$ACTION" in ra_rc=$OCF_ERR_UNIMPLEMENTED ;; esac -timeE=$(date '+%s') +timeE=${SECONDS} (( timeR = timeE - timeB )) super_ocf_log debug "DBG: ==== SAPHanaFilter=$SAPHanaFilter" super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaVersion) (${timeR}s)====" diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index f3f4c098..0316932c 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -30,7 +30,7 @@ SAPHanaTopologyVersion="0.162.0" # # Initialization: -timeB=$(date '+%s') +timeB=${SECONDS} : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs @@ -1247,7 +1247,7 @@ case "$ACTION" in ra_rc=$OCF_ERR_UNIMPLEMENTED ;; esac -timeE=$(date '+%s') +timeE=${SECONDS} (( timeR = timeE - timeB )) super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaTopologyVersion) (${timeR}s)====" exit ${ra_rc} From 705ed00e82d0c8c427be23f32b5084064c6ccd11 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 16 Dec 2022 18:13:43 +0100 Subject: [PATCH 08/60] add missing FLOW messages returning from functions --- ra/SAPHana | 9 +++++++++ ra/SAPHanaTopology | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/ra/SAPHana b/ra/SAPHana index 91bc8379..0b4244f1 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -292,6 +292,7 @@ The resource agent uses the following four interfaces provided by SAP: END +super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -307,6 +308,7 @@ function saphana_methods() { for m in start stop status monitor promote demote notify validate-all methods meta-data usage reload; do echo "$m" done + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -575,6 +577,7 @@ function set_crm_master() super_ocf_log debug "DBG: LET crm master: $score" rc=0 fi + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -621,6 +624,8 @@ function get_crm_master() super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local rc=0 ${HA_SBIN_DIR}/crm_master -G -q -l reboot; rc=$? + + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -1065,6 +1070,7 @@ function chk4systemdsupport() { rc=1 fi fi + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -1207,6 +1213,7 @@ function check_sapstartsrv() { fi fi fi + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -1459,6 +1466,7 @@ function get_hana_landscape_status() rc=0 fi fi + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc; } @@ -2280,6 +2288,7 @@ function saphana_stop_clone() { lpa_set_lpt 10 "$NODENAME" fi saphana_stop; rc="$?" + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return "$rc" } diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index f3f4c098..444eb8c0 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -198,6 +198,7 @@ SAPHanaTopology scans the output table of landscapeHostConfiguration.py to ident END +super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -715,6 +716,7 @@ function chk4systemdsupport() { rc=1 fi fi + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -880,6 +882,8 @@ function sht_status() { local rc=0 sht_monitor; rc=$? + + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -919,6 +923,8 @@ function sht_start_clone() { super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local rc=$OCF_NOT_RUNNING sht_start; rc=$? + + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -990,6 +996,8 @@ function sht_stop_clone() { # timeout of landscapeHostConfiguration.py - let stop fail rc=$OCF_ERR_GENERIC fi + + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } From 2b1f367d609afd8e06a31ac30b1d1afc764f5584 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Wed, 1 Feb 2023 15:06:30 +0100 Subject: [PATCH 09/60] use variable substitution instead of declare lower - both are only available with bash4 --- ra/SAPHana | 4 ++-- ra/SAPHanaTopology | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index afca9989..8f0793a1 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -798,7 +798,7 @@ function saphana_init() { export SAPSYSTEMNAME=$SID HANA_CALL_TIMEOUT="${OCF_RESKEY_HANA_CALL_TIMEOUT:-60}" super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" - sid="$SID" + sid="${SID,,}" sidadm="${sid}adm" # TODO PRIO3: Do we need a parameter for the RA to be able to adjust hdbSrQueryTimeout? hdbSrQueryTimeout=180 @@ -2864,7 +2864,7 @@ function saphana_demote_clone() { ## GLOBALS SID='' -declare -l sid='' +sid='' sidadm='' InstanceName="" InstanceNr="" diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 19c7962b..34de6096 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -492,7 +492,7 @@ function sht_init() { InstanceName="HDB${InstanceNr}" HANA_CALL_TIMEOUT="${OCF_RESKEY_HANA_CALL_TIMEOUT:-60}" super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" - sid="$SID" + sid="${SID,,}" sidadm="${sid}adm" ocf_env=$(env | grep 'OCF_RESKEY_CRM') super_ocf_log debug "DBG3: OCF: $ocf_env" @@ -1166,7 +1166,7 @@ function sht_notify() { ## GLOBALS SID='' -declare -l sid='' +sid='' sidadm='' InstanceName="" InstanceNr="" From c15d48b9b4c0fba9cfda208d7bfa1652eb085917 Mon Sep 17 00:00:00 2001 From: Emiliano Langella Date: Mon, 3 Apr 2023 17:19:10 +0200 Subject: [PATCH 10/60] Update SAPHanaSR_maintenance_examples.7 Corrected some typos. --- man/SAPHanaSR_maintenance_examples.7 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index ce673a5b..2534348b 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -38,7 +38,7 @@ REQUIREMENTS below. .PP \fB*\fR Watch status of HANA cluster resources and system replication. -This might be conveniant when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). +This might be convenient when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). .PP .RS 4 # watch -n 9 "(crm_mon -1r;SAPHanaSR-showAttr;cs_clusterstate -i)|egrep -v'(^$|configured|###)'" @@ -84,9 +84,9 @@ The status of HANA databases, system replication and Linux cluster has to be checked. The SAP HANA resources are set into maintenance, an sr_takeover is performed, the old primary is registered as new secondary. -Therefor the correct secondary site name has to be used, see later example. +Therefore the correct secondary site name has to be used, see later example. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determing the correct site name. +See also section REQUIREMENTS below and later example on determining the correct site name. .PP .RS 2 1. On either node @@ -254,7 +254,7 @@ See also section REQUIREMENTS below. # crm configure property maintenance-mode=false .RE .RS 2 -6. Cleanup the SAPHanaController m/s resource. +6. Clean up the SAPHanaController m/s resource. .RE .RS 4 # crm resource cleanup node @@ -362,8 +362,8 @@ HANA gets fully stopped. This procedure can be used to update HANA, OS or hardware. HANA roles and resource status remains unchanged. -It is neccessary to wait for each step to complete and to check the result. -It also is neccessary to test and document the whole procedure before applying in production. +It is necessary to wait for each step to complete and to check the result. +It also is necessary to test and document the whole procedure before applying in production. .PP .RS 2 1. disabling pacemaker on HANA primary @@ -531,7 +531,7 @@ l. Finally check if everything looks fine. .PP \fB*\fR Be patient. For detecting the overall HANA status, the Linux cluster needs a certain amount of time, depending on the HANA and the configured -intervalls and timeouts. +intervals and timeouts. .PP \fB*\fR Before doing anything, always check for the Linux cluster's idle status, left-over migration constraints, and resource failures as well as the HANA From b5ab6476236c456950cc2e9917fd79fbe02b699c Mon Sep 17 00:00:00 2001 From: Fabian Herschel Date: Wed, 31 May 2023 15:20:47 +0200 Subject: [PATCH 11/60] susTkOver.py: correted description --- srHook/susTkOver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/srHook/susTkOver.py b/srHook/susTkOver.py index 75e1cca3..7b67ca5f 100755 --- a/srHook/susTkOver.py +++ b/srHook/susTkOver.py @@ -56,7 +56,7 @@ def about(self): self.tracer.info("{0}.{1}() version {2}".format(self.__class__.__name__, method, fhSRHookVersion)) return {"provider_company": "SUSE", "provider_name": "susTkOver", # class name - "provider_description": "Inform Cluster about SR state", + "provider_description": "Block manual takeover, if cluster is active", "provider_version": "1.0"} def preTakeover(self, isForce, **kwargs): From a2ce9dff5498f4e8cd8e93c57587268ea1e941b5 Mon Sep 17 00:00:00 2001 From: Lars Pinne Date: Tue, 13 Jun 2023 13:25:13 +0200 Subject: [PATCH 12/60] SAPHanaSR_maintenance_examples.7: example hdbnsutil -sr_takeover --suspendPrimary --- man/SAPHanaSR_maintenance_examples.7 | 58 ++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 2534348b..59172801 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR_maintenance_examples 7 "10 Feb 2023" "" "SAPHanaSR" +.TH SAPHanaSR_maintenance_examples 7 "12 Jun 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_maintenance_examples \- maintenance examples for SAPHana and SAPHanaController. @@ -38,7 +38,7 @@ REQUIREMENTS below. .PP \fB*\fR Watch status of HANA cluster resources and system replication. -This might be convenient when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). +This might be conveniant when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). .PP .RS 4 # watch -n 9 "(crm_mon -1r;SAPHanaSR-showAttr;cs_clusterstate -i)|egrep -v'(^$|configured|###)'" @@ -46,7 +46,9 @@ This might be convenient when performing administrative actions or cluster tests .PP \fB*\fR Initiate an administrative takeover of the HANA primary from one node to the other by using the Linux cluster. -This procedure does not work for scale-out. +This procedure does not work for scale-out. On scale-up, it will stop the HANA primary. +This might take a while. If you want to avoid waiting for the stopped primary, +use the below procedure which suspends the primary. If the cluster should also register the former primary as secondary, AUTOMATED_REGISTER="true" is needed. Before the takeover will be initiated, the status of the Linux cluster and the HANA system replication has to be checked. The takeover should be initiated as forced migration of the multi-state SAPHana resource. .br Not working: Regular migration, migration of IP address, migration of primitive SAPHana resource, setting primary node standby. @@ -79,14 +81,17 @@ Note: Former versions of the Linux cluster used "migrate" instead of "move" and .PP \fB*\fR Perform an SAP HANA takeover by using SAP tools. -The procedures is described here for scale-out. It works for scale-up as well. +The procedure is described here for scale-out. It works for scale-up as well. +The procedure will stop the HANA primary. This might take a while. If you want +to avoid waiting for the stopped primary, use the below procedure which suspends +the primary. The status of HANA databases, system replication and Linux cluster has to be checked. The SAP HANA resources are set into maintenance, an sr_takeover is performed, the old primary is registered as new secondary. -Therefore the correct secondary site name has to be used, see later example. +Therefor the correct secondary site name has to be used, see later example. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determining the correct site name. +See also section REQUIREMENTS below and later example on determing the correct site name. .PP .RS 2 1. On either node @@ -179,6 +184,45 @@ If everything looks fine, proceed. # cs_clusterstate -i .RE .PP +\fB*\fR Overview on SAP HANA takeover using SAP tools and suspend primary feature. + +The procedure works for scale-up and scale-out. +The status of HANA databases, system replication and Linux cluster has to be +checked. +The SAP HANA resources are set into maintenance, an sr_takeover is performed +with suspending the primary, the old primary is registered as new secondary. +Therefore the correct secondary site name has to be used. +Finally the SAP HANA resources are given back to the Linux cluster. +See also section REQUIREMENTS below and later example on determining the correct site name. +.PP +.RS 2 +1. Check status of Linux cluster and HANA, show current site names. +.br +2. Set SAPHanaController multi-state resource into maintenance. +.br +3. Perform the takeover, make sure to use the suspend primary feature: +.RE +.RS 4 +~> hdbnsutil -sr_takeover --suspendPrimary +.RE +.RS 2 +4. Check if the new primary is working. +.br +5. Stop suspended old primary. +.br +6. Register old primary as new secondary, make sure to use the correct site name. +.br +7. Start the new secondary. +.br +8. Check new secondary and its system replication. +.br +9. Refresh SAPHanaController multi-state resource. +.br +10. Set SAPHanaController multi-state resource to managed. +.br +11. Finally check status of Linux cluster and HANA. +.RE +.PP \fB*\fR Check the two site names that are known to the Linux cluster. This is useful in case AUTOMATED_REGISTER is not yet set. In that case a former primary needs to be registered manually with the former site name as new secondary. The point is finding the site name that already is in use by the Linux cluster. That exact site name has to be used for registration of the new secondary. See also REQUIREMENTS of SAPHanaSR(7) and SAPHanaSR-ScaleOut(7). @@ -531,7 +575,7 @@ l. Finally check if everything looks fine. .PP \fB*\fR Be patient. For detecting the overall HANA status, the Linux cluster needs a certain amount of time, depending on the HANA and the configured -intervals and timeouts. +intervalls and timeouts. .PP \fB*\fR Before doing anything, always check for the Linux cluster's idle status, left-over migration constraints, and resource failures as well as the HANA From 76f547a0e08a32f2827684ad97108689ff6c12dd Mon Sep 17 00:00:00 2001 From: Lars Pinne Date: Tue, 13 Jun 2023 13:27:39 +0200 Subject: [PATCH 13/60] SAPHanaSR_maintenance_examples.7: example hdbnsutil -sr_takeover --suspendPrimary --- man/SAPHanaSR_maintenance_examples.7 | 58 ++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 2534348b..59172801 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR_maintenance_examples 7 "10 Feb 2023" "" "SAPHanaSR" +.TH SAPHanaSR_maintenance_examples 7 "12 Jun 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_maintenance_examples \- maintenance examples for SAPHana and SAPHanaController. @@ -38,7 +38,7 @@ REQUIREMENTS below. .PP \fB*\fR Watch status of HANA cluster resources and system replication. -This might be convenient when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). +This might be conveniant when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). .PP .RS 4 # watch -n 9 "(crm_mon -1r;SAPHanaSR-showAttr;cs_clusterstate -i)|egrep -v'(^$|configured|###)'" @@ -46,7 +46,9 @@ This might be convenient when performing administrative actions or cluster tests .PP \fB*\fR Initiate an administrative takeover of the HANA primary from one node to the other by using the Linux cluster. -This procedure does not work for scale-out. +This procedure does not work for scale-out. On scale-up, it will stop the HANA primary. +This might take a while. If you want to avoid waiting for the stopped primary, +use the below procedure which suspends the primary. If the cluster should also register the former primary as secondary, AUTOMATED_REGISTER="true" is needed. Before the takeover will be initiated, the status of the Linux cluster and the HANA system replication has to be checked. The takeover should be initiated as forced migration of the multi-state SAPHana resource. .br Not working: Regular migration, migration of IP address, migration of primitive SAPHana resource, setting primary node standby. @@ -79,14 +81,17 @@ Note: Former versions of the Linux cluster used "migrate" instead of "move" and .PP \fB*\fR Perform an SAP HANA takeover by using SAP tools. -The procedures is described here for scale-out. It works for scale-up as well. +The procedure is described here for scale-out. It works for scale-up as well. +The procedure will stop the HANA primary. This might take a while. If you want +to avoid waiting for the stopped primary, use the below procedure which suspends +the primary. The status of HANA databases, system replication and Linux cluster has to be checked. The SAP HANA resources are set into maintenance, an sr_takeover is performed, the old primary is registered as new secondary. -Therefore the correct secondary site name has to be used, see later example. +Therefor the correct secondary site name has to be used, see later example. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determining the correct site name. +See also section REQUIREMENTS below and later example on determing the correct site name. .PP .RS 2 1. On either node @@ -179,6 +184,45 @@ If everything looks fine, proceed. # cs_clusterstate -i .RE .PP +\fB*\fR Overview on SAP HANA takeover using SAP tools and suspend primary feature. + +The procedure works for scale-up and scale-out. +The status of HANA databases, system replication and Linux cluster has to be +checked. +The SAP HANA resources are set into maintenance, an sr_takeover is performed +with suspending the primary, the old primary is registered as new secondary. +Therefore the correct secondary site name has to be used. +Finally the SAP HANA resources are given back to the Linux cluster. +See also section REQUIREMENTS below and later example on determining the correct site name. +.PP +.RS 2 +1. Check status of Linux cluster and HANA, show current site names. +.br +2. Set SAPHanaController multi-state resource into maintenance. +.br +3. Perform the takeover, make sure to use the suspend primary feature: +.RE +.RS 4 +~> hdbnsutil -sr_takeover --suspendPrimary +.RE +.RS 2 +4. Check if the new primary is working. +.br +5. Stop suspended old primary. +.br +6. Register old primary as new secondary, make sure to use the correct site name. +.br +7. Start the new secondary. +.br +8. Check new secondary and its system replication. +.br +9. Refresh SAPHanaController multi-state resource. +.br +10. Set SAPHanaController multi-state resource to managed. +.br +11. Finally check status of Linux cluster and HANA. +.RE +.PP \fB*\fR Check the two site names that are known to the Linux cluster. This is useful in case AUTOMATED_REGISTER is not yet set. In that case a former primary needs to be registered manually with the former site name as new secondary. The point is finding the site name that already is in use by the Linux cluster. That exact site name has to be used for registration of the new secondary. See also REQUIREMENTS of SAPHanaSR(7) and SAPHanaSR-ScaleOut(7). @@ -531,7 +575,7 @@ l. Finally check if everything looks fine. .PP \fB*\fR Be patient. For detecting the overall HANA status, the Linux cluster needs a certain amount of time, depending on the HANA and the configured -intervals and timeouts. +intervalls and timeouts. .PP \fB*\fR Before doing anything, always check for the Linux cluster's idle status, left-over migration constraints, and resource failures as well as the HANA From ea383118d4cd783df5523384338cf42d06f0e38d Mon Sep 17 00:00:00 2001 From: Lars Pinne Date: Tue, 13 Jun 2023 15:07:28 +0200 Subject: [PATCH 14/60] SAPHanaSR-monitor.8 SAPHanaSR_maintenance_examples.7 SAPHanaSR.7 --- man/SAPHanaSR-monitor.8 | 2 +- man/SAPHanaSR.7 | 2 +- man/SAPHanaSR_maintenance_examples.7 | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/man/SAPHanaSR-monitor.8 b/man/SAPHanaSR-monitor.8 index 39533f22..4a5ee262 100644 --- a/man/SAPHanaSR-monitor.8 +++ b/man/SAPHanaSR-monitor.8 @@ -33,7 +33,7 @@ The overall system replication (SR) state is shown in an summarized manner. .HP \fB --intervall \fISEC\fR .br - repeat with intervall of \fISEC\fR seconds, default is 10 seconds. + repeat with interval of \fISEC\fR seconds, default is 10 seconds. .HP \fB --nodes \fINUM\fR .br diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index df2e2910..bf43c18e 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -271,7 +271,7 @@ Linux cluster. 17. HANA feature Secondary Time Travel is not supported. .PP 18. In MDC configurations the HANA database is treated as a single system -including all database containers. Therefore, cluster takeover decisions are +including all database containers. Therefor, cluster takeover decisions are based on the complete status independent of the status of individual containers. .PP 19. If a third HANA site is connected by system replication, that HANA is not diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 59172801..91e698e1 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -38,7 +38,7 @@ REQUIREMENTS below. .PP \fB*\fR Watch status of HANA cluster resources and system replication. -This might be conveniant when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). +This might be convenient when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). .PP .RS 4 # watch -n 9 "(crm_mon -1r;SAPHanaSR-showAttr;cs_clusterstate -i)|egrep -v'(^$|configured|###)'" @@ -191,7 +191,7 @@ The status of HANA databases, system replication and Linux cluster has to be checked. The SAP HANA resources are set into maintenance, an sr_takeover is performed with suspending the primary, the old primary is registered as new secondary. -Therefore the correct secondary site name has to be used. +Therefor the correct secondary site name has to be used. Finally the SAP HANA resources are given back to the Linux cluster. See also section REQUIREMENTS below and later example on determining the correct site name. .PP @@ -575,7 +575,7 @@ l. Finally check if everything looks fine. .PP \fB*\fR Be patient. For detecting the overall HANA status, the Linux cluster needs a certain amount of time, depending on the HANA and the configured -intervalls and timeouts. +intervals and timeouts. .PP \fB*\fR Before doing anything, always check for the Linux cluster's idle status, left-over migration constraints, and resource failures as well as the HANA From 4439b6b1da07a7e60fde24647aa9e015841ee3c4 Mon Sep 17 00:00:00 2001 From: Lars Pinne Date: Tue, 13 Jun 2023 15:37:01 +0200 Subject: [PATCH 15/60] SAPHanaSR_maintenance_examples.7: determing --- man/SAPHanaSR_maintenance_examples.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 91e698e1..282f6f93 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -193,7 +193,7 @@ The SAP HANA resources are set into maintenance, an sr_takeover is performed with suspending the primary, the old primary is registered as new secondary. Therefor the correct secondary site name has to be used. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determining the correct site name. +See also section REQUIREMENTS below and later example on determing the correct site name. .PP .RS 2 1. Check status of Linux cluster and HANA, show current site names. From 6607f0daa2fa2b7873abcfcbe045ade9d26611fb Mon Sep 17 00:00:00 2001 From: Lars Pinne Date: Tue, 13 Jun 2023 16:55:51 +0200 Subject: [PATCH 16/60] SAPHanaSR_maintenance_examples.7 SAPHanaSR-showAttr.8: determining --- man/SAPHanaSR-showAttr.8 | 4 ++-- man/SAPHanaSR_maintenance_examples.7 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index bce18776..b13cff57 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -140,7 +140,7 @@ ocf_suse_SAPHanaController(7). Value: [ \fIgeneration\fR ] The RA generation attribute identifies which generation of the RA is running. -It helps determing RA's capabilities and performing cluster-wide upgrades of +It helps determining RA's capabilities and performing cluster-wide upgrades of RA and srHook. The generation should be same for both on all nodes of the Linux cluster after successful upgrade. See also gsh below and SAPHanaSR-manageAttr(8). @@ -151,7 +151,7 @@ See also gsh below and SAPHanaSR-manageAttr(8). Value: [ \fIgeneration\fR ] The srHook generation attribute identifies which generation of the srHook is running. -It helps determing srHook's capabilities and performing cluster-wide upgrades of +It helps determining srHook's capabilities and performing cluster-wide upgrades of RA and srHook. E.g. starting with generation 2.0 the RA supports scale-out multi-target system replication, which needs replacement of the old SAPHanaSR.py by new SAPHanaSrMultiTarget.py. diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 282f6f93..6f44c5b9 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -91,7 +91,7 @@ The SAP HANA resources are set into maintenance, an sr_takeover is performed, the old primary is registered as new secondary. Therefor the correct secondary site name has to be used, see later example. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determing the correct site name. +See also section REQUIREMENTS below and later example on determining the correct site name. .PP .RS 2 1. On either node @@ -193,7 +193,7 @@ The SAP HANA resources are set into maintenance, an sr_takeover is performed with suspending the primary, the old primary is registered as new secondary. Therefor the correct secondary site name has to be used. Finally the SAP HANA resources are given back to the Linux cluster. -See also section REQUIREMENTS below and later example on determing the correct site name. +See also section REQUIREMENTS below and later example on determining the correct site name. .PP .RS 2 1. Check status of Linux cluster and HANA, show current site names. From 0f9ea9803b0f3dad253d910bf9b9fcc6572769e9 Mon Sep 17 00:00:00 2001 From: Janine Fuchs Date: Tue, 11 Jul 2023 11:18:39 +0200 Subject: [PATCH 17/60] typo fixed to enable RemoveSAPSockets functionality --- ra/SAPHana | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ra/SAPHana b/ra/SAPHana index a5202133..2b273d6c 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -845,7 +845,7 @@ function saphana_init() { SAPVIRHOST=${vName} PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" - RemoveSAPSockets="${CF_RESKEY_REMOVE_SAP_SOCKETS:-true}" + RemoveSAPSockets="${OCF_RESKEY_REMOVE_SAP_SOCKETS:-true}" LPA_DIRECTORY=/var/lib/SAPHanaRA LPA_ATTR=("lpa_${sid}_lpt" "forever") super_ocf_log debug "DBG: SID=$SID, sid=$sid, SIDInstanceName=$SIDInstanceName, InstanceName=$InstanceName, InstanceNr=$InstanceNr, SAPVIRHOST=$SAPVIRHOST" From cefd1269adb98e3929e8acfc59030a3a7dde1f81 Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Mon, 21 Aug 2023 17:56:45 +0200 Subject: [PATCH 18/60] rework variable settings, avoid external calls (like awk) in favor of bash features. see discussion in PR#158 and #159 --- ra/SAPHana | 66 ++++++++++++++++++++++------------------------ ra/SAPHanaTopology | 25 ++++++++---------- 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 03d118ac..ffbb6540 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -1021,10 +1021,9 @@ function saphana_init() { # since rev 112.03 the old option is changed and we should use -sr_stateConfiguration where ever possible # hdbState="hdbnsutil -sr_stateConfiguration" - hdbMap="hdbnsutil -sr_stateHostMapping" + # DONE: PRIO1: Beginning from SAP HANA rev 112.03 -sr_state is not longer supported if version "$hdbver" "<" "1.00.111"; then hdbState="hdbnsutil -sr_state" - hdbMap="hdbnsutil -sr_state" fi super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$OCF_SUCCESS" return $OCF_SUCCESS @@ -1241,50 +1240,49 @@ function check_for_primary() { # DONE: PRIO2: Maybe we need to use a fallback interface when hdbnsutil does not answer properly -> lookup in config files? # TODO: This might also solve some problems when we could not figure-out the local or remote site name (site_name,site_id from global.ini) local chkMethod="" - local ini_mode="" + local node_status="" + local node_full_status="" for chkMethod in hU hU hU gP; do - case "$chkMethod" in - gP ) + case "$chkMethod" in + gP ) # fallback for 'hdbnsutil' failing 3 times. local gpKeys="" gpKeys=$(echo --key=global.ini/system_replication/{actual_mode,mode}) node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') # first try to get the value of 'actual_mode' from the global.ini - ini_mode=$(echo "$node_full_status" | awk -F= '$1=="actual_mode" {print $2}') + [[ "$node_full_status" =~ actual_mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' - if [ -z "$ini_mode" ]; then - ini_mode=$(echo "$node_full_status" | awk -F= '$1=="mode" {print $2}') + if [ -z "$node_status" ]; then + [[ "$node_full_status" =~ mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} fi - node_status="$ini_mode" super_ocf_log info "ACT: Using getParameter.py as fallback - node_status=$node_status" ;; - hU | * ) - # DONE: PRIO1: Beginning from SAP HANA rev 112.03 -sr_state is not longer supported - node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState" 2>/dev/null ) - node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + hU | * ) + node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null ) + [[ "$node_full_status" =~ mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ;; - esac - case "$node_status" in - primary ) - rc=$HANA_STATE_PRIMARY - break;; - syncmem | sync | async ) - rc=$HANA_STATE_SECONDARY - break;; - none ) # have seen that mode on second side BEFEORE we registered it as replica - rc=$HANA_STATE_STANDALONE - break;; - * ) - super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" - dump=$( echo $node_status | hexdump -C ); - super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" - # TODO: Limit the runtime of hdbnsutil and use getParameter.py as fallback - # SAP_CALL - super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" - # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes - esac; - sleep 2 + esac + case "$node_status" in + primary ) + rc=$HANA_STATE_PRIMARY + break;; + syncmem | sync | async ) + rc=$HANA_STATE_SECONDARY + break;; + none ) # have seen that mode on second side BEFEORE we registered it as replica + rc=$HANA_STATE_STANDALONE + break;; + * ) + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" + dump=$( echo $node_status | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" + ((i++)) + super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" + # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes + # lets pause a bit to give hdbnsutil a chance to answer next time + sleep 2 + esac; done super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index f9687cb8..3408a941 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -475,8 +475,7 @@ function sht_init() { local myInstanceName="" local rc=$OCF_SUCCESS local hdbANSWER="" - local siteID - local siteNAME + local site="" local chkMethod="" SYSTEMCTL="/usr/bin/systemctl" systemd_unit_name="saphostagent.service" @@ -562,11 +561,10 @@ function sht_init() { # since rev 112.03 the old option is changed and we should use -sr_stateConfiguration where ever possible # hdbState="hdbnsutil -sr_stateConfiguration" - hdbMap="hdbnsutil -sr_stateHostMapping" if version "$hdbver" "<" "1.00.111"; then hdbState="hdbnsutil -sr_state" - hdbMap="hdbnsutil -sr_state" fi + srmode="" #### SAP-CALL # hdbnsutil was a bit unstable in some tests so we recall the tool, if it fails to report the srmode for chkMethod in hU hU hU gP ; do @@ -576,21 +574,21 @@ function sht_init() { #super_ocf_log debug "DBG2: hdbANSWER=$hdbANSWER" #srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') case "$chkMethod" in - gP ) # call getParameter (gP) + gP ) # call getParameter (gP) - fallback for 'hdbnsutil' failing 3 times. local gpKeys="" gpKeys=$(echo --key=global.ini/system_replication/{actual_mode,mode,site_name,site_id}) hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') - srmode=$(echo "$hdbANSWER" | awk -F= '$1=="actual_mode" {print $2}') + [[ "$hdbANSWER" =~ actual_mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$srmode" ]; then - srmode=$(echo "$hdbANSWER" | awk -F= '$1=="mode" {print $2}') + [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} fi super_ocf_log info "ACT: hdbnsutil not answering - using global.ini as fallback - srmode=$srmode" ;; hU | * ) # call hdbnsUtil (hU) ( also for unknown chkMethod ) # DONE: PRIO1: Beginning from SAP HANA rev 112.03 -sr_state is not longer supported hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null) - srmode=$(echo "$hdbANSWER" | awk -F= '$1=="mode" {print $2}') + [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} ;; esac case "$srmode" in @@ -605,23 +603,22 @@ function sht_init() { esac done # TODO PRIO3: Implement a file lookup, if we did not get a result - siteID=$(echo "$hdbANSWER" | awk -F= '/site.id/ {print $2}') # allow 'site_id' AND 'site id' - siteNAME=$(echo "$hdbANSWER" | awk -F= '/site.name/ {print $2}') - site=$siteNAME + [[ "$hdbANSWER" =~ site.name=([^$'\n']+) ]] && site=${BASH_REMATCH[1]} if [ -z "$srmode" ]; then - srmode=$(echo "$hdbANSWER" | awk -F= '$1=="actual_mode" {print $2}') + [[ "$hdbANSWER" =~ actual_mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$srmode" ]; then - srmode=$(echo "$hdbANSWER" | awk -F= '$1=="mode" {print $2}') + [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} fi fi # # for rev >= 111 we use the new mapping query # if version "$hdbver" ">=" "1.00.111"; then + hdbMap="hdbnsutil -sr_stateHostMapping" hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbMap --sapcontrol=1" 2>/dev/null) fi - MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site=$site) + MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site="$site") super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" if [ -n "$MAPPING" ]; then # we have a mapping from HANA, lets use it From 8174ecf69a005978fff8320bdbe1d665fddc1c5b Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Mon, 21 Aug 2023 19:15:15 +0200 Subject: [PATCH 19/60] avoid explicid and implicid usage of /tmp filesystem to keep the SAPHanaSR resource agents working even in situations with /tmp filesystem full. (bsc#1210728) fix identation - exchange tabs with blanks --- SAPHanaSR.changes_12 | 16 ++++++++ SAPHanaSR.changes_15 | 16 ++++++++ ra/SAPHana | 59 +++++++++++++-------------- ra/SAPHanaTopology | 95 +++++++++++++++++++++----------------------- 4 files changed, 105 insertions(+), 81 deletions(-) diff --git a/SAPHanaSR.changes_12 b/SAPHanaSR.changes_12 index 1ddc4c10..ab0e2318 100644 --- a/SAPHanaSR.changes_12 +++ b/SAPHanaSR.changes_12 @@ -1,3 +1,19 @@ +------------------------------------------------------------------- +Mon Aug 21 16:24:32 UTC 2023 - abriel@suse.com + +- Version bump to 0.162.2 + * avoid explicid and implicid usage of /tmp filesystem to keep + the SAPHanaSR resource agents working even in situations with + /tmp filesystem full. + (bsc#1210728) + * update update man pages: + SAPHanaSR_basic_cluster.7 + susCostOpt.py.7 + SAPHanaSR_maintenance_examples.7 + SAPHanaSR-monitor.8 + * add improvements from SAP to the RA scripts, part II + (jsc#PED-1739, jsc#PED-2608) + ------------------------------------------------------------------- Tue Jan 24 15:27:27 UTC 2023 - abriel@suse.com diff --git a/SAPHanaSR.changes_15 b/SAPHanaSR.changes_15 index 7c70281c..c3a40e5e 100644 --- a/SAPHanaSR.changes_15 +++ b/SAPHanaSR.changes_15 @@ -1,3 +1,19 @@ +------------------------------------------------------------------- +Mon Aug 21 16:24:32 UTC 2023 - abriel@suse.com + +- Version bump to 0.162.2 + * avoid explicid and implicid usage of /tmp filesystem to keep + the SAPHanaSR resource agents working even in situations with + /tmp filesystem full. + (bsc#1210728) + * update update man pages: + SAPHanaSR_basic_cluster.7 + susCostOpt.py.7 + SAPHanaSR_maintenance_examples.7 + SAPHanaSR-monitor.8 + * add improvements from SAP to the RA scripts, part II + (jsc#PED-1739, jsc#PED-2608) + ------------------------------------------------------------------- Tue Jan 24 15:27:27 UTC 2023 - abriel@suse.com diff --git a/ra/SAPHana b/ra/SAPHana index ffbb6540..5a0183de 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -114,12 +114,11 @@ function saphana_usage() { local rc=0 methods=$(saphana_methods) methods=$(echo $methods | tr ' ' '|') - cat <<-EOF - usage: $0 ($methods) + echo " usage: $0 ($methods) $0 manages two SAP HANA databases (scale-up) in system replication. - The 'start' operation starts the HANA instance or bring the "clone instance" to a WAITING status + The 'start' operation starts the HANA instance or bring the \"clone instance\" to a WAITING status The 'stop' operation stops the HANA instance The 'status' operation reports whether the HANA instance is running The 'monitor' operation reports whether the HANA instance seems to be working in multi-state it also needs to check the system replication status @@ -129,8 +128,7 @@ function saphana_usage() { The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports The 'reload' operation allows to adapt resource parameters - -EOF +" return $rc } @@ -151,11 +149,9 @@ function backup_global_and_nameserver() { function saphana_meta_data() { super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local rc=0 -# - cat < + echo ' - + 1.0 Manages two SAP HANA database systems in system replication (SR). @@ -192,7 +188,7 @@ The resource agent uses the following four interfaces provided by SAP: Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script "systemReplicationStatus.py" in SAP HANA SPS8 or 9. As long as we need to use hdbsql you need to set up secure store users for linux user root to be able to - access the SAP HANA database. You need to configure a secure store user key "SAPHANA${SID}SR" which can connect the SAP + access the SAP HANA database. You need to configure a secure store user key "SAPHANA'${SID}'SR" which can connect the SAP HANA database: 5. saphostctrl @@ -290,8 +286,7 @@ The resource agent uses the following four interfaces provided by SAP: - -END +' super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } @@ -593,8 +588,10 @@ scoring_crm_master() local sync="$2" local myScore='' for scan in "${SCORING_TABLE[@]}"; do - read rolePatt syncPatt score <<< $scan + # $scan needs "" to prevent globbing, as the scoring tables include '.*' + read rolePatt syncPatt score < <(echo "$scan") if [[ "$roles" =~ $rolePatt ]] ; then + # $syncPatt must NOT have "" for correct matching if [[ "$sync" =~ $syncPatt ]] ; then super_ocf_log info "DEC: scoring_crm_master: roles($roles) are matching pattern ($rolePatt)" super_ocf_log info "DEC: scoring_crm_master: sync($sync) is matching syncPattern ($syncPatt)" @@ -675,9 +672,9 @@ function HANA_CALL() ;; * ) errExt=$(date '+%s%N')_${sid}adm - su_err_log=/tmp/HANA_CALL_SU_RA_${errExt} - cmd_out_log=/tmp/HANA_CALL_CMD_RA_OUT_${errExt} - cmd_err_log=/tmp/HANA_CALL_CMD_RA_${errExt} + su_err_log=/run/HANA_CALL_SU_RA_${errExt} + cmd_out_log=/run/HANA_CALL_CMD_RA_OUT_${errExt} + cmd_err_log=/run/HANA_CALL_CMD_RA_${errExt} output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "($pre_script; timeout -s 9 $timeOut $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? @@ -1673,8 +1670,8 @@ function saphana_start_primary() if ocf_is_true "${PreferSiteTakeover}"; then remoteStatus="$remoteRole:$remoteSync" case "$remoteStatus" in - [234]:S:*:SOK | [234]:S:*:PRIM ) - lpa_advice="wait" + [234]:S:*:SOK | [234]:S:*:PRIM ) + lpa_advice="wait" # TODO: PRIO3: Split WAIT into WAIT4TAKEOVER super_ocf_log info "DEC: saphana_primary - waiting for secondary to takeover (SOK, PreferSiteTakover)" ;; @@ -1835,7 +1832,7 @@ check_for_primary_master() super_ocf_log debug "DBG: check_for_primary_master (3) ch_role=$ch_role" awk -F: 'BEGIN { rc=1 } $1 ~ "[34]" && $2 == "P" && $4 == "master" { rc=0 } - END { exit rc }' <<< $ch_role ; rc=$? + END { exit rc }' < <(echo $ch_role) ; rc=$? super_ocf_log debug "DBG: check_for_primary_master (4) rc=$rc" fi done @@ -2056,7 +2053,7 @@ function lpa_pull_lpt() { local readrest=0 local lpa_file="$LPA_DIRECTORY/lpa_${sid}_${NODENAME}" if [ -f $lpa_file ]; then - read lpt readrest <<<$(cat $lpa_file) # exactly load first word from file to lpt + read lpt readrest < <(cat $lpa_file) # exactly load first word from file to lpt fi if [ -n "$lpt" ]; then rc=0 @@ -2499,7 +2496,7 @@ function saphana_monitor_primary() else rc=$OCF_SUCCESS fi - set_SRHOOK_PRIM + set_SRHOOK_PRIM my_role=$(get_hana_attribute "${NODENAME}" "${ATTR_NAME_HANA_ROLES[@]}") super_ocf_log info "DEC: saphana_monitor_primary: scoring_crm_master($my_role,$my_sync)" scoring_crm_master "$my_role" "$my_sync" @@ -2729,15 +2726,15 @@ function saphana_monitor_clone() { my_sync=$(get_SRHOOK "$sr_name" "$NODENAME") set_hana_attribute "$NODENAME" "-" "${ATTR_NAME_HANA_SRACTION_HISTORY[@]}" - if ocf_is_probe; then - super_ocf_log debug "DBG: PROBE ONLY" - else - super_ocf_log debug "DBG: REGULAR MONITOR" - fi - # - # First check, if we are PRIMARY or SECONDARY - # - check_for_primary; primary_status=$? + if ocf_is_probe; then + super_ocf_log debug "DBG: PROBE ONLY" + else + super_ocf_log debug "DBG: REGULAR MONITOR" + fi + # + # First check, if we are PRIMARY or SECONDARY + # + check_for_primary; primary_status=$? if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then # FIX: bsc#919925 Leaving Node Maintenance stops HANA Resource Agent # TODO: PRIO1: Maybe we need a lpa-check here to @@ -2843,7 +2840,7 @@ function saphana_promote_clone() { fi fi if [ $rc -eq $OCF_SUCCESS ]; then - set_SRHOOK_PRIM + set_SRHOOK_PRIM fi super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 3408a941..4d9c3b09 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -2,7 +2,7 @@ # # SAPHanaTopology # -# Description: Clone resource to analyze SAPHana-Topology +# Description: Clone resource to analyze SAPHana-Topology # ################################################################################################################### # @@ -22,7 +22,7 @@ # OCF instance parameters: # OCF_RESKEY_SID (LNX, NDB, SLE) # OCF_RESKEY_InstanceNumber (00..99) -# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) +# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_SAPHanaFilter (outdated, replaced by cluster property hana_${sid}_glob_filter) # ####################################################################### @@ -96,12 +96,11 @@ function sht_usage() { local rc=0 methods=$(sht_methods) methods=$(echo $methods | tr ' ' '|') - cat <<-! - usage: $0 ($methods) + echo "usage: $0 ($methods) $0 manages a SAP HANA Instance as an HA resource. - The 'start' operation starts the HANA instance or bring the "instance" to a WAITING (for primary) status + The 'start' operation starts the HANA instance or bring the \"instance\" to a WAITING (for primary) status The 'stop' operation stops the HANA instance The 'status' operation reports whether the HANA instance is running The 'monitor' operation reports whether the HANA instance seems to be working in multi-state it also needs to check the system replication status @@ -109,9 +108,8 @@ function sht_usage() { The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports The 'reload' operation allows to change parameters like HANA_CALL_TIMEOUT without forcing a recover of all instances - - ! - return $rc +" + return $rc } # @@ -122,10 +120,9 @@ function sht_usage() { function sht_meta_data() { super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" local rc=0 - cat < + echo ' - + 1.0 Analyzes SAP HANA System Replication Topology. This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to @@ -174,8 +171,8 @@ SAPHanaTopology scans the output table of landscapeHostConfiguration.py to ident - Path to the SAP Hana Instance executable directory. If not set the RA tries /usr/sap/\$SID/\$InstanceName/exe. - While InstanceName is the string of "HDB" and \$InstanceNumber for SAP Hana databases. + Path to the SAP Hana Instance executable directory. If not set the RA tries /usr/sap/$SID/$InstanceName/exe. + While InstanceName is the string of "HDB" and $InstanceNumber for SAP Hana databases. Path to the SAP Hana Instance executable directory. @@ -196,10 +193,9 @@ SAPHanaTopology scans the output table of landscapeHostConfiguration.py to ident - -END -super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" -return $rc +' + super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" + return $rc } # @@ -276,22 +272,21 @@ function set_hana_attribute() # methods: What methods/operations do we support? # function sht_methods() { - super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" - local rc=0 - cat <<-! - start - stop - status - monitor - notify - validate-all - methods - meta-data - usage - admin-setup - reload - ! - return $rc + super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" + local rc=0 + echo " + start + stop + status + monitor + notify + validate-all + methods + meta-data + usage + admin-setup + reload" + return $rc } # @@ -422,9 +417,9 @@ function HANA_CALL() ;; * ) errExt=$(date '+%s%N')_${sid}adm - su_err_log=/tmp/HANA_CALL_SU_TOP_${errExt} - cmd_out_log=/tmp/HANA_CALL_CMD_TOP_OUT_${errExt} - cmd_err_log=/tmp/HANA_CALL_CMD_TOP_${errExt} + su_err_log=/run/HANA_CALL_SU_TOP_${errExt} + cmd_out_log=/run/HANA_CALL_CMD_TOP_OUT_${errExt} + cmd_err_log=/run/HANA_CALL_CMD_TOP_${errExt} output=$(timeout "$timeOut" $pre_cmd "($pre_script; $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? @@ -991,7 +986,7 @@ function sht_stop_clone() { sht_stop; rc=$? # till now it returns everytime $OCF_SUCCESS if [ "$tout" -eq 1 ]; then # timeout of landscapeHostConfiguration.py - let stop fail - rc=$OCF_ERR_GENERIC + rc=$OCF_ERR_GENERIC fi super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" @@ -1007,13 +1002,13 @@ function sht_stop_clone() { function sht_monitor_clone() { super_ocf_log info "FLOW ${FUNCNAME[0]} ($*)" # - local rc=$OCF_ERR_GENERIC - local promoted=0 + local rc=$OCF_ERR_GENERIC + local promoted=0 local init_attribute=0 - if ocf_is_probe; then - super_ocf_log debug "DBG: PROBE ONLY" + if ocf_is_probe; then + super_ocf_log debug "DBG: PROBE ONLY" sht_monitor; rc=$? local hana_version hana_version=$(grep -s -m1 -oP 'fullversion: \K.+(?= Build)' "/usr/sap/$SID/$InstanceName/exe/manifest") @@ -1022,18 +1017,18 @@ function sht_monitor_clone() { if [[ -n $hana_version ]]; then set_hana_attribute "${NODENAME}" "$hana_version" ${ATTR_NAME_HANA_VERSION[@]} fi - else - super_ocf_log debug "DBG: REGULAR MONITOR" + else + super_ocf_log debug "DBG: REGULAR MONITOR" if ! check_saphostagent; then start_saphostagent fi - # - # First check, if we are PRIMARY or SECONDARY - # - super_ocf_log debug "DBG: HANA SID $SID" - super_ocf_log debug "DBG: HANA InstanceName $InstanceName" - super_ocf_log debug "DBG: HANA InstanceNr $InstanceNr" - check_for_primary; primary_status=$? + # + # First check, if we are PRIMARY or SECONDARY + # + super_ocf_log debug "DBG: HANA SID $SID" + super_ocf_log debug "DBG: HANA InstanceName $InstanceName" + super_ocf_log debug "DBG: HANA InstanceNr $InstanceNr" + check_for_primary; primary_status=$? if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then hanaPrim="P" super_ocf_log debug "DBG: HANA IS PRIMARY" From ba70d3be8919602e0401819f50088f718e564e60 Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Wed, 23 Aug 2023 12:00:20 +0200 Subject: [PATCH 20/60] fix problem with /run location rework getParameter.py output matching to match 'exactly' the needed field even that new, similar field names may be introduced in the future. --- ra/SAPHana | 19 ++++++++++++------- ra/SAPHanaTopology | 24 ++++++++++++++---------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 5a0183de..bdef1a0c 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -672,9 +672,9 @@ function HANA_CALL() ;; * ) errExt=$(date '+%s%N')_${sid}adm - su_err_log=/run/HANA_CALL_SU_RA_${errExt} - cmd_out_log=/run/HANA_CALL_CMD_RA_OUT_${errExt} - cmd_err_log=/run/HANA_CALL_CMD_RA_${errExt} + su_err_log=${SAPHanaSR_RUN}/HANA_CALL_SU_RA_${errExt} + cmd_out_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_RA_OUT_${errExt} + cmd_err_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_RA_${errExt} output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "($pre_script; timeout -s 9 $timeOut $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? @@ -819,6 +819,10 @@ function saphana_init() { super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" sid="${SID,,}" sidadm="${sid}adm" + # create subdirectory in /run + SAPHanaSR_RUN="/run/SAPHanaSR_${sid}" + mkdir -p $SAPHanaSR_RUN + chown $sidadm $SAPHanaSR_RUN # TODO PRIO3: Do we need a parameter for the RA to be able to adjust hdbSrQueryTimeout? hdbSrQueryTimeout=180 # DONE: PRIO4: SAPVIRHOST might be different to NODENAME @@ -1239,24 +1243,25 @@ function check_for_primary() { local chkMethod="" local node_status="" local node_full_status="" + local i=0 for chkMethod in hU hU hU gP; do case "$chkMethod" in gP ) # fallback for 'hdbnsutil' failing 3 times. local gpKeys="" gpKeys=$(echo --key=global.ini/system_replication/{actual_mode,mode}) - node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') + node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1) # first try to get the value of 'actual_mode' from the global.ini - [[ "$node_full_status" =~ actual_mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} + [[ "$node_full_status" =~ "SAPCONTROL-OK: ".*"/actual_mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && node_status=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$node_status" ]; then - [[ "$node_full_status" =~ mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} + [[ "$node_full_status" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && node_status=${BASH_REMATCH[1]} fi super_ocf_log info "ACT: Using getParameter.py as fallback - node_status=$node_status" ;; hU | * ) node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null ) - [[ "$node_full_status" =~ mode=([^$'\n']+) ]] && node_status=${BASH_REMATCH[1]} + [[ "$node_full_status" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && node_status=${BASH_REMATCH[1]} super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ;; esac diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 4d9c3b09..8c161137 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -417,9 +417,9 @@ function HANA_CALL() ;; * ) errExt=$(date '+%s%N')_${sid}adm - su_err_log=/run/HANA_CALL_SU_TOP_${errExt} - cmd_out_log=/run/HANA_CALL_CMD_TOP_OUT_${errExt} - cmd_err_log=/run/HANA_CALL_CMD_TOP_${errExt} + su_err_log=${SAPHanaSR_RUN}/HANA_CALL_SU_TOP_${errExt} + cmd_out_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_TOP_OUT_${errExt} + cmd_err_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_TOP_${errExt} output=$(timeout "$timeOut" $pre_cmd "($pre_script; $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? @@ -489,6 +489,10 @@ function sht_init() { super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" sid="${SID,,}" sidadm="${sid}adm" + # create subdirectory in /run + SAPHanaSR_RUN="/run/SAPHanaSR_${sid}" + mkdir -p $SAPHanaSR_RUN + chown $sidadm $SAPHanaSR_RUN ocf_env=$(env | grep 'OCF_RESKEY_CRM') super_ocf_log debug "DBG3: OCF: $ocf_env" ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? @@ -572,18 +576,18 @@ function sht_init() { gP ) # call getParameter (gP) - fallback for 'hdbnsutil' failing 3 times. local gpKeys="" gpKeys=$(echo --key=global.ini/system_replication/{actual_mode,mode,site_name,site_id}) - hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') - [[ "$hdbANSWER" =~ actual_mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} + hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python getParameter.py $gpKeys --sapcontrol=1" 2>&1) + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/actual_mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$srmode" ]; then - [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} fi super_ocf_log info "ACT: hdbnsutil not answering - using global.ini as fallback - srmode=$srmode" ;; hU | * ) # call hdbnsUtil (hU) ( also for unknown chkMethod ) # DONE: PRIO1: Beginning from SAP HANA rev 112.03 -sr_state is not longer supported hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null) - [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} ;; esac case "$srmode" in @@ -598,12 +602,12 @@ function sht_init() { esac done # TODO PRIO3: Implement a file lookup, if we did not get a result - [[ "$hdbANSWER" =~ site.name=([^$'\n']+) ]] && site=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/site_name="([^$'\n']*).*"SAPCONTROL-OK: " ]] && site=${BASH_REMATCH[1]} if [ -z "$srmode" ]; then - [[ "$hdbANSWER" =~ actual_mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/actual_mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$srmode" ]; then - [[ "$hdbANSWER" =~ mode=([^$'\n']+) ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} fi fi # From 1b4969a88f126f02c30a299a4d578da511e1a3ba Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Wed, 23 Aug 2023 13:37:59 +0200 Subject: [PATCH 21/60] fix matching problems with hdbnsutil output --- ra/SAPHana | 2 +- ra/SAPHanaTopology | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index bdef1a0c..b9e966f7 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -1261,7 +1261,7 @@ function check_for_primary() { ;; hU | * ) node_full_status=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null ) - [[ "$node_full_status" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && node_status=${BASH_REMATCH[1]} + [[ "$node_full_status" =~ "SAPCONTROL-OK: ".*(^|$'\n')"mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && node_status=${BASH_REMATCH[2]} super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ;; esac diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 8c161137..ea7e534c 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -587,7 +587,7 @@ function sht_init() { hU | * ) # call hdbnsUtil (hU) ( also for unknown chkMethod ) # DONE: PRIO1: Beginning from SAP HANA rev 112.03 -sr_state is not longer supported hdbANSWER=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "$hdbState --sapcontrol=1" 2>/dev/null) - [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*(^|$'\n')"mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[2]} ;; esac case "$srmode" in @@ -602,12 +602,15 @@ function sht_init() { esac done # TODO PRIO3: Implement a file lookup, if we did not get a result - [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/site_name="([^$'\n']*).*"SAPCONTROL-OK: " ]] && site=${BASH_REMATCH[1]} + # following pattern matching needs to work the output of getParameter.py and + # hdbnsutil as we do not know which of these commands succeeded in the loop + # above. + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*/?"site"."name="([^$'\n']*).*"SAPCONTROL-OK: " ]] && site=${BASH_REMATCH[1]} if [ -z "$srmode" ]; then - [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/actual_mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*/?"actual"."mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} # if 'actual_mode' is not available, fallback to 'mode' if [ -z "$srmode" ]; then - [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*"/mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} + [[ "$hdbANSWER" =~ "SAPCONTROL-OK: ".*/?"mode="([^$'\n']*).*"SAPCONTROL-OK: " ]] && srmode=${BASH_REMATCH[1]} fi fi # From bc1087e3bea149d5298da2890157603a2d5571d1 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 25 Aug 2023 09:42:41 +0200 Subject: [PATCH 22/60] add Shell PID to HANA_CALL - this will be logged by HANA tracefiles and allows Mapping of the Calls from RA to HANA trace entries --- ra/SAPHana | 4 ++-- ra/SAPHanaTopology | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index b9e966f7..cd8ddc45 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -641,7 +641,7 @@ function HANA_CALL() local use_su=1 # Default to be changed later (see TODO above) local pre_cmd="" local cmd="" - local pre_script="" + local pre_script='' local output="" while [ $# -gt 0 ]; do case "$1" in @@ -655,7 +655,7 @@ function HANA_CALL() if [ $use_su -eq 1 ]; then pre_cmd="su - ${sid}adm -c" - [[ $cmd == python* ]] && pre_script="cd $DIR_EXECUTABLE/python_support" || pre_script='true' + [[ $cmd == python* ]] && pre_script=": [$$]; cd $DIR_EXECUTABLE/python_support" || pre_script=": [$$]" else # as root user we need the library path to the SAP kernel to be able to call sapcontrol # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index ea7e534c..2fb7388a 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -386,7 +386,7 @@ function HANA_CALL() local use_su=1 # Default to be changed later (see TODO above) local pre_cmd="" local cmd="" - local pre_script="" + local pre_script='' local output="" while [ $# -gt 0 ]; do case "$1" in @@ -400,7 +400,7 @@ function HANA_CALL() if [ $use_su -eq 1 ]; then pre_cmd="su - ${sid}adm -c" - [[ $cmd == python* ]] && pre_script="cd $DIR_EXECUTABLE/python_support" || pre_script='true' + [[ $cmd == python* ]] && pre_script=": [$$]; cd $DIR_EXECUTABLE/python_support" || pre_script=": [$$]" else # as root user we need the library path to the SAP kernel to be able to call sapcontrol # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH From 90d6748bf9ed71e6b278d1c469dd2f54b720b939 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 25 Aug 2023 13:30:44 +0200 Subject: [PATCH 23/60] Runtime - remove timeE-timeB calculation as bash SECONDS is already passed time, remove variables --- ra/SAPHana | 6 +----- ra/SAPHanaTopology | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index b9e966f7..5d62e579 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -44,8 +44,6 @@ SAPHanaVersion="0.162.1" # # Initialization: -timeB=${SECONDS} - : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs @@ -2976,9 +2974,7 @@ case "$ACTION" in ra_rc=$OCF_ERR_UNIMPLEMENTED ;; esac -timeE=${SECONDS} -(( timeR = timeE - timeB )) super_ocf_log debug "DBG: ==== SAPHanaFilter=$SAPHanaFilter" -super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaVersion) (${timeR}s)====" +super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaVersion) (${SECONDS}s)====" exit ${ra_rc} # set ts=4 sw=4 sts=4 et diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index ea7e534c..05433b49 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -30,8 +30,6 @@ SAPHanaTopologyVersion="0.162.1" # # Initialization: -timeB=${SECONDS} - : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs @@ -1255,7 +1253,5 @@ case "$ACTION" in ra_rc=$OCF_ERR_UNIMPLEMENTED ;; esac -timeE=${SECONDS} -(( timeR = timeE - timeB )) -super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaTopologyVersion) (${timeR}s)====" +super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($SAPHanaTopologyVersion) (${SECONDS}s)====" exit ${ra_rc} From 7491657905269ee3b17795c263ef93e703124bce Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Fri, 25 Aug 2023 13:31:24 +0200 Subject: [PATCH 24/60] add runtime to begin action (as this is not really the beginning) - most time is spent within initialization of RA --- ra/SAPHana | 2 +- ra/SAPHanaTopology | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 5d62e579..cd9d71d9 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -2951,7 +2951,7 @@ else fi # What kind of method was invoked? -super_ocf_log info "RA ==== begin action $ACTION$CLACT ($SAPHanaVersion) ====" +super_ocf_log info "RA ==== begin action $ACTION$CLACT ($SAPHanaVersion) (${SECONDS}s)====" ra_rc=$OCF_ERR_UNIMPLEMENTED case "$ACTION" in start|stop|monitor|promote|demote) # Standard controlling actions diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 05433b49..01f48c3c 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -1234,7 +1234,7 @@ else fi fi -super_ocf_log info "RA ==== begin action $ACTION$CLACT ($SAPHanaTopologyVersion) ====" +super_ocf_log info "RA ==== begin action $ACTION$CLACT ($SAPHanaTopologyVersion) (${SECONDS}s)====" ra_rc=$OCF_ERR_UNIMPLEMENTED case "$ACTION" in start|stop|monitor) # Standard controlling actions From 48444cf1b5847f41503aae857faf7b0f8f3c4fb0 Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Fri, 25 Aug 2023 18:33:15 +0200 Subject: [PATCH 25/60] set RA version to 0.162.2 add package info for next release 0.162.2 --- SAPHanaSR.changes_12 | 11 +++++++++-- SAPHanaSR.changes_15 | 11 +++++++++-- SAPHanaSR.spec | 2 +- ra/SAPHana | 2 +- ra/SAPHanaTopology | 2 +- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/SAPHanaSR.changes_12 b/SAPHanaSR.changes_12 index ab0e2318..42dd9fc5 100644 --- a/SAPHanaSR.changes_12 +++ b/SAPHanaSR.changes_12 @@ -1,12 +1,19 @@ ------------------------------------------------------------------- -Mon Aug 21 16:24:32 UTC 2023 - abriel@suse.com +Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com - Version bump to 0.162.2 + * improve supportability by providing the current process ID of + the RA, which is logged in the RA outputs, to HANA tracefiles + too. + This allows a mapping of the SAP related command invocations + from the RA and the HANA executions which might have a delay + in between. + (bsc#1214613) * avoid explicid and implicid usage of /tmp filesystem to keep the SAPHanaSR resource agents working even in situations with /tmp filesystem full. (bsc#1210728) - * update update man pages: + * update man pages: SAPHanaSR_basic_cluster.7 susCostOpt.py.7 SAPHanaSR_maintenance_examples.7 diff --git a/SAPHanaSR.changes_15 b/SAPHanaSR.changes_15 index c3a40e5e..f14c408f 100644 --- a/SAPHanaSR.changes_15 +++ b/SAPHanaSR.changes_15 @@ -1,12 +1,19 @@ ------------------------------------------------------------------- -Mon Aug 21 16:24:32 UTC 2023 - abriel@suse.com +Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com - Version bump to 0.162.2 + * improve supportability by providing the current process ID of + the RA, which is logged in the RA outputs, to HANA tracefiles + too. + This allows a mapping of the SAP related command invocations + from the RA and the HANA executions which might have a delay + in between. + (bsc#1214613) * avoid explicid and implicid usage of /tmp filesystem to keep the SAPHanaSR resource agents working even in situations with /tmp filesystem full. (bsc#1210728) - * update update man pages: + * update man pages: SAPHanaSR_basic_cluster.7 susCostOpt.py.7 SAPHanaSR_maintenance_examples.7 diff --git a/SAPHanaSR.spec b/SAPHanaSR.spec index 345bef13..b96817b3 100644 --- a/SAPHanaSR.spec +++ b/SAPHanaSR.spec @@ -23,7 +23,7 @@ License: GPL-2.0 Group: Productivity/Clustering/HA AutoReqProv: on Summary: Resource agents to control the HANA database in system replication setup -Version: 0.162.1 +Version: 0.162.2 Release: 0 Url: http://scn.sap.com/community/hana-in-memory/blog/2014/04/04/fail-safe-operation-of-sap-hana-suse-extends-its-high-availability-solution diff --git a/ra/SAPHana b/ra/SAPHana index cd8ddc45..040cd2cb 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -41,7 +41,7 @@ # systemReplicationStatus.py (>= SPS090) # ####################################################################### -SAPHanaVersion="0.162.1" +SAPHanaVersion="0.162.2" # # Initialization: timeB=${SECONDS} diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 2fb7388a..fb3fa6c3 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -27,7 +27,7 @@ # ####################################################################### # DONE PRIO 1: AFTER(!) SAP HANA SPS12 is available we could use hdbnsutil --sr_stateConfiguration -SAPHanaTopologyVersion="0.162.1" +SAPHanaTopologyVersion="0.162.2" # # Initialization: timeB=${SECONDS} From 1a8e4902a476582bc08b541cdf7aa814c37bc4ba Mon Sep 17 00:00:00 2001 From: lpinne Date: Tue, 19 Sep 2023 11:18:30 +0200 Subject: [PATCH 26/60] SAPHanaSR.7: mentioned /tmp space and NSE support --- man/SAPHanaSR.7 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index bf43c18e..1043d9aa 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -288,6 +288,15 @@ memory can be used, as long as they are transparent to SUSE HA. 22. The SAP HANA site name is from 2 up to 32 characters long. It starts with a character or number. Subsequent characters may contain dash and underscore. .PP +23. The SAPHanaController RA, the SUSE HA cluster and several SAP components +need read/write access and sufficient space in the Linux /tmp filesystem. +.PP +24. SAP HANA Native Storage Extension (NSE) is supported. +Important is that this feature does not change the HANA topology or interfaces. +In opposite to Native Storage Extension, the HANA Extension Nodes are changing +the topology and thus currently are not supported. +Please refer to SAP documentation for details. +.PP .\" .SH BUGS .\" TODO From 9d1e3a80bb9bdcb23ae86a799751d4712bd02502 Mon Sep 17 00:00:00 2001 From: lpinne Date: Mon, 25 Sep 2023 17:25:48 +0200 Subject: [PATCH 27/60] SAPHanaSR_maintenance_examples.7: whith -> with --- man/SAPHanaSR_maintenance_examples.7 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 6f44c5b9..f9505aea 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -467,7 +467,7 @@ This procedure can be used to update RAs, HANA HADR provider hook scripts and re \fB*\fR Remove left-over maintenance attribute from overall Linux cluster. This could be done to avoid confusion caused by different maintenance procedures. -See above overview on maintenance procedures whith running Linux cluster. +See above overview on maintenance procedures with running Linux cluster. Before doing so, check for cluster attribute maintenance-mode="false". .PP .RS 4 @@ -483,7 +483,7 @@ Before doing so, check for cluster attribute maintenance-mode="false". \fB*\fR Remove left-over standby attribute from Linux cluster nodes. This could be done to avoid confusion caused by different maintenance procedures. -See above overview on maintenance procedures whith running Linux cluster. +See above overview on maintenance procedures with running Linux cluster. Before doing so for all nodes, check for node attribute standby="off" on all nodes. .PP .RS 4 @@ -499,7 +499,7 @@ Before doing so for all nodes, check for node attribute standby="off" on all nod \fB*\fR Remove left-over maintenance attribute from resource. This should usually not be needed. -See above overview on maintenance procedures whith running Linux cluster. +See above overview on maintenance procedures with running Linux cluster. .PP .RS 4 # SAPHanaSR-showAttr From 5834c25a1053681073492965bed860d726b8a18c Mon Sep 17 00:00:00 2001 From: lpinne Date: Mon, 25 Sep 2023 17:30:30 +0200 Subject: [PATCH 28/60] SAPHanaSR_maintenance_examples.7 SAPHanaSR-ScaleOut_basic_cluster.7: whith -> with --- man/SAPHanaSR_basic_cluster.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR_basic_cluster.7 b/man/SAPHanaSR_basic_cluster.7 index ee2b50e7..d6e3bcf0 100644 --- a/man/SAPHanaSR_basic_cluster.7 +++ b/man/SAPHanaSR_basic_cluster.7 @@ -22,7 +22,7 @@ configurations might match specific needs. The crm basic parameter default-resource-stickiness defines the 'stickiness' score a resource gets on the node where it is currently running. This prevents -the cluster from moving resources around whithout an urgent need during a +the cluster from moving resources around without an urgent need during a cluster transition. The correct value depends on number of resources, colocation rules and resource groups. Particularly additional groups colocated to the HANA primary master resource can affect cluster decisions. From 42a02bd2addceee3ced8faadad4d8f2295f8e6ae Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 4 Oct 2023 11:30:19 +0200 Subject: [PATCH 29/60] ocf_suse_SAPHanaTopology.7 ocf_suse_SAPHana.7 SAPHanaSR-showAttr.8: rc details --- man/SAPHanaSR-showAttr.8 | 10 ++++++---- man/ocf_suse_SAPHana.7 | 6 +++--- man/ocf_suse_SAPHanaTopology.7 | 11 +++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index b13cff57..32d3ead3 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR-showAttr 8 "11 Nov 2022" "" "SAPHanaSR" +.TH SAPHanaSR-showAttr 8 "04 Oct 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR-showAttr \- Shows Linux cluster attributes for SAP HANA system replication. @@ -303,12 +303,14 @@ Value: [ 4 | 3 | 2 | 1 | 0 ] This field contains the return code of landscapHostConfiguration.py. The parameter does not tell you if the secondary system is ready for a takeover. The meaning is different from common Linux return codes. +The SAPHanaSR and SAPHanaSR-ScaleOut RAs will interpret return code 1 as +NOT-RUNNING (or ERROR) and return codes 2+3+4 as RUNNING. .br 4 = OK - Everything looks perfect on the HANA primary. .br -3 = WARNING - A HANA Host Auto-Failover is taking place. +3 = INFO - The landscape is completely functional, but the actual host role differs from the configured role. .br -2 = INFO - The landscape is completely functional, but the actual role of the host differs from the configured role. +2 = WARNING - An internal HANA action is ongoing, e.g. host auto-failover. .br 1 = DOWN - There are not enough active hosts. .br @@ -577,7 +579,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2017 SUSE Linux GmbH, Germany. .br -(c) 2018-2022 SUSE LLC +(c) 2018-2023 SUSE LLC .br SAPHanaSR-showAttr comes with ABSOLUTELY NO WARRANTY. .br diff --git a/man/ocf_suse_SAPHana.7 b/man/ocf_suse_SAPHana.7 index 8cb8d2ae..30eb83e0 100644 --- a/man/ocf_suse_SAPHana.7 +++ b/man/ocf_suse_SAPHana.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH ocf_suse_SAPHana 7 "27 Jun 2022" "" "OCF resource agents" +.TH ocf_suse_SAPHana 7 "04 Oct 2023" "" "OCF resource agents" .\" .SH NAME SAPHana \- Manages takeover between two SAP HANA databases with system replication. @@ -46,7 +46,7 @@ landscapeHostConfiguration.py has some detailed output about HANA system status and node roles. For our monitor the overall status is relevant. This overall status is reported by the return code of the script: 0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO, 4: OK -The SAPHana resource agent will interpret return code 0 as FATAL, 1 as not-running +The SAPHana resource agent will interpret return code 0 as FATAL, 1 as NOT-RUNNING (or ERROR) and return codes 2+3+4 as RUNNING. .PP 3. \fBhdbnsutil\fR @@ -612,7 +612,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2022 SUSE LLC +(c) 2019-2023 SUSE LLC .br The resource agent SAPHana comes with ABSOLUTELY NO WARRANTY. .br diff --git a/man/ocf_suse_SAPHanaTopology.7 b/man/ocf_suse_SAPHanaTopology.7 index 30597c54..63dc9c19 100644 --- a/man/ocf_suse_SAPHanaTopology.7 +++ b/man/ocf_suse_SAPHanaTopology.7 @@ -1,6 +1,6 @@ .\" Version: 0.180.0 .\" -.TH ocf_suse_SAPHanaTopology 7 "30 Jun 2022" "" "OCF resource agents" +.TH ocf_suse_SAPHanaTopology 7 "04 Oct 2023" "" "OCF resource agents" .\" .SH NAME SAPHanaTopology \- Helps to manage two SAP HANA databases with system replication. @@ -23,11 +23,10 @@ The resource agent uses the following interfaces provided by SAP: landscapeHostConfiguration.py has some detailed output about HANA system status and node roles. For our monitor the overall status is relevant. This overall status is reported by the return code of the script: -0: Internal Fatal 1: ERROR 2: WARNING 3: INFO (maybe a switch of the resource -running) 4: OK +0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO (e.g. host auto-failover happened), 4: OK .br -The SAPHanaTopology resource agent will interpret return codes 1 as -NOT-RUNNING (or 1 failure) and return codes 2+3+4 as RUNNING. +The SAPHanaTopology resource agent will interpret return code 1 as +NOT-RUNNING (or ERROR) and return codes 2+3+4 as RUNNING. SAPHanaTopology scans the output table of landscapeHostConfiguration.py to identify the roles of the cluster node. Roles means configured and current role of the nameserver as well as the indexserver. @@ -268,7 +267,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2017 SUSE Linux GmbH, Germany. .br -(c) 2018-2022 SUSE LLC +(c) 2018-2023 SUSE LLC .br SAPHanaTopology comes with ABSOLUTELY NO WARRANTY. .br From 5e69ea7b9aabacae97450cae8140395094bd9e10 Mon Sep 17 00:00:00 2001 From: Fabian Herschel Date: Thu, 12 Oct 2023 16:25:05 +0200 Subject: [PATCH 30/60] SAPHanaSR.py: As SAPHanaSR-ScaleOut handle pending fallback file during new events (bsc1215693) --- srHook/SAPHanaSR.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/srHook/SAPHanaSR.py b/srHook/SAPHanaSR.py index 498a6498..2c8fbebc 100755 --- a/srHook/SAPHanaSR.py +++ b/srHook/SAPHanaSR.py @@ -30,7 +30,7 @@ class SAPHanaSR(HADRBase): def __init__(self, *args, **kwargs): # delegate construction to base class super(SAPHanaSR, self).__init__(*args, **kwargs) - self.tracer.info("SAPHanaSR init()") + self.tracer.info("SAPHanaSR init()") def about(self): return {"provider_company": "SUSE", @@ -72,7 +72,16 @@ def srConnectionChanged(self, ParamDict, **kwargs): rc = os.system(myCMD) myMSG = "CALLING CRM: <{0}> rc={1}".format(myCMD, rc) self.tracer.info("{0}.{1}() {2}\n".format(self.__class__.__name__, method, myMSG)) - if rc != 0: + fallback_file_name = "../.crm_attribute.{0}".format(mySite) + fallback_stage_file_name = "../.crm_attribute.stage.{0}".format(mySite) + if rc == 0: + # cluster attribute set was successfull - delete pending fallback file, if existing + try: + os.remove(fallback_file_name) + self.tracer.info("new event - pending fallback file {0} deleted".format(fallback_file_name)) + except FileNotFoundError: + pass + else: # # FALLBACK # sending attribute to the cluster failed - using fallback method and write status to a file - RA to pick-up the value during next SAPHanaController monitor operation @@ -83,14 +92,13 @@ def srConnectionChanged(self, ParamDict, **kwargs): # cwd of hana is /hana/shared//HDB00/ we use a relative path to cwd this gives us a adm permitted directory # however we go one level up (..) to have the file accessible for all SAP HANA swarm nodes # - fallbackFileObject = open("../.crm_attribute.stage.{0}".format(mySite), "w") - fallbackFileObject.write("hana_{0}_site_srHook_{1} = {2}".format(mysid, mySite, mySRS)) - fallbackFileObject.close() + with open(fallback_stage_file_name, "w", encoding='utf-8') as fallbackFileObject: + fallbackFileObject.write("hana_{0}_site_srHook_{1} = {2}".format(mysid, mySite, mySRS)) # # release the stage file to the original name (move is used to be atomic) # .crm_attribute.stage. is renamed to .crm_attribute. # - os.rename("../.crm_attribute.stage.{0}".format(mySite), "../.crm_attribute.{0}".format(mySite)) + os.rename(fallback_stage_file_name, fallback_file_name) return 0 except NameError as e: print("Could not find base class ({0})".format(e)) From 36793813c897570e66c5fad9d734e634debfbda5 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 08:37:39 +0100 Subject: [PATCH 31/60] SAPHanaSR.7: requirements --- man/SAPHanaSR.7 | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index 1043d9aa..8b8551c7 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -200,7 +200,8 @@ Linux system. If users are resolved by remote service, local caching is neccessary. Substitute user (su) to sidadm needs to work reliable and without customized actions or messages. Supported shell is bash. .PP -3. Strict time synchronization between the cluster nodes, e.g. NTP. +3. Strict time synchronization between the cluster nodes, e.g. NTP. All nodes of +a cluster have configured the same timezone. .PP 4. For scale-up the following SAP HANA SR scenarios are supported with the SAPHanaSR package: @@ -219,7 +220,7 @@ SAPHanaSR package: .br Note: For MCOS, there must be no constraints between HANA SR pairs. .PP -5. Only one system replication between the two SAP HANA database in the Linux cluster. +5. Only one system replication between the two SAP HANA databases in the Linux cluster. Maximum one system replication to an HANA database outside the Linux cluster. See also item 12 below. .PP @@ -263,7 +264,8 @@ being written into CIB attributes. The current HANA SR status might differ from CIB srHook attribute after cluster maintenance. .PP 15. Once an HANA system replication site is known to the Linux cluster, that -exact site name has to be used whenever the site is registered manually. +exact site name has to be used whenever the site is registered manually. At any +time only one site is configured as primary replication source. .PP 16. Reliable access to the /hana/shared/ filesystem is crucial for HANA and the Linux cluster. From 36225d83a10612901a2a65f32b6c181c0ff656e8 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 08:50:21 +0100 Subject: [PATCH 32/60] SAPHanaSR.7: requirements --- man/SAPHanaSR.7 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index 8b8551c7..f0495d70 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -201,7 +201,7 @@ neccessary. Substitute user (su) to sidadm needs to work reliable and without customized actions or messages. Supported shell is bash. .PP 3. Strict time synchronization between the cluster nodes, e.g. NTP. All nodes of -a cluster have configured the same timezone. +the Linux cluster have configured the same timezone. .PP 4. For scale-up the following SAP HANA SR scenarios are supported with the SAPHanaSR package: @@ -226,7 +226,8 @@ See also item 12 below. .PP 6. The replication mode is either sync or syncmem for the controlled replication. Replication mode async is not supported. The operation modes delta_datashipping, -logreplay and logreplay_readaccess are supported. +logreplay and logreplay_readaccess are supported. The operation mode logreplay +is default. .PP 7. Both SAP HANA database systems have the same SAP Identifier (SID) and Instance Number (INO). From bf1a7847214599b8c9f73eb1d1e33603a2d90795 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 09:13:58 +0100 Subject: [PATCH 33/60] SAPHanaSR.7: requirements --- man/SAPHanaSR.7 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index f0495d70..567a6d67 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR 7 "22 Jun 2022" "" "SAPHanaSR" +.TH SAPHanaSR 7 "31 Oct 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR \- Tools for automating SAP HANA system replication in scale-up setups. @@ -181,7 +181,7 @@ left-over migration constraints, and resource failures as well as the HANA landscape status, and the HANA SR status. .PP \fB*\fR Manually activating an HANA primary creates risk of a dual-primary situation. -The user is responsible for data integrity. +The user is responsible for data integrity. See also susTkOver.py(7). .PP .\" .SH REQUIREMENTS @@ -241,7 +241,7 @@ However, all nodes in one Linux cluster have to use the same style. 9. Automated start of SAP HANA database systems during system boot must be switched off. .PP -10. The RA's monitoring operations have to be active. +10. The RAs' monitoring operations have to be active. .PP 11. Using HADR provider hook for srConnectionChanged() by enabling SAPHanaSR.py is strongly recommended. This might become mandatory in furture versions. @@ -348,7 +348,7 @@ A.Briel, F.Herschel, L.Pinne. .SH COPYRIGHT (c) 2015-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2022 SUSE LLC +(c) 2019-2023 SUSE LLC .br The package SAPHanaSR comes with ABSOLUTELY NO WARRANTY. .br From f2dfda1b179cb1f6f870268c248bcecb56af6e3c Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 10:04:08 +0100 Subject: [PATCH 34/60] SAPHanaSR_maintenance_examples.7: requirements, typos --- man/SAPHanaSR_maintenance_examples.7 | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index f9505aea..63c74ba4 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR_maintenance_examples 7 "12 Jun 2023" "" "SAPHanaSR" +.TH SAPHanaSR_maintenance_examples 7 "31 Oct 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_maintenance_examples \- maintenance examples for SAPHana and SAPHanaController. @@ -583,12 +583,17 @@ landscape status, and the HANA SR status. .PP \fB*\fR Maintenance attributes for cluster, nodes and resources must not be mixed. .PP -\fB*\fR The Linux cluster needs to be up and running to allow HA/DR provider events being written into CIB attributes. -The current HANA SR status might differ from CIB srHook attribute after Linux cluster maintenance. +\fB*\fR The Linux cluster needs to be up and running to allow HA/DR provider events +being written into CIB attributes. The current HANA SR status might differ from CIB +srHook attribute after Linux cluster maintenance. .PP -\fB*\fR Manually activating an HANA primary, like start of HANA primary or takeover outside -the cluster creates risk of a duplicate-primary situation. The user is responsible for data -integrity, particularly when activating an HANA primary. +\fB*\fR Manually activating an HANA primary, like start of HANA primary or takeover +outside the cluster creates risk of a duplicate-primary situation. The user is +responsible for data integrity, particularly when activating an HANA primary. See +also susTkOver.py(7). +.PP +\fB*\fR HANA site names are discovered automatically when the RAs are activated the +very first time. That exact site names have to be used later for all manual tasks. .PP .\" .SH BUGS From f9cc97dfdd4f547fe767f31a88f982e13e225df8 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 14:06:38 +0100 Subject: [PATCH 35/60] SAPHanaSR_maintenance_examples.7: bug 1216671, misc. examples --- man/SAPHanaSR_maintenance_examples.7 | 45 +++++++++++++++++----------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 63c74ba4..cfb39ee3 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -41,7 +41,7 @@ REQUIREMENTS below. This might be convenient when performing administrative actions or cluster tests. It does not replace the afore mentioned checks. See also cs_show_saphanasr_status(8). .PP .RS 4 -# watch -n 9 "(crm_mon -1r;SAPHanaSR-showAttr;cs_clusterstate -i)|egrep -v'(^$|configured|###)'" +# watch -n9 "crm_mon -1r --include=none,nodes,resources,failures;echo;SAPHanaSR-showAttr;cs_clusterstate -i|grep -v '#'" .RE .PP \fB*\fR Initiate an administrative takeover of the HANA primary from one node to the other by using the Linux cluster. @@ -357,48 +357,59 @@ before proceeding with the new procedure for pacemaker-2.0. .PP \fB*\fR Overview on maintenance procedure for Linux, HANA remains running, on pacemaker-2.0. +It is necessary to wait for each step to complete and to check the result. It +also is necessary to test and document the whole procedure before applying in production. See also section REQUIREMENTS below and example on checking status of HANA and cluster above. -.\" TODO details .PP .RS 2 1. Check status of Linux cluster and HANA, see above. .br -2. Set the Linux cluster into maintenance mode, on either node. +2. Set HANA multistate resource into maintenance mode. .RE .RS 4 -# crm maintenance on +# crm resource maintenance msl_... on .RE .RS 2 -3. Stop Linux Cluster on all nodes. Make sure to do that on all nodes. +3. Set the Linux cluster into maintenance mode, on either node. .RE -.RS 3 -# crm cluster stop +.RS 4 +# crm maintenance on .RE .RS 2 -4. Perform Linux maintenance. -.br -5. Start Linux cluster on all nodes. Make sure to do that on all nodes. +4. Stop Linux Cluster on all nodes. Make sure to do that on all nodes. .RE .RS 4 -# crm cluster start +# crm cluster run "crm cluster stop" .RE .RS 2 -6. Let Linux cluster detect status of HANA resource, on either node. +.PP +5. Perform Linux maintenance. +.PP +6. Start Linux cluster on all nodes. Make sure to do that on all nodes. .RE .RS 4 -# crm resource refresh cln_... -.br -# crm resource refresh msl_... +# crm cluster run "crm cluster start" .RE .RS 2 7. Set cluster ready for operations, on either node. .RE .RS 4 # crm maintenance off -.\" TODO delete property, optional? .RE .RS 2 -8. Check status of Linux cluster and HANA, see above. +8. Let Linux cluster detect status of HANA multistate resource, on either node. +.RE +.RS 4 +# crm resource refresh msl_... +.RE +.RS 2 +9. Set HANA multistate resource ready for operations, on either node. +.RE +.RS 4 +# crm maintenance msl_... off +.RE +.RS 2 +10. Check status of Linux cluster and HANA, see above. .RE .PP \fB*\fR Overview on simple procedure for stopping and temporarily disabling the Linux cluster, From 9c7f0354d1552d5cde6db71e9fea0198dc6e7666 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 1 Nov 2023 15:00:47 +0100 Subject: [PATCH 36/60] ocf_suse_SAPHanaTopology.7: examples, typos --- man/ocf_suse_SAPHanaTopology.7 | 36 +++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/man/ocf_suse_SAPHanaTopology.7 b/man/ocf_suse_SAPHanaTopology.7 index 63dc9c19..b1b362b8 100644 --- a/man/ocf_suse_SAPHanaTopology.7 +++ b/man/ocf_suse_SAPHanaTopology.7 @@ -1,6 +1,6 @@ .\" Version: 0.180.0 .\" -.TH ocf_suse_SAPHanaTopology 7 "04 Oct 2023" "" "OCF resource agents" +.TH ocf_suse_SAPHanaTopology 7 "31 Oct 2023" "" "OCF resource agents" .\" .SH NAME SAPHanaTopology \- Helps to manage two SAP HANA databases with system replication. @@ -160,7 +160,7 @@ Please refer to the OCF definition on the website mentioned below. .PP .\" .SH EXAMPLES -* This is an example configuration for a SAPHanaTopology resource for HANA scale-up. +* Example configuration of SAPHanaTopology resource for HANA scale-up. .br In addition, a SAPHana resource is needed to make this work. .RE @@ -181,7 +181,7 @@ clone cln_SAPHanaTop_SLE_HDB00 rsc_SAPHanaTop_SLE_HDB00 \\ notify="true" interleave="true" clone-node-max="1" .RE .PP -* Below is an example configuration for a SAPHanaTopology resource for HANA scale-out. +* Example configuration of SAPHanaTopology resource for HANA scale-out. .br The HANA consists of two sites with five nodes each. An additional cluster node is used as majority maker for split brain situations. @@ -208,6 +208,36 @@ location SAPHanaTop_not_on_majority_maker cln_SAPHanaTop_HAE_HDB00 -inf: vm-majo order SAPHanaTop_first Optional: cln_SAPHanaTop_SLE_HDB00 msl_SAPHC_SLE_HDB00 .RE .PP +* Showing the current SAPHanaTopology resource configuration on scale-up. +.br +The primitive is "rsc_SAPHanaTop_SLE_HDB00" and clone is "cln_SAPHanaTop_SLE_HDB00". +The constraints´ names are starting with "SAPHanaTop". +.RE +.PP +.RS 4 +# crm configure show | grep SAPHanaTop +.br +# crm configure show rsc_SAPHanaTop_SLE_HDB00 +.br +# crm configure show cln_SAPHanaTop_SLE_HDB00 +.br +# crm configure show SAPHanaTop_first +.RE +.PP +* Correlating SAPHanaTopology syslog entry with HANA nameserver trace entry. +.br +.RS 4 +# grep "SAPHanaTop.*16933.*monitor_clone" /var/log/messages | awk '{print $1,$3,$7,$11,$12,$13}' +.br +# su - hxeadm +.br +~> cdtrace +.br +~> grep "2023-11-01.*parent.*16933" nameserver_*trc | awk '{print $2,$3,$22,$23,$24,$25}' +.br +~> exit +.RE +.PP .\" .SH FILES .TP From fd92dc024205290d2d7cf0550cf5a4124f6af7aa Mon Sep 17 00:00:00 2001 From: Fabian Herschel Date: Thu, 2 Nov 2023 12:17:47 +0100 Subject: [PATCH 37/60] bsc1216484: test/SAPHanaSR-hookHelper - use full path for cibadmin to also support non root users in special user environments --- test/SAPHanaSR-hookHelper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/SAPHanaSR-hookHelper b/test/SAPHanaSR-hookHelper index 3bffd453..838d5de7 100755 --- a/test/SAPHanaSR-hookHelper +++ b/test/SAPHanaSR-hookHelper @@ -195,7 +195,7 @@ case "$USECASE" in # query CIB and write content to a temporary file to limit the # cluster calls to a minimum. cibtmp=$(mktemp /tmp/SAPHanaSR_SRHhelper.XXXXXX) - cibadmin -Q > "$cibtmp"; rc=$? + /usr/sbin/cibadmin -Q > "$cibtmp"; rc=$? if [ "$rc" != 0 ]; then case "$rc" in 102) From 706322757bbe658bfe351bc800fda13a99de952b Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 3 Nov 2023 09:43:18 +0100 Subject: [PATCH 38/60] /SAPHanaSR.py.7: exmaple test --- man/SAPHanaSR.py.7 | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index 7826d94a..f2a89353 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR.py 7 "02 Nov 2022" "" "SAPHanaSR" +.TH SAPHanaSR.py 7 "03 Nov 2023" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR.py \- Provider for SAP HANA srHook method srConnectionChanged(). @@ -213,6 +213,46 @@ SID is HA1. # SAPHanaSR-showAttr .RE .PP +\fB*\fR Example for temporarily blocking HANA system replication. +.br +This could be done for testing the HA/DR provider hook script integration. +Blocking the HANA system replication is dangerous. This test should not be done +on production systems. +Instance number is 00, the resulting network ports are depending on the HANA +setup. Please refer to SAP HANA documentation. See also manual page iptables(8). +.br +Note: Understand the impact before trying. +.PP +1. Check HANA and Linux cluster for clean idle state. +.PP +2. On secondary, check for used network ports. +.RS 2 +# ss -tulpan | grep hdb.*server | grep -v 127.0.0.1 |\\ +.br +grep -v "\*" | awk '{print $6}' | awk -F: '{print $2}' +.RE +.PP +3. On secondary, block HANA system replication, example ports are 4000[123]. +.RS 2 +# iptables -I INPUT -p tcp -m multiport --ports 40001,40002,40003 -j ACCEPT +.RE +Note: The ACCEPT needs to be replaced by appropriate action. +.PP +4. Check the nameserver tracefile for srConnectionChanged() events, check the +system log for crm_attribute calls, check the Linux cluster attributes for srHook +status. +.PP +5. On secondary, unblock HANA system replication. +.RS 2 +# iptables -D INPUT -p tcp -m multiport --ports 40001,40002,40003 -j DROP +.RE +.PP +6. Check the nameserver tracefile for srConnectionChanged() events, check the +system log for crm_attribute calls, check the Linux cluster attributes for srHook +status. +.PP +7. Check HANA and Linux cluster for clean idle state. +.PP .\" .SH FILES .TP @@ -269,6 +309,7 @@ Please report any other feedback and suggestions to feedback@suse.com. \fBSAPHanaSR-monitor\fP(8) , \fBSAPHanaSR-showAttr\fP(8) , \fBSAPHanaSR-manageProvider\fP(8) , \fBSAPHanaSR-hookHelper\fP(8) , \fBcrm_attribute\fP(8) , \fBsudo\fP(8) , \fBsudoers\fP(5), \fBpython\fP(8) , +\fBitables\fP(8) , \fBss\fP(8) , .br https://help.sap.com/docs/SAP_HANA_PLATFORM?locale=en-US .br @@ -282,7 +323,7 @@ F.Herschel, L.Pinne. .SH COPYRIGHT (c) 2015-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2022 SUSE LLC +(c) 2019-2023 SUSE LLC .br SAPHanaSR.py comes with ABSOLUTELY NO WARRANTY. .br From b5a88e9c422e9a77982b51c0699e5ca8a5cb466d Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 3 Nov 2023 10:42:32 +0100 Subject: [PATCH 39/60] /SAPHanaSR.py.7: typo --- man/SAPHanaSR.py.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index f2a89353..a168fce4 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -309,7 +309,7 @@ Please report any other feedback and suggestions to feedback@suse.com. \fBSAPHanaSR-monitor\fP(8) , \fBSAPHanaSR-showAttr\fP(8) , \fBSAPHanaSR-manageProvider\fP(8) , \fBSAPHanaSR-hookHelper\fP(8) , \fBcrm_attribute\fP(8) , \fBsudo\fP(8) , \fBsudoers\fP(5), \fBpython\fP(8) , -\fBitables\fP(8) , \fBss\fP(8) , +\fBiptables\fP(8) , \fBss\fP(8) , .br https://help.sap.com/docs/SAP_HANA_PLATFORM?locale=en-US .br From 1c88daa8b3e559da52d350d57b5b5a7eca31e71d Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Fri, 3 Nov 2023 13:29:22 +0100 Subject: [PATCH 40/60] changelog update for next version 0.162.2 add additional logging for landscapeHostConfiguration.py timeout to SAPHana RA --- SAPHanaSR.changes_12 | 16 ++++++++++++++-- SAPHanaSR.changes_15 | 16 ++++++++++++++-- ra/SAPHana | 2 ++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/SAPHanaSR.changes_12 b/SAPHanaSR.changes_12 index 42dd9fc5..d292c5d4 100644 --- a/SAPHanaSR.changes_12 +++ b/SAPHanaSR.changes_12 @@ -1,7 +1,15 @@ ------------------------------------------------------------------- -Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com +Thu Nov 2 17:50:47 UTC 2023 - abriel@suse.com - Version bump to 0.162.2 + * inside SAPHanaSR-hookHelper use the full path for the cibadmin + command to support non root users in special user environments + (bsc#1216484) + * if the SAPHanaSR.py hook has successfully reported a SR event + to the cluster a still existing fall-back state file will be + removed to prevent an override of an already reported + SR state. + (bsc#1215693) * improve supportability by providing the current process ID of the RA, which is logged in the RA outputs, to HANA tracefiles too. @@ -14,10 +22,14 @@ Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com /tmp filesystem full. (bsc#1210728) * update man pages: + SAPHanaSR.7 SAPHanaSR_basic_cluster.7 - susCostOpt.py.7 SAPHanaSR_maintenance_examples.7 + ocf_suse_SAPHana.7 + ocf_suse_SAPHanaTopology.7 + susCostOpt.py.7 SAPHanaSR-monitor.8 + SAPHanaSR-showAttr.8 * add improvements from SAP to the RA scripts, part II (jsc#PED-1739, jsc#PED-2608) diff --git a/SAPHanaSR.changes_15 b/SAPHanaSR.changes_15 index f14c408f..2f6813cb 100644 --- a/SAPHanaSR.changes_15 +++ b/SAPHanaSR.changes_15 @@ -1,7 +1,15 @@ ------------------------------------------------------------------- -Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com +Thu Nov 2 17:49:47 UTC 2023 - abriel@suse.com - Version bump to 0.162.2 + * inside SAPHanaSR-hookHelper use the full path for the cibadmin + command to support non root users in special user environments + (bsc#1216484) + * if the SAPHanaSR.py hook has successfully reported a SR event + to the cluster a still existing fall-back state file will be + removed to prevent an override of an already reported + SR state. + (bsc#1215693) * improve supportability by providing the current process ID of the RA, which is logged in the RA outputs, to HANA tracefiles too. @@ -14,10 +22,14 @@ Mon Aug 25 16:24:32 UTC 2023 - abriel@suse.com /tmp filesystem full. (bsc#1210728) * update man pages: + SAPHanaSR.7 SAPHanaSR_basic_cluster.7 - susCostOpt.py.7 SAPHanaSR_maintenance_examples.7 + ocf_suse_SAPHana.7 + ocf_suse_SAPHanaTopology.7 + susCostOpt.py.7 SAPHanaSR-monitor.8 + SAPHanaSR-showAttr.8 * add improvements from SAP to the RA scripts, part II (jsc#PED-1739, jsc#PED-2608) diff --git a/ra/SAPHana b/ra/SAPHana index d0c28c8d..894d73be 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -1454,9 +1454,11 @@ function get_hana_landscape_status() if [ $rc -eq 124 ]; then # TODO: PRIO 1: Check, if we should loop here like 'for i in 1 2 3 ...' ? # landscape timeout + super_ocf_log warn "RA: landscapeHostConfiguration.py TIMEOUT after $HANA_CALL_TIMEOUT seconds" sleep 20 HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? if [ $rc -eq 124 ]; then + super_ocf_log warn "RA: landscapeHostConfiguration.py second TIMEOUT after $HANA_CALL_TIMEOUT seconds" # TODO PRIO2: How to handle still hanging lss - current solution is to say "FATAL" rc=0 fi From 2747087c17a382d1759a9117cb66b1971a0210c1 Mon Sep 17 00:00:00 2001 From: lpinne Date: Thu, 14 Dec 2023 14:08:23 +0100 Subject: [PATCH 41/60] ocf_suse_SAPHana.7: action details --- man/ocf_suse_SAPHana.7 | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/man/ocf_suse_SAPHana.7 b/man/ocf_suse_SAPHana.7 index 30eb83e0..36073fbd 100644 --- a/man/ocf_suse_SAPHana.7 +++ b/man/ocf_suse_SAPHana.7 @@ -1,13 +1,13 @@ .\" Version: 0.160.1 .\" -.TH ocf_suse_SAPHana 7 "04 Oct 2023" "" "OCF resource agents" +.TH ocf_suse_SAPHana 7 "13 Dec 2023" "" "OCF resource agents" .\" .SH NAME SAPHana \- Manages takeover between two SAP HANA databases with system replication. .PP .\" .SH SYNOPSIS -\fBSAPHana\fP [start | stop | status | monitor | promote | demote | notify | meta\-data | validate\-all | methods | usage ] +\fBSAPHana\fP [ start | stop | status | monitor | promote | demote | reload | meta\-data | validate\-all | methods | usage ] .PP .\" .SH DESCRIPTION @@ -198,14 +198,17 @@ This resource agent supports the following actions (operations): .PP \fBstart\fR .RS 4 -Starts the HANA instance or bring the "clone instance" to a WAITING status. +Starts the HANA instance or brings the "clone instance" to a WAITING status. The correct timeout depends on factors like database size and storage performance. Large databases might require higher start timeouts, use of persistent memory might reduce the timeout needed. Suggested minimum timeout: 3600\&. .RE .PP \fBstop\fR .RS 4 -Stops the HANA instance. -Suggested minimum timeout: 3600\&. +Stops the HANA instance. +The correct timeout depends on factors like database size. +If HANA database internal timeouts have been tuned for fast shutdown, the RA timeout might be reduced. +.\" TODO point to HANA parameters +Suggested minimum timeout: 600\&. .RE .PP \fBpromote\fR @@ -220,12 +223,6 @@ Nearly does nothing and just mark the instance as demoted. Suggested minimum timeout: 320\&. .RE .PP -\fBnotify\fR -.RS 4 -Always returns SUCCESS. -Suggested minimum timeout: 10\&. -.RE -.PP \fBstatus\fR .RS 4 Reports whether the HANA instance is running. @@ -256,9 +253,15 @@ Suggested minimum timeout: 5\&. .PP \fBmethods\fR .RS 4 +Reports which methods (operations) the resource agent supports. Suggested minimum timeout: 5\&. .RE .PP +\fBreload\fR +.RS 4 +Changes parameters without forcing a recover of the resource. Suggested minimum timeout: 5. +.RE +.PP .\" .SH RETURN CODES .br From 6feec6e91f2f4d9ed997dcaca383bbef8e99544f Mon Sep 17 00:00:00 2001 From: lpinne Date: Thu, 4 Jan 2024 11:50:48 +0100 Subject: [PATCH 42/60] SAPHanaSR.py.7: requirements --- man/SAPHanaSR.py.7 | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index a168fce4..d481983b 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR.py 7 "03 Nov 2023" "" "SAPHanaSR" +.TH SAPHanaSR.py 7 "04 Jan 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR.py \- Provider for SAP HANA srHook method srConnectionChanged(). @@ -272,7 +272,7 @@ the sudo permissions configuration path to HANA tracefiles .TP /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE -the internal cache for srHook status changes while Linux cluster is down, file is owned by ${SID}adm and must never be touched +the internal cache for srHook status changes while Linux cluster is down, file is owned and r/w by ${SID}adm and must never be touched .PP .\" .SH REQUIREMENTS @@ -290,11 +290,17 @@ contradictingly. 3. The user ${sid}adm needs execution permission as user root for the command crm_attribute. .PP -4. The hook provider needs to be added to the HANA global configuration, +4. The user ${sid}adm needs ownership and read/write permission on the internal +cache file /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE . +.PP +5. The hook provider needs to be added to the HANA global configuration, in memory and on disk (in persistence). .PP -5. If the hook provider should be pre-compiled, the particular Python version +6. If the hook provider should be pre-compiled, the particular Python version that comes with SAP HANA has to be used. +.PP +7. The srHook script runtime almost completely depends on call-outs to OS and +Linux cluster. .\" .SH BUGS In case of any problem, please use your favourite SAP support process to open @@ -323,7 +329,7 @@ F.Herschel, L.Pinne. .SH COPYRIGHT (c) 2015-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2023 SUSE LLC +(c) 2019-2024 SUSE LLC .br SAPHanaSR.py comes with ABSOLUTELY NO WARRANTY. .br From 0330dded79f730d644e466666b042a52dcaf7e25 Mon Sep 17 00:00:00 2001 From: lpinne Date: Thu, 4 Jan 2024 12:04:24 +0100 Subject: [PATCH 43/60] SAPHanaSR.7: requirements --- man/SAPHanaSR.7 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index 567a6d67..074381ed 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR 7 "31 Oct 2023" "" "SAPHanaSR" +.TH SAPHanaSR 7 "04 Jan 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR \- Tools for automating SAP HANA system replication in scale-up setups. @@ -300,6 +300,8 @@ In opposite to Native Storage Extension, the HANA Extension Nodes are changing the topology and thus currently are not supported. Please refer to SAP documentation for details. .PP +25. The Linux user root´s shell is /bin/bash, or completely compatible. +.PP .\" .SH BUGS .\" TODO @@ -348,7 +350,7 @@ A.Briel, F.Herschel, L.Pinne. .SH COPYRIGHT (c) 2015-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2023 SUSE LLC +(c) 2019-2024 SUSE LLC .br The package SAPHanaSR comes with ABSOLUTELY NO WARRANTY. .br From 2632f6d35b61f4b28f472373683b15ccb2bbd9dd Mon Sep 17 00:00:00 2001 From: lpinne Date: Thu, 4 Jan 2024 12:47:44 +0100 Subject: [PATCH 44/60] SAPHanaSR.py.7: requirements --- man/SAPHanaSR.py.7 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index d481983b..ff4673d0 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -290,8 +290,8 @@ contradictingly. 3. The user ${sid}adm needs execution permission as user root for the command crm_attribute. .PP -4. The user ${sid}adm needs ownership and read/write permission on the internal -cache file /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE . +4. The user ${sid}adm needs ownership and permission for reading/writing/creating +on the internal cache file /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE . .PP 5. The hook provider needs to be added to the HANA global configuration, in memory and on disk (in persistence). From cf0b723ff38dd106d7d7b8a6479c10f5e7889231 Mon Sep 17 00:00:00 2001 From: lpinne Date: Thu, 4 Jan 2024 12:52:03 +0100 Subject: [PATCH 45/60] SAPHanaSR.py.7: requirements --- man/SAPHanaSR.py.7 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index ff4673d0..d9149eae 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -290,8 +290,8 @@ contradictingly. 3. The user ${sid}adm needs execution permission as user root for the command crm_attribute. .PP -4. The user ${sid}adm needs ownership and permission for reading/writing/creating -on the internal cache file /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE . +4. The user ${sid}adm needs ownership and read/write permission on the internal +cache file /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE and its parent directory. .PP 5. The hook provider needs to be added to the HANA global configuration, in memory and on disk (in persistence). From 59b5040739d6e761a4ceb80ec8dab8c4b8785f36 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 10 Jan 2024 09:24:11 +0100 Subject: [PATCH 46/60] SAPHanaSR.py.7: r/w -> read/write --- man/SAPHanaSR.py.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index d9149eae..aad182bb 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -272,7 +272,7 @@ the sudo permissions configuration path to HANA tracefiles .TP /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE -the internal cache for srHook status changes while Linux cluster is down, file is owned and r/w by ${SID}adm and must never be touched +the internal cache for srHook status changes while Linux cluster is down, file is owned and read/write by ${SID}adm and must never be touched .PP .\" .SH REQUIREMENTS From 7b19905aed55b116d541c8a3c43f94ba6eb967b0 Mon Sep 17 00:00:00 2001 From: lpinne Date: Wed, 10 Jan 2024 09:33:30 +0100 Subject: [PATCH 47/60] SAPHanaSR.py.7: r/w -> read/write --- man/SAPHanaSR.py.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index aad182bb..8f8685e4 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -272,7 +272,7 @@ the sudo permissions configuration path to HANA tracefiles .TP /usr/sap/$SID/HDB$nr/.crm_attribute.$SITE -the internal cache for srHook status changes while Linux cluster is down, file is owned and read/write by ${SID}adm and must never be touched +the internal cache for srHook status changes while Linux cluster is down, file is owned and read/write by ${sid}adm and must never be touched .PP .\" .SH REQUIREMENTS From df993e0c3fb5e02bdc1d536ec21fd667a3e4c0ed Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Fri, 19 Jan 2024 19:19:35 +0100 Subject: [PATCH 48/60] HA/DR provider hooks - make function logTimestamp() part of the class adapt copyright date and version strings prepare changelog and spec file for next mainteance update prevent trailing newline in the input of the hexdump log do not log a hexdump of an empty node status --- SAPHanaSR.changes_12 | 11 ++++++ SAPHanaSR.changes_15 | 11 ++++++ SAPHanaSR.spec | 4 +-- ra/SAPHana | 13 ++++--- ra/SAPHanaTopology | 14 +++++--- srHook/susChkSrv.py | 80 ++++++++++++++++++++++---------------------- 6 files changed, 81 insertions(+), 52 deletions(-) diff --git a/SAPHanaSR.changes_12 b/SAPHanaSR.changes_12 index d292c5d4..2c13cca9 100644 --- a/SAPHanaSR.changes_12 +++ b/SAPHanaSR.changes_12 @@ -1,3 +1,14 @@ +------------------------------------------------------------------- +Wed Jan 19 18:10:10 UTC 2024 - abriel@suse.com + +- Version bump to 0.162.3 + * Fix the hexdump log for empty node states + * susChkSrv.py - relocate function logTimestamp() + * update man pages: + SAPHanaSR.7 + ocf_suse_SAPHana.7 + SAPHanaSR.py.7 + ------------------------------------------------------------------- Thu Nov 2 17:50:47 UTC 2023 - abriel@suse.com diff --git a/SAPHanaSR.changes_15 b/SAPHanaSR.changes_15 index 2f6813cb..fa3e7081 100644 --- a/SAPHanaSR.changes_15 +++ b/SAPHanaSR.changes_15 @@ -1,3 +1,14 @@ +------------------------------------------------------------------- +Wed Jan 19 18:11:10 UTC 2024 - abriel@suse.com + +- Version bump to 0.162.3 + * Fix the hexdump log for empty node states + * susChkSrv.py - relocate function logTimestamp() + * update man pages: + SAPHanaSR.7 + ocf_suse_SAPHana.7 + SAPHanaSR.py.7 + ------------------------------------------------------------------- Thu Nov 2 17:49:47 UTC 2023 - abriel@suse.com diff --git a/SAPHanaSR.spec b/SAPHanaSR.spec index b96817b3..f64151e7 100644 --- a/SAPHanaSR.spec +++ b/SAPHanaSR.spec @@ -3,7 +3,7 @@ # # Copyright (c) 2013-2014 SUSE Linux Products GmbH, Nuernberg, Germany. # Copyright (c) 2014-2016 SUSE Linux GmbH, Nuernberg, Germany. -# Copyright (c) 2017-2023 SUSE LLC. +# Copyright (c) 2017-2024 SUSE LLC. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -23,7 +23,7 @@ License: GPL-2.0 Group: Productivity/Clustering/HA AutoReqProv: on Summary: Resource agents to control the HANA database in system replication setup -Version: 0.162.2 +Version: 0.162.3 Release: 0 Url: http://scn.sap.com/community/hana-in-memory/blog/2014/04/04/fail-safe-operation-of-sap-hana-suse-extends-its-high-availability-solution diff --git a/ra/SAPHana b/ra/SAPHana index 894d73be..e447bf2c 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -16,7 +16,7 @@ # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2013,2014 SUSE Linux Products GmbH -# Copyright: (c) 2015-2022 SUSE LLC +# Copyright: (c) 2015-2024 SUSE LLC # # An example usage: # See usage() function below for more details... @@ -41,7 +41,7 @@ # systemReplicationStatus.py (>= SPS090) # ####################################################################### -SAPHanaVersion="0.162.2" +SAPHanaVersion="0.162.3" # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} @@ -1275,14 +1275,17 @@ function check_for_primary() { break;; * ) super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" - dump=$( echo $node_status | hexdump -C ); - super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" + if [ -n "$node_status" ]; then + dump=$( echo -n $node_status | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" + fi ((i++)) super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes # lets pause a bit to give hdbnsutil a chance to answer next time sleep 2 - esac; + ;; + esac done super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index a0a59f73..99f9fcfe 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -14,7 +14,7 @@ # License: GNU General Public License (GPL) # Copyright: (c) 2014 SUSE Linux Products GmbH # (c) 2015-2018 SUSE Linux GmbH -# (c) 2019-2021 SUSE LLC +# (c) 2019-2024 SUSE LLC # # An example usage: # See usage() function below for more details... @@ -27,7 +27,7 @@ # ####################################################################### # DONE PRIO 1: AFTER(!) SAP HANA SPS12 is available we could use hdbnsutil --sr_stateConfiguration -SAPHanaTopologyVersion="0.162.2" +SAPHanaTopologyVersion="0.162.3" # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} @@ -688,10 +688,14 @@ function check_for_primary() { super_ocf_log info "FLOW ${FUNCNAME[0]} rc=HANA_STATE_STANDALONE" rc=$HANA_STATE_STANDALONE;; * ) - dump=$( echo $srmode | hexdump -C ); - super_ocf_log err "ACT: check_for_primary: we didn't expect srmode to be: DUMP: <$dump>" + super_ocf_log err "ACT: check_for_primary: we didn't expect srmode to be: <$srmode>" + if [ -n "$srmode" ]; then + dump=$( echo -n $srmode | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect srmode to be: DUMP: <$dump>" + fi rc=$HANA_STATE_DEFECT - esac; + ;; + esac super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return $rc } diff --git a/srHook/susChkSrv.py b/srHook/susChkSrv.py index 80566250..e3308269 100755 --- a/srHook/susChkSrv.py +++ b/srHook/susChkSrv.py @@ -2,7 +2,7 @@ # susChkSrv.py # Author: Fabian Herschel, June 2022 # License: GNU General Public License (GPL) -# Copyright: (c) 2022 SUSE LLC +# Copyright: (c) 2022-2024 SUSE LLC susChkSrv needs SAP HANA 2.0 SPS4 (2.00.040.00) as minimum version @@ -41,7 +41,7 @@ # hook section SRHookName = "susChkSrv" -SRHookVersion = "0.7.7" +SRHookVersion = "0.7.8" # parameter section TIME_OUT_DFLT = 20 @@ -51,29 +51,29 @@ def getEpisode(): return episode -def logTimestamp(self, method, episode, outputMessage): - traceFilepath = os.path.join(os.environ['SAP_RETRIEVAL_PATH'], 'trace', 'nameserver_suschksrv.trc') - try: - with open(traceFilepath, "a") as saphanasr_multitarget_file: - currentTimeStr = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f ') - outputMessage = "{0} [{2}] {1}".format(currentTimeStr, outputMessage, episode) - saphanasr_multitarget_file.write(outputMessage + "\n") - saphanasr_multitarget_file.flush() - - except (RuntimeError, TypeError, NameError, OSError) as err: - self.tracer.info("{0}.{1}() logTimestamp error {2}".format(self.__class__.__name__, method, err)) - print("Error in logTimestamp(): {0}".format(err)) - try: class susChkSrv(HADRBase): + def logTimestamp(self, method, episode, outputMessage): + traceFilepath = os.path.join(os.environ['SAP_RETRIEVAL_PATH'], 'trace', 'nameserver_suschksrv.trc') + try: + with open(traceFilepath, "a") as saphanasr_multitarget_file: + currentTimeStr = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f ') + outputMessage = "{0} [{2}] {1}".format(currentTimeStr, outputMessage, episode) + saphanasr_multitarget_file.write(outputMessage + "\n") + saphanasr_multitarget_file.flush() + + except (RuntimeError, TypeError, NameError, OSError) as err: + self.tracer.info("{0}.{1}() logTimestamp error {2}".format(self.__class__.__name__, method, err)) + print("Error in logTimestamp(): {0}".format(err)) + def __init__(self, *args, **kwargs): # delegate construction to base class super(susChkSrv, self).__init__(*args, **kwargs) method = "init" episode = getEpisode() - logTimestamp(self, method, episode, "init called") + self.logTimestamp(self, method, episode, "init called") # read settings from global.ini # read sustkover_timeout @@ -87,12 +87,12 @@ def __init__(self, *args, **kwargs): isValidAction = (self.action_on_lost in ["ignore", "fence", "kill", "stop", "firstStopThenKill"]) if not isValidAction: msg = "Invalid action_on_lost {}. Fallback to 'ignore'".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) self.action_on_lost = "ignore_fallback" else: msg = "action_on_lost not configured. Fallback to 'ignore'" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) self.action_on_lost = "ignore_default" if self.config.hasKey("kill_signal"): @@ -101,7 +101,7 @@ def __init__(self, *args, **kwargs): self.killSignal = "9" # TODO: logging the signal parameter, but only if it is the kill action msg = "{}.{}() version {}, parameter info: action_on_lost={} stop_timeout={} kill_signal={}".format(self.__class__.__name__, method, SRHookVersion, self.action_on_lost, self.stop_timeout, self.killSignal) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) # TODO: use action specific init messages (e.g. for stop also report stop_timeout) self.takeover_active = False @@ -133,9 +133,9 @@ def srServiceStateChanged(self, ParamDict, **kwargs): msg1 = "{0} version {1}. Method {2} method called.".format(SRHookName, SRHookVersion, method) msg2 = "{0} {1} method called with Dict={2}".format(SRHookName, method, ParamDict) msg3 = "{0} {1} method called with SAPSYSTEMNAME={2}".format(SRHookName, method, mySID) - logTimestamp(self, method, episode, msg1) - logTimestamp(self, method, episode, msg2) - logTimestamp(self, method, episode, msg3) + self.logTimestamp(self, method, episode, msg1) + self.logTimestamp(self, method, episode, msg2) + self.logTimestamp(self, method, episode, msg3) self.tracer.info(msg1) self.tracer.info(msg2) self.tracer.info(msg3) @@ -153,7 +153,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # log service_name, service_port, service_status, service_previous_status, database_id, database_name, database_status, daemon_status msg = "srv:{0}-{1}-{2}-{3} db:{4}-{5}-{6} daem:{7}".format(service, port, status, previousStatus, databaseName, databaseId, databaseStatus, daemonStatus) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) # analysis, if the event looks like an dying indexserver (LOST) @@ -177,54 +177,54 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # if (isIndexserver and serviceRestart and daemonActive and databaseActive): msg = "LOST: indexserver event looks like a lost indexserver (status={})".format(status) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) isLostIndexserver = True eventKnown = True if (isIndexserver and serviceActive and daemonActive and databaseActive): if self.takeover_active: msg = "TAKEOVER: indexserver event looks like a takeover event" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) else: msg = "LOST: indexserver event looks like a lost indexserver (indexserver started)" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True # TODO: this event (LOST/started) seems also to come, if a sr_takeover is been processed (using preTakeover() and postTakeover() to mark this event?) if (isIndexserver and serviceStopping and daemonStop): msg = "STOP: indexserver event looks like graceful instance stop" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceDown and daemonStop): msg = "STOP: indexserver event looks like graceful instance stop (indexserver stopped)" self.tracer.info(msg) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) eventKnown = True if (isIndexserver and serviceStopping and daemonActive and databaseStop): msg = "STOP: indexserver event looks like graceful tenant stop" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceDown and daemonActive and databaseStop): msg = "STOP: indexserver event looks like graceful tenant stop (indexserver stopped)" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceRestart and daemonStarting and databaseActive): msg = "START: indexserver event looks like graceful tenant start" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceActive and daemonStarting and databaseActive): msg = "START: indexserver event looks like graceful tenant start (indexserver started)" - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and not eventKnown): msg = "DBG: version={},serviceRestart={}, serviceStop={}, serviceDown={}, daemonActive={}, daemonStop={}, daemonStarting={}, databaseActive={}, databaseStop={}".format(SRHookVersion, serviceRestart, serviceStop, serviceDown, daemonActive, daemonStop, daemonStarting, databaseActive, databaseStop) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) # event on secondary, if HA1 tenant is stopped on primary # DBG: version=0.2.7,serviceRestart=True, serviceStop=True, serviceDown=False, daemonActive=True, daemonStop=False, daemonStarting=False, databaseActive=False, databaseStop=False @@ -235,11 +235,11 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # if (isLostIndexserver and (self.action_on_lost in ["ignore", "ignore_fallback", "ignore_default"])): msg = "LOST: event ignored. action_on_lost={}".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) if (isLostIndexserver and self.action_on_lost == "fence"): msg = "LOST: fence node. action_on_lost={}".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) tout_cmd = "" action_cmd = "sudo /usr/sbin/SAPHanaSR-hookHelper --sid={0} --case=fenceMe".format(mySID) @@ -247,7 +247,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # DONE add fence code here if (isLostIndexserver and self.action_on_lost == "kill"): msg = "LOST: kill instance. action_on_lost={} signal={}".format(self.action_on_lost, self.killSignal) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) tout_cmd = "" action_cmd = "HDB kill-{}".format(self.killSignal) @@ -255,11 +255,11 @@ def srServiceStateChanged(self, ParamDict, **kwargs): cmdrc = os.WEXITSTATUS(os.system("sleep {}; {} {}".format("5", tout_cmd, action_cmd))) # the following message will most-likely also be lost, if we use signal 9 msg = "LOST: killed instance. action_on_lost={}".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) # DONE: hardcoded 5 here to be moved to a self.sleep_before_action (or however it will be named) if (isLostIndexserver and self.action_on_lost == "stop"): msg = "LOST: stop instance. action_on_lost={}".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) tout_cmd = "timeout {}".format(self.stop_timeout) # action_cmd = "HDB stop" @@ -271,7 +271,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # this code could be removed at any time without notice # the code does not promise that it will be part of any product later msg = "LOST: firstStopThenKill instance. action_on_lost={}".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) action_cmd = "/usr/sbin/SAPHanaSR-hookHelper --sid={} --ino={} --case=firstStopThenKill".format(mySID, self.ino) cmdrc = os.WEXITSTATUS(os.system("sleep {}; {}".format("5", action_cmd))) @@ -280,7 +280,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # this code could be removed at any time without notice # the code does not promise that it will be part of any product later msg = "LOST: set cluster attribute. action_on_lost={} is currently not implemented".format(self.action_on_lost) - logTimestamp(self, method, episode, msg) + self.logTimestamp(self, method, episode, msg) self.tracer.info(msg) # TODO add attribute code here return 0 From 9dfca5fe4273d96f54bf3d83a1beff78030db766 Mon Sep 17 00:00:00 2001 From: lpinne Date: Tue, 23 Jan 2024 11:53:53 +0100 Subject: [PATCH 49/60] SAPHanaSR.py.7: python 3 --- man/SAPHanaSR.py.7 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index 8f8685e4..f4f68b26 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -1,6 +1,6 @@ -.\" Version: 0.160.1 +.\" Version: 0.162.0 .\" -.TH SAPHanaSR.py 7 "04 Jan 2024" "" "SAPHanaSR" +.TH SAPHanaSR.py 7 "23 Jan 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR.py \- Provider for SAP HANA srHook method srConnectionChanged(). @@ -276,11 +276,11 @@ the internal cache for srHook status changes while Linux cluster is down, file i .PP .\" .SH REQUIREMENTS -1. SAP HANA 2.0 SPS04 or later provides the HA/DR provider hook method -srConnectionChanged() with multi-target aware parameters. +1. SAP HANA 2.0 SPS05 rev.059 or later provides Python 3 as well as the HA/DR +provider hook method srConnectionChanged() with multi-target aware parameters. SAP HANA 1.0 does not provide them. -The multi-target aware parameters are needed for the SAPHanaSR scale-up -package. +The Python 3 and multi-target aware parameters are needed for the SAPHanaSR +scale-up package. .PP 2. No other HADR provider hook script should be configured for the srConnectionChanged() method. Hook scripts for other methods, provided in From 0c4899786c48ede84e6c051038b704e7c2fd9e94 Mon Sep 17 00:00:00 2001 From: lpinne Date: Tue, 23 Jan 2024 14:22:56 +0100 Subject: [PATCH 50/60] SAPHanaSR_maintenance_examples.7 --- man/SAPHanaSR_maintenance_examples.7 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index cfb39ee3..5514660d 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH SAPHanaSR_maintenance_examples 7 "31 Oct 2023" "" "SAPHanaSR" +.TH SAPHanaSR_maintenance_examples 7 "23 Jan 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_maintenance_examples \- maintenance examples for SAPHana and SAPHanaController. @@ -158,9 +158,9 @@ If everything looks fine, proceed. .RE .RS 4 .br -~> HDBsettings.sh systemReplicationStatus.py; echo RC:$? +~> cdpy; python ./systemReplicationStatus.py; echo RC:$? .br -~> HDBsettings.sh ./landscapeConfigurationStatus.py; echo RC:$? +~> cdpy; python ./landscapeConfigurationStatus.py; echo RC:$? .br ~> exit .br @@ -649,7 +649,7 @@ F.Herschel, L.Pinne. .SH COPYRIGHT (c) 2017-2018 SUSE Linux GmbH, Germany. .br -(c) 2019-2023 SUSE LLC +(c) 2019-2024 SUSE LLC .br This maintenance examples are coming with ABSOLUTELY NO WARRANTY. .br From 029080cedf6d7b4d424408802cb4f0ff81c31643 Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Fri, 26 Jan 2024 14:50:07 +0100 Subject: [PATCH 51/60] variable 'site' must be a gloabl variable as the value found in the 'init' function will be used in 'sht_monitor_clone' to set the attribute in the CIB (bsc#1219194) --- ra/SAPHanaTopology | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 99f9fcfe..d728fe04 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -468,7 +468,6 @@ function sht_init() { local myInstanceName="" local rc=$OCF_SUCCESS local hdbANSWER="" - local site="" local chkMethod="" SYSTEMCTL="/usr/bin/systemctl" systemd_unit_name="saphostagent.service" @@ -478,6 +477,9 @@ function sht_init() { SAPHOSTCTRL_PATH=${USRSAP}/hostctrl/exe HOSTEXEC_PATH=${SAPHOSTCTRL_PATH}/${HOSTEXECNAME} HOSTEXEC_PROFILE_PATH=${SAPHOSTCTRL_PATH}/host_profile + # site must be global as value from init function is used in + # function sht_monitor_clone to set the attribute + site="" NODENAME=$(crm_node -n) SID=$OCF_RESKEY_SID InstanceNr=$OCF_RESKEY_InstanceNumber From f2e3760e14b5bcfa1bc5bbc3605340ae5e495ca4 Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 26 Jan 2024 15:41:23 +0100 Subject: [PATCH 52/60] SAPHanaSR-showAttr.8: detail on hana...op_mode and hana...srmode in scale-out vs. scale-up --- man/SAPHanaSR-showAttr.8 | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index 32d3ead3..493d701c 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -1,6 +1,6 @@ -.\" Version: 0.160.1 +.\" Version: 0.162.1 .\" -.TH SAPHanaSR-showAttr 8 "04 Oct 2023" "" "SAPHanaSR" +.TH SAPHanaSR-showAttr 8 "24 Jan 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR-showAttr \- Shows Linux cluster attributes for SAP HANA system replication. @@ -266,7 +266,7 @@ Value: [ online | offline ] Value: [ logreplay | delta_datashipping | logreplay_readaccess ] -The node attribute hana__op_mode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system replication. delta_datashipping is not recommended in the context of Linux clusters. +The node attribute hana__op_mode or cluster attribute hana__glob_op_mode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from the RA default "logreplay". In any case, delta_datashipping is not recommended in the context of Linux clusters. .PP .B remoteHost - HANA SR remote host @@ -381,7 +381,8 @@ Value: [ sync | syncmem ] The node attribute hana__srmode or cluster attribute hana__glob_srmode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system -replication. SAP HANA knows also async and fullsync (see URLs below). +replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from the RA default "sync". +SAP HANA knows also async and fullsync (see URLs below). Those do not make sense for automating HANA system replication by an Linux cluster. .PP .B standby @@ -579,7 +580,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2017 SUSE Linux GmbH, Germany. .br -(c) 2018-2023 SUSE LLC +(c) 2018-2024 SUSE LLC .br SAPHanaSR-showAttr comes with ABSOLUTELY NO WARRANTY. .br From 4e30045f499f37cdd547f234570c13faf12c656c Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 26 Jan 2024 16:24:42 +0100 Subject: [PATCH 53/60] SAPHanaSR-showAttr.8: detail on hana...op_mode and hana...srmode in scale-out vs. scale-up --- man/SAPHanaSR-showAttr.8 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index 493d701c..b248c93a 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -266,7 +266,7 @@ Value: [ online | offline ] Value: [ logreplay | delta_datashipping | logreplay_readaccess ] -The node attribute hana__op_mode or cluster attribute hana__glob_op_mode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from the RA default "logreplay". In any case, delta_datashipping is not recommended in the context of Linux clusters. +The node attribute hana__op_mode or cluster attribute hana__glob_op_mode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from current value or from RA default "logreplay". In any case, delta_datashipping is not recommended in the context of Linux clusters. .PP .B remoteHost - HANA SR remote host @@ -381,7 +381,8 @@ Value: [ sync | syncmem ] The node attribute hana__srmode or cluster attribute hana__glob_srmode is set by SAPHanaTopology, according to the running HANA. The attribute is used by the SAPHanaController or SAPHana resource agent for setting up system -replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from the RA default "sync". +replication. In SAPHanaSR-ScaleOut, the cluster attribute is written only if it differs from current +value or from RA default "sync". SAP HANA knows also async and fullsync (see URLs below). Those do not make sense for automating HANA system replication by an Linux cluster. .PP From 5c937c6f9e3f2c94a5b84875771ae973216ea017 Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 26 Jan 2024 16:28:04 +0100 Subject: [PATCH 54/60] SAPHanaSR-showAttr.8: detail on hana...op_mode and hana...srmode in scale-out vs. scale-up --- man/SAPHanaSR-showAttr.8 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index b248c93a..8869e172 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -43,6 +43,8 @@ HANA replication channel state, indicated by srConnectionChanged (\fBsrHook\fP) .br HANA replication state of secondary site, indicated by systemReplicationStatus.py (\fBsync_state\fP) .br +HANA replication mode (\fBsrmode\fP) +.br Linux cluster update status (\fBupd\fP) .TP Resource section @@ -52,6 +54,7 @@ name of Linux cluster resource (\fBResource\fP) maintenance status of Linux cluster resource (\fBmaintenance\fP) .TP Site section +.br HANA site name (\fBSite\fP) .br SAPHanaSR last primary timestamp (\fBlpt\fP) From a6cf83059afe46fb797a191a3377a2cde74a7925 Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Mon, 29 Jan 2024 16:38:49 +0100 Subject: [PATCH 55/60] catch monitor calls for non-cloned resources and report them as unsupported instead of 'command not found' (bsc#1218333) update changelog files --- SAPHanaSR.changes_12 | 9 ++++++++- SAPHanaSR.changes_15 | 9 ++++++++- ra/SAPHana | 16 ++++++++++++++++ ra/SAPHanaTopology | 16 ++++++++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/SAPHanaSR.changes_12 b/SAPHanaSR.changes_12 index 2c13cca9..c23e0f55 100644 --- a/SAPHanaSR.changes_12 +++ b/SAPHanaSR.changes_12 @@ -1,13 +1,20 @@ ------------------------------------------------------------------- -Wed Jan 19 18:10:10 UTC 2024 - abriel@suse.com +Wed Jan 29 14:24:15 UTC 2024 - abriel@suse.com - Version bump to 0.162.3 * Fix the hexdump log for empty node states + * catch monitor calls for non-cloned resources and report them as + unsupported instead of 'command not found' + (bsc#1218333) + * fix scope of variable 'site' to be global + (bsc#1219194) * susChkSrv.py - relocate function logTimestamp() * update man pages: SAPHanaSR.7 ocf_suse_SAPHana.7 + SAPHanaSR_maintenance_examples.7 SAPHanaSR.py.7 + SAPHanaSR-showAttr.8 ------------------------------------------------------------------- Thu Nov 2 17:50:47 UTC 2023 - abriel@suse.com diff --git a/SAPHanaSR.changes_15 b/SAPHanaSR.changes_15 index fa3e7081..efe971a3 100644 --- a/SAPHanaSR.changes_15 +++ b/SAPHanaSR.changes_15 @@ -1,13 +1,20 @@ ------------------------------------------------------------------- -Wed Jan 19 18:11:10 UTC 2024 - abriel@suse.com +Wed Jan 29 14:31:20 UTC 2024 - abriel@suse.com - Version bump to 0.162.3 * Fix the hexdump log for empty node states + * catch monitor calls for non-cloned resources and report them as + unsupported instead of 'command not found' + (bsc#1218333) + * fix scope of variable 'site' to be global + (bsc#1219194) * susChkSrv.py - relocate function logTimestamp() * update man pages: SAPHanaSR.7 ocf_suse_SAPHana.7 + SAPHanaSR_maintenance_examples.7 SAPHanaSR.py.7 + SAPHanaSR-showAttr.8 ------------------------------------------------------------------- Thu Nov 2 17:49:47 UTC 2023 - abriel@suse.com diff --git a/ra/SAPHana b/ra/SAPHana index e447bf2c..3c5c4e82 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -2776,6 +2776,22 @@ function saphana_monitor_clone() { return $rc } +# +# function: saphana_monitor +# this function should never be called currently. +# it is intended for future releases which might support un-cloned resources +# for now it is only used to print a reasonable error message in case of +# non-cloned resources instead of 'command not found' +# +function saphana_monitor() { + if ! is_clone; then + super_ocf_log error "RA: resource is not defined as clone. This is not supported (OCF_ERR_UNIMPLEMENTED)" + return "$OCF_ERR_UNIMPLEMENTED" + else + return "$OCF_SUCCESS" + fi +} + # # function: saphana_promote_clone - promote a hana clone # params: - diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index d728fe04..5c4f6838 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -1004,6 +1004,22 @@ function sht_stop_clone() { return $rc } +# +# function: sht_monitor +# this function should never be called currently. +# it is intended for future releases which might support un-cloned resources +# for now it is only used to print a reasonable error message in case of +# non-cloned resources instead of 'command not found' +# +function sht_monitor() { + if ! is_clone; then + super_ocf_log error "RA: resource is not defined as clone. This is not supported (OCF_ERR_UNIMPLEMENTED)" + return "$OCF_ERR_UNIMPLEMENTED" + else + return "$OCF_SUCCESS" + fi +} + # # function: sht_monitor_clone - monitor a hana clone instance # params: - From aa4bb4040a0a7924e8d86a28ce230a16bb9aa7a8 Mon Sep 17 00:00:00 2001 From: Angela Briel Date: Thu, 1 Feb 2024 11:57:06 +0100 Subject: [PATCH 56/60] fix load error of susChkSrv.py --- srHook/susChkSrv.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/srHook/susChkSrv.py b/srHook/susChkSrv.py index e3308269..14905d46 100755 --- a/srHook/susChkSrv.py +++ b/srHook/susChkSrv.py @@ -73,7 +73,7 @@ def __init__(self, *args, **kwargs): super(susChkSrv, self).__init__(*args, **kwargs) method = "init" episode = getEpisode() - self.logTimestamp(self, method, episode, "init called") + self.logTimestamp(method, episode, "init called") # read settings from global.ini # read sustkover_timeout @@ -87,12 +87,12 @@ def __init__(self, *args, **kwargs): isValidAction = (self.action_on_lost in ["ignore", "fence", "kill", "stop", "firstStopThenKill"]) if not isValidAction: msg = "Invalid action_on_lost {}. Fallback to 'ignore'".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) self.action_on_lost = "ignore_fallback" else: msg = "action_on_lost not configured. Fallback to 'ignore'" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) self.action_on_lost = "ignore_default" if self.config.hasKey("kill_signal"): @@ -101,7 +101,7 @@ def __init__(self, *args, **kwargs): self.killSignal = "9" # TODO: logging the signal parameter, but only if it is the kill action msg = "{}.{}() version {}, parameter info: action_on_lost={} stop_timeout={} kill_signal={}".format(self.__class__.__name__, method, SRHookVersion, self.action_on_lost, self.stop_timeout, self.killSignal) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) # TODO: use action specific init messages (e.g. for stop also report stop_timeout) self.takeover_active = False @@ -133,9 +133,9 @@ def srServiceStateChanged(self, ParamDict, **kwargs): msg1 = "{0} version {1}. Method {2} method called.".format(SRHookName, SRHookVersion, method) msg2 = "{0} {1} method called with Dict={2}".format(SRHookName, method, ParamDict) msg3 = "{0} {1} method called with SAPSYSTEMNAME={2}".format(SRHookName, method, mySID) - self.logTimestamp(self, method, episode, msg1) - self.logTimestamp(self, method, episode, msg2) - self.logTimestamp(self, method, episode, msg3) + self.logTimestamp(method, episode, msg1) + self.logTimestamp(method, episode, msg2) + self.logTimestamp(method, episode, msg3) self.tracer.info(msg1) self.tracer.info(msg2) self.tracer.info(msg3) @@ -153,7 +153,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # log service_name, service_port, service_status, service_previous_status, database_id, database_name, database_status, daemon_status msg = "srv:{0}-{1}-{2}-{3} db:{4}-{5}-{6} daem:{7}".format(service, port, status, previousStatus, databaseName, databaseId, databaseStatus, daemonStatus) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) # analysis, if the event looks like an dying indexserver (LOST) @@ -177,54 +177,54 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # if (isIndexserver and serviceRestart and daemonActive and databaseActive): msg = "LOST: indexserver event looks like a lost indexserver (status={})".format(status) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) isLostIndexserver = True eventKnown = True if (isIndexserver and serviceActive and daemonActive and databaseActive): if self.takeover_active: msg = "TAKEOVER: indexserver event looks like a takeover event" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) else: msg = "LOST: indexserver event looks like a lost indexserver (indexserver started)" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True # TODO: this event (LOST/started) seems also to come, if a sr_takeover is been processed (using preTakeover() and postTakeover() to mark this event?) if (isIndexserver and serviceStopping and daemonStop): msg = "STOP: indexserver event looks like graceful instance stop" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceDown and daemonStop): msg = "STOP: indexserver event looks like graceful instance stop (indexserver stopped)" self.tracer.info(msg) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) eventKnown = True if (isIndexserver and serviceStopping and daemonActive and databaseStop): msg = "STOP: indexserver event looks like graceful tenant stop" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceDown and daemonActive and databaseStop): msg = "STOP: indexserver event looks like graceful tenant stop (indexserver stopped)" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceRestart and daemonStarting and databaseActive): msg = "START: indexserver event looks like graceful tenant start" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and serviceActive and daemonStarting and databaseActive): msg = "START: indexserver event looks like graceful tenant start (indexserver started)" - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) eventKnown = True if (isIndexserver and not eventKnown): msg = "DBG: version={},serviceRestart={}, serviceStop={}, serviceDown={}, daemonActive={}, daemonStop={}, daemonStarting={}, databaseActive={}, databaseStop={}".format(SRHookVersion, serviceRestart, serviceStop, serviceDown, daemonActive, daemonStop, daemonStarting, databaseActive, databaseStop) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) # event on secondary, if HA1 tenant is stopped on primary # DBG: version=0.2.7,serviceRestart=True, serviceStop=True, serviceDown=False, daemonActive=True, daemonStop=False, daemonStarting=False, databaseActive=False, databaseStop=False @@ -235,11 +235,11 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # if (isLostIndexserver and (self.action_on_lost in ["ignore", "ignore_fallback", "ignore_default"])): msg = "LOST: event ignored. action_on_lost={}".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) if (isLostIndexserver and self.action_on_lost == "fence"): msg = "LOST: fence node. action_on_lost={}".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) tout_cmd = "" action_cmd = "sudo /usr/sbin/SAPHanaSR-hookHelper --sid={0} --case=fenceMe".format(mySID) @@ -247,7 +247,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # DONE add fence code here if (isLostIndexserver and self.action_on_lost == "kill"): msg = "LOST: kill instance. action_on_lost={} signal={}".format(self.action_on_lost, self.killSignal) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) tout_cmd = "" action_cmd = "HDB kill-{}".format(self.killSignal) @@ -255,11 +255,11 @@ def srServiceStateChanged(self, ParamDict, **kwargs): cmdrc = os.WEXITSTATUS(os.system("sleep {}; {} {}".format("5", tout_cmd, action_cmd))) # the following message will most-likely also be lost, if we use signal 9 msg = "LOST: killed instance. action_on_lost={}".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) # DONE: hardcoded 5 here to be moved to a self.sleep_before_action (or however it will be named) if (isLostIndexserver and self.action_on_lost == "stop"): msg = "LOST: stop instance. action_on_lost={}".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) tout_cmd = "timeout {}".format(self.stop_timeout) # action_cmd = "HDB stop" @@ -271,7 +271,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # this code could be removed at any time without notice # the code does not promise that it will be part of any product later msg = "LOST: firstStopThenKill instance. action_on_lost={}".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) action_cmd = "/usr/sbin/SAPHanaSR-hookHelper --sid={} --ino={} --case=firstStopThenKill".format(mySID, self.ino) cmdrc = os.WEXITSTATUS(os.system("sleep {}; {}".format("5", action_cmd))) @@ -280,7 +280,7 @@ def srServiceStateChanged(self, ParamDict, **kwargs): # this code could be removed at any time without notice # the code does not promise that it will be part of any product later msg = "LOST: set cluster attribute. action_on_lost={} is currently not implemented".format(self.action_on_lost) - self.logTimestamp(self, method, episode, msg) + self.logTimestamp(method, episode, msg) self.tracer.info(msg) # TODO add attribute code here return 0 From 6062afd4f2da3972e3469b2ab8491e3314296efe Mon Sep 17 00:00:00 2001 From: AngelaBriel Date: Thu, 22 Feb 2024 15:38:11 +0100 Subject: [PATCH 57/60] fix regression in topology (bsc#1219785) --- ra/SAPHanaTopology | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/ra/SAPHanaTopology b/ra/SAPHanaTopology index 5c4f6838..8630dd8c 100755 --- a/ra/SAPHanaTopology +++ b/ra/SAPHanaTopology @@ -873,7 +873,6 @@ function sht_monitor() { return $rc } - # # function: sht_status - get status of a hana instance (os tools only) # params: - @@ -1004,21 +1003,6 @@ function sht_stop_clone() { return $rc } -# -# function: sht_monitor -# this function should never be called currently. -# it is intended for future releases which might support un-cloned resources -# for now it is only used to print a reasonable error message in case of -# non-cloned resources instead of 'command not found' -# -function sht_monitor() { - if ! is_clone; then - super_ocf_log error "RA: resource is not defined as clone. This is not supported (OCF_ERR_UNIMPLEMENTED)" - return "$OCF_ERR_UNIMPLEMENTED" - else - return "$OCF_SUCCESS" - fi -} # # function: sht_monitor_clone - monitor a hana clone instance From 2308b70a4ba752869d9e75d58b64aac8bb0318ba Mon Sep 17 00:00:00 2001 From: lpinne Date: Fri, 1 Mar 2024 09:09:54 +0100 Subject: [PATCH 58/60] SAPHanaSR.py.7: example checking HANA python --- man/SAPHanaSR.py.7 | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/man/SAPHanaSR.py.7 b/man/SAPHanaSR.py.7 index f4f68b26..a6c5f983 100644 --- a/man/SAPHanaSR.py.7 +++ b/man/SAPHanaSR.py.7 @@ -1,6 +1,6 @@ .\" Version: 0.162.0 .\" -.TH SAPHanaSR.py 7 "23 Jan 2024" "" "SAPHanaSR" +.TH SAPHanaSR.py 7 "01 Mar 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR.py \- Provider for SAP HANA srHook method srConnectionChanged(). @@ -66,7 +66,7 @@ Usage, syntax or execution errors. .\" .SH EXAMPLES .PP -* Example for entry in sudo permissions /etc/sudoers +* Example for entry in sudo permissions /etc/sudoers.d/SAPHanaSR .PP .RS 2 # SAPHanaSR (Scale-Up) needs for srHook @@ -83,6 +83,14 @@ Example SID is HA1. # sudo -U ha1adm -l | grep "NOPASSWD.*crm_attribute.*hana_ha1" .RE .PP +* Example for checking HANA´s python version. +.br +This might be done before installing HADR provider hook scripts. SID is HA1. +.PP +.RS 2 +# su - ha1adm -c "python --version" +.RE +.PP * Example for entry in SAP HANA global configuration /hana/shared/$SID/global/hdb/custom/config/global.ini .br From 7dd11845b0e556e7cd820d53c56c64521422957a Mon Sep 17 00:00:00 2001 From: lpinne Date: Mon, 4 Mar 2024 10:46:13 +0100 Subject: [PATCH 59/60] SAPHanaSR_basic_cluster.7: fixed sbd example pcmk_delay_max --- man/SAPHanaSR_basic_cluster.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/SAPHanaSR_basic_cluster.7 b/man/SAPHanaSR_basic_cluster.7 index d6e3bcf0..fac0aa01 100644 --- a/man/SAPHanaSR_basic_cluster.7 +++ b/man/SAPHanaSR_basic_cluster.7 @@ -177,7 +177,7 @@ Example for a priority fencing disk-based SBD resource. .br primitive rsc_stonith_sbd stonith:external/sbd \\ .br - params pcmk_delay_base=15 \\ + params pcmk_delay_max=15 \\ .br property cib-bootstrap-options: \\ .br From 732dd8df62c905195a62619da06ef80df76e1b09 Mon Sep 17 00:00:00 2001 From: Peter Pitterling Date: Wed, 20 Mar 2024 10:10:33 +0100 Subject: [PATCH 60/60] HANA_CALL - remove inner timeout (has no meaning) which prevents proper hdbnsutil logging within nameserver.*.00000.*.trc --- ra/SAPHana | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ra/SAPHana b/ra/SAPHana index 3c5c4e82..a83f3ae1 100755 --- a/ra/SAPHana +++ b/ra/SAPHana @@ -674,7 +674,7 @@ function HANA_CALL() cmd_out_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_RA_OUT_${errExt} cmd_err_log=${SAPHanaSR_RUN}/HANA_CALL_CMD_RA_${errExt} - output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "($pre_script; timeout -s 9 $timeOut $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? + output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "($pre_script; $cmd > $cmd_out_log) >& $cmd_err_log" 2>"$su_err_log"); rc=$? output=$(if [ -f "$cmd_out_log" ]; then cat "$cmd_out_log"; rm -f "$cmd_out_log"; fi) suErr=$(if [ -f "$su_err_log" ]; then cat "$su_err_log"; rm -f "$su_err_log"; else echo "NA"; fi) @@ -686,7 +686,7 @@ function HANA_CALL() if [ "$cmdErr" == "NA" ]; then # seems something was going wrong with the 'pre_cmd' (su) super_ocf_log warn "DEC: HANA_CALL returned '1' for command '$pre_cmd'. Retry once." - output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "$pre_script; timeout -s 9 $timeOut $cmd"); rc=$? + output=$(timeout --foreground -s 9 "$timeOut" $pre_cmd "$pre_script; $cmd"); rc=$? fi fi #