Skip to content

Commit 538d2d4

Browse files
authored
Fix shutdown issues (#1117)
* Add kernel_id to message * Relocate clean up into start_ipython * Cleanup error handling of listener shutdown * Set max start attempts to avoid YARN RM auto-restart race condition
1 parent c38180d commit 538d2d4

File tree

8 files changed

+24
-23
lines changed

8 files changed

+24
-23
lines changed

enterprise_gateway/services/processproxies/processproxy.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,9 @@ def _post_connection(self, connection_info: dict) -> None:
377377
self.log.error("No kernel id found in response! Kernel launch will fail.")
378378
return
379379
if kernel_id not in self._response_registry:
380-
self.log.error("Kernel id '{}' has not been registered and will not be processed!")
380+
self.log.error(
381+
f"Kernel id '{kernel_id}' has not been registered and will not be processed!"
382+
)
381383
return
382384

383385
self.log.debug(f"Connection info received for kernel '{kernel_id}': {connection_info}")
@@ -1482,22 +1484,19 @@ def _send_listener_request(self, request, shutdown_socket=False):
14821484
if isinstance(e2, OSError) and e2.errno == errno.ENOTCONN:
14831485
# Listener is not connected. This is probably a follow-on to ECONNREFUSED on connect
14841486
self.log.debug(
1485-
"ERROR: OSError(ENOTCONN) raised on socket shutdown, "
1486-
f"listener likely not connected. Cannot send {request}"
1487+
f"OSError(ENOTCONN) raised on socket shutdown, listener "
1488+
f"has likely already exited. Cannot send '{request}'"
14871489
)
14881490
else:
14891491
self.log.warning(
1490-
"Exception occurred attempting to shutdown communication socket to {}:{} "
1491-
"for KernelID '{}' (ignored): {}".format(
1492-
self.comm_ip, self.comm_port, self.kernel_id, str(e2)
1493-
)
1492+
f"Exception occurred attempting to shutdown communication "
1493+
f"socket to {self.comm_ip}:{self.comm_port} "
1494+
f"for KernelID '{self.kernel_id}' (ignored): {str(e2)}"
14941495
)
14951496
sock.close()
14961497
else:
14971498
self.log.debug(
1498-
"Invalid comm port, not sending request '{}' to comm_port '{}'.",
1499-
request,
1500-
self.comm_port,
1499+
f"Invalid comm port, not sending request '{request}' to comm_port '{self.comm_port}'."
15011500
)
15021501

15031502
def send_signal(self, signum):

etc/kernel-launchers/python/scripts/launch_ipykernel.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,15 @@ def start_ipython(
435435
app.initialize([])
436436
app.start()
437437

438+
# cleanup
439+
conn_file = kwargs["connection_file"]
440+
try:
441+
import os # re-import os since it's removed during namespace manipulation during startup
442+
443+
os.remove(conn_file)
444+
except Exception as e:
445+
print(f"Could not delete connection file '{conn_file}' at exit due to error: {e}")
446+
438447

439448
if __name__ == "__main__":
440449
parser = argparse.ArgumentParser()
@@ -598,10 +607,3 @@ def start_ipython(
598607
ip=ip,
599608
kernel_class_name=kernel_class_name,
600609
)
601-
602-
try:
603-
os.remove(connection_file)
604-
except Exception as e:
605-
logger.warning(
606-
f"Could not delete connection file '{connection_file}' at exit due to error: {e}"
607-
)

etc/kernelspecs/spark_R_conductor_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
}
88
},
99
"env": {
10-
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} ${KERNEL_EXTRA_SPARK_OPTS}",
10+
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1111
"LAUNCH_OPTS": "--customAppName ${KERNEL_ID}"
1212
},
1313
"argv": [

etc/kernelspecs/spark_R_yarn_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
},
99
"env": {
1010
"SPARK_HOME": "/usr/hdp/current/spark2-client",
11-
"SPARK_OPTS": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.am.waitTime=1d --conf spark.yarn.appMasterEnv.PATH=/opt/conda/bin:$PATH --conf spark.sparkr.r.command=/opt/conda/lib/R/bin/Rscript ${KERNEL_EXTRA_SPARK_OPTS}",
11+
"SPARK_OPTS": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.am.waitTime=1d --conf spark.yarn.appMasterEnv.PATH=/opt/conda/bin:$PATH --conf spark.sparkr.r.command=/opt/conda/lib/R/bin/Rscript --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1212
"LAUNCH_OPTS": ""
1313
},
1414
"argv": [

etc/kernelspecs/spark_python_conductor_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"debugger": true
99
},
1010
"env": {
11-
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} ${KERNEL_EXTRA_SPARK_OPTS}",
11+
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1212
"LAUNCH_OPTS": ""
1313
},
1414
"argv": [

etc/kernelspecs/spark_python_yarn_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"SPARK_HOME": "/usr/hdp/current/spark2-client",
1212
"PYSPARK_PYTHON": "/opt/conda/bin/python",
1313
"PYTHONPATH": "${HOME}/.local/lib/python3.7/site-packages:/usr/hdp/current/spark2-client/python:/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip",
14-
"SPARK_OPTS": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.appMasterEnv.PYTHONUSERBASE=/home/${KERNEL_USERNAME}/.local --conf spark.yarn.appMasterEnv.PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:/usr/hdp/current/spark2-client/python:/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip --conf spark.yarn.appMasterEnv.PATH=/opt/conda/bin:$PATH ${KERNEL_EXTRA_SPARK_OPTS}",
14+
"SPARK_OPTS": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.appMasterEnv.PYTHONUSERBASE=/home/${KERNEL_USERNAME}/.local --conf spark.yarn.appMasterEnv.PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:/usr/hdp/current/spark2-client/python:/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip --conf spark.yarn.appMasterEnv.PATH=/opt/conda/bin:$PATH --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1515
"LAUNCH_OPTS": ""
1616
},
1717
"argv": [

etc/kernelspecs/spark_scala_conductor_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
}
88
},
99
"env": {
10-
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} ${KERNEL_EXTRA_SPARK_OPTS}",
10+
"SPARK_OPTS": "--name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1111
"__TOREE_OPTS__": "--alternate-sigint USR2 --spark-context-initialization-mode eager",
1212
"LAUNCH_OPTS": "",
1313
"DEFAULT_INTERPRETER": "Scala"

etc/kernelspecs/spark_scala_yarn_cluster/kernel.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
},
99
"env": {
1010
"SPARK_HOME": "/usr/hdp/current/spark2-client",
11-
"__TOREE_SPARK_OPTS__": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.am.waitTime=1d ${KERNEL_EXTRA_SPARK_OPTS}",
11+
"__TOREE_SPARK_OPTS__": "--master yarn --deploy-mode cluster --name ${KERNEL_ID:-ERROR__NO__KERNEL_ID} --conf spark.yarn.submit.waitAppCompletion=false --conf spark.yarn.am.waitTime=1d --conf spark.yarn.maxAppAttempts=1 ${KERNEL_EXTRA_SPARK_OPTS}",
1212
"__TOREE_OPTS__": "--alternate-sigint USR2",
1313
"LAUNCH_OPTS": "",
1414
"DEFAULT_INTERPRETER": "Scala"

0 commit comments

Comments
 (0)