@@ -274,6 +274,48 @@ env_vars:
274274 To (WithTransform (RayJobReason , Equal (rayv1 .DeadlineExceeded )))
275275 })
276276
277+ test .T ().Run ("RayJob fails when head Pod is deleted when job is running" , func (_ * testing.T ) {
278+ rayJobAC := rayv1ac .RayJob ("delete-head-after-submit" , namespace .Name ).
279+ WithSpec (rayv1ac .RayJobSpec ().
280+ WithRayClusterSpec (NewRayClusterSpec ()).
281+ WithEntrypoint ("python -c \" import time; time.sleep(60)\" " ).
282+ WithShutdownAfterJobFinishes (true ))
283+
284+ rayJob , err := test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Apply (test .Ctx (), rayJobAC , TestApplyOptions )
285+ g .Expect (err ).NotTo (HaveOccurred ())
286+ LogWithTimestamp (test .T (), "Created RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
287+
288+ // Wait until the RayJob's job status transitions to Running
289+ LogWithTimestamp (test .T (), "Waiting for RayJob %s/%s to be 'Running'" , rayJob .Namespace , rayJob .Name )
290+ g .Eventually (RayJob (test , rayJob .Namespace , rayJob .Name ), TestTimeoutMedium ).
291+ Should (WithTransform (RayJobStatus , Equal (rayv1 .JobStatusRunning )))
292+
293+ // Fetch RayCluster and delete the head Pod
294+ rayJob , err = GetRayJob (test , rayJob .Namespace , rayJob .Name )
295+ g .Expect (err ).NotTo (HaveOccurred ())
296+ rayCluster , err := GetRayCluster (test , rayJob .Namespace , rayJob .Status .RayClusterName )
297+ g .Expect (err ).NotTo (HaveOccurred ())
298+ headPod , err := GetHeadPod (test , rayCluster )
299+ g .Expect (err ).NotTo (HaveOccurred ())
300+ LogWithTimestamp (test .T (), "Deleting head Pod %s/%s for RayCluster %s" , headPod .Namespace , headPod .Name , rayCluster .Name )
301+ err = test .Client ().Core ().CoreV1 ().Pods (headPod .Namespace ).Delete (test .Ctx (), headPod .Name , metav1.DeleteOptions {})
302+ g .Expect (err ).NotTo (HaveOccurred ())
303+
304+ // After head pod deletion, controller should mark RayJob as Failed with a specific message
305+ g .Eventually (RayJob (test , rayJob .Namespace , rayJob .Name ), TestTimeoutMedium ).
306+ Should (WithTransform (RayJobDeploymentStatus , Equal (rayv1 .JobDeploymentStatusFailed )))
307+ g .Eventually (RayJob (test , rayJob .Namespace , rayJob .Name ), TestTimeoutMedium ).
308+ Should (WithTransform (RayJobReason , Equal (rayv1 .AppFailed )))
309+ g .Eventually (RayJob (test , rayJob .Namespace , rayJob .Name ), TestTimeoutMedium ).
310+ Should (WithTransform (func (job * rayv1.RayJob ) string { return job .Status .Message },
311+ Equal ("Submitter completed but Ray job not found in RayCluster." )))
312+
313+ // Cleanup
314+ err = test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Delete (test .Ctx (), rayJob .Name , metav1.DeleteOptions {})
315+ g .Expect (err ).NotTo (HaveOccurred ())
316+ LogWithTimestamp (test .T (), "Deleted RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
317+ })
318+
277319 test .T ().Run ("RayJob should be created, but not updated when managed externally" , func (_ * testing.T ) {
278320 // RayJob
279321 rayJobAC := rayv1ac .RayJob ("managed-externally" , namespace .Name ).
0 commit comments