fix(workflow): Clearly differentiate WFT failures resulting from unhandle Promise rejection (#1606)

mjameswh · web-flow · commit eb67d04fca02 · 2025-02-27T18:00:14.000-05:00
diff --git a/packages/test/src/test-integration-split-two.ts b/packages/test/src/test-integration-split-two.ts
@@ -374,9 +374,9 @@ test('unhandledRejection causes WFT to fail', configMacro, async (t, config) =>
           t.fail();
           return;
         }
-        t.is(failure.message, 'unhandled rejection');
+        t.is(failure.message, 'Unhandled Promise rejection: Error: unhandled rejection');
         t.true(failure.stackTrace?.includes(`Error: unhandled rejection`));
-        t.is(failure.cause?.message, 'root failure');
+        t.is(failure.cause?.cause?.message, 'root failure');
       },
       { minTimeout: 300, factor: 1, retries: 100 }
     )
diff --git a/packages/test/src/test-payload-converter.ts b/packages/test/src/test-payload-converter.ts
@@ -193,12 +193,18 @@ if (RUN_INTEGRATION_TESTS) {
     const expectedErrorWasThrown = new Promise<void>((resolve) => {
       markErrorThrown = resolve;
     });
-    const logger = new DefaultLogger('ERROR', (entry) => {
-      if (entry.meta?.error.message === 'Unknown encoding: json/protobuf') {
-        markErrorThrown();
-      }
+
+    Runtime.install({
+      logger: new DefaultLogger('WARN', (entry) => {
+        if (
+          entry.message.includes('Failing workflow task') &&
+          entry.meta?.failure?.includes('Unknown encoding: json/protobuf')
+        ) {
+          markErrorThrown();
+        }
+      }),
+      telemetryOptions: { logging: { forward: {}, filter: 'WARN' } },
     });
-    Runtime.install({ logger });
 
     const taskQueue = `${__filename}/${t.title}`;
     const worker = await Worker.create({
diff --git a/packages/test/src/test-workflows.ts b/packages/test/src/test-workflows.ts
@@ -1645,9 +1645,16 @@ test('globalOverrides', async (t) => {
 
 test('logAndTimeout', async (t) => {
   const { workflowType, workflow } = t.context;
-  await t.throwsAsync(activate(t, makeStartWorkflow(workflowType)), {
-    code: 'ERR_SCRIPT_EXECUTION_TIMEOUT',
-    message: 'Script execution timed out after 400ms',
+  const completion = await activate(t, makeStartWorkflow(workflowType));
+  compareCompletion(t, completion, {
+    failed: {
+      failure: {
+        message: 'Script execution timed out after 400ms',
+        source: 'TypeScriptSDK',
+        stackTrace: 'Error: Script execution timed out after 400ms',
+        cause: undefined,
+      },
+    },
   });
   const calls = await workflow.getAndResetSinkCalls();
   // Ignore LogTimestamp and workflowInfo for the purpose of this comparison
diff --git a/packages/worker/src/errors.ts b/packages/worker/src/errors.ts
@@ -16,11 +16,6 @@ export class GracefulShutdownPeriodExpiredError extends Error {}
  * a Promise. To silent rejections on a specific Promise, use `promise.catch(funcThatCantThrow)`
  * (e.g. `promise.catch(() => void 0)` or `promise.catch((e) => logger.error(e))`).
  */
-// FIXME: At this time, this wrapper is only used for errors that could not be associated with a
-//        specific workflow run; it should also be used for unhandled rejections in workflow code,
-//        but this is not possible at the moment as we intentionally "unhandle" non-TemporalFailure
-//        errors happening in workflow code (i.e. ALL non-TemporalFailure errors thrown from
-//        workflow code becomes Unhandled Rejection at some point in our own logic)
 @SymbolBasedInstanceOfError('UnhandledRejectionError')
 export class UnhandledRejectionError extends Error {
   constructor(
diff --git a/packages/worker/src/workflow/vm-shared.ts b/packages/worker/src/workflow/vm-shared.ts
@@ -26,7 +26,7 @@ export function setUnhandledRejectionHandler(getWorkflowByRunId: (runId: string)
     if (runId !== undefined) {
       const workflow = getWorkflowByRunId(runId);
       if (workflow !== undefined) {
-        workflow.setUnhandledRejection(err);
+        workflow.setUnhandledRejection(new UnhandledRejectionError(`Unhandled Promise rejection: ${err}`, err));
         return;
       }
     }
@@ -323,97 +323,100 @@ export abstract class BaseVMWorkflow implements Workflow {
   public async activate(
     activation: coresdk.workflow_activation.IWorkflowActivation
   ): Promise<coresdk.workflow_completion.IWorkflowActivationCompletion> {
-    if (this.context === undefined) throw new IllegalStateError('Workflow isolate context uninitialized');
-    activation = coresdk.workflow_activation.WorkflowActivation.fromObject(activation);
-    if (!activation.jobs) throw new TypeError('Expected workflow activation jobs to be defined');
-
-    // Queries are particular in many ways, and Core guarantees that a single activation will not
-    // contain both queries and other jobs. So let's handle them separately.
-    const [queries, nonQueries] = partition(activation.jobs, ({ queryWorkflow }) => queryWorkflow != null);
-    if (queries.length > 0) {
-      if (nonQueries.length > 0) throw new TypeError('Got both queries and other jobs in a single activation');
-      return this.activateQueries(activation);
-    }
+    try {
+      if (this.context === undefined) throw new IllegalStateError('Workflow isolate context uninitialized');
+      activation = coresdk.workflow_activation.WorkflowActivation.fromObject(activation);
+      if (!activation.jobs) throw new TypeError('Expected workflow activation jobs to be defined');
+
+      // Queries are particular in many ways, and Core guarantees that a single activation will not
+      // contain both queries and other jobs. So let's handle them separately.
+      const [queries, nonQueries] = partition(activation.jobs, ({ queryWorkflow }) => queryWorkflow != null);
+      if (queries.length > 0) {
+        if (nonQueries.length > 0) throw new TypeError('Got both queries and other jobs in a single activation');
+        return this.activateQueries(activation);
+      }
 
-    // Update the activator's state in preparation for a non-query activation.
-    // This is done early, so that we can then rely on the activator while processing the activation.
-    if (activation.timestamp == null)
-      throw new TypeError('Expected activation.timestamp to be set for non-query activation');
-    this.activator.now = tsToMs(activation.timestamp);
-    this.activator.mutateWorkflowInfo((info) => ({
-      ...info,
-      historyLength: activation.historyLength as number,
-      // Exact truncation for multi-petabyte histories
-      // historySize === 0 means WFT was generated by pre-1.20.0 server, and the history size is unknown
-      historySize: activation.historySizeBytes?.toNumber() ?? 0,
-      continueAsNewSuggested: activation.continueAsNewSuggested ?? false,
-      currentBuildId: activation.buildIdForCurrentTask ?? undefined,
-      unsafe: {
-        ...info.unsafe,
-        isReplaying: activation.isReplaying ?? false,
-      },
-    }));
-    this.activator.addKnownFlags(activation.availableInternalFlags ?? []);
+      // Update the activator's state in preparation for a non-query activation.
+      // This is done early, so that we can then rely on the activator while processing the activation.
+      if (activation.timestamp == null)
+        throw new TypeError('Expected activation.timestamp to be set for non-query activation');
+      this.activator.now = tsToMs(activation.timestamp);
+      this.activator.mutateWorkflowInfo((info) => ({
+        ...info,
+        historyLength: activation.historyLength as number,
+        // Exact truncation for multi-petabyte histories
+        // historySize === 0 means WFT was generated by pre-1.20.0 server, and the history size is unknown
+        historySize: activation.historySizeBytes?.toNumber() ?? 0,
+        continueAsNewSuggested: activation.continueAsNewSuggested ?? false,
+        currentBuildId: activation.buildIdForCurrentTask ?? undefined,
+        unsafe: {
+          ...info.unsafe,
+          isReplaying: activation.isReplaying ?? false,
+        },
+      }));
+      this.activator.addKnownFlags(activation.availableInternalFlags ?? []);
 
-    // Initialization of the workflow must happen before anything else. Yet, keep the init job in
-    // place in the list as we'll use it as a marker to know when to start the workflow function.
-    const initWorkflowJob = activation.jobs.find((job) => job.initializeWorkflow != null)?.initializeWorkflow;
-    if (initWorkflowJob) this.workflowModule.initialize(initWorkflowJob);
+      // Initialization of the workflow must happen before anything else. Yet, keep the init job in
+      // place in the list as we'll use it as a marker to know when to start the workflow function.
+      const initWorkflowJob = activation.jobs.find((job) => job.initializeWorkflow != null)?.initializeWorkflow;
+      if (initWorkflowJob) this.workflowModule.initialize(initWorkflowJob);
 
-    const hasSignals = activation.jobs.some(({ signalWorkflow }) => signalWorkflow != null);
-    const doSingleBatch = !hasSignals || this.activator.hasFlag(SdkFlags.ProcessWorkflowActivationJobsAsSingleBatch);
+      const hasSignals = activation.jobs.some(({ signalWorkflow }) => signalWorkflow != null);
+      const doSingleBatch = !hasSignals || this.activator.hasFlag(SdkFlags.ProcessWorkflowActivationJobsAsSingleBatch);
 
-    const [patches, nonPatches] = partition(activation.jobs, ({ notifyHasPatch }) => notifyHasPatch != null);
-    for (const { notifyHasPatch } of patches) {
-      if (notifyHasPatch == null) throw new TypeError('Expected notifyHasPatch to be set');
-      this.activator.notifyHasPatch(notifyHasPatch);
-    }
+      const [patches, nonPatches] = partition(activation.jobs, ({ notifyHasPatch }) => notifyHasPatch != null);
+      for (const { notifyHasPatch } of patches) {
+        if (notifyHasPatch == null) throw new TypeError('Expected notifyHasPatch to be set');
+        this.activator.notifyHasPatch(notifyHasPatch);
+      }
 
-    if (doSingleBatch) {
-      // updateRandomSeed requires the same special handling as patches (before anything else, and don't
-      // unblock conditions after each job). Unfortunately, prior to ProcessWorkflowActivationJobsAsSingleBatch,
-      // they were handled as regular jobs, making it unsafe to properly handle that job above, with patches.
-      const [updateRandomSeed, rest] = partition(nonPatches, ({ updateRandomSeed }) => updateRandomSeed != null);
-      if (updateRandomSeed.length > 0)
-        this.activator.updateRandomSeed(updateRandomSeed[updateRandomSeed.length - 1].updateRandomSeed!);
-      this.workflowModule.activate(
-        coresdk.workflow_activation.WorkflowActivation.fromObject({ ...activation, jobs: rest })
-      );
-      this.tryUnblockConditionsAndMicrotasks();
-    } else {
-      const [signals, nonSignals] = partition(
-        nonPatches,
-        // Move signals to a first batch; all the rest goes in a second batch.
-        ({ signalWorkflow }) => signalWorkflow != null
-      );
-
-      // Loop and invoke each batch, waiting for microtasks to complete after each batch.
-      let batchIndex = 0;
-      for (const jobs of [signals, nonSignals]) {
-        if (jobs.length === 0) continue;
+      if (doSingleBatch) {
+        // updateRandomSeed requires the same special handling as patches (before anything else, and don't
+        // unblock conditions after each job). Unfortunately, prior to ProcessWorkflowActivationJobsAsSingleBatch,
+        // they were handled as regular jobs, making it unsafe to properly handle that job above, with patches.
+        const [updateRandomSeed, rest] = partition(nonPatches, ({ updateRandomSeed }) => updateRandomSeed != null);
+        if (updateRandomSeed.length > 0)
+          this.activator.updateRandomSeed(updateRandomSeed[updateRandomSeed.length - 1].updateRandomSeed!);
         this.workflowModule.activate(
-          coresdk.workflow_activation.WorkflowActivation.fromObject({ ...activation, jobs }),
-          batchIndex++
+          coresdk.workflow_activation.WorkflowActivation.fromObject({ ...activation, jobs: rest })
         );
         this.tryUnblockConditionsAndMicrotasks();
+      } else {
+        const [signals, nonSignals] = partition(
+          nonPatches,
+          // Move signals to a first batch; all the rest goes in a second batch.
+          ({ signalWorkflow }) => signalWorkflow != null
+        );
+
+        // Loop and invoke each batch, waiting for microtasks to complete after each batch.
+        let batchIndex = 0;
+        for (const jobs of [signals, nonSignals]) {
+          if (jobs.length === 0) continue;
+          this.workflowModule.activate(
+            coresdk.workflow_activation.WorkflowActivation.fromObject({ ...activation, jobs }),
+            batchIndex++
+          );
+          this.tryUnblockConditionsAndMicrotasks();
+        }
       }
-    }
 
-    const completion = this.workflowModule.concludeActivation();
+      const completion = this.workflowModule.concludeActivation();
 
-    // Give unhandledRejection handler a chance to be triggered.
-    await new Promise(setImmediate);
-    if (this.unhandledRejection) {
+      // Give unhandledRejection handler a chance to be triggered.
+      await new Promise(setImmediate);
+      if (this.unhandledRejection) throw this.unhandledRejection;
+
+      return completion;
+    } catch (err) {
       return {
         runId: this.activator.info.runId,
         // FIXME: Calling `activator.errorToFailure()` directly from outside the VM is unsafe, as it
         // depends on the `failureConverter` and `payloadConverter`, which may be customized and
         // therefore aren't guaranteed not to access `global` or to cause scheduling microtasks.
         // Admitingly, the risk is very low, so we're leaving it as is for now.
-        failed: { failure: this.activator.errorToFailure(this.unhandledRejection) },
+        failed: { failure: this.activator.errorToFailure(err) },
       };
     }
-    return completion;
   }
 
   private activateQueries(
@@ -434,14 +437,22 @@ export abstract class BaseVMWorkflow implements Workflow {
    * If called (by an external unhandledRejection handler), activations will fail with provided error.
    */
   public setUnhandledRejection(err: unknown): void {
+    if (this.activator) {
+      // This is very unlikely to make a difference, as unhandled rejections should be reported
+      // on the next macro task of the outer execution context (i.e. not inside the VM), at which
+      // point we are done handling the workflow activation anyway. But just in case, copying the
+      // error to the activator will ensure that any attempt to make progress in the workflow
+      // VM will immediately fail.
+      this.activator.workflowTaskError = err;
+    }
     this.unhandledRejection = err;
   }
 
   /**
    * Call into the Workflow context to attempt to unblock any blocked conditions and microtasks.
    *
-   * This is performed in a loop allowing microtasks to be processed between
-   * each iteration until there are no more conditions to unblock.
+   * This is performed in a loop, going in and out of the VM, allowing microtasks to be processed
+   * between each iteration of the outer loop, until there are no more conditions to unblock.
    */
   protected tryUnblockConditionsAndMicrotasks(): void {
     for (;;) {
diff --git a/packages/workflow/src/internals.ts b/packages/workflow/src/internals.ts
@@ -204,6 +204,38 @@ export class Activator implements ActivationHandler {
     childToParent: new Map(),
   };
 
+  /**
+   * The error that caused the current Workflow Task to fail. Sets if a non-`TemporalFailure`
+   * error bubbles up out of the Workflow function, or out of a Signal or Update handler. We
+   * capture errors this way because those functions are not technically awaited when started,
+   * but left to run asynchronously. There is therefore no real "parent" function that can
+   * directly handle those errors, and not capturing it would result in an Unhandled Promise
+   * Rejection. So instead, we buffer the error here, to then be processed in the context
+   * of our own synchronous Activation handling event loop.
+   *
+   * Our code does a best effort to stop processing the current activation as soon as possible
+   * after this field is set:
+   *  - If an error is thrown while executing code synchronously (e.g. anything before the
+   *    first `await` statement in a Workflow function or a signal/update handler), the error
+   *    will be _immediately_ rethrown, which will prevent execution of further jobs in the
+   *    current activation. We know we're currently running code synchronously thanks to the
+   *    `rethrowSynchronously` flag below.
+   *  - It an error is thrown while executing microtasks, then the error will be rethrown on
+   *    the next call to `tryUnblockConditions()`.
+   *
+   * Unfortunately, there's no way for us to prevent further execution of microtasks that have
+   * already been scheduled, nor those that will be recursively scheduled from those microtasks.
+   * Should more errors get thrown while settling microtasks, those will be ignored (i.e. only
+   * the first captured error is preserved).
+   */
+  public workflowTaskError: unknown;
+
+  /**
+   * Set to true when running synchronous code (e.g. while processing activation jobs and when calling
+   * `tryUnblockConditions()`). While this flag is set, it is safe to let errors bubble up.
+   */
+  public rethrowSynchronously = false;
+
   public readonly rootScope = new RootCancellationScope();
 
   /**
@@ -703,7 +735,7 @@ export class Activator implements ActivationHandler {
           if (error instanceof TemporalFailure) {
             this.rejectUpdate(protocolInstanceId, error);
           } else {
-            throw error;
+            this.handleWorkflowFailure(error);
           }
         })
         .finally(() => this.inProgressUpdates.delete(updateId));
@@ -830,6 +862,8 @@ export class Activator implements ActivationHandler {
   }
 
   public warnIfUnfinishedHandlers(): void {
+    if (this.workflowTaskError) return;
+
     const getWarnable = (handlerExecutions: Iterable<MessageHandlerExecution>): MessageHandlerExecution[] => {
       return Array.from(handlerExecutions).filter(
         (ex) => ex.unfinishedPolicy === HandlerUnfinishedPolicy.WARN_AND_ABANDON
@@ -941,17 +975,12 @@ export class Activator implements ActivationHandler {
    * Transforms failures into a command to be sent to the server.
    * Used to handle any failure emitted by the Workflow.
    */
-  async handleWorkflowFailure(error: unknown): Promise<void> {
+  handleWorkflowFailure(error: unknown): void {
     if (this.cancelled && isCancellation(error)) {
       this.pushCommand({ cancelWorkflowExecution: {} }, true);
     } else if (error instanceof ContinueAsNew) {
       this.pushCommand({ continueAsNewWorkflowExecution: error.command }, true);
-    } else {
-      if (!(error instanceof TemporalFailure)) {
-        // This results in an unhandled rejection which will fail the activation
-        // preventing it from completing.
-        throw error;
-      }
+    } else if (error instanceof TemporalFailure) {
       // Fail the workflow. We do not want to issue unfinishedHandlers warnings. To achieve that, we
       // mark all handlers as completed now.
       this.inProgressSignals.clear();
@@ -964,9 +993,29 @@ export class Activator implements ActivationHandler {
         },
         true
       );
+    } else {
+      this.recordWorkflowTaskError(error);
     }
   }
 
+  recordWorkflowTaskError(error: unknown): void {
+    // Only keep the first error that bubbles up; subsequent errors will be ignored.
+    if (this.workflowTaskError === undefined) this.workflowTaskError = error;
+
+    // Immediately rethrow the error if we know it is safe to do so (i.e. we are not running async
+    // microtasks). Otherwise, the error will be rethrown whenever we get an opportunity to do so,
+    // e.g. the next time `tryUnblockConditions()` is called.
+    if (this.rethrowSynchronously) this.maybeRethrowWorkflowTaskError();
+  }
+
+  /**
+   * If a Workflow Task error was captured, and we are running in synchronous mode,
+   * then bubble it up now. This is safe to call even if there is no error to rethrow.
+   */
+  maybeRethrowWorkflowTaskError(): void {
+    if (this.workflowTaskError) throw this.workflowTaskError;
+  }
+
   private completeQuery(queryId: string, result: unknown): void {
     this.pushCommand({
       respondToQuery: { queryId, succeeded: { response: this.payloadConverter.toPayload(result) } },
diff --git a/packages/workflow/src/worker-interface.ts b/packages/workflow/src/worker-interface.ts

Original file line number	Diff line number	Diff line change
`@@ -374,9 +374,9 @@ test('unhandledRejection causes WFT to fail', configMacro, async (t, config) =>`
`374`	`374`	`t.fail();`
`375`	`375`	`return;`
`376`	`376`	`}`
`377`		`- t.is(failure.message, 'unhandled rejection');`
	`377`	`+ t.is(failure.message, 'Unhandled Promise rejection: Error: unhandled rejection');`
`378`	`378`	t.true(failure.stackTrace?.includes(`Error: unhandled rejection`));
`379`		`- t.is(failure.cause?.message, 'root failure');`
	`379`	`+ t.is(failure.cause?.cause?.message, 'root failure');`
`380`	`380`	`},`
`381`	`381`	`{ minTimeout: 300, factor: 1, retries: 100 }`
`382`	`382`	`)`