From 095724603de93799da4c3b57ed286a795a6d310d Mon Sep 17 00:00:00 2001
From: Theodore Li <theo@sim.ai>
Date: Tue, 2 Jun 2026 20:17:10 -0700
Subject: [PATCH 1/3] fix(webhook): don't fault trigger run on user/workflow
 execution errors

Webhook-triggered executions re-threw every error, so trigger.dev marked
the run failed and fired #eng-errors alerts. The vast majority of these are
user-caused workflow failures (missing required fields, invalid field
references, bad URLs, provider 4xx, expired models, low credit) that are
already recorded in the execution logs.

Distinguish fault vs error in executeWebhookJobInternal: when the failure
was finalized by core (the workflow ran and its failure is logged), complete
the run with { success: false } instead of throwing. Errors that were not
finalized came from the webhook pipeline itself and still re-throw to fault
the run. Await waitForPostExecution first so the finalized flag is reliable.

The error is still recorded on the run's OTel span via recordException (no
ERROR status, so the run isn't faulted) and remains in the execution logs,
so these stay investigable in Tempo/Loki without false alerts.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 apps/sim/background/webhook-execution.test.ts | 156 +++++++++++++++++-
 apps/sim/background/webhook-execution.ts      |  22 ++-
 2 files changed, 175 insertions(+), 3 deletions(-)
diff --git a/apps/sim/background/webhook-execution.test.ts b/apps/sim/background/webhook-execution.test.ts
index 620c073ac0d..79ed92eca65 100644
--- a/apps/sim/background/webhook-execution.test.ts
+++ b/apps/sim/background/webhook-execution.test.ts
@@ -2,17 +2,110 @@
  * @vitest-environment node
  */
 
+import {
+  dbChainMock,
+  dbChainMockFns,
+  executionPreprocessingMock,
+  executionPreprocessingMockFns,
+  loggingSessionMock,
+  loggingSessionMockFns,
+} from '@sim/testing'
 import { beforeEach, describe, expect, it, vi } from 'vitest'
 
-const { mockResolveWebhookRecordProviderConfig } = vi.hoisted(() => ({
+const {
+  mockResolveWebhookRecordProviderConfig,
+  mockExecuteWorkflowCore,
+  mockWasExecutionFinalizedByCore,
+  mockRecordException,
+  mockGetActiveSpan,
+} = vi.hoisted(() => ({
   mockResolveWebhookRecordProviderConfig: vi.fn(),
+  mockExecuteWorkflowCore: vi.fn(),
+  mockWasExecutionFinalizedByCore: vi.fn(),
+  mockRecordException: vi.fn(),
+  mockGetActiveSpan: vi.fn(),
 }))
 
+vi.mock('@opentelemetry/api', () => ({
+  trace: { getActiveSpan: mockGetActiveSpan },
+}))
+
+vi.mock('@sim/db', () => dbChainMock)
+vi.mock('@/lib/execution/preprocessing', () => executionPreprocessingMock)
+vi.mock('@/lib/logs/execution/logging-session', () => loggingSessionMock)
+
 vi.mock('@/lib/webhooks/env-resolver', () => ({
   resolveWebhookRecordProviderConfig: mockResolveWebhookRecordProviderConfig,
 }))
 
-import { resolveWebhookExecutionProviderConfig } from './webhook-execution'
+vi.mock('@/lib/workflows/executor/execution-core', () => ({
+  executeWorkflowCore: mockExecuteWorkflowCore,
+  wasExecutionFinalizedByCore: mockWasExecutionFinalizedByCore,
+}))
+
+vi.mock('@/lib/core/idempotency', () => ({
+  IdempotencyService: { createWebhookIdempotencyKey: vi.fn(() => 'idempotency-key') },
+  webhookIdempotency: {
+    executeWithIdempotency: vi.fn(
+      (_provider: string, _key: string, operation: () => Promise<unknown>) => operation()
+    ),
+  },
+}))
+
+vi.mock('@/lib/workflows/persistence/utils', () => ({
+  loadDeployedWorkflowState: vi.fn(async () => ({
+    blocks: {},
+    edges: [],
+    loops: {},
+    parallels: {},
+    deploymentVersionId: 'deployment-1',
+  })),
+}))
+
+vi.mock('@/lib/webhooks/providers', () => ({
+  getProviderHandler: vi.fn(() => ({})),
+}))
+
+vi.mock('@/lib/logs/execution/trace-spans/trace-spans', () => ({
+  buildTraceSpans: vi.fn(() => ({ traceSpans: [] })),
+}))
+
+vi.mock('@/lib/core/execution-limits', () => ({
+  createTimeoutAbortController: vi.fn(() => ({
+    signal: new AbortController().signal,
+    cleanup: vi.fn(),
+    isTimedOut: () => false,
+    timeoutMs: 120_000,
+  })),
+  getTimeoutErrorMessage: vi.fn(() => 'timed out'),
+}))
+
+vi.mock('@/lib/workflows/executor/pause-persistence', () => ({
+  handlePostExecutionPauseState: vi.fn(),
+}))
+
+vi.mock('@/lib/webhooks/attachment-processor', () => ({
+  WebhookAttachmentProcessor: class {},
+}))
+
+vi.mock('@/app/api/auth/oauth/utils', () => ({
+  resolveOAuthAccountId: vi.fn(),
+}))
+
+vi.mock('@/executor/execution/snapshot', () => ({
+  ExecutionSnapshot: class {},
+}))
+
+vi.mock('@/tools/safe-assign', () => ({ safeAssign: vi.fn() }))
+
+vi.mock('@/blocks', () => ({ getBlock: vi.fn(() => null) }))
+
+vi.mock('@/triggers', () => ({
+  getTrigger: vi.fn(),
+  isTriggerValid: vi.fn(() => false),
+}))
+
+import { executeWebhookJob, resolveWebhookExecutionProviderConfig } from './webhook-execution'
 
 describe('resolveWebhookExecutionProviderConfig', () => {
   beforeEach(() => {
@@ -66,3 +159,62 @@ describe('resolveWebhookExecutionProviderConfig', () => {
     )
   })
 })
+
+describe('executeWebhookJob fault vs error handling', () => {
+  const payload = {
+    webhookId: 'webhook-1',
+    workflowId: 'workflow-1',
+    userId: 'user-1',
+    executionId: 'execution-1',
+    requestId: 'request-1',
+    provider: 'gmail',
+    body: { message: 'hello' },
+    headers: {},
+    path: '/webhook',
+    workspaceId: 'workspace-1',
+  }
+
+  beforeEach(() => {
+    vi.clearAllMocks()
+    executionPreprocessingMockFns.mockPreprocessExecution.mockResolvedValue({
+      success: true,
+      workflowRecord: { workspaceId: 'workspace-1', userId: 'user-1', variables: {} },
+      executionTimeout: { async: 120_000 },
+    })
+    mockResolveWebhookRecordProviderConfig.mockImplementation(async (record) => record)
+    dbChainMockFns.limit.mockResolvedValue([{ id: 'webhook-1' }])
+    mockGetActiveSpan.mockReturnValue({ recordException: mockRecordException })
+  })
+
+  it('completes the run (does not throw) when the failure was finalized by core', async () => {
+    mockExecuteWorkflowCore.mockRejectedValue(
+      new Error('Gmail 2 is missing required fields: Label')
+    )
+    mockWasExecutionFinalizedByCore.mockReturnValue(true)
+
+    const result = await executeWebhookJob(payload)
+
+    expect(result).toMatchObject({
+      success: false,
+      workflowId: 'workflow-1',
+      executionId: 'execution-1',
+      provider: 'gmail',
+    })
+    expect(loggingSessionMockFns.mockWaitForPostExecution).toHaveBeenCalled()
+    // User/workflow errors are already recorded by core — the catch must not re-log them.
+    expect(loggingSessionMockFns.mockSafeCompleteWithError).not.toHaveBeenCalled()
+    // The error is still recorded on the run span so it stays visible in traces.
+    expect(mockRecordException).toHaveBeenCalledWith(
+      expect.objectContaining({ message: 'Gmail 2 is missing required fields: Label' })
+    )
+  })
+
+  it('faults the run (re-throws) when the failure was not finalized by core', async () => {
+    mockExecuteWorkflowCore.mockRejectedValue(new Error('Workflow state not found'))
+    mockWasExecutionFinalizedByCore.mockReturnValue(false)
+
+    await expect(executeWebhookJob(payload)).rejects.toThrow('Workflow state not found')
+    // Pipeline/infra errors are recorded here before re-throwing to fault the trigger.dev run.
+    expect(loggingSessionMockFns.mockSafeCompleteWithError).toHaveBeenCalled()
+  })
+})
diff --git a/apps/sim/background/webhook-execution.ts b/apps/sim/background/webhook-execution.ts
index 1753813d849..41048f9208b 100644
--- a/apps/sim/background/webhook-execution.ts
+++ b/apps/sim/background/webhook-execution.ts
@@ -1,3 +1,4 @@
+import { trace } from '@opentelemetry/api'
 import { db } from '@sim/db'
 import { account, webhook } from '@sim/db/schema'
 import { createLogger, runWithRequestContext } from '@sim/logger'
@@ -616,8 +617,27 @@ async function executeWebhookJobInternal(
       provider: payload.provider,
     })
 
+    // The finalized flag is set inside a fire-and-forget post-execution promise; await it so the
+    // signal is reliable and the failure is fully persisted before we decide fault vs error.
+    await loggingSession.waitForPostExecution()
+
+    // A failure inside workflow execution (block error, provider 4xx, missing required field, etc.)
+    // is finalized by core and already recorded in the execution logs. That is a user/workflow error,
+    // not a trigger.dev job fault — complete the run normally so we don't fire a false alert. Errors
+    // that were not finalized came from the webhook pipeline itself, so we re-throw to fault below.
     if (wasExecutionFinalizedByCore(error, executionId)) {
-      throw error
+      // Record the exception on the run span so it stays visible in traces without
+      // marking the span as ERROR — that status is what faults the trigger.dev run.
+      trace.getActiveSpan()?.recordException(toError(error))
+
+      return {
+        success: false,
+        workflowId: payload.workflowId,
+        executionId,
+        output: hasExecutionResult(error) ? error.executionResult.output : {},
+        executedAt: new Date().toISOString(),
+        provider: payload.provider,
+      }
     }
 
     try {

From 682477a434cebfc99ffc575104c3add845e6ea73 Mon Sep 17 00:00:00 2001
From: Theodore Li <theo@sim.ai>
Date: Tue, 2 Jun 2026 22:37:24 -0700
Subject: [PATCH 2/3] fix(schedule): don't fault trigger run on error-recovery
 failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The schedule task already treats workflow-execution failures as recorded
errors rather than trigger faults, but the outermost catch's own recovery
code (the infra-retry and releaseClaim calls) was unguarded. A secondary DB
blip while releasing the claim re-threw and escaped run(), faulting the
trigger.dev run and firing an alert — a double-fault during cleanup.

Wrap the recovery path in a try/catch: log and record the exception on the
span without re-throwing. The claim expires on its TTL and the next tick
re-claims the schedule, so swallowing the cleanup failure is safe.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 apps/sim/background/schedule-execution.ts | 31 ++++++++++++++++-------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/apps/sim/background/schedule-execution.ts b/apps/sim/background/schedule-execution.ts
index 4c240e5f1e7..0ee2b07f2f3 100644
--- a/apps/sim/background/schedule-execution.ts
+++ b/apps/sim/background/schedule-execution.ts
@@ -1,3 +1,4 @@
+import { trace } from '@opentelemetry/api'
 import {
   db,
   jobExecutionLogs,
@@ -947,16 +948,28 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) {
         )
       }
     } catch (error: unknown) {
-      if (isRetryableInfrastructureError(error)) {
-        await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
-        return
-      }
+      try {
+        if (isRetryableInfrastructureError(error)) {
+          await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
+          return
+        }
 
-      logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
-      await releaseClaim(
-        now,
-        `Failed to release schedule ${payload.scheduleId} after unhandled error`
-      )
+        logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
+        await releaseClaim(
+          now,
+          `Failed to release schedule ${payload.scheduleId} after unhandled error`
+        )
+      } catch (recoveryError: unknown) {
+        // A secondary failure during error recovery (e.g. a transient DB blip while
+        // releasing the claim or scheduling an infra retry) must not fault the run. The
+        // claim expires on its TTL and the next tick re-claims the schedule. Record the
+        // exception on the span so it stays visible in traces without faulting the run.
+        logger.error(
+          `[${requestId}] Failed to recover schedule ${payload.scheduleId} after error`,
+          recoveryError
+        )
+        trace.getActiveSpan()?.recordException(toError(recoveryError))
+      }
     }
   })
 }

From 3fbd65fbb6b94c3819c82d0301b6553925bb7e53 Mon Sep 17 00:00:00 2001
From: Theodore Li <theo@sim.ai>
Date: Tue, 2 Jun 2026 22:44:17 -0700
Subject: [PATCH 3/3] test(webhook): assert waitForPostExecution runs on the
 non-finalized path

Guards the race fix on the infra-error path so a future refactor can't
silently drop the await. Addresses Greptile review feedback.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 apps/sim/background/webhook-execution.test.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/sim/background/webhook-execution.test.ts b/apps/sim/background/webhook-execution.test.ts
index 79ed92eca65..7a75dc3e4f9 100644
--- a/apps/sim/background/webhook-execution.test.ts
+++ b/apps/sim/background/webhook-execution.test.ts
@@ -214,6 +214,8 @@ describe('executeWebhookJob fault vs error handling', () => {
     mockWasExecutionFinalizedByCore.mockReturnValue(false)
 
     await expect(executeWebhookJob(payload)).rejects.toThrow('Workflow state not found')
+    // waitForPostExecution must run on every path so the finalized-by-core signal is always reliable.
+    expect(loggingSessionMockFns.mockWaitForPostExecution).toHaveBeenCalled()
     // Pipeline/infra errors are recorded here before re-throwing to fault the trigger.dev run.
     expect(loggingSessionMockFns.mockSafeCompleteWithError).toHaveBeenCalled()
   })