From f25e12de878d445ee48008d1e4951349abc773fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Braulio=20Gonz=C3=A1lez=20Valido?=
 <jose.gonzalez@n8n.io>
Date: Thu, 4 Jun 2026 10:57:08 +0100
Subject: [PATCH] ci(ai-builder): Add iterations + experiment-name inputs to
 instance-ai eval dispatch (no-changelog) (#31631)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-instance-ai-evals.yml    | 10 +++
 .github/workflows/test-evals-instance-ai.yml  | 42 ++++++-----
 .../@n8n/instance-ai/evaluations/cli/index.ts | 74 ++++++++++++-------
 3 files changed, 83 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/ci-instance-ai-evals.yml b/.github/workflows/ci-instance-ai-evals.yml
index d0df898f915..89f6c367ada 100644
--- a/.github/workflows/ci-instance-ai-evals.yml
+++ b/.github/workflows/ci-instance-ai-evals.yml
@@ -12,6 +12,14 @@ on:
         description: 'Sandbox provider (n8n-sandbox or daytona)'
         required: false
         default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case (use 10 for a baseline)'
+        required: false
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (set to instance-ai-baseline to refresh the baseline)'
+        required: false
+        default: ''
 
 concurrency:
   group: instance-ai-evals-${{ github.ref }}
@@ -25,4 +33,6 @@ jobs:
     with:
       branch: ${{ inputs.branch }}
       sandbox-provider: ${{ inputs.sandbox-provider }}
+      iterations: ${{ inputs.iterations }}
+      experiment-name: ${{ inputs.experiment-name }}
     secrets: inherit
diff --git a/.github/workflows/test-evals-instance-ai.yml b/.github/workflows/test-evals-instance-ai.yml
index 9356b26013f..a13ada506df 100644
--- a/.github/workflows/test-evals-instance-ai.yml
+++ b/.github/workflows/test-evals-instance-ai.yml
@@ -18,6 +18,16 @@ on:
         required: false
         type: string
         default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case'
+        required: false
+        type: string
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
+        required: false
+        type: string
+        default: ''
   workflow_dispatch:
     inputs:
       branch:
@@ -32,17 +42,23 @@ on:
         description: 'Sandbox provider (n8n-sandbox or daytona)'
         required: false
         default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case (use 10 for a baseline)'
+        required: false
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
+        required: false
+        default: ''
 
 jobs:
   run-evals:
     name: 'Run Evals'
     runs-on: blacksmith-4vcpu-ubuntu-2204
-    timeout-minutes: 45
+    timeout-minutes: 90
     env:
       # Each port hosts an independent n8n container. The eval CLI's
       # work-stealing allocator dispatches builds across them, capped per-lane.
-      # 11 lanes on 4vcpu — builds are LLM-bound so CPU headroom is sufficient;
-      # bump back to 8vcpu if contention shows up.
       LANE_PORTS: '5678,5679,5680,5681,5682,5683,5684,5685,5686,5687,5688'
     permissions:
       contents: read
@@ -207,6 +223,8 @@ jobs:
           LANGSMITH_REVISION_ID: ${{ github.sha }}
           LANGSMITH_BRANCH: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref_name }}
           FILTER: ${{ inputs.filter }}
+          ITERATIONS: ${{ inputs.iterations }}
+          EXPERIMENT_NAME: ${{ inputs.experiment-name }}
         run: |
           IFS=',' read -ra PORTS <<< "$LANE_PORTS"
           URLS=()
@@ -214,20 +232,10 @@ jobs:
             URLS+=("http://localhost:$port")
           done
           BASE_URLS=$(IFS=,; printf '%s' "${URLS[*]}")
-          if [ -n "$FILTER" ]; then
-            pnpm eval:instance-ai \
-              --base-url "$BASE_URLS" \
-              --concurrency 32 \
-              --verbose \
-              --iterations 3 \
-              --filter "$FILTER"
-          else
-            pnpm eval:instance-ai \
-              --base-url "$BASE_URLS" \
-              --concurrency 32 \
-              --verbose \
-              --iterations 3
-          fi
+          ARGS=(--base-url "$BASE_URLS" --concurrency 32 --verbose --iterations "${ITERATIONS:-3}")
+          [ -n "$FILTER" ] && ARGS+=(--filter "$FILTER")
+          [ -n "$EXPERIMENT_NAME" ] && ARGS+=(--experiment-name "$EXPERIMENT_NAME")
+          pnpm eval:instance-ai "${ARGS[@]}"
 
       # Captures sandbox/builder diagnostic signals that surface during the
       # eval (after migrations finish). Two layers of secret-leak defense:
diff --git a/packages/@n8n/instance-ai/evaluations/cli/index.ts b/packages/@n8n/instance-ai/evaluations/cli/index.ts
index a4cd90820fd..06b6c874423 100644
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@@ -478,33 +478,55 @@ async function runWithLangSmith(config: RunConfig): Promise<{
 
 		const execStart = Date.now();
 		const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
+		const maxExecAttempts = 5;
 		let result;
-		try {
-			result = await builtOnLane.tracedExecute({
-				workflowId: build.workflowId,
-				scenario,
-				workflowJsons: build.workflowJsons,
-			});
-		} catch (error: unknown) {
-			// Mirror direct mode's per-scenario guard — without this, n8n API errors
-			// or verifier timeouts from executeWithLlmMock / verifyChecklist would
-			// escape to LangSmith, come back as a Run with null outputs, and be
-			// misclassified as builder regressions by the feedback extractor.
-			const errorMessage = error instanceof Error ? error.message : String(error);
-			logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
-			return {
-				buildSuccess: true,
-				workflowId: build.workflowId,
-				passed: false,
-				score: 0,
-				reasoning: `Scenario execution error: ${errorMessage}`,
-				failureCategory: 'framework_issue',
-				execErrors: [errorMessage],
-				buildDurationMs,
-				execDurationMs: Date.now() - execStart,
-				nodeCount,
-				workflowChecks: build.workflowChecks,
-			};
+		for (let attempt = 1; ; attempt++) {
+			try {
+				result = await builtOnLane.tracedExecute({
+					workflowId: build.workflowId,
+					scenario,
+					workflowJsons: build.workflowJsons,
+				});
+				break;
+			} catch (error: unknown) {
+				const baseError = error instanceof Error ? error : new Error(String(error));
+				const cause = baseError.cause;
+				const causeText =
+					cause instanceof Error ? cause.message : typeof cause === 'string' ? cause : undefined;
+				const errorMessage =
+					causeText && causeText !== baseError.message
+						? `${baseError.message}: ${causeText}`
+						: baseError.message;
+				const isTransient =
+					/fetch failed|ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN|socket hang up/i.test(
+						errorMessage,
+					);
+				if (isTransient && attempt < maxExecAttempts) {
+					logger.warn(
+						`    [${scenario.name}] execution attempt ${attempt}/${maxExecAttempts} failed (${errorMessage}); retrying`,
+					);
+					await new Promise((resolve) => setTimeout(resolve, 500 * attempt));
+					continue;
+				}
+				// Mirror direct mode's per-scenario guard — without this, n8n API errors
+				// or verifier timeouts from executeWithLlmMock / verifyChecklist would
+				// escape to LangSmith, come back as a Run with null outputs, and be
+				// misclassified as builder regressions by the feedback extractor.
+				logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
+				return {
+					buildSuccess: true,
+					workflowId: build.workflowId,
+					passed: false,
+					score: 0,
+					reasoning: `Scenario execution error: ${errorMessage}`,
+					failureCategory: 'framework_issue',
+					execErrors: [errorMessage],
+					buildDurationMs,
+					execDurationMs: Date.now() - execStart,
+					nodeCount,
+					workflowChecks: build.workflowChecks,
+				};
+			}
 		}
 		const execDurationMs = Date.now() - execStart;