ci(ai-builder): Add iterations + experiment-name inputs to instance-ai eval dispatch (no-changelog) (#31631)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 02:59:27 +02:00 · 2026-06-04 10:57:08 +01:00 · 2026-06-04 10:57:08 +01:00 · f25e12de87
commit f25e12de87
parent b858fbe91f
3 changed files with 83 additions and 43 deletions
--- a/.github/workflows/ci-instance-ai-evals.yml
+++ b/.github/workflows/ci-instance-ai-evals.yml
@ -12,6 +12,14 @@ on:
        description: 'Sandbox provider (n8n-sandbox or daytona)'
        required: false
        default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case (use 10 for a baseline)'
+        required: false
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (set to instance-ai-baseline to refresh the baseline)'
+        required: false
+        default: ''

 concurrency:
  group: instance-ai-evals-${{ github.ref }}
@ -25,4 +33,6 @@ jobs:
    with:
      branch: ${{ inputs.branch }}
      sandbox-provider: ${{ inputs.sandbox-provider }}
+      iterations: ${{ inputs.iterations }}
+      experiment-name: ${{ inputs.experiment-name }}
    secrets: inherit
--- a/.github/workflows/test-evals-instance-ai.yml
+++ b/.github/workflows/test-evals-instance-ai.yml
@ -18,6 +18,16 @@ on:
        required: false
        type: string
        default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case'
+        required: false
+        type: string
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
+        required: false
+        type: string
+        default: ''
  workflow_dispatch:
    inputs:
      branch:
@ -32,17 +42,23 @@ on:
        description: 'Sandbox provider (n8n-sandbox or daytona)'
        required: false
        default: 'n8n-sandbox'
+      iterations:
+        description: 'Iterations per test case (use 10 for a baseline)'
+        required: false
+        default: '3'
+      experiment-name:
+        description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
+        required: false
+        default: ''

 jobs:
  run-evals:
    name: 'Run Evals'
    runs-on: blacksmith-4vcpu-ubuntu-2204
-    timeout-minutes: 45
+    timeout-minutes: 90
    env:
      # Each port hosts an independent n8n container. The eval CLI's
      # work-stealing allocator dispatches builds across them, capped per-lane.
-      # 11 lanes on 4vcpu — builds are LLM-bound so CPU headroom is sufficient;
-      # bump back to 8vcpu if contention shows up.
      LANE_PORTS: '5678,5679,5680,5681,5682,5683,5684,5685,5686,5687,5688'
    permissions:
      contents: read
@ -207,6 +223,8 @@ jobs:
          LANGSMITH_REVISION_ID: ${{ github.sha }}
          LANGSMITH_BRANCH: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref_name }}
          FILTER: ${{ inputs.filter }}
+          ITERATIONS: ${{ inputs.iterations }}
+          EXPERIMENT_NAME: ${{ inputs.experiment-name }}
        run: |
          IFS=',' read -ra PORTS <<< "$LANE_PORTS"
          URLS=()
@ -214,20 +232,10 @@ jobs:
            URLS+=("http://localhost:$port")
          done
          BASE_URLS=$(IFS=,; printf '%s' "${URLS[*]}")
-          if [ -n "$FILTER" ]; then
-            pnpm eval:instance-ai \
-              --base-url "$BASE_URLS" \
-              --concurrency 32 \
-              --verbose \
-              --iterations 3 \
-              --filter "$FILTER"
-          else
-            pnpm eval:instance-ai \
-              --base-url "$BASE_URLS" \
-              --concurrency 32 \
-              --verbose \
-              --iterations 3
-          fi
+          ARGS=(--base-url "$BASE_URLS" --concurrency 32 --verbose --iterations "${ITERATIONS:-3}")
+          [ -n "$FILTER" ] && ARGS+=(--filter "$FILTER")
+          [ -n "$EXPERIMENT_NAME" ] && ARGS+=(--experiment-name "$EXPERIMENT_NAME")
+          pnpm eval:instance-ai "${ARGS[@]}"

      # Captures sandbox/builder diagnostic signals that surface during the
      # eval (after migrations finish). Two layers of secret-leak defense:
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -478,33 +478,55 @@ async function runWithLangSmith(config: RunConfig): Promise<{

 		const execStart = Date.now();
 		const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
+		const maxExecAttempts = 5;
 		let result;
-		try {
-			result = await builtOnLane.tracedExecute({
-				workflowId: build.workflowId,
-				scenario,
-				workflowJsons: build.workflowJsons,
-			});
-		} catch (error: unknown) {
-			// Mirror direct mode's per-scenario guard — without this, n8n API errors
-			// or verifier timeouts from executeWithLlmMock / verifyChecklist would
-			// escape to LangSmith, come back as a Run with null outputs, and be
-			// misclassified as builder regressions by the feedback extractor.
-			const errorMessage = error instanceof Error ? error.message : String(error);
-			logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
-			return {
-				buildSuccess: true,
-				workflowId: build.workflowId,
-				passed: false,
-				score: 0,
-				reasoning: `Scenario execution error: ${errorMessage}`,
-				failureCategory: 'framework_issue',
-				execErrors: [errorMessage],
-				buildDurationMs,
-				execDurationMs: Date.now() - execStart,
-				nodeCount,
-				workflowChecks: build.workflowChecks,
-			};
+		for (let attempt = 1; ; attempt++) {
+			try {
+				result = await builtOnLane.tracedExecute({
+					workflowId: build.workflowId,
+					scenario,
+					workflowJsons: build.workflowJsons,
+				});
+				break;
+			} catch (error: unknown) {
+				const baseError = error instanceof Error ? error : new Error(String(error));
+				const cause = baseError.cause;
+				const causeText =
+					cause instanceof Error ? cause.message : typeof cause === 'string' ? cause : undefined;
+				const errorMessage =
+					causeText && causeText !== baseError.message
+						? `${baseError.message}: ${causeText}`
+						: baseError.message;
+				const isTransient =
+					/fetch failed|ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN|socket hang up/i.test(
+						errorMessage,
+					);
+				if (isTransient && attempt < maxExecAttempts) {
+					logger.warn(
+						`    [${scenario.name}] execution attempt ${attempt}/${maxExecAttempts} failed (${errorMessage}); retrying`,
+					);
+					await new Promise((resolve) => setTimeout(resolve, 500 * attempt));
+					continue;
+				}
+				// Mirror direct mode's per-scenario guard — without this, n8n API errors
+				// or verifier timeouts from executeWithLlmMock / verifyChecklist would
+				// escape to LangSmith, come back as a Run with null outputs, and be
+				// misclassified as builder regressions by the feedback extractor.
+				logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
+				return {
+					buildSuccess: true,
+					workflowId: build.workflowId,
+					passed: false,
+					score: 0,
+					reasoning: `Scenario execution error: ${errorMessage}`,
+					failureCategory: 'framework_issue',
+					execErrors: [errorMessage],
+					buildDurationMs,
+					execDurationMs: Date.now() - execStart,
+					nodeCount,
+					workflowChecks: build.workflowChecks,
+				};
+			}
 		}
 		const execDurationMs = Date.now() - execStart;