From f25e12de878d445ee48008d1e4951349abc773fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Braulio=20Gonz=C3=A1lez=20Valido?= Date: Thu, 4 Jun 2026 10:57:08 +0100 Subject: [PATCH] ci(ai-builder): Add iterations + experiment-name inputs to instance-ai eval dispatch (no-changelog) (#31631) Co-authored-by: Claude Opus 4.8 (1M context) --- .github/workflows/ci-instance-ai-evals.yml | 10 +++ .github/workflows/test-evals-instance-ai.yml | 42 ++++++----- .../@n8n/instance-ai/evaluations/cli/index.ts | 74 ++++++++++++------- 3 files changed, 83 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci-instance-ai-evals.yml b/.github/workflows/ci-instance-ai-evals.yml index d0df898f915..89f6c367ada 100644 --- a/.github/workflows/ci-instance-ai-evals.yml +++ b/.github/workflows/ci-instance-ai-evals.yml @@ -12,6 +12,14 @@ on: description: 'Sandbox provider (n8n-sandbox or daytona)' required: false default: 'n8n-sandbox' + iterations: + description: 'Iterations per test case (use 10 for a baseline)' + required: false + default: '3' + experiment-name: + description: 'LangSmith experiment name (set to instance-ai-baseline to refresh the baseline)' + required: false + default: '' concurrency: group: instance-ai-evals-${{ github.ref }} @@ -25,4 +33,6 @@ jobs: with: branch: ${{ inputs.branch }} sandbox-provider: ${{ inputs.sandbox-provider }} + iterations: ${{ inputs.iterations }} + experiment-name: ${{ inputs.experiment-name }} secrets: inherit diff --git a/.github/workflows/test-evals-instance-ai.yml b/.github/workflows/test-evals-instance-ai.yml index 9356b26013f..a13ada506df 100644 --- a/.github/workflows/test-evals-instance-ai.yml +++ b/.github/workflows/test-evals-instance-ai.yml @@ -18,6 +18,16 @@ on: required: false type: string default: 'n8n-sandbox' + iterations: + description: 'Iterations per test case' + required: false + type: string + default: '3' + experiment-name: + description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)' + required: false + type: string + default: '' workflow_dispatch: inputs: branch: @@ -32,17 +42,23 @@ on: description: 'Sandbox provider (n8n-sandbox or daytona)' required: false default: 'n8n-sandbox' + iterations: + description: 'Iterations per test case (use 10 for a baseline)' + required: false + default: '3' + experiment-name: + description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)' + required: false + default: '' jobs: run-evals: name: 'Run Evals' runs-on: blacksmith-4vcpu-ubuntu-2204 - timeout-minutes: 45 + timeout-minutes: 90 env: # Each port hosts an independent n8n container. The eval CLI's # work-stealing allocator dispatches builds across them, capped per-lane. - # 11 lanes on 4vcpu — builds are LLM-bound so CPU headroom is sufficient; - # bump back to 8vcpu if contention shows up. LANE_PORTS: '5678,5679,5680,5681,5682,5683,5684,5685,5686,5687,5688' permissions: contents: read @@ -207,6 +223,8 @@ jobs: LANGSMITH_REVISION_ID: ${{ github.sha }} LANGSMITH_BRANCH: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref_name }} FILTER: ${{ inputs.filter }} + ITERATIONS: ${{ inputs.iterations }} + EXPERIMENT_NAME: ${{ inputs.experiment-name }} run: | IFS=',' read -ra PORTS <<< "$LANE_PORTS" URLS=() @@ -214,20 +232,10 @@ jobs: URLS+=("http://localhost:$port") done BASE_URLS=$(IFS=,; printf '%s' "${URLS[*]}") - if [ -n "$FILTER" ]; then - pnpm eval:instance-ai \ - --base-url "$BASE_URLS" \ - --concurrency 32 \ - --verbose \ - --iterations 3 \ - --filter "$FILTER" - else - pnpm eval:instance-ai \ - --base-url "$BASE_URLS" \ - --concurrency 32 \ - --verbose \ - --iterations 3 - fi + ARGS=(--base-url "$BASE_URLS" --concurrency 32 --verbose --iterations "${ITERATIONS:-3}") + [ -n "$FILTER" ] && ARGS+=(--filter "$FILTER") + [ -n "$EXPERIMENT_NAME" ] && ARGS+=(--experiment-name "$EXPERIMENT_NAME") + pnpm eval:instance-ai "${ARGS[@]}" # Captures sandbox/builder diagnostic signals that surface during the # eval (after migrations finish). Two layers of secret-leak defense: diff --git a/packages/@n8n/instance-ai/evaluations/cli/index.ts b/packages/@n8n/instance-ai/evaluations/cli/index.ts index a4cd90820fd..06b6c874423 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/index.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts @@ -478,33 +478,55 @@ async function runWithLangSmith(config: RunConfig): Promise<{ const execStart = Date.now(); const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0; + const maxExecAttempts = 5; let result; - try { - result = await builtOnLane.tracedExecute({ - workflowId: build.workflowId, - scenario, - workflowJsons: build.workflowJsons, - }); - } catch (error: unknown) { - // Mirror direct mode's per-scenario guard — without this, n8n API errors - // or verifier timeouts from executeWithLlmMock / verifyChecklist would - // escape to LangSmith, come back as a Run with null outputs, and be - // misclassified as builder regressions by the feedback extractor. - const errorMessage = error instanceof Error ? error.message : String(error); - logger.error(` ERROR [${scenario.name}]: ${errorMessage}`); - return { - buildSuccess: true, - workflowId: build.workflowId, - passed: false, - score: 0, - reasoning: `Scenario execution error: ${errorMessage}`, - failureCategory: 'framework_issue', - execErrors: [errorMessage], - buildDurationMs, - execDurationMs: Date.now() - execStart, - nodeCount, - workflowChecks: build.workflowChecks, - }; + for (let attempt = 1; ; attempt++) { + try { + result = await builtOnLane.tracedExecute({ + workflowId: build.workflowId, + scenario, + workflowJsons: build.workflowJsons, + }); + break; + } catch (error: unknown) { + const baseError = error instanceof Error ? error : new Error(String(error)); + const cause = baseError.cause; + const causeText = + cause instanceof Error ? cause.message : typeof cause === 'string' ? cause : undefined; + const errorMessage = + causeText && causeText !== baseError.message + ? `${baseError.message}: ${causeText}` + : baseError.message; + const isTransient = + /fetch failed|ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN|socket hang up/i.test( + errorMessage, + ); + if (isTransient && attempt < maxExecAttempts) { + logger.warn( + ` [${scenario.name}] execution attempt ${attempt}/${maxExecAttempts} failed (${errorMessage}); retrying`, + ); + await new Promise((resolve) => setTimeout(resolve, 500 * attempt)); + continue; + } + // Mirror direct mode's per-scenario guard — without this, n8n API errors + // or verifier timeouts from executeWithLlmMock / verifyChecklist would + // escape to LangSmith, come back as a Run with null outputs, and be + // misclassified as builder regressions by the feedback extractor. + logger.error(` ERROR [${scenario.name}]: ${errorMessage}`); + return { + buildSuccess: true, + workflowId: build.workflowId, + passed: false, + score: 0, + reasoning: `Scenario execution error: ${errorMessage}`, + failureCategory: 'framework_issue', + execErrors: [errorMessage], + buildDurationMs, + execDurationMs: Date.now() - execStart, + nodeCount, + workflowChecks: build.workflowChecks, + }; + } } const execDurationMs = Date.now() - execStart;