mirror of
https://github.com/n8n-io/n8n.git
synced 2026-06-05 02:59:27 +02:00
ci(ai-builder): Add iterations + experiment-name inputs to instance-ai eval dispatch (no-changelog) (#31631)
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b858fbe91f
commit
f25e12de87
10
.github/workflows/ci-instance-ai-evals.yml
vendored
10
.github/workflows/ci-instance-ai-evals.yml
vendored
|
|
@ -12,6 +12,14 @@ on:
|
|||
description: 'Sandbox provider (n8n-sandbox or daytona)'
|
||||
required: false
|
||||
default: 'n8n-sandbox'
|
||||
iterations:
|
||||
description: 'Iterations per test case (use 10 for a baseline)'
|
||||
required: false
|
||||
default: '3'
|
||||
experiment-name:
|
||||
description: 'LangSmith experiment name (set to instance-ai-baseline to refresh the baseline)'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
concurrency:
|
||||
group: instance-ai-evals-${{ github.ref }}
|
||||
|
|
@ -25,4 +33,6 @@ jobs:
|
|||
with:
|
||||
branch: ${{ inputs.branch }}
|
||||
sandbox-provider: ${{ inputs.sandbox-provider }}
|
||||
iterations: ${{ inputs.iterations }}
|
||||
experiment-name: ${{ inputs.experiment-name }}
|
||||
secrets: inherit
|
||||
|
|
|
|||
42
.github/workflows/test-evals-instance-ai.yml
vendored
42
.github/workflows/test-evals-instance-ai.yml
vendored
|
|
@ -18,6 +18,16 @@ on:
|
|||
required: false
|
||||
type: string
|
||||
default: 'n8n-sandbox'
|
||||
iterations:
|
||||
description: 'Iterations per test case'
|
||||
required: false
|
||||
type: string
|
||||
default: '3'
|
||||
experiment-name:
|
||||
description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
branch:
|
||||
|
|
@ -32,17 +42,23 @@ on:
|
|||
description: 'Sandbox provider (n8n-sandbox or daytona)'
|
||||
required: false
|
||||
default: 'n8n-sandbox'
|
||||
iterations:
|
||||
description: 'Iterations per test case (use 10 for a baseline)'
|
||||
required: false
|
||||
default: '3'
|
||||
experiment-name:
|
||||
description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
jobs:
|
||||
run-evals:
|
||||
name: 'Run Evals'
|
||||
runs-on: blacksmith-4vcpu-ubuntu-2204
|
||||
timeout-minutes: 45
|
||||
timeout-minutes: 90
|
||||
env:
|
||||
# Each port hosts an independent n8n container. The eval CLI's
|
||||
# work-stealing allocator dispatches builds across them, capped per-lane.
|
||||
# 11 lanes on 4vcpu — builds are LLM-bound so CPU headroom is sufficient;
|
||||
# bump back to 8vcpu if contention shows up.
|
||||
LANE_PORTS: '5678,5679,5680,5681,5682,5683,5684,5685,5686,5687,5688'
|
||||
permissions:
|
||||
contents: read
|
||||
|
|
@ -207,6 +223,8 @@ jobs:
|
|||
LANGSMITH_REVISION_ID: ${{ github.sha }}
|
||||
LANGSMITH_BRANCH: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref_name }}
|
||||
FILTER: ${{ inputs.filter }}
|
||||
ITERATIONS: ${{ inputs.iterations }}
|
||||
EXPERIMENT_NAME: ${{ inputs.experiment-name }}
|
||||
run: |
|
||||
IFS=',' read -ra PORTS <<< "$LANE_PORTS"
|
||||
URLS=()
|
||||
|
|
@ -214,20 +232,10 @@ jobs:
|
|||
URLS+=("http://localhost:$port")
|
||||
done
|
||||
BASE_URLS=$(IFS=,; printf '%s' "${URLS[*]}")
|
||||
if [ -n "$FILTER" ]; then
|
||||
pnpm eval:instance-ai \
|
||||
--base-url "$BASE_URLS" \
|
||||
--concurrency 32 \
|
||||
--verbose \
|
||||
--iterations 3 \
|
||||
--filter "$FILTER"
|
||||
else
|
||||
pnpm eval:instance-ai \
|
||||
--base-url "$BASE_URLS" \
|
||||
--concurrency 32 \
|
||||
--verbose \
|
||||
--iterations 3
|
||||
fi
|
||||
ARGS=(--base-url "$BASE_URLS" --concurrency 32 --verbose --iterations "${ITERATIONS:-3}")
|
||||
[ -n "$FILTER" ] && ARGS+=(--filter "$FILTER")
|
||||
[ -n "$EXPERIMENT_NAME" ] && ARGS+=(--experiment-name "$EXPERIMENT_NAME")
|
||||
pnpm eval:instance-ai "${ARGS[@]}"
|
||||
|
||||
# Captures sandbox/builder diagnostic signals that surface during the
|
||||
# eval (after migrations finish). Two layers of secret-leak defense:
|
||||
|
|
|
|||
|
|
@ -478,33 +478,55 @@ async function runWithLangSmith(config: RunConfig): Promise<{
|
|||
|
||||
const execStart = Date.now();
|
||||
const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
|
||||
const maxExecAttempts = 5;
|
||||
let result;
|
||||
try {
|
||||
result = await builtOnLane.tracedExecute({
|
||||
workflowId: build.workflowId,
|
||||
scenario,
|
||||
workflowJsons: build.workflowJsons,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Mirror direct mode's per-scenario guard — without this, n8n API errors
|
||||
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
|
||||
// escape to LangSmith, come back as a Run with null outputs, and be
|
||||
// misclassified as builder regressions by the feedback extractor.
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
|
||||
return {
|
||||
buildSuccess: true,
|
||||
workflowId: build.workflowId,
|
||||
passed: false,
|
||||
score: 0,
|
||||
reasoning: `Scenario execution error: ${errorMessage}`,
|
||||
failureCategory: 'framework_issue',
|
||||
execErrors: [errorMessage],
|
||||
buildDurationMs,
|
||||
execDurationMs: Date.now() - execStart,
|
||||
nodeCount,
|
||||
workflowChecks: build.workflowChecks,
|
||||
};
|
||||
for (let attempt = 1; ; attempt++) {
|
||||
try {
|
||||
result = await builtOnLane.tracedExecute({
|
||||
workflowId: build.workflowId,
|
||||
scenario,
|
||||
workflowJsons: build.workflowJsons,
|
||||
});
|
||||
break;
|
||||
} catch (error: unknown) {
|
||||
const baseError = error instanceof Error ? error : new Error(String(error));
|
||||
const cause = baseError.cause;
|
||||
const causeText =
|
||||
cause instanceof Error ? cause.message : typeof cause === 'string' ? cause : undefined;
|
||||
const errorMessage =
|
||||
causeText && causeText !== baseError.message
|
||||
? `${baseError.message}: ${causeText}`
|
||||
: baseError.message;
|
||||
const isTransient =
|
||||
/fetch failed|ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN|socket hang up/i.test(
|
||||
errorMessage,
|
||||
);
|
||||
if (isTransient && attempt < maxExecAttempts) {
|
||||
logger.warn(
|
||||
` [${scenario.name}] execution attempt ${attempt}/${maxExecAttempts} failed (${errorMessage}); retrying`,
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, 500 * attempt));
|
||||
continue;
|
||||
}
|
||||
// Mirror direct mode's per-scenario guard — without this, n8n API errors
|
||||
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
|
||||
// escape to LangSmith, come back as a Run with null outputs, and be
|
||||
// misclassified as builder regressions by the feedback extractor.
|
||||
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
|
||||
return {
|
||||
buildSuccess: true,
|
||||
workflowId: build.workflowId,
|
||||
passed: false,
|
||||
score: 0,
|
||||
reasoning: `Scenario execution error: ${errorMessage}`,
|
||||
failureCategory: 'framework_issue',
|
||||
execErrors: [errorMessage],
|
||||
buildDurationMs,
|
||||
execDurationMs: Date.now() - execStart,
|
||||
nodeCount,
|
||||
workflowChecks: build.workflowChecks,
|
||||
};
|
||||
}
|
||||
}
|
||||
const execDurationMs = Date.now() - execStart;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user