ci(ai-builder): Add iterations + experiment-name inputs to instance-ai eval dispatch (no-changelog) (#31631)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
José Braulio González Valido 2026-06-04 10:57:08 +01:00 committed by GitHub
parent b858fbe91f
commit f25e12de87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 83 additions and 43 deletions

View File

@ -12,6 +12,14 @@ on:
description: 'Sandbox provider (n8n-sandbox or daytona)'
required: false
default: 'n8n-sandbox'
iterations:
description: 'Iterations per test case (use 10 for a baseline)'
required: false
default: '3'
experiment-name:
description: 'LangSmith experiment name (set to instance-ai-baseline to refresh the baseline)'
required: false
default: ''
concurrency:
group: instance-ai-evals-${{ github.ref }}
@ -25,4 +33,6 @@ jobs:
with:
branch: ${{ inputs.branch }}
sandbox-provider: ${{ inputs.sandbox-provider }}
iterations: ${{ inputs.iterations }}
experiment-name: ${{ inputs.experiment-name }}
secrets: inherit

View File

@ -18,6 +18,16 @@ on:
required: false
type: string
default: 'n8n-sandbox'
iterations:
description: 'Iterations per test case'
required: false
type: string
default: '3'
experiment-name:
description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
required: false
type: string
default: ''
workflow_dispatch:
inputs:
branch:
@ -32,17 +42,23 @@ on:
description: 'Sandbox provider (n8n-sandbox or daytona)'
required: false
default: 'n8n-sandbox'
iterations:
description: 'Iterations per test case (use 10 for a baseline)'
required: false
default: '3'
experiment-name:
description: 'LangSmith experiment name (instance-ai-baseline refreshes the baseline)'
required: false
default: ''
jobs:
run-evals:
name: 'Run Evals'
runs-on: blacksmith-4vcpu-ubuntu-2204
timeout-minutes: 45
timeout-minutes: 90
env:
# Each port hosts an independent n8n container. The eval CLI's
# work-stealing allocator dispatches builds across them, capped per-lane.
# 11 lanes on 4vcpu — builds are LLM-bound so CPU headroom is sufficient;
# bump back to 8vcpu if contention shows up.
LANE_PORTS: '5678,5679,5680,5681,5682,5683,5684,5685,5686,5687,5688'
permissions:
contents: read
@ -207,6 +223,8 @@ jobs:
LANGSMITH_REVISION_ID: ${{ github.sha }}
LANGSMITH_BRANCH: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref_name }}
FILTER: ${{ inputs.filter }}
ITERATIONS: ${{ inputs.iterations }}
EXPERIMENT_NAME: ${{ inputs.experiment-name }}
run: |
IFS=',' read -ra PORTS <<< "$LANE_PORTS"
URLS=()
@ -214,20 +232,10 @@ jobs:
URLS+=("http://localhost:$port")
done
BASE_URLS=$(IFS=,; printf '%s' "${URLS[*]}")
if [ -n "$FILTER" ]; then
pnpm eval:instance-ai \
--base-url "$BASE_URLS" \
--concurrency 32 \
--verbose \
--iterations 3 \
--filter "$FILTER"
else
pnpm eval:instance-ai \
--base-url "$BASE_URLS" \
--concurrency 32 \
--verbose \
--iterations 3
fi
ARGS=(--base-url "$BASE_URLS" --concurrency 32 --verbose --iterations "${ITERATIONS:-3}")
[ -n "$FILTER" ] && ARGS+=(--filter "$FILTER")
[ -n "$EXPERIMENT_NAME" ] && ARGS+=(--experiment-name "$EXPERIMENT_NAME")
pnpm eval:instance-ai "${ARGS[@]}"
# Captures sandbox/builder diagnostic signals that surface during the
# eval (after migrations finish). Two layers of secret-leak defense:

View File

@ -478,33 +478,55 @@ async function runWithLangSmith(config: RunConfig): Promise<{
const execStart = Date.now();
const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
const maxExecAttempts = 5;
let result;
try {
result = await builtOnLane.tracedExecute({
workflowId: build.workflowId,
scenario,
workflowJsons: build.workflowJsons,
});
} catch (error: unknown) {
// Mirror direct mode's per-scenario guard — without this, n8n API errors
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
// escape to LangSmith, come back as a Run with null outputs, and be
// misclassified as builder regressions by the feedback extractor.
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
return {
buildSuccess: true,
workflowId: build.workflowId,
passed: false,
score: 0,
reasoning: `Scenario execution error: ${errorMessage}`,
failureCategory: 'framework_issue',
execErrors: [errorMessage],
buildDurationMs,
execDurationMs: Date.now() - execStart,
nodeCount,
workflowChecks: build.workflowChecks,
};
for (let attempt = 1; ; attempt++) {
try {
result = await builtOnLane.tracedExecute({
workflowId: build.workflowId,
scenario,
workflowJsons: build.workflowJsons,
});
break;
} catch (error: unknown) {
const baseError = error instanceof Error ? error : new Error(String(error));
const cause = baseError.cause;
const causeText =
cause instanceof Error ? cause.message : typeof cause === 'string' ? cause : undefined;
const errorMessage =
causeText && causeText !== baseError.message
? `${baseError.message}: ${causeText}`
: baseError.message;
const isTransient =
/fetch failed|ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN|socket hang up/i.test(
errorMessage,
);
if (isTransient && attempt < maxExecAttempts) {
logger.warn(
` [${scenario.name}] execution attempt ${attempt}/${maxExecAttempts} failed (${errorMessage}); retrying`,
);
await new Promise((resolve) => setTimeout(resolve, 500 * attempt));
continue;
}
// Mirror direct mode's per-scenario guard — without this, n8n API errors
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
// escape to LangSmith, come back as a Run with null outputs, and be
// misclassified as builder regressions by the feedback extractor.
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
return {
buildSuccess: true,
workflowId: build.workflowId,
passed: false,
score: 0,
reasoning: `Scenario execution error: ${errorMessage}`,
failureCategory: 'framework_issue',
execErrors: [errorMessage],
buildDurationMs,
execDurationMs: Date.now() - execStart,
nodeCount,
workflowChecks: build.workflowChecks,
};
}
}
const execDurationMs = Date.now() - execStart;