fix(core): Preserve execution output fidelity in eval verifier artifact (no-changelog) (#30989)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-04 02:37:46 +02:00 · 2026-06-02 09:55:01 +01:00 · 2026-06-02 09:55:01 +01:00 · a1369e7736
commit a1369e7736
parent c33a772cc0
9 changed files with 709 additions and 138 deletions
--- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
@ -1076,9 +1076,16 @@ export interface InstanceAiEvalInterceptedRequest {
 }

 export interface InstanceAiEvalNodeResult {
-	output: unknown;
-	/** Full count of output items (`output` is truncated for artifact size) */
-	outputCount?: number;
+	/** Outputs by connection type → per-branch items. Empty when pinned, errored, or didn't run. */
+	outputs: Record<string, unknown[][]>;
+	/** Total items across all branches (full untruncated count). */
+	outputCount: number;
+	/** True when any branch in `outputs` was truncated for size. */
+	truncated?: boolean;
+	/** Number of times this node ran (>1 inside loops). `outputs` captures the LAST iteration. */
+	iterationCount: number;
+	/** 0-based index of the first iteration that errored, if any. */
+	firstErrorIteration?: number;
 	interceptedRequests: InstanceAiEvalInterceptedRequest[];
 	executionMode: InstanceAiEvalNodeExecutionMode;
 	/** Missing required parameters detected before execution (empty = fully configured) */
--- a/packages/@n8n/instance-ai/evaluations/tests/verification-artifact.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/verification-artifact.test.ts
@ -0,0 +1,317 @@
+import type { InstanceAiEvalExecutionResult, InstanceAiEvalNodeResult } from '@n8n/api-types';
+
+import type { WorkflowResponse } from '../clients/n8n-client';
+import { buildVerificationArtifact } from '../harness/runner';
+import type { ExecutionScenario } from '../types';
+
+function makeNodeResult(
+	overrides: Partial<InstanceAiEvalNodeResult> = {},
+): InstanceAiEvalNodeResult {
+	return {
+		outputs: {},
+		outputCount: 0,
+		iterationCount: 0,
+		interceptedRequests: [],
+		executionMode: 'real',
+		...overrides,
+	};
+}
+
+const scenario: ExecutionScenario = {
+	name: 'happy-path',
+	description: 'baseline',
+	dataSetup: 'three posts, two match the filter',
+	successCriteria: 'two posts forwarded downstream',
+};
+
+function makeEvalResult(
+	nodeResults: Record<string, InstanceAiEvalNodeResult>,
+): InstanceAiEvalExecutionResult {
+	return {
+		executionId: 'exec-1',
+		success: true,
+		nodeResults,
+		errors: [],
+		hints: {
+			globalContext: '',
+			triggerContent: { foo: 1 },
+			nodeHints: {},
+			warnings: [],
+			bypassPinData: {},
+		},
+		mockedCredentials: [],
+	};
+}
+
+describe('buildVerificationArtifact', () => {
+	it('splits artifact into a workflow block (cacheable) and a scenario block (fresh)', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'pipeline',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'Trigger',
+					type: 'n8n-nodes-base.scheduleTrigger',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: { Trigger: { main: [[]] } },
+		};
+		const artifact = buildVerificationArtifact(scenario, makeEvalResult({}), [wf]);
+		expect(artifact.workflowContext).toContain('Workflow structure');
+		expect(artifact.workflowContext).toContain('Trigger');
+		expect(artifact.workflowContext).toContain('Connections');
+		expect(artifact.scenarioContext).toContain('happy-path');
+		expect(artifact.scenarioContext).toContain('Execution trace');
+		expect(artifact.scenarioContext).not.toContain('Workflow structure');
+	});
+
+	it('labels Filter branches with downstream node names so verifier can tell where items went', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'pipeline',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'Filter',
+					type: 'n8n-nodes-base.filter',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+				{
+					id: 'b',
+					name: 'Aggregate Posts',
+					type: 'n8n-nodes-base.aggregate',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: {
+				Filter: {
+					main: [[{ node: 'Aggregate Posts', type: 'main', index: 0 }], []],
+				},
+			},
+		};
+		const evalResult = makeEvalResult({
+			Filter: makeNodeResult({
+				outputs: {
+					main: [[{ json: { id: 1, kept: true } }], [{ json: { id: 2, dropped: true } }]],
+				},
+				outputCount: 2,
+				iterationCount: 1,
+			}),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain('Output [main branch 0] → Aggregate Posts');
+		expect(artifact.scenarioContext).toContain(
+			'Output [main branch 1] → (no downstream connection)',
+		);
+		expect(artifact.scenarioContext).toContain('"id": 1');
+		expect(artifact.scenarioContext).toContain('"id": 2');
+	});
+
+	it('renders Switch branches per route with their downstream targets', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'router',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'Switch',
+					type: 'n8n-nodes-base.switch',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+				{
+					id: 'b',
+					name: 'Slack',
+					type: 'n8n-nodes-base.slack',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+				{
+					id: 'c',
+					name: 'Email',
+					type: 'n8n-nodes-base.emailSend',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: {
+				Switch: {
+					main: [
+						[{ node: 'Slack', type: 'main', index: 0 }],
+						[{ node: 'Email', type: 'main', index: 0 }],
+					],
+				},
+			},
+		};
+		const evalResult = makeEvalResult({
+			Switch: makeNodeResult({
+				outputs: { main: [[{ json: { route: 'a' } }], [{ json: { route: 'b' } }]] },
+				outputCount: 2,
+				iterationCount: 1,
+			}),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain('Output [main branch 0] → Slack');
+		expect(artifact.scenarioContext).toContain('Output [main branch 1] → Email');
+	});
+
+	it('renders AI sub-node outputs under their non-main connection type', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'agent',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'OpenAI Chat Model',
+					type: '@n8n/n8n-nodes-langchain.lmChatOpenAi',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+				{
+					id: 'b',
+					name: 'AI Agent',
+					type: '@n8n/n8n-nodes-langchain.agent',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: {
+				'OpenAI Chat Model': {
+					ai_languageModel: [[{ node: 'AI Agent', type: 'ai_languageModel', index: 0 }]],
+				},
+			},
+		};
+		const evalResult = makeEvalResult({
+			'OpenAI Chat Model': makeNodeResult({
+				outputs: { ai_languageModel: [[{ json: { reply: 'hi' } }]] },
+				outputCount: 1,
+				iterationCount: 1,
+			}),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain('Output [ai_languageModel branch 0] → AI Agent');
+		expect(artifact.scenarioContext).toContain('"reply": "hi"');
+	});
+
+	it('flags truncation and reports the full count', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'big-output',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'HTTP',
+					type: 'n8n-nodes-base.httpRequest',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: { HTTP: { main: [[]] } },
+		};
+		const evalResult = makeEvalResult({
+			HTTP: makeNodeResult({
+				outputs: { main: [Array.from({ length: 10 }, (_, i) => ({ json: { i } }))] },
+				outputCount: 42,
+				truncated: true,
+				iterationCount: 1,
+			}),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain('full count across all branches: 42');
+	});
+
+	it('keeps pinned trigger nodes out of "Did not run" (they have synthetic input, no runData)', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'scheduled-pipeline',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'Schedule Trigger',
+					type: 'n8n-nodes-base.scheduleTrigger',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: { 'Schedule Trigger': { main: [[]] } },
+		};
+		const evalResult = makeEvalResult({
+			'Schedule Trigger': makeNodeResult({ executionMode: 'pinned' }),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain(
+			'**Pinned nodes** (synthetic input): Schedule Trigger',
+		);
+		expect(artifact.scenarioContext).toContain('**Did not run** (no execution data): none');
+	});
+
+	it('tags loop iterations and first-error iteration in the trace header', () => {
+		const wf: WorkflowResponse = {
+			id: 'w1',
+			name: 'loop',
+			active: false,
+			versionId: 'v1',
+			nodes: [
+				{
+					id: 'a',
+					name: 'Loop',
+					type: 'n8n-nodes-base.splitInBatches',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+			],
+			connections: {},
+		};
+		const evalResult = makeEvalResult({
+			Loop: makeNodeResult({
+				outputs: { main: [[{ json: { i: 4 } }]] },
+				outputCount: 1,
+				iterationCount: 5,
+				firstErrorIteration: 3,
+			}),
+		});
+
+		const artifact = buildVerificationArtifact(scenario, evalResult, [wf]);
+
+		expect(artifact.scenarioContext).toContain('ran 5×');
+		expect(artifact.scenarioContext).toContain('first error at iter 3');
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
+++ b/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
@ -1,7 +1,8 @@
+import type { Message } from '@n8n/agents';
 import { z } from 'zod';

-import { createEvalAgent } from '../../src/utils/eval-agents';
-import type { WorkflowResponse } from '../clients/n8n-client';
+import { EPHEMERAL_CACHE, createEvalAgent } from '../../src/utils/eval-agents';
+import type { VerificationArtifact } from '../harness/runner';
 import { MOCK_EXECUTION_VERIFY_PROMPT } from '../system-prompts/mock-execution-verify';
 import type { ChecklistItem, ChecklistResult } from '../types';

@ -30,21 +31,29 @@ const VERIFY_ATTEMPT_TIMEOUT_MS = 120_000;

 export async function verifyChecklist(
 	checklist: ChecklistItem[],
-	verificationArtifact: string,
-	_workflowJsons: WorkflowResponse[],
+	artifact: VerificationArtifact,
 ): Promise<ChecklistResult[]> {
 	const llmItems = checklist.filter((i) => i.strategy === 'llm');
 	if (llmItems.length === 0) return [];

-	const userMessage = `## Checklist
-
-${JSON.stringify(llmItems, null, 2)}
-
-## Verification Artifact
-
-${verificationArtifact}
-
-Verify each checklist item against the artifact above.`;
+	// Multi-block user message: the workflow context is stable across scenarios of
+	// the same build, so we mark it as a cache breakpoint for Anthropic prompt caching.
+	const messages: Message[] = [
+		{
+			role: 'user',
+			content: [
+				{
+					type: 'text',
+					text: artifact.workflowContext,
+					providerOptions: EPHEMERAL_CACHE,
+				},
+				{
+					type: 'text',
+					text: `## Checklist\n\n${JSON.stringify(llmItems, null, 2)}\n\n${artifact.scenarioContext}\n\nVerify each checklist item against the workflow + scenario artifact above.`,
+				},
+			],
+		},
+	];

 	const validIds = new Set(llmItems.map((i) => i.id));

@ -62,7 +71,7 @@ Verify each checklist item against the artifact above.`;
 		);
 		let result;
 		try {
-			result = await agent.generate(userMessage, { abortSignal: abortController.signal });
+			result = await agent.generate(messages, { abortSignal: abortController.signal });
 		} catch (error: unknown) {
 			const msg = error instanceof Error ? error.message : String(error);
 			console.warn(`[verifier] attempt ${attempt}/${MAX_VERIFY_ATTEMPTS} failed: ${msg}`);
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -560,7 +560,7 @@ async function runScenario(
 	);

 	const verifyStart = Date.now();
-	const verificationArtifact = buildVerificationArtifact(scenario, evalResult, workflowJsons);
+	const artifact = buildVerificationArtifact(scenario, evalResult, workflowJsons);

 	const scenarioChecklist: ChecklistItem[] = [
 		{
@ -571,11 +571,7 @@ async function runScenario(
 		},
 	];

-	const verificationResults = await verifyChecklist(
-		scenarioChecklist,
-		verificationArtifact,
-		workflowJsons,
-	);
+	const verificationResults = await verifyChecklist(scenarioChecklist, artifact);

 	const verifyMs = Date.now() - verifyStart;
 	const passed = verificationResults.length > 0 && verificationResults[0].pass;
@ -607,19 +603,123 @@ async function runScenario(
 // Verification artifact builder
 // ---------------------------------------------------------------------------

-/**
- * Build a rich verification artifact from the execution result.
- * Includes execution trace with mock responses, config issues,
- * and pre-analysis flags so the verifier can diagnose root causes.
- */
-function buildVerificationArtifact(
+export interface VerificationArtifact {
+	/** Workflow structure + connections + node configs. Stable across scenarios of the same build (cacheable). */
+	workflowContext: string;
+	/** Scenario + execution trace + errors. Fresh per scenario. */
+	scenarioContext: string;
+}
+
+/** Render the per-build workflow structure: nodes, connections, all configs. */
+function buildWorkflowContextBlock(wf: WorkflowResponse | undefined): string {
+	if (!wf) return '## Workflow structure\n\n(no workflow built)';
+	const lines: string[] = ['## Workflow structure', ''];
+	for (const node of wf.nodes) {
+		lines.push(`- **${node.name ?? '(unnamed)'}** (${node.type})`);
+	}
+	lines.push('');
+	lines.push('**All node configs:**');
+	lines.push(
+		'```json',
+		JSON.stringify(
+			wf.nodes.map((node) => ({
+				name: node.name ?? '(unnamed)',
+				type: node.type,
+				typeVersion: node.typeVersion,
+				...(node.disabled !== undefined ? { disabled: node.disabled } : {}),
+				parameters: node.parameters ?? {},
+			})),
+			null,
+			2,
+		),
+		'```',
+		'',
+	);
+	lines.push('**Connections:**');
+	lines.push('```json', JSON.stringify(wf.connections, null, 2), '```');
+	return lines.join('\n');
+}
+
+function isObjectRecord(v: unknown): v is Record<string, unknown> {
+	return typeof v === 'object' && v !== null && !Array.isArray(v);
+}
+
+/** For a given node + connection type, return downstream node names per output port. */
+function getDownstreamsByBranch(
+	nodeName: string,
+	connectionType: string,
+	connections: Record<string, unknown> | undefined,
+): string[][] {
+	if (!connections) return [];
+	const nodeConns = connections[nodeName];
+	if (!isObjectRecord(nodeConns)) return [];
+	const typeConns = nodeConns[connectionType];
+	if (!Array.isArray(typeConns)) return [];
+	return typeConns.map((branch) => {
+		if (!Array.isArray(branch)) return [];
+		const targets: string[] = [];
+		for (const c of branch) {
+			if (isObjectRecord(c) && typeof c.node === 'string') targets.push(c.node);
+		}
+		return targets;
+	});
+}
+
+/** Render per-node outputs grouped by connection type + branch, with downstream labels. */
+function renderNodeOutputs(
+	nodeName: string,
+	outputs: Record<string, unknown[][]>,
+	outputCount: number,
+	truncated: boolean | undefined,
+	connections: Record<string, unknown> | undefined,
+): string[] {
+	const lines: string[] = [];
+	const connTypes = Object.keys(outputs);
+	// "Output: none" only when no branches exist on any port — distinct from "branches exist but all empty".
+	// An `outputs.main = [[]]` (one connected branch, zero items) falls through and renders as `Output [main]: 0 items`.
+	if (connTypes.length === 0 || connTypes.every((k) => outputs[k].length === 0)) {
+		lines.push('**Output:** none');
+		return lines;
+	}
+	for (const connType of connTypes) {
+		const branches = outputs[connType];
+		if (branches.length === 0) continue;
+		const downstreams = getDownstreamsByBranch(nodeName, connType, connections);
+		const isMultiBranch = branches.length > 1 || connType !== 'main';
+		if (!isMultiBranch) {
+			lines.push(`**Output [${connType}]:** ${String(branches[0].length)} items`);
+			lines.push('```json', JSON.stringify(branches[0], null, 2), '```');
+			continue;
+		}
+		for (let i = 0; i < branches.length; i++) {
+			const branch = branches[i];
+			const targets = downstreams[i] ?? [];
+			const targetLabel =
+				targets.length > 0 ? `→ ${targets.join(', ')}` : '→ (no downstream connection)';
+			lines.push(
+				`**Output [${connType} branch ${String(i)}] ${targetLabel}:** ${String(branch.length)} items`,
+			);
+			if (branch.length > 0) {
+				lines.push('```json', JSON.stringify(branch, null, 2), '```');
+			}
+		}
+	}
+	if (truncated) {
+		lines.push(
+			`_(items truncated for size; full count across all branches: ${String(outputCount)})_`,
+		);
+	}
+	return lines;
+}
+
+/** Render the per-scenario context: scenario, pre-analysis, execution summary, errors, per-node trace. */
+function buildScenarioContextBlock(
 	scenario: ExecutionScenario,
 	evalResult: InstanceAiEvalExecutionResult,
-	workflowJsons: WorkflowResponse[],
+	wf: WorkflowResponse | undefined,
 ): string {
 	const sections: string[] = [];

-	// --- Scenario context ---
 	sections.push(
 		'## Scenario',
 		'',
@ -628,10 +728,8 @@ function buildVerificationArtifact(
 		'',
 	);

-	// --- Pre-analysis: flag known issues programmatically ---
+	// Pre-analysis: programmatic flags
 	const preAnalysis: string[] = [];
-
-	// Flag Phase 1 failures — these cause empty trigger data and cascade failures
 	if (evalResult.hints.warnings.length > 0) {
 		for (const warning of evalResult.hints.warnings) {
 			preAnalysis.push(`⚠ FRAMEWORK ISSUE: ${warning}`);
@ -642,7 +740,6 @@ function buildVerificationArtifact(
 			'⚠ FRAMEWORK ISSUE: Trigger content is empty — the start node received no input data. All downstream failures are likely caused by this, not by the workflow builder.',
 		);
 	}
-
 	for (const [nodeName, nr] of Object.entries(evalResult.nodeResults)) {
 		if (nr.configIssues && Object.keys(nr.configIssues).length > 0) {
 			preAnalysis.push(
@ -663,29 +760,35 @@ function buildVerificationArtifact(
 			}
 		}
 	}
-
 	if (preAnalysis.length > 0) {
 		sections.push('## Pre-analysis (automated flags)', '', ...preAnalysis, '');
 	}

-	// --- Execution summary ---
+	// Execution summary
 	const mockedNodes: string[] = [];
 	const pinnedNodes: string[] = [];
 	const realNodes: string[] = [];
-
+	const ranNodes = new Set<string>();
 	for (const [nodeName, nr] of Object.entries(evalResult.nodeResults)) {
 		if (nr.executionMode === 'mocked') mockedNodes.push(nodeName);
 		else if (nr.executionMode === 'pinned') pinnedNodes.push(nodeName);
 		else realNodes.push(nodeName);
+		// Pinned nodes (trigger / bypass) get their data from pin data and never appear in runData,
+		// so `iterationCount` stays 0 — count them as "ran" anyway to keep them out of `didNotRun`.
+		if (nr.iterationCount > 0 || nr.executionMode !== 'real') ranNodes.add(nodeName);
 	}
-
+	const didNotRun: string[] =
+		wf?.nodes
+			.map((n) => n.name)
+			.filter((name): name is string => typeof name === 'string' && !ranNodes.has(name)) ?? [];
 	sections.push(
 		'## Execution summary',
 		'',
 		`**Status:** ${evalResult.success ? 'success' : 'failed'}`,
-		`**Mocked nodes** (HTTP intercepted, responses generated by LLM): ${mockedNodes.join(', ') || 'none'}`,
-		`**Pinned nodes** (trigger data provided, not executed): ${pinnedNodes.join(', ') || 'none'}`,
-		`**Real nodes** (executed with actual logic on mock/pinned data): ${realNodes.join(', ') || 'none'}`,
+		`**Mocked nodes** (HTTP intercepted): ${mockedNodes.join(', ') || 'none'}`,
+		`**Pinned nodes** (synthetic input): ${pinnedNodes.join(', ') || 'none'}`,
+		`**Real nodes** (executed with actual logic): ${realNodes.join(', ') || 'none'}`,
+		`**Did not run** (no execution data): ${didNotRun.join(', ') || 'none'}`,
 		'',
 	);

@ -693,74 +796,23 @@ function buildVerificationArtifact(
 		sections.push('## Errors', '', ...evalResult.errors.map((e) => `- ${e}`), '');
 	}

-	// --- Build a node config lookup from workflow JSON ---
-	const nodeConfigs = new Map<string, Record<string, unknown>>();
-	const wf = workflowJsons[0];
-	if (wf) {
-		for (const node of wf.nodes) {
-			if (node.name && node.parameters) {
-				nodeConfigs.set(node.name, { type: node.type, parameters: node.parameters });
-			}
-		}
-	}
-
-	// --- Workflow structure: ALL nodes and connections ---
-	const executedNodeNames = new Set(Object.keys(evalResult.nodeResults));
-	if (wf) {
-		sections.push('## Workflow structure (all nodes)', '');
-		for (const node of wf.nodes) {
-			const ran = node.name ? executedNodeNames.has(node.name) : false;
-			const status = ran ? 'EXECUTED' : 'DID NOT RUN';
-			sections.push(`- **${node.name ?? '(unnamed)'}** (${node.type}) — ${status}`);
-		}
-		sections.push('');
-		sections.push(
-			'**All node configs** (from saved workflow JSON, including nodes that did not run):',
-		);
-		sections.push(
-			'```json',
-			JSON.stringify(
-				wf.nodes.map((node) => ({
-					name: node.name ?? '(unnamed)',
-					type: node.type,
-					typeVersion: node.typeVersion,
-					...(node.disabled !== undefined ? { disabled: node.disabled } : {}),
-					parameters: node.parameters ?? {},
-				})),
-				null,
-				2,
-			),
-			'```',
-		);
-		sections.push('');
-		sections.push('**Connections:**');
-		sections.push('```json', JSON.stringify(wf.connections, null, 2), '```');
-		sections.push('');
-	}
-
-	// --- Execution trace: per-node detail (sorted by execution order) ---
+	// Per-node execution trace, sorted by start time
 	sections.push('## Execution trace', '');
-
 	const sortedNodeResults = Object.entries(evalResult.nodeResults).sort(
 		([, a], [, b]) => (a.startTime ?? 0) - (b.startTime ?? 0),
 	);
-
 	for (const [nodeName, nr] of sortedNodeResults) {
-		sections.push(`### ${nodeName} [${nr.executionMode}]`);
+		const iterTag = nr.iterationCount > 1 ? ` · ran ${String(nr.iterationCount)}×` : '';
+		const errTag =
+			nr.firstErrorIteration !== undefined
+				? ` · first error at iter ${String(nr.firstErrorIteration)}`
+				: '';
+		sections.push(`### ${nodeName} [${nr.executionMode}${iterTag}${errTag}]`);

-		// Node configuration (from workflow JSON)
-		const nodeConfig = nodeConfigs.get(nodeName);
-		if (nodeConfig) {
-			sections.push('**Node config:**');
-			sections.push('```json', JSON.stringify(nodeConfig, null, 2), '```');
-		}
-
-		// Config issues
 		if (nr.configIssues && Object.keys(nr.configIssues).length > 0) {
 			sections.push(`**Config issues:** ${Object.values(nr.configIssues).flat().join('; ')}`);
 		}

-		// Intercepted requests + mock responses (for mocked nodes)
 		for (const req of nr.interceptedRequests) {
 			sections.push(`**Request:** ${req.method} ${req.url}`);
 			if (req.requestBody) {
@ -772,13 +824,9 @@ function buildVerificationArtifact(
 			}
 		}

-		// Node output
-		if (nr.output !== null && nr.output !== undefined) {
-			sections.push('**Output:**');
-			sections.push('```json', JSON.stringify(nr.output, null, 2), '```');
-		} else {
-			sections.push('**Output:** none');
-		}
+		sections.push(
+			...renderNodeOutputs(nodeName, nr.outputs, nr.outputCount, nr.truncated, wf?.connections),
+		);

 		sections.push('');
 	}
@ -786,6 +834,19 @@ function buildVerificationArtifact(
 	return sections.join('\n');
 }

+/** Build a verification artifact split into a cacheable workflow block + a fresh scenario block. */
+export function buildVerificationArtifact(
+	scenario: ExecutionScenario,
+	evalResult: InstanceAiEvalExecutionResult,
+	workflowJsons: WorkflowResponse[],
+): VerificationArtifact {
+	const wf = workflowJsons[0];
+	return {
+		workflowContext: buildWorkflowContextBlock(wf),
+		scenarioContext: buildScenarioContextBlock(scenario, evalResult, wf),
+	};
+}
+
 // ---------------------------------------------------------------------------
 // Concurrency control
 // ---------------------------------------------------------------------------
--- a/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
+++ b/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
@ -178,10 +178,23 @@ function renderScenarioDetail(sr: ExecutionScenarioResult): string {
 				html += '</div>';
 			}

-			// Node output
-			if (nr.output !== null && nr.output !== undefined) {
+			const outputEntries = Object.entries(nr.outputs);
+			const hasOutput = outputEntries.some(([, branches]) => branches.length > 0);
+			if (hasOutput) {
 				html += '<details class="node-output-toggle"><summary>Node output</summary>';
-				html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(nr.output, null, 2))}</code></pre>`;
+				for (const [connType, branches] of outputEntries) {
+					for (let i = 0; i < branches.length; i++) {
+						const label =
+							branches.length > 1 || connType !== 'main'
+								? `${connType} branch ${String(i)} (${String(branches[i].length)} items)`
+								: `${connType} (${String(branches[i].length)} items)`;
+						html += `<div class="node-output-branch"><strong>${escapeHtml(label)}</strong>`;
+						html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(branches[i], null, 2))}</code></pre></div>`;
+					}
+				}
+				if (nr.truncated) {
+					html += `<div class="muted">truncated; full count: ${String(nr.outputCount)}</div>`;
+				}
 				html += '</details>';
 			} else {
 				html += '<div class="muted">no output</div>';
--- a/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
+++ b/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
@ -15,13 +15,30 @@ Credential ID values in the workflow JSON (real, placeholder strings, or stale r

 ## What you receive

-The verification artifact contains:
+The artifact is split into two blocks:
+1. **Workflow structure** (stable across scenarios for the same build): all nodes, their saved configs, and the connections JSON.
+2. **Scenario context** (fresh per scenario): pre-analysis flags, execution summary, errors, and per-node execution trace.
+
+The full layout:
 - **Pre-analysis**: Automated flags for known issues (builder config problems, mock generation failures)
- **Execution summary**: Which nodes were mocked, pinned, or real
+- **Execution summary**: Which nodes were mocked, pinned, real, or did not run
 - **Errors**: Any runtime errors from the execution
- **Workflow structure**: ALL nodes that were built, whether they executed or not, the saved config for every node, plus the full connections JSON showing how nodes are wired. Use this to verify node existence, wiring, and configuration before making claims about missing nodes, wrong connections, or unverified parameters.
- **Execution trace**: Per-node detail including HTTP requests sent, mock responses returned, and node output. Only includes nodes that actually ran. **IMPORTANT: The trace is NOT in chronological order.** Do not infer execution sequence from the order nodes appear in the trace. Use the connections JSON in the workflow structure to determine execution flow.
- **Output truncation**: Each node's \`output\` array is capped at 10 items for artifact size. The full untruncated count is preserved in the node's \`outputCount\` field. **Do not treat a smaller \`output\` array as a bug.** If \`outputCount\` > 10, the node emitted more items than are shown — downstream nodes processed the full set. Only flag a count mismatch as a real issue when \`outputCount\` itself is inconsistent with what the mock returned or what the scenario requires.
+- **Workflow structure**: ALL nodes that were built, the saved config for every node, plus the full connections JSON. Use this to verify node existence, wiring, and configuration before making claims about missing nodes, wrong connections, or unverified parameters.
+- **Execution trace**: Per-node detail with HTTP requests sent, mock responses, and node outputs. **NOT in chronological order** — use the connections JSON to determine flow. Per-node header tags include \`ran Nx\` for loop iterations and \`first error at iter K\` when an early iteration errored.
+
+### Reading per-node outputs
+
+Each node's outputs are grouped by **connection type** (\`main\`, \`ai_languageModel\`, \`ai_memory\`, \`ai_tool\`, …) and then by **output port (branch)**:
+- Most nodes have a single \`main\` port: \`Output [main]\`.
+- **Filter / IF**: two \`main\` branches — \`Output [main branch 0]\` (matched / true) and \`Output [main branch 1]\` (unmatched / false). Items go to one branch OR the other, never both.
+- **Switch**: one branch per route — \`Output [main branch 0]\`, \`Output [main branch 1]\`, etc.
+- **AI sub-nodes**: emit via non-main connections such as \`ai_languageModel\`.
+
+Each branch is labelled with the downstream node it connects to (e.g. \`→ Aggregate Posts\`) or \`(no downstream connection)\` when the branch isn't wired up. **Only items in connected branches reach downstream nodes** — items in unconnected branches are correctly excluded from the flow, not a bug.
+
+### Output truncation
+
+Each branch's items are capped at 10 for artifact size. The full untruncated total across all branches is in the node's \`outputCount\` field, and \`truncated: true\` is set when any branch was sliced. **Do not treat a smaller items array as a bug.** Downstream nodes processed the full set; only flag a count mismatch if \`outputCount\` itself contradicts the scenario.

 ## How to evaluate

@ -42,11 +59,12 @@ The verification artifact contains:
 6. **Workflows can branch.** Not every node runs in every execution. A crashed or misconfigured node prevents all downstream branches from running. When diagnosing, identify the single root cause (the first node that crashed) rather than listing each unexecuted downstream node as a separate issue.
 7. Check the **success criteria** against the execution trace and node outputs
 8. For scenarios with no errors and no output beyond the trigger: this usually means the workflow handled empty data gracefully (no crash = success for empty-input scenarios)
+9. **0 items flowing into a downstream node = that node doesn't run.** This is n8n's default branching behavior, not a defect. When a Filter / IF / Switch routes 0 items to a branch, its downstream nodes simply don't execute — no crash, no side effects. **Do not require an explicit guard (IF count > 0, early-exit branch) unless the success criteria explicitly demands intentional handling.** Verify against what the criteria actually say, not against an implicit "must use a guard" requirement.

 ## Failure categories

 When a checklist item fails, categorize the root cause:
- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for. Also applies when a filter/code node receives correct input data but produces wrong output — this means the node logic is wrong, not the mock data. **Also applies when the builder produced an empty or trivial workflow (0 nodes, or only a trigger and no action nodes) — even if the build phase appears to have completed.** A "No trigger or start node found" execution error caused by zero nodes in the saved workflow is a builder failure, not a framework failure: the builder is responsible for committing at least a trigger.
+- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for. Also applies when a Code node receives correct input data but its connected downstream branch produces wrong output — that's wrong node logic. **For Filter / IF / Switch: an item appearing in the unmatched branch is NOT wrong output — it's correctly routed there by the predicate. Only flag a builder_issue when items that should have matched the predicate end up in the wrong branch.** **Also applies when the builder produced an empty or trivial workflow (0 nodes, or only a trigger and no action nodes) — even if the build phase appears to have completed.** A "No trigger or start node found" execution error caused by zero nodes in the saved workflow is a builder failure, not a framework failure: the builder is responsible for committing at least a trigger.
 - **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, mock data missing fields that downstream nodes reference. IMPORTANT: Trace the data flow carefully — if the mock returned correct data but a downstream filter or code node transformed it incorrectly, that is a builder_issue, not a mock_issue.
 - **legitimate_failure**: The workflow genuinely doesn't meet the success criteria and neither the builder nor mock is at fault. The test is working as designed — for example, the workflow lacks error handling that the scenario tests for.
 - **framework_issue**: The evaluation framework itself failed delivering input to an otherwise-built workflow. Evidence: a built workflow with at least a trigger node exists, but Phase 1 returned an error or the trigger output is empty (empty JSON object), causing cascading failures. Pre-analysis flags starting with "FRAMEWORK ISSUE", "Phase 1 error" warnings. DOES NOT apply when the workflow is empty (0 nodes) — that is a builder_issue, see above.
--- a/packages/@n8n/instance-ai/src/utils/eval-agents.ts
+++ b/packages/@n8n/instance-ai/src/utils/eval-agents.ts
@ -32,10 +32,13 @@ function getApiKey(): string {
 // Agent factory
 // ---------------------------------------------------------------------------

+/** Anthropic `providerOptions` payload that marks the preceding block as an ephemeral cache breakpoint. */
+export const EPHEMERAL_CACHE = {
+	anthropic: { cacheControl: { type: 'ephemeral' as const } },
+};
+
 const CACHE_PROVIDER_OPTS = {
-	providerOptions: {
-		anthropic: { cacheControl: { type: 'ephemeral' as const } },
-	},
+	providerOptions: EPHEMERAL_CACHE,
 };

 export function createEvalAgent(
--- a/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
@ -874,7 +874,7 @@ describe('EvalExecutionService', () => {
 			expect(result.nodeResults['HTTP Request'].startTime).toBe(1710000000);
 		});

-		it('captures output limited to MAX_OUTPUT_ITEMS_PER_NODE and reports full outputCount', async () => {
+		it('truncates per-branch items to MAX_OUTPUT_ITEMS_PER_BRANCH and reports full outputCount', async () => {
 			const items = Array.from({ length: 15 }, (_, i) => ({ json: { idx: i } }));
 			mockProcessRunExecutionData.mockResolvedValue(
 				makeIRun({
@ -898,8 +898,121 @@ describe('EvalExecutionService', () => {

 			const result = await service.executeWithLlmMock('wf-1', makeUser());

-			expect(result.nodeResults['HTTP Request'].output).toHaveLength(10);
-			expect(result.nodeResults['HTTP Request'].outputCount).toBe(15);
+			const entry = result.nodeResults['HTTP Request'];
+			expect(entry.outputs.main[0]).toHaveLength(10);
+			expect(entry.outputCount).toBe(15);
+			expect(entry.truncated).toBe(true);
+			expect(entry.iterationCount).toBe(1);
+		});
+
+		it('preserves per-branch structure for Filter/IF/Switch nodes', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								Filter: [
+									{
+										startTime: 1000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: {
+											main: [
+												[{ json: { id: 1, kept: true } }, { json: { id: 2, kept: true } }],
+												[{ json: { id: 3, dropped: true } }],
+											],
+										},
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			const entry = result.nodeResults['Filter'];
+			expect(entry.outputs.main).toHaveLength(2);
+			expect(entry.outputs.main[0]).toHaveLength(2);
+			expect(entry.outputs.main[1]).toHaveLength(1);
+			expect(entry.outputCount).toBe(3);
+		});
+
+		it('captures non-main connection outputs (AI sub-nodes)', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'OpenAI Chat Model': [
+									{
+										startTime: 1000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: {
+											ai_languageModel: [[{ json: { response: 'hi' } }]],
+										},
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			const entry = result.nodeResults['OpenAI Chat Model'];
+			expect(entry.outputs.ai_languageModel).toBeDefined();
+			expect(entry.outputs.ai_languageModel[0]).toHaveLength(1);
+			expect(entry.outputs.main).toBeUndefined();
+			expect(entry.outputCount).toBe(1);
+		});
+
+		it('records iterationCount and firstErrorIteration for nodes that ran multiple times', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'Code (in loop)': [
+									{
+										startTime: 1000,
+										executionTime: 50,
+										executionIndex: 0,
+										source: [],
+										data: { main: [[{ json: { iter: 0 } }]] },
+									},
+									{
+										startTime: 1100,
+										executionTime: 50,
+										executionIndex: 1,
+										source: [],
+										data: { main: [[{ json: { iter: 1 } }]] },
+										error: { message: 'boom' } as unknown as Error,
+									},
+									{
+										startTime: 1200,
+										executionTime: 50,
+										executionIndex: 2,
+										source: [],
+										data: { main: [[{ json: { iter: 2 } }]] },
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			const entry = result.nodeResults['Code (in loop)'];
+			expect(entry.iterationCount).toBe(3);
+			expect(entry.firstErrorIteration).toBe(1);
 		});
 	});

--- a/packages/cli/src/modules/instance-ai/eval/execution.service.ts
+++ b/packages/cli/src/modules/instance-ai/eval/execution.service.ts
@ -62,8 +62,8 @@ import {
 // Constants
 // ---------------------------------------------------------------------------

-/** Max output items per node kept in the artifact. The full count lives in `outputCount`. */
-const MAX_OUTPUT_ITEMS_PER_NODE = 10;
+/** Max output items per branch kept in the artifact. The full count lives in `outputCount`. */
+const MAX_OUTPUT_ITEMS_PER_BRANCH = 10;

 // ---------------------------------------------------------------------------
 // Service
@ -410,7 +410,9 @@ export class EvalExecutionService {

 			if (issues?.parameters && Object.keys(issues.parameters).length > 0) {
 				const entry = (nodeResults[node.name] ??= {
-					output: null,
+					outputs: {},
+					outputCount: 0,
+					iterationCount: 0,
 					interceptedRequests: [],
 					executionMode: 'real',
 				});
@ -519,7 +521,9 @@ export class EvalExecutionService {
 		nodeResults: Record<string, InstanceAiEvalNodeResult>,
 	): void {
 		const entry = (nodeResults[turn.rootName] ??= {
-			output: null,
+			outputs: {},
+			outputCount: 0,
+			iterationCount: 0,
 			interceptedRequests: [],
 			executionMode: 'mocked',
 		});
@ -556,7 +560,9 @@ export class EvalExecutionService {
 			// A node may make multiple HTTP requests — ensure it's marked as mocked.
 			// checkNodeConfig may have pre-created the entry as 'real', so always override.
 			const entry = (nodeResults[node.name] ??= {
-				output: null,
+				outputs: {},
+				outputCount: 0,
+				iterationCount: 0,
 				interceptedRequests: [],
 				executionMode: 'mocked',
 			});
@ -592,7 +598,9 @@ export class EvalExecutionService {
 	): void {
 		const existing = nodeResults[nodeName];
 		nodeResults[nodeName] = {
-			output: null,
+			outputs: {},
+			outputCount: 0,
+			iterationCount: 0,
 			interceptedRequests: [],
 			executionMode: 'pinned',
 			...(existing?.configIssues ? { configIssues: existing.configIssues } : {}),
@ -672,24 +680,46 @@ export class EvalExecutionService {
 			// Nodes already in nodeResults were intercepted (mocked) or pinned.
 			// Nodes appearing here for the first time executed for real (logic nodes).
 			const entry = (nodeResults[nodeName] ??= {
-				output: null,
+				outputs: {},
+				outputCount: 0,
+				iterationCount: 0,
 				interceptedRequests: [],
 				executionMode: 'real',
 			});
+			entry.iterationCount = nodeRuns.length;
+			const firstErrorIdx = nodeRuns.findIndex((run) => run?.error !== undefined);
+			if (firstErrorIdx !== -1) {
+				entry.firstErrorIteration = firstErrorIdx;
+			}
+
 			const lastRun = nodeRuns[nodeRuns.length - 1];
 			if (lastRun?.startTime) {
 				entry.startTime = lastRun.startTime;
 			}
-			if (lastRun?.data?.main) {
-				// Capture output from all branches (Switch/IF nodes have multiple outputs)
-				const flattened = lastRun.data.main
-					.flat()
-					.filter((item): item is INodeExecutionData => item !== null);
-				entry.outputCount = flattened.length;
-				const allOutputs = flattened.slice(0, MAX_OUTPUT_ITEMS_PER_NODE);
-				if (allOutputs.length > 0) {
-					entry.output = await this.hydrateBinaryData(allOutputs);
+			if (lastRun?.data) {
+				// Preserve per-connection-type, per-output-port structure so verifiers can
+				// distinguish Filter/IF/Switch branches and AI sub-node outputs.
+				let totalCount = 0;
+				let truncated = false;
+				const outputs: Record<string, unknown[][]> = {};
+				for (const [connectionType, branches] of Object.entries(lastRun.data)) {
+					if (!Array.isArray(branches)) continue;
+					outputs[connectionType] = await Promise.all(
+						branches.map(async (branch) => {
+							if (!Array.isArray(branch)) return [];
+							totalCount += branch.length;
+							let kept = branch;
+							if (branch.length > MAX_OUTPUT_ITEMS_PER_BRANCH) {
+								truncated = true;
+								kept = branch.slice(0, MAX_OUTPUT_ITEMS_PER_BRANCH);
+							}
+							return await this.hydrateBinaryData(kept);
+						}),
+					);
 				}
+				entry.outputs = outputs;
+				entry.outputCount = totalCount;
+				if (truncated) entry.truncated = true;
 			}
 			if (lastRun?.error) {
 				errors.push(`Node "${nodeName}": ${lastRun.error.message}`);