feat(core): Stream tool calls and ship M3 fixtures from LLM eval wire server (no-changelog) (#30983)

2026-05-30 16:26:59 +02:00 · 2026-05-27 15:53:43 +02:00 · 2026-05-27 15:53:43 +02:00 · 55d8b59a48
commit 55d8b59a48
parent fabacb64f3
25 changed files with 3346 additions and 669 deletions
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts
@ -10,6 +10,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { HumanMessage, SystemMessage } from '@langchain/core/messages';
 import { existsSync, readFileSync, readdirSync } from 'fs';
 import {
+	findAiRootNodeNames,
 	jsonParse,
 	type IDataObject,
 	type INode,
@ -63,29 +64,6 @@ const NON_SERVICE_NODES_WITH_CREDENTIALS = new Set([
 // Node identification
 // ---------------------------------------------------------------------------

-/**
- * Build a set of node names that are targets of AI-type connections
- * (ai_languageModel, ai_tool, ai_memory, etc.). These are root AI nodes
- * (e.g. Agent, Chain) whose sub-nodes can't be individually pinned.
- * Pinning the root prevents sub-node execution entirely.
- */
-function findAiRootNodeNames(workflow: SimpleWorkflow): Set<string> {
-	const roots = new Set<string>();
-	for (const nodeConns of Object.values(workflow.connections)) {
-		for (const [connType, outputs] of Object.entries(nodeConns)) {
-			if (!connType.startsWith('ai_')) continue;
-			if (!Array.isArray(outputs)) continue;
-			for (const group of outputs) {
-				if (!Array.isArray(group)) continue;
-				for (const conn of group) {
-					if (conn?.node) roots.add(conn.node);
-				}
-			}
-		}
-	}
-	return roots;
-}
-
 /**
 * Identify which nodes in a workflow need pin data.
 * In eval context, we pin all service/API nodes since none have real credentials.
@ -95,7 +73,7 @@ export function identifyPinDataNodes(
 	nodeTypes: INodeTypeDescription[],
 ): INode[] {
 	const nodeTypeMap = new Map(nodeTypes.map((nt) => [nt.name, nt]));
-	const aiRootNodes = findAiRootNodeNames(workflow);
+	const aiRootNodes = findAiRootNodeNames(workflow.connections);

 	return workflow.nodes.filter((node) => {
 		// Skip disabled nodes
--- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
@ -1127,10 +1127,8 @@ export const EVAL_VENDOR_SDK_INTERCEPTION_FLAG = '085_eval_vendor_sdk_intercepti

 /**
 * Records a credential field that was rewritten (e.g. routed to the eval wire
- * server) during evaluation. Populated when the caller opts into the unpin
- * path via `InstanceAiEvalExecutionRequest.unpinNodes`. Field added in the
- * foundation PR; the rewrite path itself is wired up in a later PR and stays
- * empty until then.
+ * server) during evaluation. Populated for every AI root the server intercepts;
+ * empty when the kill-switch is off or every root was auto-/explicit-pinned.
 */
 export interface InstanceAiEvalRewrittenCredential {
 	nodeName: string;
@ -1152,29 +1150,20 @@ export interface InstanceAiEvalExecutionResult {
 export class InstanceAiEvalExecutionRequest extends Z.class({
 	scenarioHints: z.string().max(2000).optional(),
 	/**
-	 * AI root node names (Agent, Chain, etc.) whose sub-nodes should run their
-	 * real vendor SDK code instead of being pinned. The eval pipeline rewrites
-	 * matching credentials so vendor traffic lands on the eval wire server.
+	 * AI root nodes (Agent, Chain) that should stay pinned — opt-out from the
+	 * default-on wire-server interception path. Useful when the caller wants
+	 * to keep a specific root on the pinned baseline (e.g. for A/B comparison)
+	 * even though its sub-nodes are interceptable.
 	 *
-	 * The compatibility guard refuses the request up front (no execution
-	 * attempted) when any inbound `ai_*` sub-node of a requested root falls
-	 * into one of these categories:
-	 *   - **Protocol-binary client**: Postgres/Redis/MongoDB memory, native
-	 *     vector stores (PGVector / Mongo / Redis / Milvus). These don't
-	 *     speak HTTP and can't be intercepted by the wire server.
-	 *   - **Unsupported vendor LLM**: any `@n8n/n8n-nodes-langchain.lm*` node
-	 *     not yet on the supported list (currently `lmChatOpenAi` only).
-	 *     These would call the real provider with real credentials because
-	 *     there's no eval URL-rewrite mapping for them.
-	 *   - **Unsafe `options.baseURL` override**: a supported vendor LLM
-	 *     configured with a non-empty `options.baseURL` parameter. The SDK
-	 *     prefers that over the rewritten credential URL, so the override
-	 *     would bypass the wire server.
+	 * The server auto-pins AI roots whose inbound `ai_*` sub-nodes are
+	 * incompatible (protocol-binary memory/vector store, unsupported vendor
+	 * LLM, configured `options.baseURL` override, shared with another root)
+	 * — callers do not need to list those here.
 	 *
-	 * Refused requests come back as an error-shaped `InstanceAiEvalExecutionResult`
-	 * with the offending root → sub-node pairs listed in `errors`.
+	 * Validated up front: unknown / disabled / non-AI-root names come back
+	 * as an error-shaped `InstanceAiEvalExecutionResult`.
 	 */
-	unpinNodes: z.array(z.string().min(1)).max(50).optional(),
+	pinNodes: z.array(z.string().min(1)).max(50).optional(),
 }) {}

 // ---------------------------------------------------------------------------
--- a/packages/@n8n/instance-ai/evaluations/cli/args.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts
@ -47,6 +47,10 @@ export interface CliArgs {
 	/** Number of iterations to run each test case (default: 1). Each iteration
 	 *  gets a fresh build so pass@k / pass^k capture real builder variance. */
 	iterations: number;
+	/** AI root nodes (Agent, Chain) to keep pinned — opt-out from the default-on
+	 *  wire-server interception path. Useful for A/B comparison or when a
+	 *  specific root needs to stay on the pinned baseline. CSV of node names. */
+	pinAiRoots?: string[];
 }

 // ---------------------------------------------------------------------------
@ -68,6 +72,7 @@ const cliArgsSchema = z.object({
 	concurrency: z.number().int().positive().default(16),
 	experimentName: z.string().optional(),
 	iterations: z.number().int().positive().default(1),
+	pinAiRoots: z.array(z.string().min(1)).optional(),
 });

 // ---------------------------------------------------------------------------
@ -93,6 +98,7 @@ export function parseCliArgs(argv: string[]): CliArgs {
 		concurrency: validated.concurrency,
 		experimentName: validated.experimentName,
 		iterations: validated.iterations,
+		pinAiRoots: validated.pinAiRoots,
 	};
 }

@ -115,6 +121,7 @@ interface RawArgs {
 	concurrency: number;
 	experimentName?: string;
 	iterations: number;
+	pinAiRoots?: string[];
 }

 function parseRawArgs(argv: string[]): RawArgs {
@ -128,6 +135,7 @@ function parseRawArgs(argv: string[]): RawArgs {
 		concurrency: 16,
 		experimentName: undefined,
 		iterations: 1,
+		pinAiRoots: undefined,
 	};

 	for (let i = 0; i < argv.length; i++) {
@ -207,6 +215,16 @@ function parseRawArgs(argv: string[]): RawArgs {
 				i++;
 				break;

+			case '--pin-ai-roots': {
+				const raw = nextArg(argv, i, '--pin-ai-roots');
+				result.pinAiRoots = raw
+					.split(',')
+					.map((s) => s.trim())
+					.filter((s) => s.length > 0);
+				i++;
+				break;
+			}
+
 			default:
 				// Fail loudly on unknown flags. Strip any =value payload before
 				// echoing and drop positional values entirely — raw CLI input
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -360,6 +360,7 @@ async function runWithLangSmith(config: RunConfig): Promise<{
 						execArgs.workflowJsons,
 						logger,
 						args.timeoutMs,
+						args.pinAiRoots,
 					),
 				{
 					name: 'scenario_execution',
@ -940,6 +941,7 @@ async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
 								keepWorkflows: args.keepWorkflows,
 								laneTag,
 								prebuiltWorkflowId: pickPrebuiltWorkflowId(prebuiltManifest, tc.fileSlug, iter),
+								pinAiRoots: args.pinAiRoots,
 							}),
 						MAX_CONCURRENT_BUILDS,
 					);
--- a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
+++ b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
@ -495,15 +495,26 @@ export class N8nClient {
 	/**
 	 * Execute a workflow with LLM-based HTTP mocking.
 	 * The server handles hint generation and mock execution in a single synchronous call.
+	 *
+	 * AI root nodes (Agent, Chain) default to wire-server interception so their
+	 * sub-nodes actually run instead of being short-circuited by pin data;
+	 * pass `pinNodes` to keep specific roots on the pinned baseline (e.g. for
+	 * A/B comparison). Gated server-side behind the
+	 * `085_eval_vendor_sdk_interception` PostHog flag.
 	 */
 	async executeWithLlmMock(
 		workflowId: string,
 		scenarioHints?: string,
 		timeoutMs: number = 120_000,
+		pinNodes?: string[],
 	): Promise<InstanceAiEvalExecutionResult> {
+		const body: { scenarioHints?: string; pinNodes?: string[] } = {};
+		if (scenarioHints) body.scenarioHints = scenarioHints;
+		if (pinNodes && pinNodes.length > 0) body.pinNodes = pinNodes;
+
 		const result = (await this.fetch(`/rest/instance-ai/eval/execute-with-llm-mock/${workflowId}`, {
 			method: 'POST',
-			body: scenarioHints ? { scenarioHints } : {},
+			body,
 			timeoutMs,
 		})) as { data: InstanceAiEvalExecutionResult };
 		return result.data;
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -68,6 +68,11 @@ interface WorkflowTestCaseConfig {
 	/** When set, skip the orchestrator build and verify this existing workflow
 	 *  instead. The harness leaves it in place — caller owns its lifecycle. */
 	prebuiltWorkflowId?: string;
+	/** AI root nodes (Agent, Chain) to keep pinned — opt-out from the default-on
+	 *  wire-server interception path. Omit (or pass empty) to intercept every
+	 *  interceptable AI root the workflow contains. Server-side gated by the
+	 *  `085_eval_vendor_sdk_interception` PostHog flag. */
+	pinAiRoots?: string[];
 }

 /**
@ -144,6 +149,7 @@ export async function runWorkflowTestCase(
 					build.workflowJsons,
 					logger,
 					timeoutMs,
+					config.pinAiRoots,
 				);
 			} catch (error: unknown) {
 				const errorMessage = error instanceof Error ? error.message : String(error);
@ -478,8 +484,17 @@ export async function executeScenario(
 	workflowJsons: WorkflowResponse[],
 	logger: EvalLogger,
 	timeoutMs?: number,
+	pinAiRoots?: string[],
 ): Promise<ExecutionScenarioResult> {
-	return await runScenario(client, scenario, workflowId, workflowJsons, logger, timeoutMs);
+	return await runScenario(
+		client,
+		scenario,
+		workflowId,
+		workflowJsons,
+		logger,
+		timeoutMs,
+		pinAiRoots,
+	);
 }

 /**
@ -526,13 +541,22 @@ async function runScenario(
 	workflowJsons: WorkflowResponse[],
 	logger: EvalLogger,
 	timeoutMs?: number,
+	pinAiRoots?: string[],
 ): Promise<ExecutionScenarioResult> {
+	const pinNodes = pinAiRoots && pinAiRoots.length > 0 ? pinAiRoots : undefined;
+
 	const execStart = Date.now();
-	const evalResult = await client.executeWithLlmMock(workflowId, scenario.dataSetup, timeoutMs);
+	const evalResult = await client.executeWithLlmMock(
+		workflowId,
+		scenario.dataSetup,
+		timeoutMs,
+		pinNodes,
+	);
 	const execMs = Date.now() - execStart;

+	const pinTag = pinNodes ? ` pinned=${pinNodes.join(',')}` : '';
 	logger.info(
-		`    [${scenario.name}] exec=${String(Math.round(execMs / 1000))}s (${Object.keys(evalResult.nodeResults).length} nodes)`,
+		`    [${scenario.name}] exec=${String(Math.round(execMs / 1000))}s (${Object.keys(evalResult.nodeResults).length} nodes)${pinTag}`,
 	);

 	const verifyStart = Date.now();
--- a/packages/cli/package.json
+++ b/packages/cli/package.json
@ -90,6 +90,7 @@
    "ioredis-mock": "^8.8.1",
    "mjml": "^4.15.3",
    "n8n-containers": "workspace:*",
+    "openai": "catalog:",
    "openapi-types": "^12.1.3",
    "ts-essentials": "^7.0.3"
  },
--- a/packages/cli/src/modules/instance-ai/eval/tests/eval-mocked-credentials-helper.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/eval-mocked-credentials-helper.test.ts
@ -414,6 +414,161 @@ describe('EvalMockedCredentialsHelper', () => {
 		});
 	});

+	describe('getDecrypted — schema synthesis when id is null', () => {
+		// Core's eval-mode bypass passes `{ id: null, name: type }` when a node
+		// has no credentials configured at all. The inner helper throws
+		// CredentialNotFoundError on a null id; the catch below schema-synthesizes
+		// (and applies the URL rewrite) so vendor SDK traffic stays inside the
+		// wire server instead of escaping to the real provider with 401.
+		const propsSchema = [
+			{
+				name: 'apiKey',
+				displayName: 'API Key',
+				type: 'string' as const,
+				default: '',
+				typeOptions: { password: true },
+			},
+			{
+				name: 'url',
+				displayName: 'Base URL',
+				type: 'string' as const,
+				default: 'https://api.openai.com/v1',
+			},
+		];
+
+		const nullNodeCreds: INodeCredentialsDetails = { id: null, name: 'openAiApi' };
+
+		function makeSynthesizingInner(): ICredentialsHelper {
+			return makeInner({
+				getCredentialsProperties: jest.fn().mockReturnValue(propsSchema),
+				// Inner throws on a null-id lookup → catch fires → schema synthesis.
+				getDecrypted: jest.fn().mockRejectedValue(new CredentialNotFoundError('null', 'openAiApi')),
+			});
+		}
+
+		it('synthesizes a credential from the schema and applies the URL rewrite', async () => {
+			const subNodeToRoot = new Map<string, string>([['OpenAI', 'Agent']]);
+			const helper = new EvalMockedCredentialsHelper(
+				makeSynthesizingInner(),
+				'http://127.0.0.1:54321',
+				undefined,
+				subNodeToRoot,
+			);
+
+			const result = await helper.getDecrypted(
+				fakeAdditionalData,
+				nullNodeCreds,
+				'openAiApi',
+				'manual',
+				{ node: { name: 'OpenAI' } as INode } as IExecuteData,
+			);
+
+			// Schema default for `url` is rewritten to the wire-server path.
+			expect(result.url).toBe('http://127.0.0.1:54321/eval/Agent/v1');
+			// Secret field (apiKey) is filled by `buildEvalMockCredentials` —
+			// the placeholder doesn't matter, only that it's not undefined.
+			expect(typeof result.apiKey).toBe('string');
+		});
+
+		it('records the synthesized credential on `mockedCredentials`', async () => {
+			const helper = new EvalMockedCredentialsHelper(
+				makeSynthesizingInner(),
+				'http://127.0.0.1:1',
+				undefined,
+			);
+
+			await helper.getDecrypted(fakeAdditionalData, nullNodeCreds, 'openAiApi', 'manual', {
+				node: { name: 'OpenAI GPT-4' } as INode,
+			} as IExecuteData);
+
+			expect(helper.mockedCredentials).toEqual([
+				{
+					nodeName: 'OpenAI GPT-4',
+					credentialType: 'openAiApi',
+					credentialId: undefined,
+				},
+			]);
+		});
+
+		it('records the rewrite on `rewrittenCredentials`', async () => {
+			const subNodeToRoot = new Map<string, string>([['OpenAI', 'Agent']]);
+			const helper = new EvalMockedCredentialsHelper(
+				makeSynthesizingInner(),
+				'http://127.0.0.1:1',
+				undefined,
+				subNodeToRoot,
+			);
+
+			await helper.getDecrypted(fakeAdditionalData, nullNodeCreds, 'openAiApi', 'manual', {
+				node: { name: 'OpenAI' } as INode,
+			} as IExecuteData);
+
+			expect(helper.rewrittenCredentials).toEqual([
+				{
+					nodeName: 'OpenAI',
+					credentialType: 'openAiApi',
+					credentialId: undefined,
+					field: 'url',
+				},
+			]);
+		});
+
+		it('brands the synthetic credential with __evalMockedCredential so authenticate short-circuits', async () => {
+			// Regression: without the marker, `authenticate` / `preAuthentication`
+			// / `runPreAuthentication` would delegate the synthetic credential
+			// through the inner helper's real-auth flow (OAuth refresh, PreSend
+			// hooks). Those flows would either crash on placeholder values or
+			// leak real-auth side effects from a fake credential.
+			const inner = makeInner({
+				getCredentialsProperties: jest.fn().mockReturnValue(propsSchema),
+				getDecrypted: jest.fn().mockRejectedValue(new CredentialNotFoundError('null', 'openAiApi')),
+				authenticate: jest.fn().mockResolvedValue({ url: 'http://should-not-be-called' }),
+			});
+			const helper = new EvalMockedCredentialsHelper(inner);
+
+			const synthetic = await helper.getDecrypted(
+				fakeAdditionalData,
+				nullNodeCreds,
+				'openAiApi',
+				'manual',
+				{ node: { name: 'OpenAI' } as INode } as IExecuteData,
+			);
+
+			expect(synthetic.__evalMockedCredential).toBe(true);
+
+			// Round-trip through `authenticate` confirms the marker actually
+			// short-circuits — the inner helper must not be invoked.
+			const requestOptions: IHttpRequestOptions = { url: 'http://example.com' };
+			const result = await helper.authenticate(
+				synthetic,
+				'openAiApi',
+				requestOptions,
+				fakeWorkflow,
+				fakeNode,
+			);
+			expect(result).toBe(requestOptions);
+			expect(inner.authenticate).not.toHaveBeenCalled();
+		});
+
+		it('still returns the synthetic credential when no serverUrl is configured', async () => {
+			// The helper may be used in eval mode without the wire server
+			// (e.g. HTTP-helper-only workflows). Without `serverUrl` we just
+			// pass the synthetic through — matches the pre-hook behaviour.
+			const helper = new EvalMockedCredentialsHelper(makeSynthesizingInner());
+
+			const result = await helper.getDecrypted(
+				fakeAdditionalData,
+				nullNodeCreds,
+				'openAiApi',
+				'manual',
+				{ node: { name: 'OpenAI' } as INode } as IExecuteData,
+			);
+
+			expect(result.url).toBe('https://api.openai.com/v1');
+			expect(helper.rewrittenCredentials).toEqual([]);
+		});
+	});
+
 	describe('authenticate', () => {
 		it('passes the request through unchanged for marker payloads', async () => {
 			const inner = makeInner();
--- a/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
@ -1,6 +1,6 @@
-import { mock } from 'jest-mock-extended';
-import type { User } from '@n8n/db';
 import type { Logger } from '@n8n/backend-common';
+import type { User } from '@n8n/db';
+import { mock } from 'jest-mock-extended';
 import type {
 	INode,
 	IRunExecutionData,
@ -8,10 +8,11 @@ import type {
 	IWorkflowBase,
 	INodeTypeDescription,
 } from 'n8n-workflow';
+import { UserError } from 'n8n-workflow';

-import type { WorkflowFinderService } from '@/workflows/workflow-finder.service';
 import type { NodeTypes } from '@/node-types';
 import type { PostHogClient } from '@/posthog';
+import type { WorkflowFinderService } from '@/workflows/workflow-finder.service';

 // ---------------------------------------------------------------------------
 // Mocks — must be before the import of the class under test
@ -28,7 +29,7 @@ jest.mock('../mock-handler', () => ({
 	createLlmMockHandler: jest.fn(),
 }));
 jest.mock('../workflow-analysis', () => ({
-	assertUnpinCompatibility: jest.fn(),
+	partitionAiRoots: jest.fn(),
 	buildVendorLlmRouting: jest.fn().mockReturnValue({
 		subNodeToRoot: new Map(),
 		rootToSubNode: new Map(),
@ -96,15 +97,14 @@ jest.mock('n8n-workflow', () => {
 // ---------------------------------------------------------------------------

 import { EvalExecutionService } from '../execution.service';
+import { createLlmMockHandler } from '../mock-handler';
 import {
-	assertUnpinCompatibility,
 	generateMockHints,
 	identifyNodesForHints,
 	identifyNodesForPinData,
+	partitionAiRoots,
 } from '../workflow-analysis';
-import { createLlmMockHandler } from '../mock-handler';
 import type { MockHints } from '../workflow-analysis';
-import { UserError } from 'n8n-workflow';

 // ---------------------------------------------------------------------------
 // Helpers
@ -113,7 +113,7 @@ import { UserError } from 'n8n-workflow';
 const generateMockHintsMock = jest.mocked(generateMockHints);
 const identifyNodesForHintsMock = jest.mocked(identifyNodesForHints);
 const identifyNodesForPinDataMock = jest.mocked(identifyNodesForPinData);
-const assertUnpinCompatibilityMock = jest.mocked(assertUnpinCompatibility);
+const partitionAiRootsMock = jest.mocked(partitionAiRoots);
 const createLlmMockHandlerMock = jest.mocked(createLlmMockHandler);

 function makeWorkflowEntity(overrides: Partial<IWorkflowBase> = {}) {
@ -201,10 +201,12 @@ describe('EvalExecutionService', () => {

 		service = new EvalExecutionService(workflowFinderService, nodeTypes, logger, postHogClient);

-		// Default mock returns — happy path
+		// Default mock returns — happy path. partitionAiRoots returns an empty
+		// partition (no AI roots in the test workflow) so the kill-switch
+		// short-circuits and the wire server stays off unless a test overrides.
 		identifyNodesForHintsMock.mockReturnValue([]);
 		identifyNodesForPinDataMock.mockReturnValue([]);
-		assertUnpinCompatibilityMock.mockImplementation(() => undefined);
+		partitionAiRootsMock.mockReturnValue({ unpinNodes: [], pinNodes: [], autoPinned: [] });
 		generateMockHintsMock.mockResolvedValue(makeEmptyHints());
 		createLlmMockHandlerMock.mockReturnValue(jest.fn());
 		mockGetStartNode.mockReturnValue(makeStartNode());
@ -311,21 +313,30 @@ describe('EvalExecutionService', () => {
 		});
 	});

-	// ── unpinNodes handling ──────────────────────────────────────────
+	// ── pinNodes / interception partition ────────────────────────────

-	describe('unpinNodes', () => {
+	describe('interception partition', () => {
 		beforeEach(() => {
 			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
 		});

-		it('calls assertUnpinCompatibility with an empty list when unpinNodes is omitted', async () => {
+		it('calls partitionAiRoots with an empty explicit pin list when pinNodes is omitted', async () => {
 			await service.executeWithLlmMock('wf-1', makeUser());

-			expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith(expect.anything(), []);
+			expect(partitionAiRootsMock).toHaveBeenCalledWith(expect.anything(), []);
 		});

-		it('omits the exclusion set when unpinNodes is empty', async () => {
-			await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: [] });
+		it('forwards explicit pinNodes from the request to partitionAiRoots', async () => {
+			await service.executeWithLlmMock('wf-1', makeUser(), { pinNodes: ['Agent'] });
+
+			expect(partitionAiRootsMock).toHaveBeenCalledWith(expect.objectContaining({ id: 'wf-1' }), [
+				'Agent',
+			]);
+		});
+
+		it('omits the exclusion set when the partition returns no unpinNodes', async () => {
+			// Default mock returns empty unpinNodes → no AI roots intercepted.
+			await service.executeWithLlmMock('wf-1', makeUser());

 			expect(identifyNodesForPinDataMock).toHaveBeenCalledWith(
 				expect.objectContaining({ id: 'wf-1' }),
@ -333,78 +344,82 @@ describe('EvalExecutionService', () => {
 			);
 		});

-		// PostHog kill-switch: non-empty unpinNodes only runs when the flag
-		// resolves to ON. Flag OFF refuses the request before any other work
-		// so vendor traffic can never reach the real provider.
+		it("surfaces the partition's typo-guard error when an explicit pin name is invalid", async () => {
+			partitionAiRootsMock.mockImplementation(() => {
+				throw new UserError('Cannot pin — not found in workflow: "Ghost".');
+			});
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser(), {
+				pinNodes: ['Ghost'],
+			});
+
+			expect(result.success).toBe(false);
+			expect(result.errors).toEqual([expect.stringContaining('not found in workflow')]);
+			expect(mockProcessRunExecutionData).not.toHaveBeenCalled();
+			expect(mockWireServerStart).not.toHaveBeenCalled();
+		});
+
+		// PostHog kill-switch: when partitionAiRoots wants to intercept any
+		// roots, the flag is consulted. Flag OFF silently degrades to the
+		// pinned baseline so the eval still produces a result — no error,
+		// just the today-baseline behaviour. This is the right default once
+		// interception is the default-on path.
 		describe('PostHog kill-switch (flag off)', () => {
 			beforeEach(() => {
+				partitionAiRootsMock.mockReturnValue({
+					unpinNodes: ['Agent'],
+					pinNodes: [],
+					autoPinned: [],
+				});
 				postHogClient.getFeatureFlags.mockResolvedValue({
 					'085_eval_vendor_sdk_interception': false,
 				});
 			});

-			it('runs the compatibility guard first, then refuses with the gate error when the guard passes', async () => {
-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+			it('silently degrades to the pinned baseline (no wire server, no error)', async () => {
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

-				expect(result.success).toBe(false);
-				expect(result.errors).toEqual([expect.stringContaining('currently disabled')]);
-				// Guard runs first so the user gets actionable diagnostics when their
-				// workflow has a permanent compatibility issue. When the guard passes,
-				// the gate fires with the generic "currently disabled" message.
-				expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith(
-					expect.objectContaining({ id: 'wf-1' }),
-					['Agent'],
-				);
-				expect(generateMockHintsMock).not.toHaveBeenCalled();
-				expect(mockProcessRunExecutionData).not.toHaveBeenCalled();
+				// No refusal — the eval still completes through the pinned path.
+				expect(result.errors).toEqual([]);
+				expect(mockWireServerStart).not.toHaveBeenCalled();
+				expect(mockProcessRunExecutionData).toHaveBeenCalledTimes(1);
 			});

-			it("surfaces the guard's error when the workflow has a permanent compatibility issue", async () => {
-				assertUnpinCompatibilityMock.mockImplementation(() => {
-					throw new UserError(
-						'Cannot unpin AI root nodes — protocol-binary sub-nodes ' +
-							'(cannot be intercepted via HTTP): "Mem" (memoryPostgresChat) → "Agent"',
-					);
+			it('does not consult PostHog when the partition has nothing to intercept', async () => {
+				partitionAiRootsMock.mockReturnValue({
+					unpinNodes: [],
+					pinNodes: [],
+					autoPinned: [],
 				});

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
-
-				expect(result.success).toBe(false);
-				// Guard's protocol-binary message wins over the generic gate message —
-				// the user needs to fix the workflow regardless of when the feature ships.
-				expect(result.errors).toEqual([expect.stringContaining('memoryPostgresChat')]);
-				expect(result.errors[0]).not.toContain('currently disabled');
-				// Guard refused before the PostHog check fires.
-				expect(postHogClient.getFeatureFlags).not.toHaveBeenCalled();
-			});
-
-			it('still runs the normal pinned path when unpinNodes is omitted (no flag check)', async () => {
 				await service.executeWithLlmMock('wf-1', makeUser());

 				expect(postHogClient.getFeatureFlags).not.toHaveBeenCalled();
-				expect(generateMockHintsMock).toHaveBeenCalled();
-				expect(mockProcessRunExecutionData).toHaveBeenCalled();
+			});
+
+			it('also degrades silently when PostHog itself rejects (fail-closed)', async () => {
+				postHogClient.getFeatureFlags.mockRejectedValue(new Error('PostHog down'));
+
+				const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+				expect(result.errors).toEqual([]);
+				expect(mockWireServerStart).not.toHaveBeenCalled();
 			});
 		});

-		// Flag ON (or unset — fail-open default): non-empty unpinNodes proceeds
-		// into the rewrite path and boots the wire server.
+		// Flag ON (or unset — fail-open default): the partition's unpinNodes
+		// drive the rewrite path and boot the wire server.
 		describe('PostHog kill-switch (flag on)', () => {
-			it('forwards unpinNodes to assertUnpinCompatibility', async () => {
-				await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] });
-
-				expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith(
-					expect.objectContaining({ id: 'wf-1' }),
-					['Agent'],
-				);
+			beforeEach(() => {
+				partitionAiRootsMock.mockReturnValue({
+					unpinNodes: ['Agent'],
+					pinNodes: [],
+					autoPinned: [],
+				});
 			});

-			it('forwards the exclusion set to identifyNodesForPinData', async () => {
-				await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] });
+			it('forwards the exclusion set to identifyNodesForPinData when interception is enabled', async () => {
+				await service.executeWithLlmMock('wf-1', makeUser());

 				expect(identifyNodesForPinDataMock).toHaveBeenCalledWith(
 					expect.objectContaining({ id: 'wf-1' }),
@ -413,7 +428,7 @@ describe('EvalExecutionService', () => {
 			});

 			it('boots and tears down the wire server around the workflow run', async () => {
-				await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] });
+				await service.executeWithLlmMock('wf-1', makeUser());

 				expect(mockWireServerStart).toHaveBeenCalledTimes(1);
 				expect(mockProcessRunExecutionData).toHaveBeenCalledTimes(1);
@ -424,43 +439,33 @@ describe('EvalExecutionService', () => {
 			it('tears down the wire server even if the workflow run throws', async () => {
 				mockProcessRunExecutionData.mockRejectedValue(new Error('explode'));

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

 				expect(result.success).toBe(false);
 				expect(mockWireServerStop).toHaveBeenCalledTimes(1);
 				expect(mockRestoreNoProxy).toHaveBeenCalledTimes(1);
 			});

-			it('does not boot the wire server when unpinNodes is empty', async () => {
-				await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: [] });
+			it('does not boot the wire server when the partition has no unpinNodes', async () => {
+				partitionAiRootsMock.mockReturnValue({
+					unpinNodes: [],
+					pinNodes: [],
+					autoPinned: [],
+				});
+
+				await service.executeWithLlmMock('wf-1', makeUser());

 				expect(mockWireServerStart).not.toHaveBeenCalled();
 				expect(mockWireServerStop).not.toHaveBeenCalled();
 			});

-			it('fails closed when PostHog rejects (treats flag as off and refuses the request)', async () => {
-				postHogClient.getFeatureFlags.mockRejectedValue(new Error('PostHog down'));
-
-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
-
-				expect(result.success).toBe(false);
-				expect(result.errors).toEqual([expect.stringContaining('currently disabled')]);
-				expect(mockWireServerStart).not.toHaveBeenCalled();
-			});
-
 			it('tears down the wire server when NO_PROXY patching throws after boot', async () => {
 				const proxyLoopback = require('../proxy-loopback');
 				proxyLoopback.patchNoProxyForLoopback.mockImplementationOnce(() => {
 					throw new Error('env mutation blocked');
 				});

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

 				expect(result.success).toBe(false);
 				expect(result.errors).toEqual([expect.stringContaining('env mutation blocked')]);
@ -468,24 +473,6 @@ describe('EvalExecutionService', () => {
 				expect(mockWireServerStop).toHaveBeenCalledTimes(1);
 			});

-			it('returns an error result and skips workflow execution when the compatibility guard refuses', async () => {
-				assertUnpinCompatibilityMock.mockImplementation(() => {
-					throw new (require('n8n-workflow').UserError)(
-						'Cannot unpin "Agent" — incompatible memory backend',
-					);
-				});
-
-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
-
-				expect(result.success).toBe(false);
-				expect(result.errors).toEqual([expect.stringContaining('Cannot unpin "Agent"')]);
-				expect(mockProcessRunExecutionData).not.toHaveBeenCalled();
-				// Server was never started — guard runs before boot.
-				expect(mockWireServerStart).not.toHaveBeenCalled();
-			});
-
 			it('records a wire-server turn against the AI root in nodeResults via onIntercept', async () => {
 				// Simulate the wire server firing onIntercept mid-execution by
 				// invoking the captured callback before processRunExecutionData
@ -506,9 +493,7 @@ describe('EvalExecutionService', () => {
 					return makeIRun();
 				});

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

 				expect(result.nodeResults['Agent']).toBeDefined();
 				expect(result.nodeResults['Agent'].executionMode).toBe('mocked');
@ -552,9 +537,7 @@ describe('EvalExecutionService', () => {
 					return makeIRun();
 				});

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

 				// 'pinned' from the bypass pass survives — preservation rule.
 				expect(result.nodeResults['Agent'].executionMode).toBe('pinned');
@ -562,6 +545,99 @@ describe('EvalExecutionService', () => {
 				expect(result.nodeResults['Agent'].interceptedRequests).toHaveLength(1);
 			});

+			// Headline ledger-attribution rule for M3: a single eval run produces
+			// two kinds of traffic — vendor-SDK model turns (attributed to the AI
+			// root via the wire server's URL path) and tool HTTP traffic
+			// (attributed to the tool node via the existing helpers.httpRequest
+			// interceptor in `request-helper-functions.ts:1147`). The two must
+			// land in separate `nodeResults` entries; tools whose HTTP traffic
+			// gets folded into the Agent's ledger would mask real bugs.
+			it('splits the ledger: model turns to the Agent root, tool HTTP to the tool node', async () => {
+				const innerMockHandler = jest.fn().mockResolvedValue({
+					body: { content: 'tool result' },
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+				createLlmMockHandlerMock.mockReturnValue(innerMockHandler);
+
+				mockProcessRunExecutionData.mockImplementation(async () => {
+					const opts = capturedWireServerOptions.last as {
+						onIntercept?: (turn: unknown) => void;
+					};
+					// Model turn — wire server's onIntercept fires with the root name.
+					opts.onIntercept?.({
+						rootName: 'Agent',
+						url: 'https://api.openai.com/v1/chat/completions',
+						method: 'POST',
+						nodeType: '@n8n/n8n-nodes-langchain.lmChatOpenAi',
+						requestBody: { model: 'gpt-4o', messages: [] },
+						mockResponse: {
+							tool_calls: [{ id: 'c1', function: { name: 'getOrder', arguments: '{}' } }],
+						},
+					});
+
+					// Tool HTTP — `evalLlmMockHandler` is invoked from
+					// `request-helper-functions.ts` with the tool node's
+					// identity. The SUT passes `additionalData` as the first
+					// positional argument to the `WorkflowExecute` constructor
+					// (see `runWorkflow()` in `execution.service.ts`). If that
+					// contract ever changes, the explicit guard below fails
+					// loudly with an actionable message instead of silently
+					// reading the wrong argument slot.
+					const wfExecuteCtor = jest.mocked(
+						(await import('n8n-core')).WorkflowExecute,
+					) as unknown as jest.Mock;
+					const additionalData = wfExecuteCtor.mock.calls[0][0] as {
+						evalLlmMockHandler?: (req: unknown, node: unknown) => Promise<unknown>;
+					};
+					if (!additionalData?.evalLlmMockHandler) {
+						throw new Error(
+							'WorkflowExecute(additionalData, ...) contract changed — ' +
+								'arg 0 no longer carries evalLlmMockHandler. Update the ledger-split test.',
+						);
+					}
+					await additionalData.evalLlmMockHandler(
+						{ url: 'https://orders.example.com/v1/orders/42', method: 'GET' },
+						{
+							id: 'tool-node',
+							name: 'Get Order Tool',
+							type: 'n8n-nodes-base.httpRequestTool',
+							typeVersion: 1,
+							position: [0, 0],
+							parameters: {},
+						},
+					);
+
+					return makeIRun();
+				});
+
+				const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+				// Model turn attributed to Agent only.
+				expect(result.nodeResults['Agent']).toBeDefined();
+				expect(result.nodeResults['Agent'].interceptedRequests).toHaveLength(1);
+				expect(result.nodeResults['Agent'].interceptedRequests[0].nodeType).toBe(
+					'@n8n/n8n-nodes-langchain.lmChatOpenAi',
+				);
+
+				// Tool HTTP attributed to the tool node, NOT to the Agent.
+				expect(result.nodeResults['Get Order Tool']).toBeDefined();
+				expect(result.nodeResults['Get Order Tool'].interceptedRequests).toHaveLength(1);
+				expect(result.nodeResults['Get Order Tool'].interceptedRequests[0].url).toBe(
+					'https://orders.example.com/v1/orders/42',
+				);
+				expect(result.nodeResults['Get Order Tool'].interceptedRequests[0].nodeType).toBe(
+					'n8n-nodes-base.httpRequestTool',
+				);
+				expect(result.nodeResults['Get Order Tool'].executionMode).toBe('mocked');
+
+				// Cross-check: neither side's ledger contains the other side's URL.
+				const agentUrls = result.nodeResults['Agent'].interceptedRequests.map((r) => r.url);
+				const toolUrls = result.nodeResults['Get Order Tool'].interceptedRequests.map((r) => r.url);
+				expect(agentUrls).not.toContain('https://orders.example.com/v1/orders/42');
+				expect(toolUrls).not.toContain('https://api.openai.com/v1/chat/completions');
+			});
+
 			it('upgrades a pre-marked "real" entry to "mocked" when a wire-server turn fires', async () => {
 				// checkNodeConfig() pre-marks any node with a config-issue as
 				// `executionMode: 'real'` BEFORE runWorkflow runs. If a wire-
@ -597,9 +673,7 @@ describe('EvalExecutionService', () => {
 					return makeIRun();
 				});

-				const result = await service.executeWithLlmMock('wf-1', makeUser(), {
-					unpinNodes: ['Agent'],
-				});
+				const result = await service.executeWithLlmMock('wf-1', makeUser());

 				// 'real' (from config-issue pre-marking) gets upgraded to 'mocked'.
 				expect(result.nodeResults['HTTP Request']).toBeDefined();
--- a/packages/cli/src/modules/instance-ai/eval/tests/llm-wire-server.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/llm-wire-server.test.ts
@ -1,6 +1,7 @@
 import type { Logger } from '@n8n/backend-common';
 import type { EvalLlmMockHandler } from 'n8n-core';
 import type { INode } from 'n8n-workflow';
+import OpenAI from 'openai';

 import { type InterceptedTurn, LlmWireServer } from '../llm-wire-server';

@ -65,6 +66,19 @@ describe('LlmWireServer', () => {
 				await second.stop();
 			}
 		});
+
+		it('accepts requests after start() → stop() → start() — shutdown latch resets', async () => {
+			await server.start();
+			await server.stop();
+			const url = await server.start();
+			const response = await postChatCompletion(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o-mini',
+				messages: [],
+			});
+			// Post-restart the route must hand back a 200 envelope, NOT the
+			// 503 the in-flight shutdown latch would emit if it weren't reset.
+			expect(response.status).toBe(200);
+		});
 	});

 	describe('POST /eval/:root/v1/chat/completions — stub fallback', () => {
@ -223,7 +237,7 @@ describe('LlmWireServer', () => {
 			expect(warn.mock.calls[0][0]).toContain('ledger disk full');
 		});

-		it('records an isolated deep copy of the request body in the ledger', async () => {
+		it('records a per-request body in the ledger that does not bleed across requests', async () => {
 			const intercepts: InterceptedTurn[] = [];
 			const mockHandler = jest.fn().mockResolvedValue({
 				body: { content: 'reply' },
@ -388,4 +402,552 @@ describe('LlmWireServer', () => {
 			expect(body.error.message).toContain('/eval/<root>/');
 		});
 	});
+
+	// SSE branch — switches when the inbound body has `stream: true`. The spec
+	// is strict on chunk semantics; the openai SDK throws opaque `BadStream`
+	// errors when the envelope is malformed, so the assertions here mirror
+	// what the SDK validates internally.
+	describe('POST /eval/:root/v1/chat/completions — SSE branch (stream: true)', () => {
+		const subNode = makeSubNode({ name: 'OpenAI Chat Model' });
+
+		async function readSseChunks(url: string, path: string, body: unknown) {
+			const response = await fetch(`${url}${path}`, {
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json', Accept: 'text/event-stream' },
+				body: JSON.stringify(body),
+			});
+			const text = await response.text();
+			const frames = text
+				.split('\n\n')
+				.map((f) => f.trim())
+				.filter((f) => f.startsWith('data: '))
+				.map((f) => f.slice('data: '.length));
+			return { response, frames };
+		}
+
+		it('returns Content-Type: text/event-stream and a [DONE] terminator', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { content: 'streamed reply' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const { response, frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				stream: true,
+				messages: [{ role: 'user', content: 'hi' }],
+			});
+
+			expect(response.status).toBe(200);
+			expect(response.headers.get('content-type')).toMatch(/text\/event-stream/);
+			expect(frames[frames.length - 1]).toBe('[DONE]');
+		});
+
+		it('emits chat.completion.chunk frames terminated with a stop finish_reason', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { content: 'hello via SSE' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const { frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				stream: true,
+				messages: [{ role: 'user', content: 'hi' }],
+			});
+
+			const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f));
+			expect(dataFrames.every((f) => f.object === 'chat.completion.chunk')).toBe(true);
+
+			const ids = new Set(dataFrames.map((f) => f.id));
+			expect(ids.size).toBe(1);
+
+			const contentChunk = dataFrames.find((f) => f.choices[0].delta.content === 'hello via SSE');
+			expect(contentChunk).toBeDefined();
+
+			const terminal = dataFrames[dataFrames.length - 1];
+			expect(terminal.choices[0].finish_reason).toBe('stop');
+		});
+
+		it('streams tool_calls with first-chunk id+name and a terminal tool_calls finish_reason', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: {
+					tool_calls: [
+						{ id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"NYC"}' } },
+					],
+				},
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const { frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				stream: true,
+				messages: [{ role: 'user', content: 'weather in NYC?' }],
+				tools: [
+					{
+						type: 'function',
+						function: { name: 'get_weather', parameters: { type: 'object' } },
+					},
+				],
+			});
+
+			const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f));
+
+			const firstToolFrame = dataFrames.find(
+				(f) => f.choices[0].delta.tool_calls?.[0]?.id === 'call_1',
+			);
+			expect(firstToolFrame).toBeDefined();
+			expect(firstToolFrame.choices[0].delta.tool_calls[0].function.name).toBe('get_weather');
+
+			const argsFrame = dataFrames.find(
+				(f) => f.choices[0].delta.tool_calls?.[0]?.function?.arguments === '{"city":"NYC"}',
+			);
+			expect(argsFrame).toBeDefined();
+			// Args frame MUST NOT repeat id or name.
+			expect(argsFrame.choices[0].delta.tool_calls[0].id).toBeUndefined();
+			expect(argsFrame.choices[0].delta.tool_calls[0].function.name).toBeUndefined();
+
+			const terminal = dataFrames[dataFrames.length - 1];
+			expect(terminal.choices[0].finish_reason).toBe('tool_calls');
+		});
+
+		it('attributes the streamed turn against the requested root in onIntercept', async () => {
+			const intercepts: InterceptedTurn[] = [];
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { content: 'streamed' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+				onIntercept: (t) => intercepts.push(t),
+			});
+			const url = await server.start();
+
+			await readSseChunks(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				stream: true,
+				messages: [],
+			});
+
+			expect(intercepts).toHaveLength(1);
+			expect(intercepts[0].rootName).toBe('Agent');
+		});
+
+		it('uses the no-handler stub for streaming when no mock handler is attached', async () => {
+			server = new LlmWireServer();
+			const url = await server.start();
+
+			const { response, frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				stream: true,
+				messages: [],
+			});
+
+			expect(response.headers.get('content-type')).toMatch(/text\/event-stream/);
+			const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f));
+			const stubFrame = dataFrames.find(
+				(f) =>
+					typeof f.choices[0].delta.content === 'string' &&
+					f.choices[0].delta.content.includes('eval wire server stub'),
+			);
+			expect(stubFrame).toBeDefined();
+		});
+
+		// Live SDK round-trip — the master spec mandates this: "Test against
+		// the live `openai` v5 SDK — do not hand-roll envelope shape against
+		// documentation alone." The hand-rolled `readSseChunks` frame splitter
+		// above proves our wire shape against the spec; this test proves it
+		// against the *actual SDK parser*. If our `delta.tool_calls` chunks
+		// drift from what `openai`'s reducer expects, this test will throw a
+		// typed BadStream error before any of the per-frame asserts above
+		// would notice.
+		describe('live `openai` SDK round-trip (catches SDK-strict envelope drift)', () => {
+			function makeClient(serverUrl: string, rootName: string) {
+				return new OpenAI({
+					apiKey: 'sk-eval-test',
+					baseURL: `${serverUrl}/eval/${encodeURIComponent(rootName)}/v1`,
+					// Disable retries — a failed parse should surface immediately,
+					// not loop the test through the default 2x retry budget.
+					maxRetries: 0,
+				});
+			}
+
+			it('non-streaming chat completion parses through the SDK reducer', async () => {
+				const mockHandler = jest.fn().mockResolvedValue({
+					body: { content: 'hello via SDK' },
+					headers: {},
+					statusCode: 200,
+				}) as unknown as EvalLlmMockHandler;
+				server = new LlmWireServer({
+					mockHandler,
+					rootToSubNode: new Map([['Agent', subNode]]),
+				});
+				const url = await server.start();
+				const client = makeClient(url, 'Agent');
+
+				const completion = await client.chat.completions.create({
+					model: 'gpt-4o',
+					messages: [{ role: 'user', content: 'hi' }],
+				});
+
+				expect(completion.object).toBe('chat.completion');
+				expect(completion.choices[0].message.content).toBe('hello via SDK');
+				expect(completion.choices[0].finish_reason).toBe('stop');
+			});
+
+			it('streaming content yields chunks through the SDK async iterator', async () => {
+				const mockHandler = jest.fn().mockResolvedValue({
+					body: { content: 'streamed via SDK' },
+					headers: {},
+					statusCode: 200,
+				}) as unknown as EvalLlmMockHandler;
+				server = new LlmWireServer({
+					mockHandler,
+					rootToSubNode: new Map([['Agent', subNode]]),
+				});
+				const url = await server.start();
+				const client = makeClient(url, 'Agent');
+
+				const stream = await client.chat.completions.create({
+					model: 'gpt-4o',
+					stream: true,
+					messages: [{ role: 'user', content: 'hi' }],
+				});
+
+				let assembled = '';
+				let lastFinishReason: string | null | undefined;
+				for await (const chunk of stream) {
+					expect(chunk.object).toBe('chat.completion.chunk');
+					const delta = chunk.choices[0]?.delta;
+					if (typeof delta?.content === 'string') {
+						assembled += delta.content;
+					}
+					if (chunk.choices[0]?.finish_reason !== undefined) {
+						lastFinishReason = chunk.choices[0].finish_reason;
+					}
+				}
+
+				expect(assembled).toBe('streamed via SDK');
+				expect(lastFinishReason).toBe('stop');
+			});
+
+			it('streaming tool_calls accumulate through the SDK reducer with the correct final shape', async () => {
+				// The strictest test of the wire format. The SDK accumulates
+				// `delta.tool_calls` slices into a single tool call — first chunk
+				// owns `id` + `function.name`, later chunks contribute
+				// `function.arguments`. A drift here (e.g. repeating `id` on
+				// later chunks) throws a `BadStream` error, not a soft skip.
+				const mockHandler = jest.fn().mockResolvedValue({
+					body: {
+						tool_calls: [
+							{
+								id: 'call_live',
+								function: { name: 'get_weather', arguments: '{"city":"NYC"}' },
+							},
+						],
+					},
+					headers: {},
+					statusCode: 200,
+				}) as unknown as EvalLlmMockHandler;
+				server = new LlmWireServer({
+					mockHandler,
+					rootToSubNode: new Map([['Agent', subNode]]),
+				});
+				const url = await server.start();
+				const client = makeClient(url, 'Agent');
+
+				const stream = await client.chat.completions.create({
+					model: 'gpt-4o',
+					stream: true,
+					messages: [{ role: 'user', content: 'weather' }],
+					tools: [
+						{
+							type: 'function',
+							function: { name: 'get_weather', parameters: { type: 'object' } },
+						},
+					],
+				});
+
+				const accumulated: Record<number, { id?: string; name?: string; args: string }> = {};
+				let lastFinishReason: string | null | undefined;
+				for await (const chunk of stream) {
+					const toolDeltas = chunk.choices[0]?.delta?.tool_calls ?? [];
+					for (const td of toolDeltas) {
+						const slot = (accumulated[td.index] ??= { args: '' });
+						if (td.id) slot.id = td.id;
+						if (td.function?.name) slot.name = td.function.name;
+						if (typeof td.function?.arguments === 'string') {
+							slot.args += td.function.arguments;
+						}
+					}
+					if (chunk.choices[0]?.finish_reason !== undefined) {
+						lastFinishReason = chunk.choices[0].finish_reason;
+					}
+				}
+
+				// SDK reducer reassembled the full call.
+				expect(accumulated[0]).toEqual({
+					id: 'call_live',
+					name: 'get_weather',
+					args: '{"city":"NYC"}',
+				});
+				expect(lastFinishReason).toBe('tool_calls');
+			});
+		});
+
+		it('returns a JSON error envelope (not SSE) when the mock handler throws on a streaming request', async () => {
+			const mockHandler = jest
+				.fn()
+				.mockRejectedValue(new Error('LLM offline')) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const response = await fetch(`${url}/eval/Agent/v1/chat/completions`, {
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({ model: 'gpt-4o', stream: true, messages: [] }),
+			});
+			// SDK clients on a 500 short-circuit before iterating the stream, so
+			// returning a JSON error envelope here keeps both streaming and
+			// non-streaming code paths happy.
+			expect(response.status).toBe(500);
+			const body = (await response.json()) as { error: { message: string } };
+			expect(body.error.message).toContain('LLM offline');
+		});
+	});
+
+	// Non-streaming tool_calls: the same envelope shape the agent-side SDK
+	// expects when stream:false. SDKs use `finish_reason: 'tool_calls'` to
+	// branch into tool-execution; we must set it whenever tool_calls is present.
+	describe('POST /eval/:root/v1/chat/completions — tool_calls (non-streaming)', () => {
+		const subNode = makeSubNode({ name: 'OpenAI Chat Model' });
+
+		it('emits tool_calls + content:null + finish_reason: tool_calls on the message', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: {
+					tool_calls: [{ id: 'call_1', function: { name: 'lookup', arguments: '{"q":"hi"}' } }],
+				},
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const response = await postChatCompletion(url, '/eval/Agent/v1/chat/completions', {
+				model: 'gpt-4o',
+				messages: [{ role: 'user', content: 'lookup hi' }],
+				tools: [{ type: 'function', function: { name: 'lookup', parameters: { type: 'object' } } }],
+			});
+
+			expect(response.status).toBe(200);
+			const body = (await response.json()) as {
+				choices: Array<{
+					message: {
+						role: string;
+						content: string | null;
+						tool_calls: Array<{
+							id: string;
+							type: string;
+							function: { name: string; arguments: string };
+						}>;
+					};
+					finish_reason: string;
+				}>;
+			};
+			const choice = body.choices[0];
+			expect(choice.message.role).toBe('assistant');
+			expect(choice.message.content).toBeNull();
+			expect(choice.message.tool_calls[0]).toMatchObject({
+				id: 'call_1',
+				type: 'function',
+				function: { name: 'lookup', arguments: '{"q":"hi"}' },
+			});
+			expect(choice.finish_reason).toBe('tool_calls');
+		});
+	});
+
+	// `@langchain/openai` v1.3+ auto-routes Agent v3.1+ calls to /v1/responses
+	// instead of /v1/chat/completions. Verified empirically against a real
+	// LangChain Agent — without this route the SDK 404s.
+	describe('POST /eval/:root/v1/responses — Responses API', () => {
+		const subNode = makeSubNode({ name: 'OpenAI Chat Model' });
+
+		it('returns a `response` envelope with annotations:[] on output_text content', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { output_text: 'hello via responses' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const response = await postChatCompletion(url, '/eval/Agent/v1/responses', {
+				model: 'gpt-4o',
+				input: [{ role: 'user', content: 'hi' }],
+			});
+
+			expect(response.status).toBe(200);
+			const body = (await response.json()) as {
+				object: string;
+				status: string;
+				output: Array<{
+					type: string;
+					content: Array<{ type: string; text: string; annotations: unknown[] }>;
+				}>;
+			};
+			expect(body.object).toBe('response');
+			expect(body.status).toBe('completed');
+			expect(body.output[0].type).toBe('message');
+			expect(body.output[0].content[0].text).toBe('hello via responses');
+			// Without `annotations: []`, the LangChain extractor throws
+			// "Cannot read properties of undefined (reading 'map')".
+			expect(body.output[0].content[0].annotations).toEqual([]);
+		});
+
+		it('emits a function_call output item when the mock handler returns tool_calls', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: {
+					tool_calls: [{ id: 'call_1', function: { name: 'lookup', arguments: '{"q":"x"}' } }],
+				},
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const response = await postChatCompletion(url, '/eval/Agent/v1/responses', {
+				model: 'gpt-4o',
+				input: [{ role: 'user', content: 'x' }],
+				tools: [{ type: 'function', name: 'lookup' }],
+			});
+
+			const body = (await response.json()) as {
+				output: Array<{ type: string; name?: string; call_id?: string; arguments?: string }>;
+			};
+			expect(body.output[0].type).toBe('function_call');
+			expect(body.output[0].name).toBe('lookup');
+			expect(body.output[0].call_id).toBe('call_1');
+			expect(body.output[0].arguments).toBe('{"q":"x"}');
+		});
+
+		it('streams response.* SSE events when stream:true', async () => {
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { output_text: 'streamed reply' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['Agent', subNode]]),
+			});
+			const url = await server.start();
+
+			const response = await fetch(`${url}/eval/Agent/v1/responses`, {
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json', Accept: 'text/event-stream' },
+				body: JSON.stringify({
+					model: 'gpt-4o',
+					stream: true,
+					input: [{ role: 'user', content: 'hi' }],
+				}),
+			});
+
+			expect(response.headers.get('content-type')).toMatch(/text\/event-stream/);
+			const text = await response.text();
+
+			// Responses API doesn't use `data: [DONE]` — the terminal is
+			// `response.completed`. Parse the event frames and assert ordering.
+			const events: string[] = [];
+			for (const block of text.split('\n\n')) {
+				const eventLine = block.split('\n').find((l) => l.startsWith('event: '));
+				if (eventLine) events.push(eventLine.slice('event: '.length));
+			}
+			expect(events[0]).toBe('response.created');
+			expect(events[events.length - 1]).toBe('response.completed');
+			expect(events).toContain('response.output_text.delta');
+		});
+
+		it('attributes the turn via onIntercept with the parsed root', async () => {
+			const intercepts: InterceptedTurn[] = [];
+			const mockHandler = jest.fn().mockResolvedValue({
+				body: { output_text: 'ok' },
+				headers: {},
+				statusCode: 200,
+			}) as unknown as EvalLlmMockHandler;
+			server = new LlmWireServer({
+				mockHandler,
+				rootToSubNode: new Map([['My Agent', subNode]]),
+				onIntercept: (t) => intercepts.push(t),
+			});
+			const url = await server.start();
+
+			await postChatCompletion(url, '/eval/My%20Agent/v1/responses', {
+				model: 'gpt-4o',
+				input: [],
+			});
+
+			expect(intercepts).toHaveLength(1);
+			expect(intercepts[0].rootName).toBe('My Agent');
+			// Reverse translator uses the canonical OpenAI URL so mock-handler's
+			// service/endpoint extraction derives `/v1/responses` correctly.
+			expect(intercepts[0].url).toBe('https://api.openai.com/v1/responses');
+		});
+
+		it('returns the loud-fail error envelope when no /eval/<root>/ prefix is used', async () => {
+			server = new LlmWireServer();
+			const url = await server.start();
+
+			const response = await postChatCompletion(url, '/v1/responses', {
+				model: 'gpt-4o',
+				input: [],
+			});
+			const body = (await response.json()) as { error: { message: string } };
+			expect(response.status).toBe(500);
+			expect(body.error.message).toContain('/eval/<root>/');
+		});
+
+		it('uses the stub envelope when no mock handler is attached', async () => {
+			server = new LlmWireServer();
+			const url = await server.start();
+
+			const response = await postChatCompletion(url, '/eval/Agent/v1/responses', {
+				model: 'gpt-4o',
+				input: [],
+			});
+			const body = (await response.json()) as {
+				output: Array<{ content: Array<{ text: string }> }>;
+			};
+			expect(body.output[0].content[0].text).toContain('eval wire server stub');
+		});
+	});
 });
--- a/packages/cli/src/modules/instance-ai/eval/tests/m3-fixtures.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/m3-fixtures.test.ts
@ -0,0 +1,496 @@
+import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core';
+import type {
+	ICredentialDataDecryptedObject,
+	ICredentialsHelper,
+	IExecuteData,
+	IHttpRequestOptions,
+	INode,
+	INodeCredentialsDetails,
+	IWorkflowExecuteAdditionalData,
+} from 'n8n-workflow';
+
+import { EvalMockedCredentialsHelper } from '../eval-mocked-credentials-helper';
+import { type InterceptedTurn, LlmWireServer } from '../llm-wire-server';
+
+/**
+ * Integration-shaped unit test exercising credential rewrite + path-based
+ * root attribution + envelope correctness end-to-end. Boots a real
+ * `LlmWireServer` on a loopback port, instantiates a real
+ * `EvalMockedCredentialsHelper`, scripts mock-handler responses turn-by-turn,
+ * and drives the Agent loop with raw `fetch`. Envelope shape is locked down
+ * separately in `llm-wire-server.test.ts` and `openai-envelope.test.ts`.
+ *
+ *  - **Mechanism** — tool IS connected. Asserts the ledger ends with model
+ *    turns attributed to the Agent root and tool HTTP attributed to the tool
+ *    node, with no cross-contamination.
+ *  - **Regression-catch** — tool is disconnected. With un-pinning the eval
+ *    must fail because the Agent's mocked output can't produce the tool-
+ *    shaped result the grader expects. A counterfactual passes when the
+ *    tool IS connected, proving the check is meaningful.
+ */
+describe('M3 fixtures — Agent + Chat Model + HTTP tool + MemoryBufferWindow', () => {
+	const llmSubNode: INode = {
+		id: 'sub-1',
+		name: 'OpenAI Chat Model',
+		type: '@n8n/n8n-nodes-langchain.lmChatOpenAi',
+		typeVersion: 1,
+		position: [0, 0],
+		parameters: { model: 'gpt-4o-mini' },
+	};
+	const toolNode: INode = {
+		id: 'tool-1',
+		name: 'Get Order Status Tool',
+		type: 'n8n-nodes-base.httpRequestTool',
+		typeVersion: 1,
+		position: [200, 0],
+		parameters: { url: 'https://orders.example.com/v1/orders/{{ $fromAI("orderId") }}' },
+	};
+	const rootName = 'Agent';
+
+	function makeInnerHelper(credentials: ICredentialDataDecryptedObject): ICredentialsHelper {
+		return {
+			getParentTypes: jest.fn().mockReturnValue([]),
+			authenticate: jest.fn(),
+			preAuthentication: jest.fn(),
+			runPreAuthentication: jest.fn(),
+			getCredentials: jest.fn(),
+			getDecrypted: jest.fn().mockResolvedValue(credentials),
+			updateCredentials: jest.fn(),
+			updateCredentialsOauthTokenData: jest.fn(),
+			getCredentialsProperties: jest.fn().mockReturnValue([]),
+		} as ICredentialsHelper;
+	}
+
+	async function postViaRewrittenCredentials(
+		helper: EvalMockedCredentialsHelper,
+		serverBaseUrl: string,
+		requestBody: unknown,
+		callingSubNodeName: string,
+	): Promise<{ rewrittenUrl: string; response: Response; body: Record<string, unknown> }> {
+		const cred = await helper.getDecrypted(
+			{} as IWorkflowExecuteAdditionalData,
+			{ id: 'cred-1', name: 'OpenAI' } as INodeCredentialsDetails,
+			'openAiApi',
+			'manual',
+			{ node: { name: callingSubNodeName, id: 'n' } as INode } as IExecuteData,
+		);
+
+		const baseUrl = String(cred.url);
+		const response = await fetch(`${baseUrl}/chat/completions`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify(requestBody),
+		});
+		const body = (await response.json()) as Record<string, unknown>;
+		expect(baseUrl.startsWith(serverBaseUrl)).toBe(true);
+		return { rewrittenUrl: baseUrl, response, body };
+	}
+
+	/**
+	 * Build the eval-side glue that the M3 fixture exercises:
+	 *   - real LlmWireServer with a programmable mockHandler
+	 *   - real EvalMockedCredentialsHelper wired to the rewrite map
+	 *   - ledger accumulators for both model turns and tool HTTP
+	 *
+	 * The model-turn ledger mirrors what `execution.service.ts`'s
+	 * `recordWireServerTurn` writes; the tool-HTTP ledger mirrors what its
+	 * `createInterceptingHandler` writes. The split between the two is
+	 * what the M3 mechanism fixture proves.
+	 */
+	async function bootM3Harness() {
+		const modelTurns: InterceptedTurn[] = [];
+		const toolHttpCalls: Array<{ nodeName: string; url: string; mockResponse: unknown }> = [];
+
+		// Programmable mock handler — the M3 mechanism case feeds it a
+		// scripted sequence of returns, one per call. The value/regression
+		// case feeds it a single "plain content" return that lacks the
+		// tool-shaped output the grader looks for.
+		const scriptedResponses: EvalMockHttpResponse[] = [];
+		const mockHandler = jest
+			.fn<Promise<EvalMockHttpResponse>, Parameters<EvalLlmMockHandler>>()
+			.mockImplementation(async () => {
+				const next = scriptedResponses.shift();
+				if (!next) {
+					throw new Error(
+						'M3 fixture mock handler ran out of scripted responses — fixture script is wrong',
+					);
+				}
+				return next;
+			});
+
+		const wireServer = new LlmWireServer({
+			mockHandler,
+			rootToSubNode: new Map([[rootName, llmSubNode]]),
+			onIntercept: (t) => modelTurns.push(t),
+		});
+		await wireServer.start();
+
+		const helper = new EvalMockedCredentialsHelper(
+			makeInnerHelper({ apiKey: 'sk-real', url: 'https://api.openai.com/v1' }),
+			wireServer.url,
+			undefined,
+			new Map([[llmSubNode.name, rootName]]),
+		);
+
+		// Mirror of `execution.service.ts:createInterceptingHandler` for the
+		// tool side — captures HTTP attributed to the tool's node identity.
+		const toolHttpInterceptor = async (
+			request: IHttpRequestOptions,
+			node: INode,
+		): Promise<EvalMockHttpResponse> => {
+			const mockResponse: EvalMockHttpResponse = {
+				body: {
+					orderId: 'ORD-42',
+					status: 'shipped',
+					eta: '2026-05-25T00:00:00Z',
+				},
+				headers: { 'content-type': 'application/json' },
+				statusCode: 200,
+			};
+			toolHttpCalls.push({
+				nodeName: node.name,
+				url: request.url,
+				mockResponse: mockResponse.body,
+			});
+			return mockResponse;
+		};
+
+		return {
+			wireServer,
+			helper,
+			scriptedResponses,
+			modelTurns,
+			toolHttpCalls,
+			toolHttpInterceptor,
+			mockHandler,
+		};
+	}
+
+	// ── M3 mechanism ────────────────────────────────────────────────────
+
+	describe('mechanism (tool connected to Agent)', () => {
+		it('drives a full Agent loop: tool_calls turn → tool HTTP → follow-up turn → final answer', async () => {
+			const harness = await bootM3Harness();
+			try {
+				// Turn 1: Agent posts with tools array; wire server's mock handler
+				// returns a tool_calls envelope.
+				harness.scriptedResponses.push({
+					body: {
+						tool_calls: [
+							{
+								id: 'call_1',
+								function: { name: 'get_order_status', arguments: '{"orderId":"ORD-42"}' },
+							},
+						],
+					},
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+				// Turn 2: Agent re-posts with the tool result; mock returns the
+				// final natural-language answer.
+				harness.scriptedResponses.push({
+					body: {
+						content: 'Your order ORD-42 has shipped and arrives 2026-05-25.',
+					},
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+
+				const turn1 = await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [{ role: 'user', content: 'Where is my order ORD-42?' }],
+						tools: [
+							{
+								type: 'function',
+								function: {
+									name: 'get_order_status',
+									description: 'Look up an order by id',
+									parameters: { type: 'object' },
+								},
+							},
+						],
+					},
+					llmSubNode.name,
+				);
+
+				const choice1 = (
+					turn1.body.choices as Array<{
+						message: {
+							content: string | null;
+							tool_calls?: Array<{
+								id: string;
+								function: { name: string; arguments: string };
+							}>;
+						};
+						finish_reason: string;
+					}>
+				)[0];
+				expect(choice1.finish_reason).toBe('tool_calls');
+				expect(choice1.message.tool_calls?.[0].function.name).toBe('get_order_status');
+				const toolCallArgs = JSON.parse(choice1.message.tool_calls?.[0].function.arguments ?? '{}');
+				expect(toolCallArgs).toEqual({ orderId: 'ORD-42' });
+
+				// Tool runs — `helpers.httpRequest` interception fires. The
+				// nodeType is the tool's `httpRequestTool`, not the Agent.
+				const toolResult = await harness.toolHttpInterceptor(
+					{
+						url: `https://orders.example.com/v1/orders/${toolCallArgs.orderId}`,
+						method: 'GET',
+					},
+					toolNode,
+				);
+
+				// Turn 2: Agent threads the tool result back into messages and
+				// asks the model for a final answer. This mirrors what
+				// `AgentExecutor` does between tool calls and final response.
+				const turn2 = await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [
+							{ role: 'user', content: 'Where is my order ORD-42?' },
+							{
+								role: 'assistant',
+								content: null,
+								tool_calls: choice1.message.tool_calls,
+							},
+							{
+								role: 'tool',
+								tool_call_id: 'call_1',
+								content: JSON.stringify(toolResult.body),
+							},
+						],
+					},
+					llmSubNode.name,
+				);
+
+				const choice2 = (
+					turn2.body.choices as Array<{
+						message: { content: string | null };
+						finish_reason: string;
+					}>
+				)[0];
+				expect(choice2.finish_reason).toBe('stop');
+				expect(choice2.message.content).toContain('ORD-42');
+				expect(choice2.message.content).toContain('shipped');
+
+				// Ledger assertions — the headline M3 split.
+				expect(harness.modelTurns).toHaveLength(2);
+				expect(harness.modelTurns.every((t) => t.rootName === rootName)).toBe(true);
+				expect(harness.modelTurns.every((t) => t.nodeType === llmSubNode.type)).toBe(true);
+
+				expect(harness.toolHttpCalls).toHaveLength(1);
+				expect(harness.toolHttpCalls[0].nodeName).toBe(toolNode.name);
+				expect(harness.toolHttpCalls[0].url).toContain('orders.example.com');
+
+				// Cross-check: tool HTTP didn't leak into model-turn attribution.
+				const modelUrls = harness.modelTurns.map((t) => t.url);
+				expect(modelUrls.every((u) => u.includes('api.openai.com'))).toBe(true);
+			} finally {
+				await harness.wireServer.stop();
+			}
+		});
+
+		it('passes the connected tools array through to the mock handler', async () => {
+			// Tool-list awareness: the mock handler must see the request `tools`
+			// array so it can emit a realistic tool_calls block. This is the
+			// "hard-coded tool-list awareness in the wire-server prompt"
+			// behaviour from the spec — the wire server just passes the inbound
+			// body through, and the handler reads it from `req.body.tools`.
+			const harness = await bootM3Harness();
+			try {
+				harness.scriptedResponses.push({
+					body: { content: 'ok' },
+					headers: {},
+					statusCode: 200,
+				});
+
+				await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [{ role: 'user', content: 'hi' }],
+						tools: [
+							{
+								type: 'function',
+								function: { name: 'get_order_status', parameters: { type: 'object' } },
+							},
+						],
+					},
+					llmSubNode.name,
+				);
+
+				expect(harness.mockHandler).toHaveBeenCalledTimes(1);
+				const [requestOptions] = harness.mockHandler.mock.calls[0];
+				const body = requestOptions.body as {
+					tools?: Array<{ function: { name: string } }>;
+				};
+				expect(body.tools).toBeDefined();
+				expect(body.tools?.[0].function.name).toBe('get_order_status');
+			} finally {
+				await harness.wireServer.stop();
+			}
+		});
+	});
+
+	// ── M3 value (regression-catch fixture) ─────────────────────────────
+
+	describe('value / regression-catch (tool disconnected from Agent)', () => {
+		// Substring grader — a deliberately lightweight stand-in for whatever
+		// the real eval grader does downstream. It looks for `ORD-42` AND
+		// `shipped` in the final answer; both substrings together can only
+		// appear when the Agent (a) saw the user's order id AND (b) saw the
+		// tool's HTTP response (`{ status: 'shipped' }`). Plain-text content
+		// without the tool result fails. The substring shape is intentionally
+		// simple — a more structural schema check would be a Tier 5 follow-up
+		// (`MockHints.toolHints` quality work); the contract this fixture
+		// proves is "the spike makes the grader fail when pinning would have
+		// hidden the regression", not "this is a production-grade grader".
+		function graderCheck(finalAnswer: unknown): { passed: boolean; reason?: string } {
+			if (typeof finalAnswer !== 'string') {
+				return { passed: false, reason: 'final answer was not a string' };
+			}
+			const hasOrderId = finalAnswer.includes('ORD-42');
+			const hasShipped = finalAnswer.toLowerCase().includes('shipped');
+			if (hasOrderId && hasShipped) return { passed: true };
+			return {
+				passed: false,
+				reason: `grader expected order id + status substrings; got: ${JSON.stringify(finalAnswer)}`,
+			};
+		}
+
+		it('the grader fails when the Agent has no tool connection — only the spike catches this', async () => {
+			const harness = await bootM3Harness();
+			try {
+				// Mock handler returns plain content WITHOUT a tool_calls block
+				// (because the disconnected workflow has no tools to call).
+				// The Agent gives up and emits an apology — the grader sees
+				// none of the tool-derived fields and reports failure.
+				harness.scriptedResponses.push({
+					body: {
+						content: "I'd love to help, but I don't have an order-lookup tool available right now.",
+					},
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+
+				const turn = await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [{ role: 'user', content: 'Where is my order ORD-42?' }],
+						// IMPORTANT: no `tools` array — the tool is disconnected.
+					},
+					llmSubNode.name,
+				);
+
+				const choice = (
+					turn.body.choices as Array<{ message: { content: string }; finish_reason: string }>
+				)[0];
+				expect(choice.finish_reason).toBe('stop');
+
+				const verdict = graderCheck(choice.message.content);
+				// This is the M3 value assertion — pinning today would pass;
+				// the spike must fail because the Agent's mocked output can't
+				// produce the substrings the grader expects (which only
+				// appear once the tool's HTTP response threads back through
+				// turn 2 — see the counterfactual test below).
+				expect(verdict.passed).toBe(false);
+				expect(verdict.reason).toContain('order id + status');
+
+				// No tool HTTP fired — confirms the tool was actually disconnected.
+				expect(harness.toolHttpCalls).toHaveLength(0);
+
+				// Model turn ran (this is the headline behavioural delta vs.
+				// today's pinned path, where no model turn would fire at all).
+				expect(harness.modelTurns).toHaveLength(1);
+			} finally {
+				await harness.wireServer.stop();
+			}
+		});
+
+		// Counterfactual: the same grader passes for the connected fixture.
+		// Without this assertion, the regression-catch could be a false
+		// negative (a perpetually-failing grader proves nothing).
+		it('the grader passes when the tool IS connected — confirms the check is meaningful', async () => {
+			const harness = await bootM3Harness();
+			try {
+				harness.scriptedResponses.push({
+					body: {
+						tool_calls: [
+							{
+								id: 'call_1',
+								function: { name: 'get_order_status', arguments: '{"orderId":"ORD-42"}' },
+							},
+						],
+					},
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+				harness.scriptedResponses.push({
+					body: { content: 'Your order ORD-42 has shipped — eta 2026-05-25.' },
+					headers: { 'content-type': 'application/json' },
+					statusCode: 200,
+				});
+
+				// Turn 1.
+				const turn1 = await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [{ role: 'user', content: 'Where is my order ORD-42?' }],
+						tools: [
+							{
+								type: 'function',
+								function: { name: 'get_order_status', parameters: { type: 'object' } },
+							},
+						],
+					},
+					llmSubNode.name,
+				);
+
+				const choice1 = (
+					turn1.body.choices as Array<{
+						message: { tool_calls?: Array<{ id: string }> };
+					}>
+				)[0];
+				await harness.toolHttpInterceptor(
+					{ url: 'https://orders.example.com/v1/orders/ORD-42', method: 'GET' },
+					toolNode,
+				);
+
+				// Turn 2.
+				const turn2 = await postViaRewrittenCredentials(
+					harness.helper,
+					harness.wireServer.url,
+					{
+						model: 'gpt-4o-mini',
+						messages: [
+							{ role: 'user', content: 'Where is my order ORD-42?' },
+							{
+								role: 'assistant',
+								content: null,
+								tool_calls: choice1.message.tool_calls,
+							},
+							{ role: 'tool', tool_call_id: 'call_1', content: '{"status":"shipped"}' },
+						],
+					},
+					llmSubNode.name,
+				);
+
+				const choice2 = (turn2.body.choices as Array<{ message: { content: string } }>)[0];
+
+				expect(graderCheck(choice2.message.content).passed).toBe(true);
+			} finally {
+				await harness.wireServer.stop();
+			}
+		});
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/openai-envelope.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/openai-envelope.test.ts
@ -3,7 +3,10 @@ import type { EvalMockHttpResponse } from 'n8n-core';
 import {
 	buildOpenAiErrorEnvelope,
 	extractRequestModel,
+	extractToolCalls,
 	forwardTranslateToChatCompletion,
+	forwardTranslateToSseChunks,
+	isStreamRequested,
 	reverseTranslateOpenAiRequest,
 } from '../openai-envelope';

@ -63,6 +66,122 @@ describe('extractRequestModel', () => {
 	});
 });

+describe('isStreamRequested', () => {
+	it('returns true only when stream === true', () => {
+		expect(isStreamRequested({ stream: true })).toBe(true);
+	});
+
+	it('returns false for missing, false, or truthy-non-true values', () => {
+		expect(isStreamRequested({})).toBe(false);
+		expect(isStreamRequested({ stream: false })).toBe(false);
+		expect(isStreamRequested({ stream: 1 })).toBe(false);
+		expect(isStreamRequested({ stream: 'true' })).toBe(false);
+		expect(isStreamRequested(undefined)).toBe(false);
+		expect(isStreamRequested(null)).toBe(false);
+	});
+});
+
+describe('extractToolCalls', () => {
+	it('returns an empty list when no tool calls are present', () => {
+		expect(extractToolCalls(undefined)).toEqual([]);
+		expect(extractToolCalls(null)).toEqual([]);
+		expect(extractToolCalls({})).toEqual([]);
+		expect(extractToolCalls({ content: 'just text' })).toEqual([]);
+	});
+
+	it('normalizes the OpenAI-native tool_calls shape', () => {
+		const result = extractToolCalls({
+			tool_calls: [
+				{ id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"Paris"}' } },
+			],
+		});
+
+		expect(result).toEqual([{ id: 'call_1', name: 'get_weather', arguments: '{"city":"Paris"}' }]);
+	});
+
+	it('generates a synthetic id when none is provided', () => {
+		const result = extractToolCalls({
+			tool_calls: [{ function: { name: 'foo', arguments: '{}' } }],
+		});
+
+		expect(result).toHaveLength(1);
+		expect(result[0].id).toMatch(/^call_[a-f0-9]+$/);
+		expect(result[0].name).toBe('foo');
+	});
+
+	it('coerces object arguments to JSON strings (SDKs require strings)', () => {
+		const result = extractToolCalls({
+			tool_calls: [{ function: { name: 'foo', arguments: { city: 'Paris' } } }],
+		});
+
+		expect(result[0].arguments).toBe('{"city":"Paris"}');
+	});
+
+	it('defaults arguments to "{}" when missing or null', () => {
+		const result = extractToolCalls({
+			tool_calls: [{ function: { name: 'foo' } }, { function: { name: 'bar', arguments: null } }],
+		});
+
+		expect(result[0].arguments).toBe('{}');
+		expect(result[1].arguments).toBe('{}');
+	});
+
+	it('accepts the `{ name, arguments }` shorthand', () => {
+		const result = extractToolCalls({
+			tool_calls: [{ name: 'shorthand', arguments: '{"a":1}' }],
+		});
+
+		expect(result).toEqual([expect.objectContaining({ name: 'shorthand', arguments: '{"a":1}' })]);
+	});
+
+	it('unwraps tool calls nested under a choices envelope', () => {
+		const result = extractToolCalls({
+			choices: [
+				{
+					message: {
+						tool_calls: [{ id: 'call_2', function: { name: 'lookup', arguments: '{}' } }],
+					},
+				},
+			],
+		});
+
+		expect(result).toHaveLength(1);
+		expect(result[0].name).toBe('lookup');
+	});
+
+	it('extracts a single-tool shorthand under `tool`', () => {
+		const result = extractToolCalls({
+			tool: { name: 'single', arguments: '{"x":1}' },
+		});
+
+		expect(result).toEqual([expect.objectContaining({ name: 'single', arguments: '{"x":1}' })]);
+	});
+
+	it('handles multiple tool calls', () => {
+		const result = extractToolCalls({
+			tool_calls: [
+				{ id: 'a', function: { name: 'one', arguments: '{}' } },
+				{ id: 'b', function: { name: 'two', arguments: '{}' } },
+			],
+		});
+
+		expect(result.map((t) => t.name)).toEqual(['one', 'two']);
+		expect(result.map((t) => t.id)).toEqual(['a', 'b']);
+	});
+
+	it('skips entries without a function name', () => {
+		const result = extractToolCalls({
+			tool_calls: [
+				{ id: 'a', function: { arguments: '{}' } },
+				{ id: 'b', function: { name: 'kept', arguments: '{}' } },
+			],
+		});
+
+		expect(result).toHaveLength(1);
+		expect(result[0].name).toBe('kept');
+	});
+});
+
 describe('forwardTranslateToChatCompletion', () => {
 	function mockResponse(body: unknown): EvalMockHttpResponse {
 		return {
@ -180,6 +299,231 @@ describe('forwardTranslateToChatCompletion', () => {

 		expect(envelope.model).toBe('gpt-5');
 	});
+
+	it('emits tool_calls on the assistant message when the body contains them', () => {
+		const envelope = forwardTranslateToChatCompletion(
+			mockResponse({
+				tool_calls: [
+					{ id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"Paris"}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+
+		const choice = (
+			envelope.choices as Array<{
+				message: {
+					role: string;
+					content: string | null;
+					tool_calls?: Array<{
+						id: string;
+						type: string;
+						function: { name: string; arguments: string };
+					}>;
+				};
+				finish_reason: string;
+			}>
+		)[0];
+		expect(choice.message.role).toBe('assistant');
+		// Tool-call envelopes require content === null — SDKs reject content + tool_calls.
+		expect(choice.message.content).toBeNull();
+		expect(choice.message.tool_calls).toEqual([
+			{
+				id: 'call_1',
+				type: 'function',
+				function: { name: 'get_weather', arguments: '{"city":"Paris"}' },
+			},
+		]);
+		expect(choice.finish_reason).toBe('tool_calls');
+	});
+
+	it('emits multiple tool_calls when several are present', () => {
+		const envelope = forwardTranslateToChatCompletion(
+			mockResponse({
+				tool_calls: [
+					{ id: 'a', function: { name: 'one', arguments: '{}' } },
+					{ id: 'b', function: { name: 'two', arguments: '{}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+
+		const choice = (
+			envelope.choices as Array<{
+				message: { tool_calls?: Array<{ id: string }> };
+				finish_reason: string;
+			}>
+		)[0];
+		expect(choice.message.tool_calls).toHaveLength(2);
+		expect(choice.finish_reason).toBe('tool_calls');
+	});
+});
+
+describe('forwardTranslateToSseChunks', () => {
+	function mockResponse(body: unknown): EvalMockHttpResponse {
+		return {
+			body,
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		};
+	}
+
+	it('emits an opening role chunk, a content chunk, and a finish_reason chunk', () => {
+		const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hello' }), 'gpt-4o');
+
+		expect(chunks.length).toBeGreaterThanOrEqual(3);
+		const firstDelta = (chunks[0].choices as Array<{ delta: { role?: string } }>)[0].delta;
+		expect(firstDelta.role).toBe('assistant');
+
+		const contentChunk = chunks.find(
+			(c) => (c.choices as Array<{ delta: { content?: string } }>)[0].delta.content === 'hello',
+		);
+		expect(contentChunk).toBeDefined();
+
+		const terminal = chunks[chunks.length - 1];
+		const terminalChoice = (terminal.choices as Array<{ finish_reason: string }>)[0];
+		expect(terminalChoice.finish_reason).toBe('stop');
+	});
+
+	it('every chunk carries the canonical object discriminator', () => {
+		const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-4o');
+
+		for (const chunk of chunks) {
+			expect(chunk.object).toBe('chat.completion.chunk');
+		}
+	});
+
+	it('every chunk shares the same id and created timestamp', () => {
+		const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-4o');
+
+		const ids = new Set(chunks.map((c) => c.id));
+		const createdSet = new Set(chunks.map((c) => c.created));
+		expect(ids.size).toBe(1);
+		expect(createdSet.size).toBe(1);
+	});
+
+	it('emits tool_calls with first-chunk id+name then arg-stream chunks then a tool_calls terminal', () => {
+		const chunks = forwardTranslateToSseChunks(
+			mockResponse({
+				tool_calls: [
+					{
+						id: 'call_xyz',
+						function: { name: 'get_weather', arguments: '{"city":"Paris"}' },
+					},
+				],
+			}),
+			'gpt-4o',
+		);
+
+		// Opening role chunk + first-chunk (id+name) + args-chunk + terminal = 4.
+		expect(chunks).toHaveLength(4);
+
+		const opener = (chunks[0].choices as Array<{ delta: Record<string, unknown> }>)[0].delta;
+		expect(opener.role).toBe('assistant');
+		// SDK reducers expect content: null when the turn will emit tool_calls.
+		expect(opener.content).toBeNull();
+
+		const firstToolChunk = (
+			chunks[1].choices as Array<{
+				delta: {
+					tool_calls?: Array<{
+						index: number;
+						id?: string;
+						type?: string;
+						function?: { name?: string; arguments?: string };
+					}>;
+				};
+			}>
+		)[0].delta;
+		expect(firstToolChunk.tool_calls?.[0]).toMatchObject({
+			index: 0,
+			id: 'call_xyz',
+			type: 'function',
+			function: { name: 'get_weather', arguments: '' },
+		});
+
+		const argsChunk = (
+			chunks[2].choices as Array<{
+				delta: {
+					tool_calls?: Array<{ index: number; function?: { arguments?: string } }>;
+				};
+			}>
+		)[0].delta;
+		// Arg-stream chunk MUST set `index` (SDKs use it to identify the slot)
+		// but MUST NOT repeat `id` or `function.name` (only the first chunk owns those).
+		expect(argsChunk.tool_calls?.[0].index).toBe(0);
+		expect(argsChunk.tool_calls?.[0].function?.arguments).toBe('{"city":"Paris"}');
+		const argEntry = argsChunk.tool_calls?.[0] as {
+			index: number;
+			id?: string;
+			function?: { name?: string; arguments?: string };
+		};
+		expect(argEntry.id).toBeUndefined();
+		expect(argEntry.function?.name).toBeUndefined();
+
+		const terminal = chunks[3];
+		expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe(
+			'tool_calls',
+		);
+	});
+
+	it('emits the empty-arguments tool call without an arg-stream chunk', () => {
+		const chunks = forwardTranslateToSseChunks(
+			mockResponse({
+				tool_calls: [{ id: 'call_1', function: { name: 'noop', arguments: '' } }],
+			}),
+			'gpt-4o',
+		);
+
+		// opener + first-chunk(id+name) + terminal = 3 — no args slice.
+		expect(chunks).toHaveLength(3);
+		const firstToolChunk = (chunks[1].choices as Array<{ delta: { tool_calls?: unknown[] } }>)[0]
+			.delta;
+		expect(firstToolChunk.tool_calls).toBeDefined();
+		expect((chunks[2].choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe(
+			'tool_calls',
+		);
+	});
+
+	it('emits two first-chunks (one per tool) for multi-tool responses', () => {
+		const chunks = forwardTranslateToSseChunks(
+			mockResponse({
+				tool_calls: [
+					{ id: 'a', function: { name: 'one', arguments: '{"a":1}' } },
+					{ id: 'b', function: { name: 'two', arguments: '{"b":2}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+
+		const firstChunks = chunks
+			.flatMap(
+				(c) =>
+					(c.choices as Array<{ delta: { tool_calls?: Array<{ id?: string }> } }>)[0].delta
+						.tool_calls ?? [],
+			)
+			.filter((tc) => typeof tc.id === 'string');
+		expect(firstChunks.map((tc) => tc.id)).toEqual(['a', 'b']);
+
+		const terminal = chunks[chunks.length - 1];
+		expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe(
+			'tool_calls',
+		);
+	});
+
+	it('streams empty content as the terminal finish_reason chunk only (no content chunk)', () => {
+		const chunks = forwardTranslateToSseChunks(mockResponse({ content: '' }), 'gpt-4o');
+
+		// opener + terminal = 2.
+		expect(chunks).toHaveLength(2);
+		const terminal = chunks[chunks.length - 1];
+		expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe('stop');
+	});
+
+	it('uses the provided model verbatim across all chunks', () => {
+		const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-5');
+		expect(chunks.every((c) => c.model === 'gpt-5')).toBe(true);
+	});
 });

 describe('buildOpenAiErrorEnvelope', () => {
--- a/packages/cli/src/modules/instance-ai/eval/tests/openai-responses-envelope.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/openai-responses-envelope.test.ts
@ -0,0 +1,367 @@
+import type { EvalMockHttpResponse } from 'n8n-core';
+
+import {
+	buildResponsesErrorEnvelope,
+	extractResponsesRequestModel,
+	forwardTranslateToResponsesEnvelope,
+	forwardTranslateToResponsesSseEvents,
+	isResponsesStreamRequested,
+	reverseTranslateOpenAiResponsesRequest,
+} from '../openai-responses-envelope';
+
+describe('reverseTranslateOpenAiResponsesRequest', () => {
+	it('emits the synthetic /v1/responses URL and POST method', () => {
+		const result = reverseTranslateOpenAiResponsesRequest({ model: 'gpt-4o-mini', input: [] });
+
+		expect(result.url).toBe('https://api.openai.com/v1/responses');
+		expect(result.method).toBe('POST');
+	});
+
+	it('passes the inbound body through unchanged', () => {
+		const body = {
+			model: 'gpt-4o',
+			input: [{ role: 'user', content: 'hi' }],
+			tools: [{ type: 'function', name: 'foo' }],
+			stream: true,
+		};
+
+		const result = reverseTranslateOpenAiResponsesRequest(body);
+
+		expect(result.body).toBe(body);
+	});
+
+	it('substitutes an empty object when body is null or undefined', () => {
+		expect(reverseTranslateOpenAiResponsesRequest(undefined).body).toEqual({});
+		expect(reverseTranslateOpenAiResponsesRequest(null).body).toEqual({});
+	});
+});
+
+describe('extractResponsesRequestModel', () => {
+	it('returns the model string from a well-formed body', () => {
+		expect(extractResponsesRequestModel({ model: 'gpt-5' })).toBe('gpt-5');
+	});
+
+	it('falls back to gpt-4o-mini for missing, empty, or non-string values', () => {
+		expect(extractResponsesRequestModel({})).toBe('gpt-4o-mini');
+		expect(extractResponsesRequestModel({ model: '' })).toBe('gpt-4o-mini');
+		expect(extractResponsesRequestModel({ model: 42 })).toBe('gpt-4o-mini');
+		expect(extractResponsesRequestModel(undefined)).toBe('gpt-4o-mini');
+		expect(extractResponsesRequestModel(null)).toBe('gpt-4o-mini');
+	});
+});
+
+describe('isResponsesStreamRequested', () => {
+	it('returns true only when stream === true', () => {
+		expect(isResponsesStreamRequested({ stream: true })).toBe(true);
+	});
+
+	it('returns false for missing, false, or truthy-non-true values', () => {
+		expect(isResponsesStreamRequested({})).toBe(false);
+		expect(isResponsesStreamRequested({ stream: false })).toBe(false);
+		expect(isResponsesStreamRequested({ stream: 1 })).toBe(false);
+		expect(isResponsesStreamRequested({ stream: 'true' })).toBe(false);
+		expect(isResponsesStreamRequested(undefined)).toBe(false);
+		expect(isResponsesStreamRequested(null)).toBe(false);
+	});
+});
+
+describe('forwardTranslateToResponsesEnvelope', () => {
+	function mockResponse(body: unknown): EvalMockHttpResponse {
+		return {
+			body,
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		};
+	}
+
+	it('produces a `response` envelope with all required top-level fields', () => {
+		const envelope = forwardTranslateToResponsesEnvelope(
+			mockResponse({ output_text: 'hello there' }),
+			'gpt-4o',
+		);
+
+		expect(envelope).toMatchObject({
+			object: 'response',
+			status: 'completed',
+			model: 'gpt-4o',
+			usage: { input_tokens: 0, output_tokens: 0, total_tokens: 0 },
+		});
+		expect(typeof envelope.id).toBe('string');
+		expect((envelope.id as string).startsWith('resp_')).toBe(true);
+		expect(typeof envelope.created_at).toBe('number');
+	});
+
+	it('emits a single assistant message with `annotations: []` on output_text', () => {
+		const envelope = forwardTranslateToResponsesEnvelope(
+			mockResponse({ output_text: 'a reply' }),
+			'gpt-4o',
+		);
+
+		const output = envelope.output as Array<{
+			type: string;
+			role: string;
+			content: Array<{ type: string; text: string; annotations: unknown[] }>;
+		}>;
+		expect(output).toHaveLength(1);
+		expect(output[0].type).toBe('message');
+		expect(output[0].role).toBe('assistant');
+		expect(output[0].content[0].type).toBe('output_text');
+		expect(output[0].content[0].text).toBe('a reply');
+		// `annotations: []` is required by the OpenAI SDK — LangChain's
+		// extractor calls `.annotations.map(...)` and crashes on undefined.
+		expect(output[0].content[0].annotations).toEqual([]);
+	});
+
+	it('extracts content from `output_text`, `content`, and `message` shorthand bodies', () => {
+		const cases: Array<[unknown, string]> = [
+			[{ output_text: 'first' }, 'first'],
+			[{ content: 'second' }, 'second'],
+			[{ message: 'third' }, 'third'],
+		];
+
+		for (const [body, expected] of cases) {
+			const env = forwardTranslateToResponsesEnvelope(mockResponse(body), 'gpt-4o');
+			const output = env.output as Array<{
+				content: Array<{ text: string }>;
+			}>;
+			expect(output[0].content[0].text).toBe(expected);
+		}
+	});
+
+	it('extracts content from an already-shaped responses envelope', () => {
+		const inner = {
+			id: 'resp_inner',
+			object: 'response',
+			output: [
+				{
+					id: 'msg_inner',
+					type: 'message',
+					role: 'assistant',
+					content: [{ type: 'output_text', text: 'unwrap me', annotations: [] }],
+					status: 'completed',
+				},
+			],
+		};
+		const env = forwardTranslateToResponsesEnvelope(mockResponse(inner), 'gpt-4o');
+		const output = env.output as Array<{ content: Array<{ text: string }> }>;
+		expect(output[0].content[0].text).toBe('unwrap me');
+	});
+
+	it('replaces the message with a function_call item when the body has tool_calls', () => {
+		const envelope = forwardTranslateToResponsesEnvelope(
+			mockResponse({
+				tool_calls: [
+					{ id: 'call_1', function: { name: 'lookup_order', arguments: '{"id":"42"}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+
+		const output = envelope.output as Array<Record<string, unknown>>;
+		expect(output).toHaveLength(1);
+		expect(output[0].type).toBe('function_call');
+		expect(output[0].name).toBe('lookup_order');
+		expect(output[0].call_id).toBe('call_1');
+		expect(output[0].arguments).toBe('{"id":"42"}');
+		// No message item alongside the tool call — Responses API mode is exclusive.
+		expect(output.find((item) => item.type === 'message')).toBeUndefined();
+	});
+
+	it('emits multiple function_call items when several tool_calls are present', () => {
+		const envelope = forwardTranslateToResponsesEnvelope(
+			mockResponse({
+				tool_calls: [
+					{ id: 'a', function: { name: 'one', arguments: '{}' } },
+					{ id: 'b', function: { name: 'two', arguments: '{}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+		const output = envelope.output as Array<{ type: string; name: string }>;
+		expect(output.map((o) => o.type)).toEqual(['function_call', 'function_call']);
+		expect(output.map((o) => o.name)).toEqual(['one', 'two']);
+	});
+});
+
+describe('forwardTranslateToResponsesSseEvents', () => {
+	function mockResponse(body: unknown): EvalMockHttpResponse {
+		return {
+			body,
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		};
+	}
+
+	it('emits the canonical event sequence for a plain text response', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({ output_text: 'hello' }),
+			'gpt-4o',
+		);
+
+		const eventNames = events.map((e) => e.event);
+		expect(eventNames).toEqual([
+			'response.created',
+			'response.in_progress',
+			'response.output_item.added',
+			'response.content_part.added',
+			'response.output_text.delta',
+			'response.output_text.done',
+			'response.content_part.done',
+			'response.output_item.done',
+			'response.completed',
+		]);
+	});
+
+	it('skips the output_text.delta event when content is empty', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({ output_text: '' }),
+			'gpt-4o',
+		);
+		const eventNames = events.map((e) => e.event);
+		expect(eventNames).not.toContain('response.output_text.delta');
+		expect(eventNames[eventNames.length - 1]).toBe('response.completed');
+	});
+
+	it('every event carries `annotations: []` on output_text parts', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({ output_text: 'hi' }),
+			'gpt-4o',
+		);
+
+		const partEvents = events.filter(
+			(e) => e.event === 'response.content_part.added' || e.event === 'response.content_part.done',
+		);
+		for (const e of partEvents) {
+			const part = (e.data as { part?: { annotations?: unknown } }).part;
+			expect(part?.annotations).toEqual([]);
+		}
+	});
+
+	it('terminal message item (`output_item.done`, `response.completed`) carries `annotations: []`', () => {
+		// Regression: earlier the terminal `messageItem` set `content:
+		// [{ type: 'output_text', text }]` without `annotations: []`. SDK
+		// consumers iterating the completed response would crash on
+		// `.annotations.map(...)` exactly like the non-streaming bug we
+		// already fixed.
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({ output_text: 'hello' }),
+			'gpt-4o',
+		);
+
+		type MsgItem = { content?: Array<{ type?: string; annotations?: unknown }> };
+		const findItem = (eventName: string): MsgItem | undefined => {
+			const e = events.find((ev) => ev.event === eventName);
+			if (eventName === 'response.completed') {
+				return ((e?.data as { response?: { output?: MsgItem[] } }).response?.output ?? [])[0];
+			}
+			return (e?.data as { item?: MsgItem }).item;
+		};
+
+		for (const name of [
+			'response.output_item.added',
+			'response.output_item.done',
+			'response.completed',
+		]) {
+			const item = findItem(name);
+			expect(item?.content?.[0].type).toBe('output_text');
+			expect(item?.content?.[0].annotations).toEqual([]);
+		}
+	});
+
+	it('keeps `id` stable across output_item / arguments / completed events for the same tool call', () => {
+		// Regression: earlier the SSE path generated the tool-call `id` once
+		// for `output_item.added/done` and then re-ran the synthesizer for
+		// `response.completed.output[]`, producing two different `fc_<uuid>`
+		// values for the same `output_index`. SDK consumers that reconcile
+		// state by `id` (e.g. tracing UIs) would fail to match.
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({
+				tool_calls: [
+					{ id: 'call_x', function: { name: 'fn', arguments: '{}' } },
+					{ id: 'call_y', function: { name: 'fn2', arguments: '{}' } },
+				],
+			}),
+			'gpt-4o',
+		);
+
+		const addedItems = events.filter((e) => e.event === 'response.output_item.added');
+		const doneItems = events.filter((e) => e.event === 'response.output_item.done');
+		const completed = events.find((e) => e.event === 'response.completed');
+		const completedOutput = (completed?.data as { response?: { output?: Array<{ id?: string }> } })
+			.response?.output;
+
+		for (let i = 0; i < addedItems.length; i++) {
+			const addedId = (addedItems[i].data as { item?: { id?: string } }).item?.id;
+			const doneId = (doneItems[i].data as { item?: { id?: string } }).item?.id;
+			const completedId = completedOutput?.[i].id;
+			expect(addedId).toBe(doneId);
+			expect(addedId).toBe(completedId);
+			expect(typeof addedId).toBe('string');
+		}
+	});
+
+	it('emits function_call event sequence with delta + done arguments for tool calls', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({
+				tool_calls: [{ id: 'call_xyz', function: { name: 'lookup', arguments: '{"q":"hi"}' } }],
+			}),
+			'gpt-4o',
+		);
+
+		const eventNames = events.map((e) => e.event);
+		expect(eventNames).toContain('response.output_item.added');
+		expect(eventNames).toContain('response.function_call_arguments.delta');
+		expect(eventNames).toContain('response.function_call_arguments.done');
+		expect(eventNames).toContain('response.output_item.done');
+		expect(eventNames[eventNames.length - 1]).toBe('response.completed');
+
+		const deltaEvent = events.find((e) => e.event === 'response.function_call_arguments.delta');
+		expect((deltaEvent?.data as { delta?: string })?.delta).toBe('{"q":"hi"}');
+
+		const doneEvent = events.find((e) => e.event === 'response.function_call_arguments.done');
+		expect((doneEvent?.data as { arguments?: string })?.arguments).toBe('{"q":"hi"}');
+	});
+
+	it('skips the function_call_arguments.delta event when arguments are empty', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({
+				tool_calls: [{ id: 'call_1', function: { name: 'noop', arguments: '' } }],
+			}),
+			'gpt-4o',
+		);
+
+		const deltaEvent = events.find((e) => e.event === 'response.function_call_arguments.delta');
+		expect(deltaEvent).toBeUndefined();
+		expect(events.find((e) => e.event === 'response.function_call_arguments.done')).toBeDefined();
+	});
+
+	it('uses a single response id across the entire event sequence', () => {
+		const events = forwardTranslateToResponsesSseEvents(
+			mockResponse({ output_text: 'hi' }),
+			'gpt-4o',
+		);
+		const ids = new Set<string>();
+		for (const e of events) {
+			const data = e.data as { response?: { id?: string } };
+			if (data.response?.id) ids.add(data.response.id);
+		}
+		expect(ids.size).toBe(1);
+		const id = Array.from(ids)[0];
+		expect(id?.startsWith('resp_')).toBe(true);
+	});
+});
+
+describe('buildResponsesErrorEnvelope', () => {
+	it('produces the standard error shape with the supplied message', () => {
+		const envelope = buildResponsesErrorEnvelope('mock failed: rate-limited');
+
+		expect(envelope).toEqual({
+			error: {
+				message: 'mock failed: rate-limited',
+				type: 'eval_wire_server_error',
+				code: 'eval_mock_generation_failed',
+				param: null,
+			},
+		});
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/workflow-analysis.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/workflow-analysis.test.ts
@ -11,11 +11,11 @@ import { createEvalAgent, extractText } from '@n8n/instance-ai';
 import type { IConnections, INode, INodeParameters, IWorkflowBase } from 'n8n-workflow';

 import {
-	assertUnpinCompatibility,
 	buildVendorLlmRouting,
 	generateMockHints,
 	identifyNodesForHints,
 	identifyNodesForPinData,
+	partitionAiRoots,
 } from '../workflow-analysis';
 import { UserError } from 'n8n-workflow';

@ -205,7 +205,7 @@ describe('identifyNodesForPinData', () => {
 	});
 });

-describe('assertUnpinCompatibility', () => {
+describe('partitionAiRoots', () => {
 	function agentWithMemory(memoryType: string) {
 		const nodes = [
 			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
@ -219,164 +219,166 @@ describe('assertUnpinCompatibility', () => {
 		return makeWorkflow(nodes, connections);
 	}

-	it('is a no-op when unpinNodes is empty', () => {
-		const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryPostgresChat');
-		expect(() => assertUnpinCompatibility(workflow, [])).not.toThrow();
+	describe('explicit pin validation (typo guard)', () => {
+		it('throws when an explicit pin name does not exist in the workflow', () => {
+			const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow');
+			let thrown: unknown;
+			try {
+				partitionAiRoots(workflow, ['Ghost']);
+			} catch (e) {
+				thrown = e;
+			}
+			expect(thrown).toBeInstanceOf(UserError);
+			expect((thrown as UserError).message).toContain('not found in workflow');
+			expect((thrown as UserError).message).toContain('"Ghost"');
+		});
+
+		it('throws when an explicit pin name refers to a disabled root', () => {
+			const nodes = [
+				makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }),
+				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent', disabled: true }),
+			];
+			const connections: IConnections = {
+				PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
+			};
+			let thrown: unknown;
+			try {
+				partitionAiRoots(makeWorkflow(nodes, connections), ['Agent']);
+			} catch (e) {
+				thrown = e;
+			}
+			expect(thrown).toBeInstanceOf(UserError);
+			expect((thrown as UserError).message).toContain('disabled');
+			expect((thrown as UserError).message).toContain('"Agent"');
+		});
+
+		it('throws when an explicit pin name refers to a non-AI-root node', () => {
+			const nodes = [
+				makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			];
+			let thrown: unknown;
+			try {
+				partitionAiRoots(makeWorkflow(nodes), ['Set']);
+			} catch (e) {
+				thrown = e;
+			}
+			expect(thrown).toBeInstanceOf(UserError);
+			expect((thrown as UserError).message).toContain('not AI root nodes');
+			expect((thrown as UserError).message).toContain('"Set"');
+		});
 	});

-	it('allows unpinning an Agent backed by MemoryBufferWindow', () => {
-		const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow');
-		expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow();
+	describe('default partition (no explicit pin)', () => {
+		it('intercepts an Agent backed by a non-protocol-binary memory', () => {
+			const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow');
+			const result = partitionAiRoots(workflow);
+			expect(result.unpinNodes).toEqual(['Agent']);
+			expect(result.pinNodes).toEqual([]);
+			expect(result.autoPinned).toEqual([]);
+		});
+
+		it('returns an empty partition when the workflow has no AI roots', () => {
+			const nodes = [makeNode({ name: 'Set', type: 'n8n-nodes-base.set' })];
+			const result = partitionAiRoots(makeWorkflow(nodes));
+			expect(result.unpinNodes).toEqual([]);
+			expect(result.pinNodes).toEqual([]);
+			expect(result.autoPinned).toEqual([]);
+		});
+
+		it('ignores disabled sub-nodes when partitioning', () => {
+			const nodes = [
+				makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+				makeNode({
+					name: 'PgMem',
+					type: '@n8n/n8n-nodes-langchain.memoryPostgresChat',
+					disabled: true,
+				}),
+				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			];
+			const connections: IConnections = {
+				OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+				PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
+			};
+			const result = partitionAiRoots(makeWorkflow(nodes, connections));
+			expect(result.unpinNodes).toEqual(['Agent']);
+			expect(result.autoPinned).toEqual([]);
+		});
 	});

-	it('allows unpinning an Agent with no sub-nodes attached', () => {
-		const nodes = [makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' })];
-		expect(() => assertUnpinCompatibility(makeWorkflow(nodes), ['Agent'])).not.toThrow();
+	describe('explicit pin opt-out', () => {
+		it('moves explicitly pinned roots to pinNodes', () => {
+			const nodes = [
+				makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			];
+			const connections: IConnections = {
+				OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+			};
+			const result = partitionAiRoots(makeWorkflow(nodes, connections), ['Agent']);
+			expect(result.unpinNodes).toEqual([]);
+			expect(result.pinNodes).toEqual(['Agent']);
+			expect(result.autoPinned).toEqual([]);
+		});
 	});

-	it('ignores disabled sub-nodes when checking compatibility', () => {
-		const nodes = [
-			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
-			makeNode({
-				name: 'PgMem',
-				type: '@n8n/n8n-nodes-langchain.memoryPostgresChat',
-				disabled: true,
-			}),
-			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
-		];
-		const connections: IConnections = {
-			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
-			PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
-		};
-		expect(() =>
-			assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']),
-		).not.toThrow();
-	});
+	describe('auto-pin on incompatible sub-nodes', () => {
+		it.each([
+			['Postgres memory', '@n8n/n8n-nodes-langchain.memoryPostgresChat'],
+			['Redis memory', '@n8n/n8n-nodes-langchain.memoryRedisChat'],
+			['MongoDB memory', '@n8n/n8n-nodes-langchain.memoryMongoDbChat'],
+		])('auto-pins an Agent backed by %s', (_label, memoryType) => {
+			const workflow = agentWithMemory(memoryType);
+			const result = partitionAiRoots(workflow);
+			expect(result.unpinNodes).toEqual([]);
+			expect(result.pinNodes).toEqual(['Agent']);
+			expect(result.autoPinned).toContainEqual({
+				root: 'Agent',
+				subNode: 'Memory',
+				subNodeType: memoryType,
+				reason: 'protocol_binary',
+			});
+		});

-	it('refuses unknown root names rather than silently skipping (typo guard)', () => {
-		const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow');
+		it.each([
+			'@n8n/n8n-nodes-langchain.vectorStorePGVector',
+			'@n8n/n8n-nodes-langchain.vectorStoreMongoDBAtlas',
+			'@n8n/n8n-nodes-langchain.vectorStoreRedis',
+			'@n8n/n8n-nodes-langchain.vectorStoreMilvus',
+			'@n8n/n8n-nodes-langchain.chatHubVectorStorePGVector',
+		])('auto-pins an Agent backed by protocol-binary vector store %s', (vectorStoreType) => {
+			const nodes = [
+				makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+				makeNode({ name: 'Store', type: vectorStoreType }),
+				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			];
+			const connections: IConnections = {
+				OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+				Store: { ai_vectorStore: [[{ node: 'Agent', type: 'ai_vectorStore', index: 0 }]] },
+			};
+			const result = partitionAiRoots(makeWorkflow(nodes, connections));
+			expect(result.pinNodes).toEqual(['Agent']);
+			expect(result.autoPinned.some((e) => e.reason === 'protocol_binary')).toBe(true);
+		});

-		let thrown: unknown;
-		try {
-			assertUnpinCompatibility(workflow, ['Ghost']);
-		} catch (e) {
-			thrown = e;
-		}
-
-		expect(thrown).toBeInstanceOf(UserError);
-		expect((thrown as UserError).message).toContain('not found in workflow');
-		expect((thrown as UserError).message).toContain('"Ghost"');
-	});
-
-	it('refuses disabled roots rather than silently skipping (typo guard)', () => {
-		const nodes = [
-			makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }),
-			makeNode({
-				name: 'Agent',
-				type: '@n8n/n8n-nodes-langchain.agent',
-				disabled: true,
-			}),
-		];
-		const connections: IConnections = {
-			PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
-		};
-
-		let thrown: unknown;
-		try {
-			assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']);
-		} catch (e) {
-			thrown = e;
-		}
-
-		expect(thrown).toBeInstanceOf(UserError);
-		expect((thrown as UserError).message).toContain('disabled');
-		expect((thrown as UserError).message).toContain('"Agent"');
-	});
-
-	it('refuses non-AI-root nodes (e.g. a regular Set node in unpinNodes is a caller mistake)', () => {
-		const nodes = [
-			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
-			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
-		];
-
-		let thrown: unknown;
-		try {
-			assertUnpinCompatibility(makeWorkflow(nodes), ['Set']);
-		} catch (e) {
-			thrown = e;
-		}
-
-		expect(thrown).toBeInstanceOf(UserError);
-		expect((thrown as UserError).message).toContain('not AI root nodes');
-		expect((thrown as UserError).message).toContain('"Set"');
-	});
-
-	it.each([
-		'@n8n/n8n-nodes-langchain.chainLlm',
-		'@n8n/n8n-nodes-langchain.chainRetrievalQa',
-		'@n8n/n8n-nodes-langchain.chainSummarization',
-	])('recognises %s by type even when it has no inbound ai_* connections', (chainType) => {
-		const nodes = [makeNode({ name: 'Chain', type: chainType })];
-		expect(() => assertUnpinCompatibility(makeWorkflow(nodes), ['Chain'])).not.toThrow();
-	});
-
-	it.each([
-		['Postgres memory', '@n8n/n8n-nodes-langchain.memoryPostgresChat'],
-		['Redis memory', '@n8n/n8n-nodes-langchain.memoryRedisChat'],
-		['MongoDB memory', '@n8n/n8n-nodes-langchain.memoryMongoDbChat'],
-	])('refuses unpinning an Agent backed by %s', (_label, memoryType) => {
-		const workflow = agentWithMemory(memoryType);
-		expect(() => assertUnpinCompatibility(workflow, ['Agent'])).toThrow(UserError);
-	});
-
-	it.each([
-		'@n8n/n8n-nodes-langchain.vectorStorePGVector',
-		'@n8n/n8n-nodes-langchain.vectorStoreMongoDBAtlas',
-		'@n8n/n8n-nodes-langchain.vectorStoreRedis',
-		'@n8n/n8n-nodes-langchain.vectorStoreMilvus',
-		'@n8n/n8n-nodes-langchain.chatHubVectorStorePGVector',
-	])('refuses unpinning an Agent backed by protocol-binary vector store %s', (vectorStoreType) => {
-		const nodes = [
-			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
-			makeNode({ name: 'Store', type: vectorStoreType }),
-			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
-		];
-		const connections: IConnections = {
-			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
-			Store: { ai_vectorStore: [[{ node: 'Agent', type: 'ai_vectorStore', index: 0 }]] },
-		};
-		expect(() => assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent'])).toThrow(
-			UserError,
-		);
-	});
-
-	it('reports all offending roots when multiple unpin targets are mixed', () => {
-		const nodes = [
-			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
-			makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }),
-			makeNode({ name: 'BufMem', type: '@n8n/n8n-nodes-langchain.memoryBufferWindow' }),
-			makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }),
-			makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }),
-		];
-		const connections: IConnections = {
-			OpenAI: { ai_languageModel: [[{ node: 'AgentB', type: 'ai_languageModel', index: 0 }]] },
-			PgMem: { ai_memory: [[{ node: 'AgentA', type: 'ai_memory', index: 0 }]] },
-			BufMem: { ai_memory: [[{ node: 'AgentB', type: 'ai_memory', index: 0 }]] },
-		};
-
-		let thrown: unknown;
-		try {
-			assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']);
-		} catch (e) {
-			thrown = e;
-		}
-
-		expect(thrown).toBeInstanceOf(UserError);
-		const message = (thrown as UserError).message;
-		expect(message).toContain('AgentA');
-		expect(message).toContain('PgMem');
-		expect(message).not.toContain('AgentB');
-		expect(message).not.toContain('BufMem');
+		it('partitions independently across multiple roots — pin one, intercept the other', () => {
+			const nodes = [
+				makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+				makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }),
+				makeNode({ name: 'BufMem', type: '@n8n/n8n-nodes-langchain.memoryBufferWindow' }),
+				makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }),
+				makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }),
+			];
+			const connections: IConnections = {
+				OpenAI: { ai_languageModel: [[{ node: 'AgentB', type: 'ai_languageModel', index: 0 }]] },
+				PgMem: { ai_memory: [[{ node: 'AgentA', type: 'ai_memory', index: 0 }]] },
+				BufMem: { ai_memory: [[{ node: 'AgentB', type: 'ai_memory', index: 0 }]] },
+			};
+			const result = partitionAiRoots(makeWorkflow(nodes, connections));
+			expect(result.unpinNodes).toEqual(['AgentB']);
+			expect(result.pinNodes).toEqual(['AgentA']);
+			expect(result.autoPinned.map((e) => e.root)).toEqual(['AgentA']);
+		});
 	});

 	describe('vendor LLM mapping', () => {
@ -391,9 +393,10 @@ describe('assertUnpinCompatibility', () => {
 			return makeWorkflow(nodes, connections);
 		}

-		it('allows unpinning an Agent backed by lmChatOpenAi (the only mapped vendor for M1)', () => {
-			const workflow = agentWithLlm('@n8n/n8n-nodes-langchain.lmChatOpenAi');
-			expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow();
+		it('intercepts an Agent backed by lmChatOpenAi (the only mapped vendor for M1)', () => {
+			const result = partitionAiRoots(agentWithLlm('@n8n/n8n-nodes-langchain.lmChatOpenAi'));
+			expect(result.unpinNodes).toEqual(['Agent']);
+			expect(result.autoPinned).toEqual([]);
 		});

 		it.each([
@ -408,51 +411,17 @@ describe('assertUnpinCompatibility', () => {
 			'@n8n/n8n-nodes-langchain.lmChatDeepSeek',
 			'@n8n/n8n-nodes-langchain.lmChatOllama',
 			'@n8n/n8n-nodes-langchain.lmOpenAi',
-		])('refuses unpinning an Agent backed by unmapped vendor LLM %s', (llmType) => {
-			const workflow = agentWithLlm(llmType);
-
-			let thrown: unknown;
-			try {
-				assertUnpinCompatibility(workflow, ['Agent']);
-			} catch (e) {
-				thrown = e;
-			}
-
-			expect(thrown).toBeInstanceOf(UserError);
-			const message = (thrown as UserError).message;
-			expect(message).toContain('unsupported vendor LLM');
-			expect(message).toContain(llmType);
+		])('auto-pins an Agent backed by unmapped vendor LLM %s', (llmType) => {
+			const result = partitionAiRoots(agentWithLlm(llmType));
+			expect(result.pinNodes).toEqual(['Agent']);
+			expect(result.autoPinned[0]).toMatchObject({
+				root: 'Agent',
+				subNodeType: llmType,
+				reason: 'unsupported_vendor_llm',
+			});
 		});

-		it('groups protocol-binary and unsupported-vendor refusals into the same error', () => {
-			const nodes = [
-				makeNode({ name: 'Anthropic', type: '@n8n/n8n-nodes-langchain.lmChatAnthropic' }),
-				makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }),
-				makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
-			];
-			const connections: IConnections = {
-				Anthropic: {
-					ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]],
-				},
-				PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
-			};
-
-			let thrown: unknown;
-			try {
-				assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']);
-			} catch (e) {
-				thrown = e;
-			}
-
-			expect(thrown).toBeInstanceOf(UserError);
-			const message = (thrown as UserError).message;
-			expect(message).toContain('protocol-binary');
-			expect(message).toContain('PgMem');
-			expect(message).toContain('unsupported vendor LLM');
-			expect(message).toContain('Anthropic');
-		});
-
-		it('ignores disabled vendor LLM sub-nodes when checking compatibility', () => {
+		it('ignores disabled vendor LLM sub-nodes when partitioning', () => {
 			const nodes = [
 				makeNode({
 					name: 'Anthropic',
@ -466,10 +435,8 @@ describe('assertUnpinCompatibility', () => {
 					ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]],
 				},
 			};
-
-			expect(() =>
-				assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']),
-			).not.toThrow();
+			const result = partitionAiRoots(makeWorkflow(nodes, connections));
+			expect(result.unpinNodes).toEqual(['Agent']);
 		});

 		describe('lmChatOpenAi options.baseURL override', () => {
@ -488,71 +455,26 @@ describe('assertUnpinCompatibility', () => {
 				return makeWorkflow(nodes, connections);
 			}

-			it('allows lmChatOpenAi with no options', () => {
-				const workflow = agentWithOpenAi({});
-				expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow();
+			it.each([
+				['no options', {}],
+				['empty baseURL', { options: { baseURL: '' } }],
+				['whitespace-only baseURL', { options: { baseURL: '   ' } }],
+			])('intercepts lmChatOpenAi with %s', (_label, parameters) => {
+				const result = partitionAiRoots(agentWithOpenAi(parameters));
+				expect(result.unpinNodes).toEqual(['Agent']);
 			});

-			it('allows lmChatOpenAi with empty options.baseURL', () => {
-				const workflow = agentWithOpenAi({ options: { baseURL: '' } });
-				expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow();
-			});
-
-			it('allows lmChatOpenAi when options.baseURL is whitespace-only', () => {
-				const workflow = agentWithOpenAi({ options: { baseURL: '   ' } });
-				expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow();
-			});
-
-			it('refuses lmChatOpenAi when options.baseURL is set — credential rewrite would be bypassed', () => {
+			it('auto-pins lmChatOpenAi when options.baseURL would bypass the credential rewrite', () => {
 				const workflow = agentWithOpenAi({
 					options: { baseURL: 'https://my-proxy.example.com/v1' },
 				});
-
-				let thrown: unknown;
-				try {
-					assertUnpinCompatibility(workflow, ['Agent']);
-				} catch (e) {
-					thrown = e;
-				}
-
-				expect(thrown).toBeInstanceOf(UserError);
-				const message = (thrown as UserError).message;
-				expect(message).toContain('options.baseURL');
-				expect(message).toContain('"OpenAI"');
-				expect(message).not.toContain('unsupported vendor LLM');
-			});
-
-			it('groups baseURL-override refusals alongside protocol-binary refusals', () => {
-				const nodes = [
-					makeNode({
-						name: 'OpenAI',
-						type: '@n8n/n8n-nodes-langchain.lmChatOpenAi',
-						parameters: { options: { baseURL: 'https://my-proxy.example.com/v1' } },
-					}),
-					makeNode({
-						name: 'PgMem',
-						type: '@n8n/n8n-nodes-langchain.memoryPostgresChat',
-					}),
-					makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
-				];
-				const connections: IConnections = {
-					OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
-					PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
-				};
-
-				let thrown: unknown;
-				try {
-					assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']);
-				} catch (e) {
-					thrown = e;
-				}
-
-				expect(thrown).toBeInstanceOf(UserError);
-				const message = (thrown as UserError).message;
-				expect(message).toContain('protocol-binary');
-				expect(message).toContain('PgMem');
-				expect(message).toContain('options.baseURL');
-				expect(message).toContain('OpenAI');
+				const result = partitionAiRoots(workflow);
+				expect(result.pinNodes).toEqual(['Agent']);
+				expect(result.autoPinned[0]).toMatchObject({
+					root: 'Agent',
+					subNode: 'OpenAI',
+					reason: 'unsafe_baseurl_override',
+				});
 			});

 			it('skips the baseURL check when the OpenAI sub-node is disabled', () => {
@ -568,15 +490,13 @@ describe('assertUnpinCompatibility', () => {
 				const connections: IConnections = {
 					OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
 				};
-
-				expect(() =>
-					assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']),
-				).not.toThrow();
+				const result = partitionAiRoots(makeWorkflow(nodes, connections));
+				expect(result.unpinNodes).toEqual(['Agent']);
 			});
 		});

-		describe('shared vendor LLM sub-node across multiple unpinned roots', () => {
-			it('refuses unpinning both roots when one OpenAI sub-node feeds both', () => {
+		describe('shared vendor LLM sub-node across multiple roots', () => {
+			function workflowWithSharedSubNode(): IWorkflowBase {
 				const nodes = [
 					makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
 					makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }),
@ -592,49 +512,25 @@ describe('assertUnpinCompatibility', () => {
 						],
 					},
 				};
+				return makeWorkflow(nodes, connections);
+			}

-				let thrown: unknown;
-				try {
-					assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']);
-				} catch (e) {
-					thrown = e;
-				}
-
-				expect(thrown).toBeInstanceOf(UserError);
-				const message = (thrown as UserError).message;
-				expect(message).toContain('shared by multiple unpinned roots');
-				expect(message).toContain('"OpenAI"');
-				// Both root attributions listed in the error so the user can see
-				// exactly which conflict to resolve.
-				expect(message).toContain('AgentA');
-				expect(message).toContain('AgentB');
+			it('auto-pins both roots when one OpenAI sub-node feeds both', () => {
+				const result = partitionAiRoots(workflowWithSharedSubNode());
+				expect(result.unpinNodes).toEqual([]);
+				expect(result.pinNodes).toEqual(['AgentA', 'AgentB']);
+				const reasons = result.autoPinned.map((e) => e.reason);
+				expect(reasons).toContain('shared_vendor_llm_subnode');
 			});

-			it('allows unpinning when only one root references the shared OpenAI sub-node', () => {
-				const nodes = [
-					makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
-					makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }),
-					makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }),
-				];
-				const connections: IConnections = {
-					OpenAI: {
-						ai_languageModel: [
-							[
-								{ node: 'AgentA', type: 'ai_languageModel', index: 0 },
-								{ node: 'AgentB', type: 'ai_languageModel', index: 0 },
-							],
-						],
-					},
-				};
-
-				// Only AgentA is being unpinned — AgentB stays pinned so there's
-				// no attribution conflict at the wire-server layer.
-				expect(() =>
-					assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA']),
-				).not.toThrow();
+			it('intercepts the remaining root when the other one is explicitly pinned', () => {
+				// AgentA is opted out → AgentB no longer shares the sub-node ambiguously.
+				const result = partitionAiRoots(workflowWithSharedSubNode(), ['AgentA']);
+				expect(result.unpinNodes).toEqual(['AgentB']);
+				expect(result.pinNodes).toEqual(['AgentA']);
 			});

-			it('ignores a disabled sub-node when counting shared references', () => {
+			it('ignores a disabled shared sub-node when partitioning', () => {
 				const nodes = [
 					makeNode({
 						name: 'OpenAI',
@ -654,10 +550,8 @@ describe('assertUnpinCompatibility', () => {
 						],
 					},
 				};
-
-				expect(() =>
-					assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']),
-				).not.toThrow();
+				const result = partitionAiRoots(makeWorkflow(nodes, connections));
+				expect(result.unpinNodes.sort()).toEqual(['AgentA', 'AgentB']);
 			});
 		});
 	});
@ -694,6 +588,25 @@ describe('buildVendorLlmRouting', () => {
 		expect(routing.rootToSubNode.get('Agent')?.name).toBe('OpenAI');
 	});

+	it('also self-maps the root in subNodeToRoot so agent-context credential lookups resolve', () => {
+		// LangChain's Agent invokes the LLM sub-node's `supplyData` with a
+		// context whose `executeData.node` is the Agent itself (observed
+		// empirically). The credential helper looks up `subNodeToRoot` by
+		// that name — without the self-map, the lookup would miss and the
+		// SDK would post to the wire server's loud-fail no-root route.
+		const nodes = [
+			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+		];
+		const connections: IConnections = {
+			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+		};
+
+		const routing = buildVendorLlmRouting(makeWorkflow(nodes, connections), ['Agent']);
+
+		expect(routing.subNodeToRoot.get('Agent')).toBe('Agent');
+	});
+
 	it('does not include sub-nodes feeding roots that are still pinned', () => {
 		const nodes = [
 			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
@ -747,7 +660,12 @@ describe('buildVendorLlmRouting', () => {

 		const routing = buildVendorLlmRouting(makeWorkflow(nodes, connections), ['Agent']);

-		expect(Array.from(routing.subNodeToRoot.keys())).toEqual(['OpenAI']);
+		// `Agent` is also present in subNodeToRoot via the agent-context
+		// self-map (see test above) — assert by lookup so the test isn't
+		// sensitive to insertion order.
+		expect(routing.subNodeToRoot.get('OpenAI')).toBe('Agent');
+		expect(routing.subNodeToRoot.get('Agent')).toBe('Agent');
+		expect(routing.subNodeToRoot.size).toBe(2);
 		expect(Array.from(routing.rootToSubNode.keys())).toEqual(['Agent']);
 	});

--- a/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts
+++ b/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts
@ -3,6 +3,7 @@ import type {
 	InstanceAiEvalRewrittenCredential,
 } from '@n8n/api-types';
 import type { Logger } from '@n8n/backend-common';
+import { buildEvalMockCredentials } from 'n8n-core';
 import type {
 	ICredentialDataDecryptedObject,
 	ICredentials,
@ -123,7 +124,22 @@ export class EvalMockedCredentialsHelper extends ICredentialsHelper {
 				credentialId: nodeCredentials.id ?? undefined,
 			});

-			credentials = { [MOCK_MARKER]: true };
+			// When called with no credential id (eval-mode bypass for nodes
+			// with no credentials of any type configured), schema-synthesize
+			// so the wire-server URL rewrite below has a real `url` field to
+			// augment. Otherwise vendor SDK traffic would escape to the real
+			// provider with placeholder values and 401 at the wire layer.
+			// `buildEvalMockCredentials` is typed `Record<string, unknown>` —
+			// schema defaults can be richer than `CredentialInformation`, but
+			// at runtime emits only JSON-shaped values, which is what the
+			// rewrite path consumes.
+			credentials =
+				nodeCredentials.id === null
+					? ({
+							...buildEvalMockCredentials(this.inner.getCredentialsProperties(type)),
+							[MOCK_MARKER]: true,
+						} as ICredentialDataDecryptedObject)
+					: { [MOCK_MARKER]: true };
 		}

 		return this.applyServerUrlRewrite(credentials, type, nodeCredentials, executeData);
--- a/packages/cli/src/modules/instance-ai/eval/execution.service.ts
+++ b/packages/cli/src/modules/instance-ai/eval/execution.service.ts
@ -42,12 +42,12 @@ import { createLlmMockHandler } from './mock-handler';
 import { generatePinData } from './pin-data-generator';
 import { patchNoProxyForLoopback } from './proxy-loopback';
 import {
-	assertUnpinCompatibility,
 	buildVendorLlmRouting,
 	generateMockHints,
 	identifyNodesForHints,
 	identifyNodesForPinData,
 	type MockHints,
+	partitionAiRoots,
 	type VendorLlmRouting,
 } from './workflow-analysis';

@ -89,11 +89,13 @@ export class EvalExecutionService {
 			return this.errorResult(executionId, `Workflow ${workflowId} not found or not accessible`);
 		}

-		const unpinNodes = options.unpinNodes ?? [];
-
-		// Compatibility guard runs before the kill-switch so actionable errors aren't shadowed.
+		// Partition AI roots into "intercept via wire server" vs "leave pinned".
+		// Default-on: every root with compatible sub-nodes gets intercepted;
+		// callers can opt specific roots out via `pinNodes` (e.g. for A/B
+		// comparison). Roots whose sub-nodes are incompatible auto-pin.
+		let partitioned: ReturnType<typeof partitionAiRoots>;
 		try {
-			assertUnpinCompatibility(workflowEntity, unpinNodes);
+			partitioned = partitionAiRoots(workflowEntity, options.pinNodes ?? []);
 		} catch (error) {
 			if (error instanceof UserError) {
 				return this.errorResult(executionId, error.message);
@ -101,15 +103,23 @@ export class EvalExecutionService {
 			throw error;
 		}

+		for (const entry of partitioned.autoPinned) {
+			this.logger.debug(
+				`[EvalMock] Auto-pinning AI root "${entry.root}" — sub-node "${entry.subNode}" (${entry.subNodeType}) is ${entry.reason}`,
+			);
+		}
+
+		// Kill-switch: when interception is disabled, every root falls back to
+		// the pinned path regardless of partition or explicit `pinNodes`.
 		let interceptionEnabled = false;
+		let unpinNodes = partitioned.unpinNodes;
 		if (unpinNodes.length > 0) {
 			interceptionEnabled = await this.isInterceptionEnabled(user);
 			if (!interceptionEnabled) {
-				return this.errorResult(
-					executionId,
-					'`unpinNodes` is reserved — vendor SDK interception is currently disabled. ' +
-						'Submit the request without `unpinNodes` to use the existing pinned path.',
+				this.logger.warn(
+					'[EvalMock] Vendor SDK interception disabled by kill-switch — pinning all AI roots',
 				);
+				unpinNodes = [];
 			}
 		}

--- a/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts
+++ b/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts
@ -1,15 +1,25 @@
 import type { Logger } from '@n8n/backend-common';
 import express, { type Express, type Request, type Response } from 'express';
-import type { EvalLlmMockHandler } from 'n8n-core';
-import type { INode } from 'n8n-workflow';
+import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core';
+import type { IHttpRequestOptions, INode } from 'n8n-workflow';
 import { type Server } from 'node:http';

 import {
 	buildOpenAiErrorEnvelope,
 	extractRequestModel,
 	forwardTranslateToChatCompletion,
+	forwardTranslateToSseChunks,
+	isStreamRequested,
 	reverseTranslateOpenAiRequest,
 } from './openai-envelope';
+import {
+	buildResponsesErrorEnvelope,
+	extractResponsesRequestModel,
+	forwardTranslateToResponsesEnvelope,
+	forwardTranslateToResponsesSseEvents,
+	isResponsesStreamRequested,
+	reverseTranslateOpenAiResponsesRequest,
+} from './openai-responses-envelope';

 /** Loopback HTTP server that intercepts vendor SDK calls during eval. Binds to an OS-assigned port. */
 export interface InterceptedTurn {
@ -31,9 +41,67 @@ export interface LlmWireServerOptions {
 	logger?: Logger;
 }

+/** Per-protocol translator + formatter — adding a new vendor envelope is a new adapter, not a new handler. */
+interface ProtocolAdapter {
+	name: string;
+	extractModel(body: unknown): string;
+	isStreamRequested(body: unknown): boolean;
+	reverseTranslate(body: unknown): IHttpRequestOptions;
+	forwardObject(response: EvalMockHttpResponse | undefined, model: string): Record<string, unknown>;
+	/** Pre-formatted SSE frames (`data: ...\n\n` or `event: ...\ndata: ...\n\n`), incl. any terminator. */
+	buildSseFrames(response: EvalMockHttpResponse | undefined, model: string): string[];
+	buildErrorEnvelope(message: string): Record<string, unknown>;
+	stubResponse(): EvalMockHttpResponse;
+}
+
+const chatCompletionsAdapter: ProtocolAdapter = {
+	name: 'chat-completions',
+	extractModel: extractRequestModel,
+	isStreamRequested,
+	reverseTranslate: reverseTranslateOpenAiRequest,
+	forwardObject: forwardTranslateToChatCompletion,
+	buildSseFrames: (response, model) => {
+		const chunks = forwardTranslateToSseChunks(response, model);
+		const frames = chunks.map((chunk) => `data: ${JSON.stringify(chunk)}\n\n`);
+		// Terminator per OpenAI SSE spec — SDKs stop reading on this sentinel.
+		frames.push('data: [DONE]\n\n');
+		return frames;
+	},
+	buildErrorEnvelope: buildOpenAiErrorEnvelope,
+	stubResponse: () => ({
+		body: { content: '[eval wire server stub] — no mock handler attached' },
+		headers: { 'content-type': 'application/json' },
+		statusCode: 200,
+	}),
+};
+
+const responsesAdapter: ProtocolAdapter = {
+	name: 'responses',
+	extractModel: extractResponsesRequestModel,
+	isStreamRequested: isResponsesStreamRequested,
+	reverseTranslate: reverseTranslateOpenAiResponsesRequest,
+	forwardObject: forwardTranslateToResponsesEnvelope,
+	buildSseFrames: (response, model) => {
+		// Responses API uses `event: <name>\ndata: <JSON>\n\n` frames and emits
+		// `response.completed` as its terminal sentinel (no `[DONE]` line).
+		const events = forwardTranslateToResponsesSseEvents(response, model);
+		return events.map(({ event, data }) => `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
+	},
+	buildErrorEnvelope: buildResponsesErrorEnvelope,
+	stubResponse: () => ({
+		body: { output_text: '[eval wire server stub] — no mock handler attached' },
+		headers: { 'content-type': 'application/json' },
+		statusCode: 200,
+	}),
+};
+
 export class LlmWireServer {
 	private server: Server | undefined;
 	private resolvedUrl: string | undefined;
+	/** In-flight handler promises — `stop()` awaits these before resolving. */
+	private readonly inFlight = new Set<Promise<void>>();
+	/** Set by `stop()` so any request that beats the close-callback gets a 503 instead of starting a fresh handler that would race the teardown. */
+	private stopping = false;

 	constructor(private readonly options: LlmWireServerOptions = {}) {}

@ -47,6 +115,9 @@ export class LlmWireServer {
 	async start(): Promise<string> {
 		if (this.server) return this.url;

+		// Reset the shutdown latch in case this instance is restarted after stop().
+		this.stopping = false;
+
 		const app = this.buildApp();

 		this.server = await new Promise<Server>((resolve, reject) => {
@ -65,9 +136,15 @@ export class LlmWireServer {
 	async stop(): Promise<void> {
 		const server = this.server;
 		if (!server) return;
+		// Flip stopping FIRST so new requests 503 instead of racing the teardown.
+		this.stopping = true;
 		this.server = undefined;
 		this.resolvedUrl = undefined;

+		// Drain in-flight handlers so the mock-handler resolve can't write to a
+		// torn-down socket and `onIntercept` can't fire after stop().
+		await Promise.allSettled(Array.from(this.inFlight));
+
 		server.closeAllConnections();

 		await new Promise<void>((resolve, reject) => {
@ -78,54 +155,71 @@ export class LlmWireServer {
 	private buildApp(): Express {
 		const app = express();
 		app.use(express.json({ limit: '4mb' }));
-		app.post('/eval/:root/v1/chat/completions', this.handleChatCompletion);
+		app.post('/eval/:root/v1/chat/completions', this.routeFor(chatCompletionsAdapter));
+		// `@langchain/openai` v1.3+ auto-routes Agent v3.1+ calls to /v1/responses.
+		app.post('/eval/:root/v1/responses', this.routeFor(responsesAdapter));
 		// Surfaces credential-rewrite misconfiguration loudly instead of 404'ing.
-		app.post('/v1/chat/completions', this.handleUnroutedChatCompletion);
+		app.post('/v1/chat/completions', this.handleUnrouted);
+		app.post('/v1/responses', this.handleUnrouted);
 		return app;
 	}

-	private handleChatCompletion = async (req: Request, res: Response): Promise<void> => {
+	/** Wraps each route in the in-flight tracker so `stop()` can drain. */
+	private routeFor(adapter: ProtocolAdapter) {
+		return async (req: Request, res: Response): Promise<void> => {
+			if (this.stopping) {
+				res.status(503).json(adapter.buildErrorEnvelope('Wire server is shutting down'));
+				return;
+			}
+			const promise = this.handleProtocol(adapter, req, res);
+			this.inFlight.add(promise);
+			try {
+				await promise;
+			} finally {
+				this.inFlight.delete(promise);
+			}
+		};
+	}
+
+	private async handleProtocol(
+		adapter: ProtocolAdapter,
+		req: Request,
+		res: Response,
+	): Promise<void> {
 		// Express decodes route params; a second decode would mangle literal `%`.
 		const rootName = req.params.root;
-		const model = extractRequestModel(req.body);
+		const model = adapter.extractModel(req.body);
+		const stream = adapter.isStreamRequested(req.body);
 		const subNode = this.resolveSubNode(rootName);

 		if (!this.options.mockHandler) {
-			const envelope = forwardTranslateToChatCompletion(
-				{
-					body: { content: '[eval wire server stub] — no mock handler attached' },
-					headers: { 'content-type': 'application/json' },
-					statusCode: 200,
-				},
-				model,
-			);
-			res.status(200).json(envelope);
+			this.respondWithStub(adapter, req, res, model, stream);
 			return;
 		}

-		let synthetic: ReturnType<typeof reverseTranslateOpenAiRequest>;
-		let mockResponse: Awaited<ReturnType<typeof this.options.mockHandler>>;
-		let envelope: Record<string, unknown>;
+		let synthetic: IHttpRequestOptions;
+		let mockResponse: Awaited<ReturnType<EvalLlmMockHandler>>;
 		try {
-			synthetic = reverseTranslateOpenAiRequest(req.body);
+			synthetic = adapter.reverseTranslate(req.body);
 			mockResponse = await this.options.mockHandler(synthetic, subNode);
-			envelope = forwardTranslateToChatCompletion(mockResponse, model);
 		} catch (error) {
 			const message = error instanceof Error ? error.message : String(error);
 			this.options.logger?.error(`[EvalMock] Wire-server mock generation failed: ${message}`);
-			res.status(500).json(buildOpenAiErrorEnvelope(`Mock generation failed: ${message}`));
+			this.respondWithError(adapter, res, message);
 			return;
 		}

-		// Best-effort ledger write — never let it taint the 200 the SDK sees.
+		// Ledger write BEFORE the response so consumers see the entry deterministically
+		// after `await fetch(...)`. `requestBody` is stored by reference (express.json
+		// never re-touches it); callers must not mutate. A thrown `onIntercept` never
+		// blocks the response the SDK gets.
 		try {
 			this.options.onIntercept?.({
 				rootName,
 				url: synthetic.url,
 				method: synthetic.method ?? 'POST',
 				nodeType: subNode.type,
-				// Deep-clone so the ledger entry can't be mutated by later code.
-				requestBody: this.cloneRequestBody(req.body),
+				requestBody: req.body,
 				mockResponse: mockResponse?.body,
 			});
 		} catch (error) {
@ -133,10 +227,85 @@ export class LlmWireServer {
 			this.options.logger?.warn(`[EvalMock] Wire-server ledger write failed: ${message}`);
 		}

-		res.status(200).json(envelope);
-	};
+		try {
+			if (stream) {
+				this.writeSseResponse(adapter, req, res, mockResponse, model);
+			} else {
+				res.status(200).json(adapter.forwardObject(mockResponse, model));
+			}
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error);
+			this.options.logger?.error(`[EvalMock] Wire-server response write failed: ${message}`);
+			// Headers not yet flushed → send a typed error envelope; otherwise close.
+			if (!res.headersSent) {
+				this.respondWithError(adapter, res, message);
+			} else if (!res.writableEnded) {
+				res.end();
+			}
+		}
+	}

-	private handleUnroutedChatCompletion = (_req: Request, res: Response): void => {
+	/** Stream the mock response as SSE frames, short-circuiting if the client disconnects. */
+	private writeSseResponse(
+		adapter: ProtocolAdapter,
+		req: Request,
+		res: Response,
+		mockResponse: Awaited<ReturnType<EvalLlmMockHandler>>,
+		model: string,
+	): void {
+		// Build frames BEFORE setting headers so a translator throw surfaces as a
+		// 500 envelope via `handleProtocol`'s outer catch, not a 200 + empty body.
+		const frames = adapter.buildSseFrames(mockResponse, model);
+
+		res.status(200);
+		res.setHeader('Content-Type', 'text/event-stream');
+		res.setHeader('Cache-Control', 'no-cache, no-transform');
+		res.setHeader('Connection', 'keep-alive');
+		// Forces immediate flush in proxied setups (Nginx etc.).
+		res.setHeader('X-Accel-Buffering', 'no');
+
+		// Short-circuit on SDK abort (timeout / AbortController) — otherwise the
+		// loop keeps writing to a destroyed socket.
+		let aborted = false;
+		const onClose = () => {
+			aborted = true;
+		};
+		req.once('close', onClose);
+
+		try {
+			for (const frame of frames) {
+				if (aborted || res.writableEnded || res.destroyed) break;
+				res.write(frame);
+			}
+		} finally {
+			req.off('close', onClose);
+			if (!res.writableEnded) res.end();
+		}
+	}
+
+	private respondWithStub(
+		adapter: ProtocolAdapter,
+		req: Request,
+		res: Response,
+		model: string,
+		stream: boolean,
+	): void {
+		const stubBody = adapter.stubResponse();
+		if (stream) {
+			this.writeSseResponse(adapter, req, res, stubBody, model);
+			return;
+		}
+		res.status(200).json(adapter.forwardObject(stubBody, model));
+	}
+
+	private respondWithError(adapter: ProtocolAdapter, res: Response, message: string): void {
+		// Streaming clients still parse a JSON error envelope (the SDK throws an
+		// APIError before iterating chunks). Sending a 500 + JSON keeps both
+		// streaming and non-streaming SDK paths happy — no SSE branch needed.
+		res.status(500).json(adapter.buildErrorEnvelope(`Mock generation failed: ${message}`));
+	}
+
+	private handleUnrouted = (_req: Request, res: Response): void => {
 		res
 			.status(500)
 			.json(
@ -147,19 +316,6 @@ export class LlmWireServer {
 			);
 	};

-	/** Deep-clone via `structuredClone`; logs and falls back to the original ref if it throws. */
-	private cloneRequestBody(body: unknown): unknown {
-		try {
-			return structuredClone(body);
-		} catch (error) {
-			const message = error instanceof Error ? error.message : String(error);
-			this.options.logger?.warn(
-				`[EvalMock] Wire-server ledger entry not isolated — clone failed: ${message}`,
-			);
-			return body;
-		}
-	}
-
 	private resolveSubNode(rootName: string): INode {
 		const subNode = this.options.rootToSubNode?.get(rootName);
 		if (subNode) return subNode;
--- a/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts
+++ b/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts
@ -3,7 +3,9 @@ import type { IHttpRequestOptions } from 'n8n-workflow';
 import { randomUUID } from 'node:crypto';

 // Translation between the OpenAI chat-completions wire format and the shape
-// `createLlmMockHandler` consumes/emits. Non-streaming, no-tools subset only.
+// `createLlmMockHandler` consumes/emits. Covers non-streaming, streaming,
+// and tool-call emission. The OpenAI SDK is strict about envelope shape —
+// keep this in sync with `ChatCompletion` and `ChatCompletionChunk` schemas.

 // Kept identical to OpenAI's real URL so mock-handler's service/endpoint
 // extraction derives the right prompt-builder context.
@ -11,6 +13,13 @@ const OPENAI_SYNTHETIC_URL = 'https://api.openai.com/v1/chat/completions';

 const DEFAULT_MODEL = 'gpt-4o-mini';

+/** Tool call extracted from the mock handler's response body. */
+export interface NormalizedToolCall {
+	id: string;
+	name: string;
+	arguments: string;
+}
+
 /** Synthesize an `IHttpRequestOptions` from the inbound body so vendor-SDK traffic looks identical to HTTP-helper traffic. */
 export function reverseTranslateOpenAiRequest(body: unknown): IHttpRequestOptions {
 	return {
@ -27,13 +36,34 @@ export function extractRequestModel(body: unknown): string {
 	return typeof model === 'string' && model.length > 0 ? model : DEFAULT_MODEL;
 }

+/** True when the inbound request opted into streaming via `stream: true`. */
+export function isStreamRequested(body: unknown): boolean {
+	if (typeof body !== 'object' || body === null) return false;
+	return (body as { stream?: unknown }).stream === true;
+}
+
 /** Wrap the mock handler's response in a canonical chat.completion envelope. */
 export function forwardTranslateToChatCompletion(
 	mockResponse: EvalMockHttpResponse | undefined,
 	model: string,
 ): Record<string, unknown> {
-	const content = extractAssistantContent(mockResponse?.body);
-	const finishReason = extractFinishReason(mockResponse?.body);
+	const toolCalls = extractToolCalls(mockResponse?.body);
+	const content = toolCalls.length > 0 ? null : extractAssistantContent(mockResponse?.body);
+	// When tool_calls present, finish_reason MUST be 'tool_calls' — SDKs branch on this.
+	const finishReason =
+		toolCalls.length > 0 ? 'tool_calls' : extractFinishReason(mockResponse?.body);
+
+	const message: Record<string, unknown> = {
+		role: 'assistant',
+		content,
+	};
+	if (toolCalls.length > 0) {
+		message.tool_calls = toolCalls.map((tc) => ({
+			id: tc.id,
+			type: 'function' as const,
+			function: { name: tc.name, arguments: tc.arguments },
+		}));
+	}

 	return {
 		id: `chatcmpl-${randomUUID()}`,
@ -43,21 +73,84 @@ export function forwardTranslateToChatCompletion(
 		choices: [
 			{
 				index: 0,
-				message: { role: 'assistant', content },
+				message,
 				finish_reason: finishReason,
 			},
 		],
-		// Zero counts = "no real metering" — stubbed non-zero would compute
-		// as plausible-but-fictional cost in downstream cost trackers.
-		usage: {
-			prompt_tokens: 0,
-			completion_tokens: 0,
-			total_tokens: 0,
-		},
+		// Zero counts = "no real metering" — stubbed non-zero would fake plausible cost.
+		usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
+		// Non-conforming fingerprint so telemetry can tag eval traffic at a glance.
 		system_fingerprint: 'eval-wire-server',
 	};
 }

+/**
+ * Stream the mock handler's response as `chat.completion.chunk` frames per
+ * OpenAI's SSE accumulation contract: `index` on every tool-call delta;
+ * `id`/`function.name` only on the FIRST chunk per call; `function.arguments`
+ * streamed; terminal chunk's `finish_reason` is `tool_calls` when any call
+ * was emitted, otherwise `stop`. Returned as an array so tests can snapshot.
+ */
+export function forwardTranslateToSseChunks(
+	mockResponse: EvalMockHttpResponse | undefined,
+	model: string,
+): Array<Record<string, unknown>> {
+	const id = `chatcmpl-${randomUUID()}`;
+	const created = Math.floor(Date.now() / 1000);
+	const toolCalls = extractToolCalls(mockResponse?.body);
+
+	const chunks: Array<Record<string, unknown>> = [];
+
+	const baseChunk = (delta: Record<string, unknown>, finishReason: string | null = null) => ({
+		id,
+		object: 'chat.completion.chunk' as const,
+		created,
+		model,
+		choices: [{ index: 0, delta, finish_reason: finishReason }],
+		system_fingerprint: 'eval-wire-server',
+	});
+
+	// Opening chunk announces the assistant role with no content payload yet —
+	// matches what the real API sends so SDK reducers initialize correctly.
+	chunks.push(baseChunk({ role: 'assistant', content: toolCalls.length > 0 ? null : '' }));
+
+	if (toolCalls.length > 0) {
+		toolCalls.forEach((tc, callIndex) => {
+			// First chunk per tool call carries id + name; arguments start empty.
+			chunks.push(
+				baseChunk({
+					tool_calls: [
+						{
+							index: callIndex,
+							id: tc.id,
+							type: 'function',
+							function: { name: tc.name, arguments: '' },
+						},
+					],
+				}),
+			);
+			// One arg-slice is enough — the SDK accumulates regardless of chunk size.
+			if (tc.arguments.length > 0) {
+				chunks.push(
+					baseChunk({
+						tool_calls: [{ index: callIndex, function: { arguments: tc.arguments } }],
+					}),
+				);
+			}
+		});
+		chunks.push(baseChunk({}, 'tool_calls'));
+		return chunks;
+	}
+
+	const content = extractAssistantContent(mockResponse?.body);
+	if (content.length > 0) {
+		chunks.push(baseChunk({ content }));
+	}
+	const finishReason = extractFinishReason(mockResponse?.body);
+	chunks.push(baseChunk({}, finishReason));
+	return chunks;
+}
+
 /** OpenAI-style error envelope — makes the SDK throw a typed APIError instead of choking on a malformed body. */
 export function buildOpenAiErrorEnvelope(message: string): Record<string, unknown> {
 	return {
@ -70,6 +163,71 @@ export function buildOpenAiErrorEnvelope(message: string): Record<string, unknow
 	};
 }

+/**
+ * Normalize tool-call shapes the mock handler may emit:
+ *   - `{ tool_calls: [{ id, function: { name, arguments } }] }` — OpenAI native.
+ *   - `{ tool_calls: [{ name, arguments }] }` — shorthand the LLM often writes.
+ *   - `{ choices: [{ message: { tool_calls: [...] } }] }` — already-shaped envelope.
+ *   - `{ tool: { name, arguments } }` — single-tool shorthand.
+ *
+ * Returns an empty array when no tool calls are present. Arguments are
+ * coerced to JSON strings (SDKs require string-shaped arguments).
+ */
+export function extractToolCalls(body: unknown): NormalizedToolCall[] {
+	if (typeof body !== 'object' || body === null) return [];
+	const obj = body as Record<string, unknown>;
+
+	const fromChoices = pickToolCallsFromChoices(obj);
+	if (fromChoices.length > 0) return fromChoices;
+
+	const fromTopLevel = normalizeToolCallList(obj.tool_calls);
+	if (fromTopLevel.length > 0) return fromTopLevel;
+
+	if (typeof obj.tool === 'object' && obj.tool !== null) {
+		const single = normalizeToolCallList([obj.tool]);
+		if (single.length > 0) return single;
+	}
+
+	return [];
+}
+
+function pickToolCallsFromChoices(obj: Record<string, unknown>): NormalizedToolCall[] {
+	const choices = obj.choices;
+	if (!Array.isArray(choices) || choices.length === 0) return [];
+	const first: unknown = choices[0];
+	if (typeof first !== 'object' || first === null) return [];
+	const message = (first as { message?: unknown }).message;
+	if (typeof message !== 'object' || message === null) return [];
+	return normalizeToolCallList((message as { tool_calls?: unknown }).tool_calls);
+}
+
+function normalizeToolCallList(raw: unknown): NormalizedToolCall[] {
+	if (!Array.isArray(raw)) return [];
+	const out: NormalizedToolCall[] = [];
+	for (const entry of raw) {
+		if (typeof entry !== 'object' || entry === null) continue;
+		const e = entry as Record<string, unknown>;
+		const fn = (e.function ?? e) as Record<string, unknown>;
+		const name = typeof fn.name === 'string' ? fn.name : undefined;
+		if (!name) continue;
+		const args = coerceArgumentsToString(fn.arguments);
+		const id =
+			typeof e.id === 'string' ? e.id : `call_${randomUUID().replace(/-/g, '').slice(0, 16)}`;
+		out.push({ id, name, arguments: args });
+	}
+	return out;
+}
+
+function coerceArgumentsToString(args: unknown): string {
+	if (typeof args === 'string') return args;
+	if (args === undefined || args === null) return '{}';
+	// Object/array → JSON string. SDKs choke on non-string arguments.
+	// A circular structure throws here; let it propagate to the wire server's
+	// 500-envelope catch so the broken mock-handler output surfaces loudly
+	// rather than as a confusing tool-arg mismatch downstream.
+	return JSON.stringify(args);
+}
+
 function extractAssistantContent(body: unknown): string {
 	if (body === null || body === undefined) return '';
 	if (typeof body === 'string') return body;
--- a/packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts
+++ b/packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts
@ -0,0 +1,287 @@
+import type { EvalMockHttpResponse } from 'n8n-core';
+import type { IHttpRequestOptions } from 'n8n-workflow';
+import { randomUUID } from 'node:crypto';
+
+import { extractToolCalls, type NormalizedToolCall } from './openai-envelope';
+
+// Translation between the OpenAI Responses API (`/v1/responses`) wire format
+// and the shape `createLlmMockHandler` consumes/emits. The Responses API is
+// what `@langchain/openai` v1.3+ auto-routes to for newer chat models — the
+// chat-completions path covered by `openai-envelope.ts` is no longer the
+// default for v1.3+ Agent workflows.
+
+const OPENAI_RESPONSES_SYNTHETIC_URL = 'https://api.openai.com/v1/responses';
+
+const DEFAULT_MODEL = 'gpt-4o-mini';
+
+/** Same as `reverseTranslateOpenAiRequest` but for the Responses API endpoint. */
+export function reverseTranslateOpenAiResponsesRequest(body: unknown): IHttpRequestOptions {
+	return {
+		url: OPENAI_RESPONSES_SYNTHETIC_URL,
+		method: 'POST',
+		body: body ?? {},
+	};
+}
+
+/** Pull `.model` from the body; identical fallback to the chat-completions translator. */
+export function extractResponsesRequestModel(body: unknown): string {
+	if (typeof body !== 'object' || body === null) return DEFAULT_MODEL;
+	const model = (body as { model?: unknown }).model;
+	return typeof model === 'string' && model.length > 0 ? model : DEFAULT_MODEL;
+}
+
+/** True when the inbound Responses API request opted into streaming via `stream: true`. */
+export function isResponsesStreamRequested(body: unknown): boolean {
+	if (typeof body !== 'object' || body === null) return false;
+	return (body as { stream?: unknown }).stream === true;
+}
+
+/**
+ * Wrap the mock handler's response in a canonical `response` envelope.
+ * The Responses API uses a single `output` array — each entry is either a
+ * `message` (assistant text) or a `function_call` (tool call). Mixing both
+ * in one response is legal but rare; tool-call mode replaces the message.
+ */
+export function forwardTranslateToResponsesEnvelope(
+	mockResponse: EvalMockHttpResponse | undefined,
+	model: string,
+): Record<string, unknown> {
+	const toolCalls = extractToolCalls(mockResponse?.body);
+	const responseId = `resp_${randomUUID().replace(/-/g, '').slice(0, 32)}`;
+	const now = Math.floor(Date.now() / 1000);
+
+	const output =
+		toolCalls.length > 0
+			? toolCallsToResponsesOutput(toolCalls)
+			: [buildAssistantMessage(extractResponsesContent(mockResponse?.body))];
+
+	return {
+		id: responseId,
+		object: 'response',
+		created_at: now,
+		status: 'completed',
+		model,
+		output,
+		// Mirror chat-completions: zero counts make eval cost trackers happy.
+		usage: {
+			input_tokens: 0,
+			output_tokens: 0,
+			total_tokens: 0,
+		},
+		// `previous_response_id`, `instructions`, `metadata` are intentionally
+		// omitted — the SDK tolerates missing optional fields, and a stub
+		// fingerprint isn't part of the Responses API envelope.
+	};
+}
+
+/**
+ * Stream the mock response as Responses API SSE events. Non-tool-call turn:
+ * created → in_progress → output_item.added → content_part.added →
+ * output_text.delta → output_text.done → content_part.done →
+ * output_item.done → completed. Tool calls swap the message item for a
+ * `function_call` item with `function_call_arguments.delta`/`.done`.
+ */
+export function forwardTranslateToResponsesSseEvents(
+	mockResponse: EvalMockHttpResponse | undefined,
+	model: string,
+): Array<{ event: string; data: Record<string, unknown> }> {
+	const responseId = `resp_${randomUUID().replace(/-/g, '').slice(0, 32)}`;
+	const createdAt = Math.floor(Date.now() / 1000);
+	const toolCalls = extractToolCalls(mockResponse?.body);
+
+	const baseResponse = (status: string, output: unknown[]) => ({
+		id: responseId,
+		object: 'response',
+		created_at: createdAt,
+		status,
+		model,
+		output,
+		usage: { input_tokens: 0, output_tokens: 0, total_tokens: 0 },
+	});
+
+	const events: Array<{ event: string; data: Record<string, unknown> }> = [];
+
+	events.push({ event: 'response.created', data: { response: baseResponse('in_progress', []) } });
+	events.push({
+		event: 'response.in_progress',
+		data: { response: baseResponse('in_progress', []) },
+	});
+
+	if (toolCalls.length > 0) {
+		// Pre-build final items so `id` stays stable across every event the SDK
+		// reconciles (added / delta / done / terminal completed.output[i]).
+		const finalItems = toolCallsToResponsesOutput(toolCalls);
+		toolCalls.forEach((tc, callIndex) => {
+			const finalItem = finalItems[callIndex];
+			const itemId = finalItem.id as string;
+			const initialItem = { ...finalItem, arguments: '' };
+			events.push({
+				event: 'response.output_item.added',
+				data: { output_index: callIndex, item: initialItem },
+			});
+			if (tc.arguments.length > 0) {
+				events.push({
+					event: 'response.function_call_arguments.delta',
+					data: {
+						item_id: itemId,
+						output_index: callIndex,
+						delta: tc.arguments,
+					},
+				});
+			}
+			events.push({
+				event: 'response.function_call_arguments.done',
+				data: {
+					item_id: itemId,
+					output_index: callIndex,
+					arguments: tc.arguments,
+				},
+			});
+			events.push({
+				event: 'response.output_item.done',
+				data: { output_index: callIndex, item: finalItem },
+			});
+		});
+		events.push({
+			event: 'response.completed',
+			data: { response: baseResponse('completed', finalItems) },
+		});
+		return events;
+	}
+
+	// Plain message mode.
+	const content = extractResponsesContent(mockResponse?.body);
+	const messageId = `msg_${randomUUID().replace(/-/g, '').slice(0, 16)}`;
+	// `annotations: []` is required — LangChain's extractor calls `.annotations.map(...)`.
+	const messageItem = {
+		id: messageId,
+		type: 'message' as const,
+		role: 'assistant' as const,
+		content: [{ type: 'output_text' as const, text: content, annotations: [] }],
+		status: 'completed' as const,
+	};
+	events.push({
+		event: 'response.output_item.added',
+		data: {
+			output_index: 0,
+			item: {
+				...messageItem,
+				content: [{ type: 'output_text', text: '', annotations: [] }],
+				status: 'in_progress',
+			},
+		},
+	});
+	events.push({
+		event: 'response.content_part.added',
+		data: {
+			item_id: messageId,
+			output_index: 0,
+			content_index: 0,
+			part: { type: 'output_text', text: '', annotations: [] },
+		},
+	});
+	if (content.length > 0) {
+		events.push({
+			event: 'response.output_text.delta',
+			data: {
+				item_id: messageId,
+				output_index: 0,
+				content_index: 0,
+				delta: content,
+			},
+		});
+	}
+	events.push({
+		event: 'response.output_text.done',
+		data: {
+			item_id: messageId,
+			output_index: 0,
+			content_index: 0,
+			text: content,
+		},
+	});
+	events.push({
+		event: 'response.content_part.done',
+		data: {
+			item_id: messageId,
+			output_index: 0,
+			content_index: 0,
+			part: { type: 'output_text', text: content, annotations: [] },
+		},
+	});
+	events.push({
+		event: 'response.output_item.done',
+		data: { output_index: 0, item: messageItem },
+	});
+	events.push({
+		event: 'response.completed',
+		data: { response: baseResponse('completed', [messageItem]) },
+	});
+
+	return events;
+}
+
+/** Responses API uses the same error envelope as chat-completions, with `error.type` describing the failure. */
+export function buildResponsesErrorEnvelope(message: string): Record<string, unknown> {
+	return {
+		error: {
+			message,
+			type: 'eval_wire_server_error',
+			code: 'eval_mock_generation_failed',
+			param: null,
+		},
+	};
+}
+
+function toolCallsToResponsesOutput(
+	toolCalls: NormalizedToolCall[],
+): Array<Record<string, unknown>> {
+	return toolCalls.map((tc) => ({
+		id: `fc_${randomUUID().replace(/-/g, '').slice(0, 16)}`,
+		type: 'function_call',
+		call_id: tc.id,
+		name: tc.name,
+		arguments: tc.arguments,
+	}));
+}
+
+function buildAssistantMessage(text: string): Record<string, unknown> {
+	return {
+		id: `msg_${randomUUID().replace(/-/g, '').slice(0, 16)}`,
+		type: 'message',
+		role: 'assistant',
+		status: 'completed',
+		// `annotations: []` is required — LangChain's extractor calls `.annotations.map(...)`.
+		content: [{ type: 'output_text', text, annotations: [] }],
+	};
+}
+
+/** Tolerant content extractor: handles `output[].content[].text`, `output_text`, `{ content }`, `{ message }`, bare strings. */
+function extractResponsesContent(body: unknown): string {
+	if (body === null || body === undefined) return '';
+	if (typeof body === 'string') return body;
+	if (typeof body !== 'object') return String(body as number | boolean | bigint);
+
+	const obj = body as Record<string, unknown>;
+
+	if (typeof obj.output_text === 'string') return obj.output_text;
+
+	const output = obj.output;
+	if (Array.isArray(output) && output.length > 0) {
+		for (const item of output) {
+			if (typeof item !== 'object' || item === null) continue;
+			const content = (item as { content?: unknown }).content;
+			if (!Array.isArray(content) || content.length === 0) continue;
+			const first: unknown = content[0];
+			if (typeof first === 'object' && first !== null) {
+				const text = (first as { text?: unknown }).text;
+				if (typeof text === 'string') return text;
+			}
+		}
+	}
+
+	if (typeof obj.content === 'string') return obj.content;
+	if (typeof obj.message === 'string') return obj.message;
+
+	return JSON.stringify(body);
+}
--- a/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts
+++ b/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts
@ -2,6 +2,7 @@ import { Logger } from '@n8n/backend-common';
 import { Container } from '@n8n/di';
 import { createEvalAgent, extractText } from '@n8n/instance-ai';
 import {
+	findAiRootNodeNames,
 	type INode,
 	type IPinData,
 	type IWorkflowBase,
@ -12,25 +13,6 @@ import {

 import { extractNodeConfig } from './node-config';

-/** Targets of `ai_*` connections — Agent/Chain root nodes. Pinning these short-circuits sub-node SDK calls. */
-function findAiRootNodeNames(workflow: IWorkflowBase): Set<string> {
-	const roots = new Set<string>();
-	for (const nodeConns of Object.values(workflow.connections)) {
-		for (const [connType, outputs] of Object.entries(nodeConns)) {
-			if (!connType.startsWith('ai_') || !Array.isArray(outputs)) continue;
-			for (const group of outputs) {
-				if (!Array.isArray(group)) continue;
-				for (const conn of group) {
-					if (typeof conn === 'object' && conn !== null && 'node' in conn) {
-						roots.add((conn as { node: string }).node);
-					}
-				}
-			}
-		}
-	}
-	return roots;
-}
-
 /**
 * AI root node types — lets the typo guard accept a no-sub-node Agent.
 * Keep in sync with new agent/chain types in `@n8n/n8n-nodes-langchain`.
@ -116,7 +98,7 @@ export function identifyNodesForPinData(
 	workflow: IWorkflowBase,
 	exclusionSet?: Set<string>,
 ): INode[] {
-	const aiRootNodes = findAiRootNodeNames(workflow);
+	const aiRootNodes = findAiRootNodeNames(workflow.connections);

 	return workflow.nodes.filter((node) => {
 		if (node.disabled) return false;
@ -126,19 +108,21 @@ export function identifyNodesForPinData(
 	});
 }

-type UnpinRefusal = {
+export type AutoPinReason =
+	| 'protocol_binary'
+	| 'unsupported_vendor_llm'
+	| 'unsafe_baseurl_override'
+	| 'shared_vendor_llm_subnode';
+
+export interface AutoPinEntry {
 	root: string;
 	subNode: string;
 	subNodeType: string;
-	reason:
-		| 'protocol_binary'
-		| 'unsupported_vendor_llm'
-		| 'unsafe_baseurl_override'
-		| 'shared_vendor_llm_subnode';
-};
+	reason: AutoPinReason;
+}

-// Routing maps for vendor SDK interception. `assertUnpinCompatibility`
-// refuses shared sub-node topologies, so each sub-node maps to one root.
+// Routing maps for vendor SDK interception. `partitionAiRoots` auto-pins
+// shared-sub-node topologies, so each remaining sub-node maps to one root.
 export interface VendorLlmRouting {
 	subNodeToRoot: Map<string, string>;
 	rootToSubNode: Map<string, INode>;
@ -175,6 +159,17 @@ export function buildVendorLlmRouting(
 					}
 					if (!rootToSubNode.has(rootName)) {
 						rootToSubNode.set(rootName, subNode);
+						// Self-map the root: `LmChatOpenAi.supplyData()` reads
+						// `getCredentials('openAiApi')` from a context whose
+						// `executeData.node` is sometimes the parent Agent rather
+						// than the LLM sub-node — observed empirically against a
+						// real LangChain Agent. Without this entry the credential
+						// helper's lookup misses, falls back to the no-root URL,
+						// and the wire server's loud-fail handler rejects the
+						// SDK call. Self-mapping the root keeps the lookup honest
+						// regardless of which side of the supplyData boundary
+						// asked for the credential.
+						subNodeToRoot.set(rootName, rootName);
 					}
 				}
 			}
@ -184,20 +179,102 @@ export function buildVendorLlmRouting(
 	return { subNodeToRoot, rootToSubNode };
 }

-/** Throws if any unpinned AI root has a sub-node we can't intercept: protocol-binary, unmapped vendor LLM, or unsafe baseURL override. Also refuses entries that don't resolve to an enabled AI root (typo guard). */
-export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: string[]): void {
-	if (unpinNodes.length === 0) return;
+export interface PartitionedAiRoots {
+	/** Names of AI roots that will run through the wire-server interception path. */
+	unpinNodes: string[];
+	/** Names of AI roots that will remain pinned — explicit `pinNodes` + auto-pinned roots. */
+	pinNodes: string[];
+	/** Per-(root, sub-node) reasons a root was auto-pinned, for diagnostic logging. */
+	autoPinned: AutoPinEntry[];
+}

+/**
+ * Default-on partition: every AI root in the workflow runs through the wire
+ * server unless one of these applies:
+ *   - It's in the caller-supplied `explicitPinNodes` list (opt-out for nodes
+ *     the caller wants to keep pinned, e.g. for an A/B comparison).
+ *   - One of its inbound `ai_*` sub-nodes is incompatible (protocol-binary
+ *     memory/vector store, unsupported vendor LLM, configured
+ *     `options.baseURL` that bypasses the credential rewrite).
+ *   - It shares a supported vendor LLM sub-node with another root — wire-
+ *     server attribution is path-based and first-wins, so multiple roots
+ *     fanning into the same sub-node would mis-attribute later turns. Both
+ *     sides get auto-pinned.
+ *
+ * `explicitPinNodes` is validated up front: unknown / disabled / non-AI-root
+ * entries throw a `UserError` to surface typos as actionable errors instead
+ * of being silently ignored.
+ */
+export function partitionAiRoots(
+	workflow: IWorkflowBase,
+	explicitPinNodes: string[] = [],
+): PartitionedAiRoots {
 	const nodesByName = new Map(workflow.nodes.map((n) => [n.name, n]));
 	const connectionsByDestination = mapConnectionsByDestination(workflow.connections);
-	const aiRootNodes = findAiRootNodeNames(workflow);
+	const allRoots = findAiRootNodeNames(workflow.connections);

-	// Refuse typos / disabled / non-AI-root entries up front. A root counts
-	// if it has inbound ai_* connections OR its type is on AI_ROOT_NODE_TYPES.
+	validateExplicitPinNodes(nodesByName, allRoots, explicitPinNodes);
+
+	const explicitPinSet = new Set(explicitPinNodes);
+	const sharedSupportedSubNodes = trackSharedSupportedSubNodes(
+		connectionsByDestination,
+		nodesByName,
+		allRoots,
+		explicitPinSet,
+	);
+
+	const autoPinned: AutoPinEntry[] = [];
+	const pinSet = new Set<string>(explicitPinNodes);
+
+	for (const rootName of allRoots) {
+		if (explicitPinSet.has(rootName)) continue;
+
+		const inbound = connectionsByDestination[rootName];
+		if (!inbound) continue;
+
+		for (const [connType, groups] of Object.entries(inbound)) {
+			if (!connType.startsWith('ai_') || !Array.isArray(groups)) continue;
+			for (const group of groups) {
+				if (!Array.isArray(group)) continue;
+				for (const conn of group) {
+					const sourceNode = nodesByName.get(conn.node);
+					if (!sourceNode || sourceNode.disabled) continue;
+
+					const reason = categorizeSubNodeIncompatibility(sourceNode, sharedSupportedSubNodes);
+					if (reason === null) continue;
+
+					autoPinned.push({
+						root: rootName,
+						subNode: sourceNode.name,
+						subNodeType: sourceNode.type,
+						reason,
+					});
+					pinSet.add(rootName);
+				}
+			}
+		}
+	}
+
+	const unpinNodes: string[] = [];
+	const pinNodes: string[] = [];
+	for (const rootName of allRoots) {
+		if (pinSet.has(rootName)) pinNodes.push(rootName);
+		else unpinNodes.push(rootName);
+	}
+
+	return { unpinNodes, pinNodes, autoPinned };
+}
+
+/** Throw `UserError` if any explicit pin entry isn't a real, enabled AI root in the workflow. */
+function validateExplicitPinNodes(
+	nodesByName: Map<string, INode>,
+	aiRootNodes: Set<string>,
+	explicitPinNodes: string[],
+): void {
 	const unknownRoots: string[] = [];
 	const disabledRoots: string[] = [];
 	const nonAiRoots: string[] = [];
-	for (const rootName of unpinNodes) {
+	for (const rootName of explicitPinNodes) {
 		const node = nodesByName.get(rootName);
 		if (!node) unknownRoots.push(rootName);
 		else if (node.disabled) disabledRoots.push(rootName);
@ -211,21 +288,28 @@ export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: st
 		if (unknownRoots.length) parts.push(`not found in workflow: ${formatNames(unknownRoots)}`);
 		if (disabledRoots.length) parts.push(`disabled: ${formatNames(disabledRoots)}`);
 		if (nonAiRoots.length) parts.push(`not AI root nodes: ${formatNames(nonAiRoots)}`);
-		throw new UserError(`Cannot unpin — ${parts.join('; ')}.`);
+		throw new UserError(`Cannot pin — ${parts.join('; ')}.`);
 	}
+}

-	const refusals: UnpinRefusal[] = [];
-	// Track which unpinned roots each supported vendor LLM sub-node feeds.
-	// A sub-node feeding ≥2 unpinned roots can't be attributed correctly —
-	// the wire server's path-based root token is baked into the credential
-	// URL at resolution time (first-wins), so later turns from the same
-	// sub-node would mis-attribute to the first root.
-	const sharedSupportedSubNodes = new Map<string, { type: string; roots: Set<string> }>();
-
-	for (const rootName of unpinNodes) {
+/**
+ * Walk every AI root in the workflow and record which supported vendor LLM
+ * sub-nodes feed more than one root. Used by `categorizeSubNodeIncompatibility`
+ * so both sides of a shared sub-node get auto-pinned (attribution would be
+ * ambiguous otherwise). Roots in `explicitPinSet` don't contribute — pinning
+ * them removes the ambiguity.
+ */
+function trackSharedSupportedSubNodes(
+	connectionsByDestination: ReturnType<typeof mapConnectionsByDestination>,
+	nodesByName: Map<string, INode>,
+	allRoots: Set<string>,
+	explicitPinSet: Set<string>,
+): Set<string> {
+	const usage = new Map<string, Set<string>>();
+	for (const rootName of allRoots) {
+		if (explicitPinSet.has(rootName)) continue;
 		const inbound = connectionsByDestination[rootName];
 		if (!inbound) continue;
-
 		for (const [connType, groups] of Object.entries(inbound)) {
 			if (!connType.startsWith('ai_') || !Array.isArray(groups)) continue;
 			for (const group of groups) {
@ -233,101 +317,44 @@ export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: st
 				for (const conn of group) {
 					const sourceNode = nodesByName.get(conn.node);
 					if (!sourceNode || sourceNode.disabled) continue;
-
-					if (SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) {
-						const tracked = sharedSupportedSubNodes.get(sourceNode.name) ?? {
-							type: sourceNode.type,
-							roots: new Set<string>(),
-						};
-						tracked.roots.add(rootName);
-						sharedSupportedSubNodes.set(sourceNode.name, tracked);
-					}
-
-					const reason = categorizeSubNodeRefusal(sourceNode);
-					if (reason === null) continue;
-					refusals.push({
-						root: rootName,
-						subNode: sourceNode.name,
-						subNodeType: sourceNode.type,
-						reason,
-					});
+					if (!SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) continue;
+					const tracked = usage.get(sourceNode.name) ?? new Set<string>();
+					tracked.add(rootName);
+					usage.set(sourceNode.name, tracked);
 				}
 			}
 		}
 	}
-
-	// Emit a `shared_vendor_llm_subnode` refusal for every sub-node feeding
-	// more than one unpinned root. One entry per offending (root, sub-node)
-	// pair so the error message lists every conflict.
-	for (const [subNodeName, { type, roots }] of sharedSupportedSubNodes) {
-		if (roots.size < 2) continue;
-		for (const rootName of roots) {
-			refusals.push({
-				root: rootName,
-				subNode: subNodeName,
-				subNodeType: type,
-				reason: 'shared_vendor_llm_subnode',
-			});
-		}
+	const shared = new Set<string>();
+	for (const [subNodeName, roots] of usage) {
+		if (roots.size >= 2) shared.add(subNodeName);
 	}
-
-	if (refusals.length === 0) return;
-
-	const segments = [
-		formatRefusalSegment(
-			refusals,
-			'protocol_binary',
-			'protocol-binary sub-nodes (cannot be intercepted via HTTP)',
-		),
-		formatRefusalSegment(
-			refusals,
-			'unsupported_vendor_llm',
-			'unsupported vendor LLM sub-nodes (no eval URL-rewrite mapping yet)',
-		),
-		formatRefusalSegment(
-			refusals,
-			'unsafe_baseurl_override',
-			'vendor LLM sub-nodes with a configured options.baseURL that bypasses the credential rewrite',
-		),
-		formatRefusalSegment(
-			refusals,
-			'shared_vendor_llm_subnode',
-			'vendor LLM sub-nodes shared by multiple unpinned roots (attribution would be ambiguous)',
-		),
-	].filter((s): s is string => s !== undefined);
-
-	throw new UserError(
-		`Cannot unpin AI root nodes — ${segments.join('; ')}. ` +
-			'Leave these roots pinned, remove the parameter override, or replace the sub-node with one that has interception support.',
-	);
+	return shared;
 }

-/** Classify a sub-node into one of the three refusal reasons, or null if acceptable. Order matters: protocol-binary, then baseURL-override on a supported vendor, then unsupported `lm*`. */
-function categorizeSubNodeRefusal(sourceNode: INode): UnpinRefusal['reason'] | null {
+/**
+ * Return the auto-pin reason for a sub-node, or null if it's safe to intercept.
+ * Order: protocol-binary (HTTP can't reach it) → shared (attribution ambiguous) →
+ * supported-vendor-with-baseURL-override (SDK bypasses the rewrite) → unsupported
+ * vendor LLM (no URL-rewrite mapping yet).
+ */
+function categorizeSubNodeIncompatibility(
+	sourceNode: INode,
+	sharedSupportedSubNodes: Set<string>,
+): AutoPinReason | null {
 	if (PROTOCOL_BINARY_SUB_NODE_TYPES.has(sourceNode.type)) return 'protocol_binary';
 	if (SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) {
+		if (sharedSupportedSubNodes.has(sourceNode.name)) return 'shared_vendor_llm_subnode';
 		return hasUnsafeBaseUrlOverride(sourceNode) ? 'unsafe_baseurl_override' : null;
 	}
 	if (isVendorLlmSubNode(sourceNode.type)) return 'unsupported_vendor_llm';
 	return null;
 }

-/** One segment of the `assertUnpinCompatibility` error message, or undefined when no refusals match. */
-function formatRefusalSegment(
-	refusals: UnpinRefusal[],
-	reason: UnpinRefusal['reason'],
-	label: string,
-): string | undefined {
-	const matching = refusals.filter((r) => r.reason === reason);
-	if (matching.length === 0) return undefined;
-	const pairs = matching.map((r) => `"${r.subNode}" (${r.subNodeType}) → "${r.root}"`).join(', ');
-	return `${label}: ${pairs}`;
-}
-
 /** Nodes that should receive mock hints — excludes AI sub-nodes (handled via root) and pinned nodes. */
 export function identifyNodesForHints(workflow: IWorkflowBase): INode[] {
 	const aiSubNodes = findAiSubNodeNames(workflow);
-	const aiRootNodes = findAiRootNodeNames(workflow);
+	const aiRootNodes = findAiRootNodeNames(workflow.connections);
 	const pinnedNodeNames = new Set(identifyNodesForPinData(workflow).map((n) => n.name));

 	return workflow.nodes.filter((node) => {
--- a/packages/core/src/execution-engine/index.ts
+++ b/packages/core/src/execution-engine/index.ts
@ -97,3 +97,7 @@ export { ExternalSecretsProxy, type IExternalSecretsManager } from './external-s
 export { ExecutionContextService } from './execution-context.service';
 export { establishExecutionContext } from './execution-context';
 export { isEngineRequest } from './requests-response';
+// Exposed so eval-mode credential helpers (e.g. `EvalMockedCredentialsHelper`)
+// can reuse the same schema-driven cred synthesizer the wire-server URL
+// rewrite expects. See its `getDecrypted` catch path for the consumer.
+export { buildEvalMockCredentials } from './eval-mock-helpers';
--- a/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts
+++ b/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts
@ -314,13 +314,21 @@ export abstract class NodeExecutionContext implements Omit<FunctionsBase, 'getCr

 		// Eval-mode bypass: only mock when the node is fully unconfigured, so
 		// nodes that probe multiple auth types still get production's throw.
+		// Delegates to the credentials helper with a null-id `INodeCredentialsDetails`;
+		// `EvalMockedCredentialsHelper` catches the resulting `CredentialNotFoundError`
+		// and schema-synthesizes (and applies the wire-server URL rewrite). Production
+		// helpers don't catch — but production never reaches this branch because
+		// `evalLlmMockHandler` is only set in eval mode.
 		if (mode === 'evaluation' && additionalData.evalLlmMockHandler && !node.credentials?.[type]) {
 			const hasOtherCreds = !!node.credentials && Object.keys(node.credentials).length > 0;
 			if (!hasOtherCreds) {
-				const { buildEvalMockCredentials } = await import('../eval-mock-helpers');
-				return buildEvalMockCredentials(
-					additionalData.credentialsHelper.getCredentialsProperties(type),
-				) as T;
+				return (await additionalData.credentialsHelper.getDecrypted(
+					additionalData,
+					{ id: null, name: type },
+					type,
+					mode,
+					executeData,
+				)) as T;
 			}
 		}

--- a/packages/workflow/src/common/find-ai-root-node-names.ts
+++ b/packages/workflow/src/common/find-ai-root-node-names.ts
@ -0,0 +1,38 @@
+/**
+ * AI root nodes are the target of any `ai_*` connection — Agent/Chain nodes
+ * to which language model, memory, tool, etc. sub-nodes attach. Pinning these
+ * during eval short-circuits sub-node SDK calls.
+ *
+ * Accepts `unknown` so callers reading workflow JSON from the wire (which
+ * arrives as `Record<string, unknown>`) can use it without an `as` cast.
+ * Typed-`IConnections` callers assign in without widening.
+ */
+function isObjectRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null;
+}
+
+// `Array.isArray` narrows to `any[]` in lib.es5.d.ts; wrap it so the elements
+// stay typed as `unknown` and downstream checks have to narrow explicitly.
+function isUnknownArray(value: unknown): value is readonly unknown[] {
+	return Array.isArray(value);
+}
+
+export function findAiRootNodeNames(connections: unknown): Set<string> {
+	const roots = new Set<string>();
+	if (!isObjectRecord(connections)) return roots;
+	for (const nodeConns of Object.values(connections)) {
+		if (!isObjectRecord(nodeConns)) continue;
+		for (const [connType, outputs] of Object.entries(nodeConns)) {
+			if (!connType.startsWith('ai_') || !isUnknownArray(outputs)) continue;
+			for (const group of outputs) {
+				if (!isUnknownArray(group)) continue;
+				for (const conn of group) {
+					if (isObjectRecord(conn) && typeof conn.node === 'string') {
+						roots.add(conn.node);
+					}
+				}
+			}
+		}
+	}
+	return roots;
+}
--- a/packages/workflow/src/common/index.ts
+++ b/packages/workflow/src/common/index.ts
@ -1,3 +1,4 @@
+export * from './find-ai-root-node-names';
 export * from './get-child-nodes';
 export * from './get-connected-nodes';
 export * from './get-node-by-name';
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -240,6 +240,9 @@ catalogs:
    nanoid:
      specifier: 3.3.8
      version: 3.3.8
+    openai:
+      specifier: 6.19.0
+      version: 6.19.0
    oxlint:
      specifier: ^1.61.0
      version: 1.61.0
@ -2955,7 +2958,7 @@ importers:
        version: 9.0.3
      langsmith:
        specifier: 0.6.0
-        version: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
+        version: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
      ldapts:
        specifier: 4.2.6
        version: 4.2.6
@ -3194,6 +3197,9 @@ importers:
      n8n-containers:
        specifier: workspace:*
        version: link:../testing/containers
+      openai:
+        specifier: 'catalog:'
+        version: 6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67)
      openapi-types:
        specifier: ^12.1.3
        version: 12.1.3
@ -17383,6 +17389,18 @@ packages:
    resolution: {integrity: sha512-MVHddDVweXZF3awtlAS+6pgKLlm/JgxZ90+/NBurBoQctVOOB/zDdVjcyPzQ+0laDGbsWgrRkflI65sQeOgT9Q==}
    engines: {node: '>=8'}

+  openai@6.19.0:
+    resolution: {integrity: sha512-5uGrF82Ql7TKgIWUnuxh+OyzYbPRPwYDSgGc05JowbXRFsOkuj0dJuCdPCTBZT4mcmp2NEvj/URwDzW+lYgmVw==}
+    hasBin: true
+    peerDependencies:
+      ws: '>=8.20.1'
+      zod: 3.25.67
+    peerDependenciesMeta:
+      ws:
+        optional: true
+      zod:
+        optional: true
+
  openai@6.34.0:
    resolution: {integrity: sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw==}
    hasBin: true
@ -35048,6 +35066,16 @@ snapshots:
      - ws
      - zod-to-json-schema

+  langsmith@0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)):
+    dependencies:
+      p-queue: 6.6.2
+    optionalDependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/exporter-trace-otlp-proto': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+      openai: 6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67)
+      ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
+
  langsmith@0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)):
    dependencies:
      p-queue: 6.6.2
@ -37083,6 +37111,11 @@ snapshots:
      is-docker: 2.2.1
      is-wsl: 2.2.0

+  openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67):
+    optionalDependencies:
+      ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
+      zod: 3.25.67
+
  openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67):
    optionalDependencies:
      ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)