From 55d8b59a4825f7702ae615fbd947e01f7bb0b7d5 Mon Sep 17 00:00:00 2001 From: Arvin A <51036481+DeveloperTheExplorer@users.noreply.github.com> Date: Wed, 27 May 2026 15:53:43 +0200 Subject: [PATCH] feat(core): Stream tool calls and ship M3 fixtures from LLM eval wire server (no-changelog) (#30983) --- .../evaluations/support/pin-data-generator.ts | 26 +- .../src/schemas/instance-ai.schema.ts | 37 +- .../@n8n/instance-ai/evaluations/cli/args.ts | 18 + .../@n8n/instance-ai/evaluations/cli/index.ts | 2 + .../evaluations/clients/n8n-client.ts | 13 +- .../instance-ai/evaluations/harness/runner.ts | 30 +- packages/cli/package.json | 1 + .../eval-mocked-credentials-helper.test.ts | 155 +++++ .../eval/__tests__/execution.service.test.ts | 304 ++++++---- .../eval/__tests__/llm-wire-server.test.ts | 564 +++++++++++++++++- .../eval/__tests__/m3-fixtures.test.ts | 496 +++++++++++++++ .../eval/__tests__/openai-envelope.test.ts | 344 +++++++++++ .../openai-responses-envelope.test.ts | 367 ++++++++++++ .../eval/__tests__/workflow-analysis.test.ts | 540 +++++++---------- .../eval/eval-mocked-credentials-helper.ts | 18 +- .../instance-ai/eval/execution.service.ts | 28 +- .../instance-ai/eval/llm-wire-server.ts | 236 ++++++-- .../instance-ai/eval/openai-envelope.ts | 180 +++++- .../eval/openai-responses-envelope.ts | 287 +++++++++ .../instance-ai/eval/workflow-analysis.ts | 275 +++++---- packages/core/src/execution-engine/index.ts | 4 + .../node-execution-context.ts | 16 +- .../src/common/find-ai-root-node-names.ts | 38 ++ packages/workflow/src/common/index.ts | 1 + pnpm-lock.yaml | 35 +- 25 files changed, 3346 insertions(+), 669 deletions(-) create mode 100644 packages/cli/src/modules/instance-ai/eval/__tests__/m3-fixtures.test.ts create mode 100644 packages/cli/src/modules/instance-ai/eval/__tests__/openai-responses-envelope.test.ts create mode 100644 packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts create mode 100644 packages/workflow/src/common/find-ai-root-node-names.ts diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts index 25f88385f33..cc0da856321 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts @@ -10,6 +10,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models' import { HumanMessage, SystemMessage } from '@langchain/core/messages'; import { existsSync, readFileSync, readdirSync } from 'fs'; import { + findAiRootNodeNames, jsonParse, type IDataObject, type INode, @@ -63,29 +64,6 @@ const NON_SERVICE_NODES_WITH_CREDENTIALS = new Set([ // Node identification // --------------------------------------------------------------------------- -/** - * Build a set of node names that are targets of AI-type connections - * (ai_languageModel, ai_tool, ai_memory, etc.). These are root AI nodes - * (e.g. Agent, Chain) whose sub-nodes can't be individually pinned. - * Pinning the root prevents sub-node execution entirely. - */ -function findAiRootNodeNames(workflow: SimpleWorkflow): Set { - const roots = new Set(); - for (const nodeConns of Object.values(workflow.connections)) { - for (const [connType, outputs] of Object.entries(nodeConns)) { - if (!connType.startsWith('ai_')) continue; - if (!Array.isArray(outputs)) continue; - for (const group of outputs) { - if (!Array.isArray(group)) continue; - for (const conn of group) { - if (conn?.node) roots.add(conn.node); - } - } - } - } - return roots; -} - /** * Identify which nodes in a workflow need pin data. * In eval context, we pin all service/API nodes since none have real credentials. @@ -95,7 +73,7 @@ export function identifyPinDataNodes( nodeTypes: INodeTypeDescription[], ): INode[] { const nodeTypeMap = new Map(nodeTypes.map((nt) => [nt.name, nt])); - const aiRootNodes = findAiRootNodeNames(workflow); + const aiRootNodes = findAiRootNodeNames(workflow.connections); return workflow.nodes.filter((node) => { // Skip disabled nodes diff --git a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts index 43e4dea933f..2345b73c434 100644 --- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts +++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts @@ -1127,10 +1127,8 @@ export const EVAL_VENDOR_SDK_INTERCEPTION_FLAG = '085_eval_vendor_sdk_intercepti /** * Records a credential field that was rewritten (e.g. routed to the eval wire - * server) during evaluation. Populated when the caller opts into the unpin - * path via `InstanceAiEvalExecutionRequest.unpinNodes`. Field added in the - * foundation PR; the rewrite path itself is wired up in a later PR and stays - * empty until then. + * server) during evaluation. Populated for every AI root the server intercepts; + * empty when the kill-switch is off or every root was auto-/explicit-pinned. */ export interface InstanceAiEvalRewrittenCredential { nodeName: string; @@ -1152,29 +1150,20 @@ export interface InstanceAiEvalExecutionResult { export class InstanceAiEvalExecutionRequest extends Z.class({ scenarioHints: z.string().max(2000).optional(), /** - * AI root node names (Agent, Chain, etc.) whose sub-nodes should run their - * real vendor SDK code instead of being pinned. The eval pipeline rewrites - * matching credentials so vendor traffic lands on the eval wire server. + * AI root nodes (Agent, Chain) that should stay pinned — opt-out from the + * default-on wire-server interception path. Useful when the caller wants + * to keep a specific root on the pinned baseline (e.g. for A/B comparison) + * even though its sub-nodes are interceptable. * - * The compatibility guard refuses the request up front (no execution - * attempted) when any inbound `ai_*` sub-node of a requested root falls - * into one of these categories: - * - **Protocol-binary client**: Postgres/Redis/MongoDB memory, native - * vector stores (PGVector / Mongo / Redis / Milvus). These don't - * speak HTTP and can't be intercepted by the wire server. - * - **Unsupported vendor LLM**: any `@n8n/n8n-nodes-langchain.lm*` node - * not yet on the supported list (currently `lmChatOpenAi` only). - * These would call the real provider with real credentials because - * there's no eval URL-rewrite mapping for them. - * - **Unsafe `options.baseURL` override**: a supported vendor LLM - * configured with a non-empty `options.baseURL` parameter. The SDK - * prefers that over the rewritten credential URL, so the override - * would bypass the wire server. + * The server auto-pins AI roots whose inbound `ai_*` sub-nodes are + * incompatible (protocol-binary memory/vector store, unsupported vendor + * LLM, configured `options.baseURL` override, shared with another root) + * — callers do not need to list those here. * - * Refused requests come back as an error-shaped `InstanceAiEvalExecutionResult` - * with the offending root → sub-node pairs listed in `errors`. + * Validated up front: unknown / disabled / non-AI-root names come back + * as an error-shaped `InstanceAiEvalExecutionResult`. */ - unpinNodes: z.array(z.string().min(1)).max(50).optional(), + pinNodes: z.array(z.string().min(1)).max(50).optional(), }) {} // --------------------------------------------------------------------------- diff --git a/packages/@n8n/instance-ai/evaluations/cli/args.ts b/packages/@n8n/instance-ai/evaluations/cli/args.ts index 6412d4277bd..8805c23c9aa 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/args.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts @@ -47,6 +47,10 @@ export interface CliArgs { /** Number of iterations to run each test case (default: 1). Each iteration * gets a fresh build so pass@k / pass^k capture real builder variance. */ iterations: number; + /** AI root nodes (Agent, Chain) to keep pinned — opt-out from the default-on + * wire-server interception path. Useful for A/B comparison or when a + * specific root needs to stay on the pinned baseline. CSV of node names. */ + pinAiRoots?: string[]; } // --------------------------------------------------------------------------- @@ -68,6 +72,7 @@ const cliArgsSchema = z.object({ concurrency: z.number().int().positive().default(16), experimentName: z.string().optional(), iterations: z.number().int().positive().default(1), + pinAiRoots: z.array(z.string().min(1)).optional(), }); // --------------------------------------------------------------------------- @@ -93,6 +98,7 @@ export function parseCliArgs(argv: string[]): CliArgs { concurrency: validated.concurrency, experimentName: validated.experimentName, iterations: validated.iterations, + pinAiRoots: validated.pinAiRoots, }; } @@ -115,6 +121,7 @@ interface RawArgs { concurrency: number; experimentName?: string; iterations: number; + pinAiRoots?: string[]; } function parseRawArgs(argv: string[]): RawArgs { @@ -128,6 +135,7 @@ function parseRawArgs(argv: string[]): RawArgs { concurrency: 16, experimentName: undefined, iterations: 1, + pinAiRoots: undefined, }; for (let i = 0; i < argv.length; i++) { @@ -207,6 +215,16 @@ function parseRawArgs(argv: string[]): RawArgs { i++; break; + case '--pin-ai-roots': { + const raw = nextArg(argv, i, '--pin-ai-roots'); + result.pinAiRoots = raw + .split(',') + .map((s) => s.trim()) + .filter((s) => s.length > 0); + i++; + break; + } + default: // Fail loudly on unknown flags. Strip any =value payload before // echoing and drop positional values entirely — raw CLI input diff --git a/packages/@n8n/instance-ai/evaluations/cli/index.ts b/packages/@n8n/instance-ai/evaluations/cli/index.ts index 09a1dad32c6..5d129c3aaad 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/index.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts @@ -360,6 +360,7 @@ async function runWithLangSmith(config: RunConfig): Promise<{ execArgs.workflowJsons, logger, args.timeoutMs, + args.pinAiRoots, ), { name: 'scenario_execution', @@ -940,6 +941,7 @@ async function runDirectLoop(config: RunConfig): Promise { keepWorkflows: args.keepWorkflows, laneTag, prebuiltWorkflowId: pickPrebuiltWorkflowId(prebuiltManifest, tc.fileSlug, iter), + pinAiRoots: args.pinAiRoots, }), MAX_CONCURRENT_BUILDS, ); diff --git a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts index 7ce432a0d63..2616cdf43c3 100644 --- a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts +++ b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts @@ -495,15 +495,26 @@ export class N8nClient { /** * Execute a workflow with LLM-based HTTP mocking. * The server handles hint generation and mock execution in a single synchronous call. + * + * AI root nodes (Agent, Chain) default to wire-server interception so their + * sub-nodes actually run instead of being short-circuited by pin data; + * pass `pinNodes` to keep specific roots on the pinned baseline (e.g. for + * A/B comparison). Gated server-side behind the + * `085_eval_vendor_sdk_interception` PostHog flag. */ async executeWithLlmMock( workflowId: string, scenarioHints?: string, timeoutMs: number = 120_000, + pinNodes?: string[], ): Promise { + const body: { scenarioHints?: string; pinNodes?: string[] } = {}; + if (scenarioHints) body.scenarioHints = scenarioHints; + if (pinNodes && pinNodes.length > 0) body.pinNodes = pinNodes; + const result = (await this.fetch(`/rest/instance-ai/eval/execute-with-llm-mock/${workflowId}`, { method: 'POST', - body: scenarioHints ? { scenarioHints } : {}, + body, timeoutMs, })) as { data: InstanceAiEvalExecutionResult }; return result.data; diff --git a/packages/@n8n/instance-ai/evaluations/harness/runner.ts b/packages/@n8n/instance-ai/evaluations/harness/runner.ts index ecd73c50396..f819037593a 100644 --- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts +++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts @@ -68,6 +68,11 @@ interface WorkflowTestCaseConfig { /** When set, skip the orchestrator build and verify this existing workflow * instead. The harness leaves it in place — caller owns its lifecycle. */ prebuiltWorkflowId?: string; + /** AI root nodes (Agent, Chain) to keep pinned — opt-out from the default-on + * wire-server interception path. Omit (or pass empty) to intercept every + * interceptable AI root the workflow contains. Server-side gated by the + * `085_eval_vendor_sdk_interception` PostHog flag. */ + pinAiRoots?: string[]; } /** @@ -144,6 +149,7 @@ export async function runWorkflowTestCase( build.workflowJsons, logger, timeoutMs, + config.pinAiRoots, ); } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : String(error); @@ -478,8 +484,17 @@ export async function executeScenario( workflowJsons: WorkflowResponse[], logger: EvalLogger, timeoutMs?: number, + pinAiRoots?: string[], ): Promise { - return await runScenario(client, scenario, workflowId, workflowJsons, logger, timeoutMs); + return await runScenario( + client, + scenario, + workflowId, + workflowJsons, + logger, + timeoutMs, + pinAiRoots, + ); } /** @@ -526,13 +541,22 @@ async function runScenario( workflowJsons: WorkflowResponse[], logger: EvalLogger, timeoutMs?: number, + pinAiRoots?: string[], ): Promise { + const pinNodes = pinAiRoots && pinAiRoots.length > 0 ? pinAiRoots : undefined; + const execStart = Date.now(); - const evalResult = await client.executeWithLlmMock(workflowId, scenario.dataSetup, timeoutMs); + const evalResult = await client.executeWithLlmMock( + workflowId, + scenario.dataSetup, + timeoutMs, + pinNodes, + ); const execMs = Date.now() - execStart; + const pinTag = pinNodes ? ` pinned=${pinNodes.join(',')}` : ''; logger.info( - ` [${scenario.name}] exec=${String(Math.round(execMs / 1000))}s (${Object.keys(evalResult.nodeResults).length} nodes)`, + ` [${scenario.name}] exec=${String(Math.round(execMs / 1000))}s (${Object.keys(evalResult.nodeResults).length} nodes)${pinTag}`, ); const verifyStart = Date.now(); diff --git a/packages/cli/package.json b/packages/cli/package.json index cbfdfd3a7f4..92cfeff5ba7 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -90,6 +90,7 @@ "ioredis-mock": "^8.8.1", "mjml": "^4.15.3", "n8n-containers": "workspace:*", + "openai": "catalog:", "openapi-types": "^12.1.3", "ts-essentials": "^7.0.3" }, diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/eval-mocked-credentials-helper.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/eval-mocked-credentials-helper.test.ts index 711730fde37..356d13b6fa7 100644 --- a/packages/cli/src/modules/instance-ai/eval/__tests__/eval-mocked-credentials-helper.test.ts +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/eval-mocked-credentials-helper.test.ts @@ -414,6 +414,161 @@ describe('EvalMockedCredentialsHelper', () => { }); }); + describe('getDecrypted — schema synthesis when id is null', () => { + // Core's eval-mode bypass passes `{ id: null, name: type }` when a node + // has no credentials configured at all. The inner helper throws + // CredentialNotFoundError on a null id; the catch below schema-synthesizes + // (and applies the URL rewrite) so vendor SDK traffic stays inside the + // wire server instead of escaping to the real provider with 401. + const propsSchema = [ + { + name: 'apiKey', + displayName: 'API Key', + type: 'string' as const, + default: '', + typeOptions: { password: true }, + }, + { + name: 'url', + displayName: 'Base URL', + type: 'string' as const, + default: 'https://api.openai.com/v1', + }, + ]; + + const nullNodeCreds: INodeCredentialsDetails = { id: null, name: 'openAiApi' }; + + function makeSynthesizingInner(): ICredentialsHelper { + return makeInner({ + getCredentialsProperties: jest.fn().mockReturnValue(propsSchema), + // Inner throws on a null-id lookup → catch fires → schema synthesis. + getDecrypted: jest.fn().mockRejectedValue(new CredentialNotFoundError('null', 'openAiApi')), + }); + } + + it('synthesizes a credential from the schema and applies the URL rewrite', async () => { + const subNodeToRoot = new Map([['OpenAI', 'Agent']]); + const helper = new EvalMockedCredentialsHelper( + makeSynthesizingInner(), + 'http://127.0.0.1:54321', + undefined, + subNodeToRoot, + ); + + const result = await helper.getDecrypted( + fakeAdditionalData, + nullNodeCreds, + 'openAiApi', + 'manual', + { node: { name: 'OpenAI' } as INode } as IExecuteData, + ); + + // Schema default for `url` is rewritten to the wire-server path. + expect(result.url).toBe('http://127.0.0.1:54321/eval/Agent/v1'); + // Secret field (apiKey) is filled by `buildEvalMockCredentials` — + // the placeholder doesn't matter, only that it's not undefined. + expect(typeof result.apiKey).toBe('string'); + }); + + it('records the synthesized credential on `mockedCredentials`', async () => { + const helper = new EvalMockedCredentialsHelper( + makeSynthesizingInner(), + 'http://127.0.0.1:1', + undefined, + ); + + await helper.getDecrypted(fakeAdditionalData, nullNodeCreds, 'openAiApi', 'manual', { + node: { name: 'OpenAI GPT-4' } as INode, + } as IExecuteData); + + expect(helper.mockedCredentials).toEqual([ + { + nodeName: 'OpenAI GPT-4', + credentialType: 'openAiApi', + credentialId: undefined, + }, + ]); + }); + + it('records the rewrite on `rewrittenCredentials`', async () => { + const subNodeToRoot = new Map([['OpenAI', 'Agent']]); + const helper = new EvalMockedCredentialsHelper( + makeSynthesizingInner(), + 'http://127.0.0.1:1', + undefined, + subNodeToRoot, + ); + + await helper.getDecrypted(fakeAdditionalData, nullNodeCreds, 'openAiApi', 'manual', { + node: { name: 'OpenAI' } as INode, + } as IExecuteData); + + expect(helper.rewrittenCredentials).toEqual([ + { + nodeName: 'OpenAI', + credentialType: 'openAiApi', + credentialId: undefined, + field: 'url', + }, + ]); + }); + + it('brands the synthetic credential with __evalMockedCredential so authenticate short-circuits', async () => { + // Regression: without the marker, `authenticate` / `preAuthentication` + // / `runPreAuthentication` would delegate the synthetic credential + // through the inner helper's real-auth flow (OAuth refresh, PreSend + // hooks). Those flows would either crash on placeholder values or + // leak real-auth side effects from a fake credential. + const inner = makeInner({ + getCredentialsProperties: jest.fn().mockReturnValue(propsSchema), + getDecrypted: jest.fn().mockRejectedValue(new CredentialNotFoundError('null', 'openAiApi')), + authenticate: jest.fn().mockResolvedValue({ url: 'http://should-not-be-called' }), + }); + const helper = new EvalMockedCredentialsHelper(inner); + + const synthetic = await helper.getDecrypted( + fakeAdditionalData, + nullNodeCreds, + 'openAiApi', + 'manual', + { node: { name: 'OpenAI' } as INode } as IExecuteData, + ); + + expect(synthetic.__evalMockedCredential).toBe(true); + + // Round-trip through `authenticate` confirms the marker actually + // short-circuits — the inner helper must not be invoked. + const requestOptions: IHttpRequestOptions = { url: 'http://example.com' }; + const result = await helper.authenticate( + synthetic, + 'openAiApi', + requestOptions, + fakeWorkflow, + fakeNode, + ); + expect(result).toBe(requestOptions); + expect(inner.authenticate).not.toHaveBeenCalled(); + }); + + it('still returns the synthetic credential when no serverUrl is configured', async () => { + // The helper may be used in eval mode without the wire server + // (e.g. HTTP-helper-only workflows). Without `serverUrl` we just + // pass the synthetic through — matches the pre-hook behaviour. + const helper = new EvalMockedCredentialsHelper(makeSynthesizingInner()); + + const result = await helper.getDecrypted( + fakeAdditionalData, + nullNodeCreds, + 'openAiApi', + 'manual', + { node: { name: 'OpenAI' } as INode } as IExecuteData, + ); + + expect(result.url).toBe('https://api.openai.com/v1'); + expect(helper.rewrittenCredentials).toEqual([]); + }); + }); + describe('authenticate', () => { it('passes the request through unchanged for marker payloads', async () => { const inner = makeInner(); diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/execution.service.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/execution.service.test.ts index a67b91d79d4..0ac4f1286ea 100644 --- a/packages/cli/src/modules/instance-ai/eval/__tests__/execution.service.test.ts +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/execution.service.test.ts @@ -1,6 +1,6 @@ -import { mock } from 'jest-mock-extended'; -import type { User } from '@n8n/db'; import type { Logger } from '@n8n/backend-common'; +import type { User } from '@n8n/db'; +import { mock } from 'jest-mock-extended'; import type { INode, IRunExecutionData, @@ -8,10 +8,11 @@ import type { IWorkflowBase, INodeTypeDescription, } from 'n8n-workflow'; +import { UserError } from 'n8n-workflow'; -import type { WorkflowFinderService } from '@/workflows/workflow-finder.service'; import type { NodeTypes } from '@/node-types'; import type { PostHogClient } from '@/posthog'; +import type { WorkflowFinderService } from '@/workflows/workflow-finder.service'; // --------------------------------------------------------------------------- // Mocks — must be before the import of the class under test @@ -28,7 +29,7 @@ jest.mock('../mock-handler', () => ({ createLlmMockHandler: jest.fn(), })); jest.mock('../workflow-analysis', () => ({ - assertUnpinCompatibility: jest.fn(), + partitionAiRoots: jest.fn(), buildVendorLlmRouting: jest.fn().mockReturnValue({ subNodeToRoot: new Map(), rootToSubNode: new Map(), @@ -96,15 +97,14 @@ jest.mock('n8n-workflow', () => { // --------------------------------------------------------------------------- import { EvalExecutionService } from '../execution.service'; +import { createLlmMockHandler } from '../mock-handler'; import { - assertUnpinCompatibility, generateMockHints, identifyNodesForHints, identifyNodesForPinData, + partitionAiRoots, } from '../workflow-analysis'; -import { createLlmMockHandler } from '../mock-handler'; import type { MockHints } from '../workflow-analysis'; -import { UserError } from 'n8n-workflow'; // --------------------------------------------------------------------------- // Helpers @@ -113,7 +113,7 @@ import { UserError } from 'n8n-workflow'; const generateMockHintsMock = jest.mocked(generateMockHints); const identifyNodesForHintsMock = jest.mocked(identifyNodesForHints); const identifyNodesForPinDataMock = jest.mocked(identifyNodesForPinData); -const assertUnpinCompatibilityMock = jest.mocked(assertUnpinCompatibility); +const partitionAiRootsMock = jest.mocked(partitionAiRoots); const createLlmMockHandlerMock = jest.mocked(createLlmMockHandler); function makeWorkflowEntity(overrides: Partial = {}) { @@ -201,10 +201,12 @@ describe('EvalExecutionService', () => { service = new EvalExecutionService(workflowFinderService, nodeTypes, logger, postHogClient); - // Default mock returns — happy path + // Default mock returns — happy path. partitionAiRoots returns an empty + // partition (no AI roots in the test workflow) so the kill-switch + // short-circuits and the wire server stays off unless a test overrides. identifyNodesForHintsMock.mockReturnValue([]); identifyNodesForPinDataMock.mockReturnValue([]); - assertUnpinCompatibilityMock.mockImplementation(() => undefined); + partitionAiRootsMock.mockReturnValue({ unpinNodes: [], pinNodes: [], autoPinned: [] }); generateMockHintsMock.mockResolvedValue(makeEmptyHints()); createLlmMockHandlerMock.mockReturnValue(jest.fn()); mockGetStartNode.mockReturnValue(makeStartNode()); @@ -311,21 +313,30 @@ describe('EvalExecutionService', () => { }); }); - // ── unpinNodes handling ────────────────────────────────────────── + // ── pinNodes / interception partition ──────────────────────────── - describe('unpinNodes', () => { + describe('interception partition', () => { beforeEach(() => { workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never); }); - it('calls assertUnpinCompatibility with an empty list when unpinNodes is omitted', async () => { + it('calls partitionAiRoots with an empty explicit pin list when pinNodes is omitted', async () => { await service.executeWithLlmMock('wf-1', makeUser()); - expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith(expect.anything(), []); + expect(partitionAiRootsMock).toHaveBeenCalledWith(expect.anything(), []); }); - it('omits the exclusion set when unpinNodes is empty', async () => { - await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: [] }); + it('forwards explicit pinNodes from the request to partitionAiRoots', async () => { + await service.executeWithLlmMock('wf-1', makeUser(), { pinNodes: ['Agent'] }); + + expect(partitionAiRootsMock).toHaveBeenCalledWith(expect.objectContaining({ id: 'wf-1' }), [ + 'Agent', + ]); + }); + + it('omits the exclusion set when the partition returns no unpinNodes', async () => { + // Default mock returns empty unpinNodes → no AI roots intercepted. + await service.executeWithLlmMock('wf-1', makeUser()); expect(identifyNodesForPinDataMock).toHaveBeenCalledWith( expect.objectContaining({ id: 'wf-1' }), @@ -333,78 +344,82 @@ describe('EvalExecutionService', () => { ); }); - // PostHog kill-switch: non-empty unpinNodes only runs when the flag - // resolves to ON. Flag OFF refuses the request before any other work - // so vendor traffic can never reach the real provider. + it("surfaces the partition's typo-guard error when an explicit pin name is invalid", async () => { + partitionAiRootsMock.mockImplementation(() => { + throw new UserError('Cannot pin — not found in workflow: "Ghost".'); + }); + + const result = await service.executeWithLlmMock('wf-1', makeUser(), { + pinNodes: ['Ghost'], + }); + + expect(result.success).toBe(false); + expect(result.errors).toEqual([expect.stringContaining('not found in workflow')]); + expect(mockProcessRunExecutionData).not.toHaveBeenCalled(); + expect(mockWireServerStart).not.toHaveBeenCalled(); + }); + + // PostHog kill-switch: when partitionAiRoots wants to intercept any + // roots, the flag is consulted. Flag OFF silently degrades to the + // pinned baseline so the eval still produces a result — no error, + // just the today-baseline behaviour. This is the right default once + // interception is the default-on path. describe('PostHog kill-switch (flag off)', () => { beforeEach(() => { + partitionAiRootsMock.mockReturnValue({ + unpinNodes: ['Agent'], + pinNodes: [], + autoPinned: [], + }); postHogClient.getFeatureFlags.mockResolvedValue({ '085_eval_vendor_sdk_interception': false, }); }); - it('runs the compatibility guard first, then refuses with the gate error when the guard passes', async () => { - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + it('silently degrades to the pinned baseline (no wire server, no error)', async () => { + const result = await service.executeWithLlmMock('wf-1', makeUser()); - expect(result.success).toBe(false); - expect(result.errors).toEqual([expect.stringContaining('currently disabled')]); - // Guard runs first so the user gets actionable diagnostics when their - // workflow has a permanent compatibility issue. When the guard passes, - // the gate fires with the generic "currently disabled" message. - expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith( - expect.objectContaining({ id: 'wf-1' }), - ['Agent'], - ); - expect(generateMockHintsMock).not.toHaveBeenCalled(); - expect(mockProcessRunExecutionData).not.toHaveBeenCalled(); + // No refusal — the eval still completes through the pinned path. + expect(result.errors).toEqual([]); + expect(mockWireServerStart).not.toHaveBeenCalled(); + expect(mockProcessRunExecutionData).toHaveBeenCalledTimes(1); }); - it("surfaces the guard's error when the workflow has a permanent compatibility issue", async () => { - assertUnpinCompatibilityMock.mockImplementation(() => { - throw new UserError( - 'Cannot unpin AI root nodes — protocol-binary sub-nodes ' + - '(cannot be intercepted via HTTP): "Mem" (memoryPostgresChat) → "Agent"', - ); + it('does not consult PostHog when the partition has nothing to intercept', async () => { + partitionAiRootsMock.mockReturnValue({ + unpinNodes: [], + pinNodes: [], + autoPinned: [], }); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); - - expect(result.success).toBe(false); - // Guard's protocol-binary message wins over the generic gate message — - // the user needs to fix the workflow regardless of when the feature ships. - expect(result.errors).toEqual([expect.stringContaining('memoryPostgresChat')]); - expect(result.errors[0]).not.toContain('currently disabled'); - // Guard refused before the PostHog check fires. - expect(postHogClient.getFeatureFlags).not.toHaveBeenCalled(); - }); - - it('still runs the normal pinned path when unpinNodes is omitted (no flag check)', async () => { await service.executeWithLlmMock('wf-1', makeUser()); expect(postHogClient.getFeatureFlags).not.toHaveBeenCalled(); - expect(generateMockHintsMock).toHaveBeenCalled(); - expect(mockProcessRunExecutionData).toHaveBeenCalled(); + }); + + it('also degrades silently when PostHog itself rejects (fail-closed)', async () => { + postHogClient.getFeatureFlags.mockRejectedValue(new Error('PostHog down')); + + const result = await service.executeWithLlmMock('wf-1', makeUser()); + + expect(result.errors).toEqual([]); + expect(mockWireServerStart).not.toHaveBeenCalled(); }); }); - // Flag ON (or unset — fail-open default): non-empty unpinNodes proceeds - // into the rewrite path and boots the wire server. + // Flag ON (or unset — fail-open default): the partition's unpinNodes + // drive the rewrite path and boot the wire server. describe('PostHog kill-switch (flag on)', () => { - it('forwards unpinNodes to assertUnpinCompatibility', async () => { - await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] }); - - expect(assertUnpinCompatibilityMock).toHaveBeenCalledWith( - expect.objectContaining({ id: 'wf-1' }), - ['Agent'], - ); + beforeEach(() => { + partitionAiRootsMock.mockReturnValue({ + unpinNodes: ['Agent'], + pinNodes: [], + autoPinned: [], + }); }); - it('forwards the exclusion set to identifyNodesForPinData', async () => { - await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] }); + it('forwards the exclusion set to identifyNodesForPinData when interception is enabled', async () => { + await service.executeWithLlmMock('wf-1', makeUser()); expect(identifyNodesForPinDataMock).toHaveBeenCalledWith( expect.objectContaining({ id: 'wf-1' }), @@ -413,7 +428,7 @@ describe('EvalExecutionService', () => { }); it('boots and tears down the wire server around the workflow run', async () => { - await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: ['Agent'] }); + await service.executeWithLlmMock('wf-1', makeUser()); expect(mockWireServerStart).toHaveBeenCalledTimes(1); expect(mockProcessRunExecutionData).toHaveBeenCalledTimes(1); @@ -424,43 +439,33 @@ describe('EvalExecutionService', () => { it('tears down the wire server even if the workflow run throws', async () => { mockProcessRunExecutionData.mockRejectedValue(new Error('explode')); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + const result = await service.executeWithLlmMock('wf-1', makeUser()); expect(result.success).toBe(false); expect(mockWireServerStop).toHaveBeenCalledTimes(1); expect(mockRestoreNoProxy).toHaveBeenCalledTimes(1); }); - it('does not boot the wire server when unpinNodes is empty', async () => { - await service.executeWithLlmMock('wf-1', makeUser(), { unpinNodes: [] }); + it('does not boot the wire server when the partition has no unpinNodes', async () => { + partitionAiRootsMock.mockReturnValue({ + unpinNodes: [], + pinNodes: [], + autoPinned: [], + }); + + await service.executeWithLlmMock('wf-1', makeUser()); expect(mockWireServerStart).not.toHaveBeenCalled(); expect(mockWireServerStop).not.toHaveBeenCalled(); }); - it('fails closed when PostHog rejects (treats flag as off and refuses the request)', async () => { - postHogClient.getFeatureFlags.mockRejectedValue(new Error('PostHog down')); - - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); - - expect(result.success).toBe(false); - expect(result.errors).toEqual([expect.stringContaining('currently disabled')]); - expect(mockWireServerStart).not.toHaveBeenCalled(); - }); - it('tears down the wire server when NO_PROXY patching throws after boot', async () => { const proxyLoopback = require('../proxy-loopback'); proxyLoopback.patchNoProxyForLoopback.mockImplementationOnce(() => { throw new Error('env mutation blocked'); }); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + const result = await service.executeWithLlmMock('wf-1', makeUser()); expect(result.success).toBe(false); expect(result.errors).toEqual([expect.stringContaining('env mutation blocked')]); @@ -468,24 +473,6 @@ describe('EvalExecutionService', () => { expect(mockWireServerStop).toHaveBeenCalledTimes(1); }); - it('returns an error result and skips workflow execution when the compatibility guard refuses', async () => { - assertUnpinCompatibilityMock.mockImplementation(() => { - throw new (require('n8n-workflow').UserError)( - 'Cannot unpin "Agent" — incompatible memory backend', - ); - }); - - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); - - expect(result.success).toBe(false); - expect(result.errors).toEqual([expect.stringContaining('Cannot unpin "Agent"')]); - expect(mockProcessRunExecutionData).not.toHaveBeenCalled(); - // Server was never started — guard runs before boot. - expect(mockWireServerStart).not.toHaveBeenCalled(); - }); - it('records a wire-server turn against the AI root in nodeResults via onIntercept', async () => { // Simulate the wire server firing onIntercept mid-execution by // invoking the captured callback before processRunExecutionData @@ -506,9 +493,7 @@ describe('EvalExecutionService', () => { return makeIRun(); }); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + const result = await service.executeWithLlmMock('wf-1', makeUser()); expect(result.nodeResults['Agent']).toBeDefined(); expect(result.nodeResults['Agent'].executionMode).toBe('mocked'); @@ -552,9 +537,7 @@ describe('EvalExecutionService', () => { return makeIRun(); }); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + const result = await service.executeWithLlmMock('wf-1', makeUser()); // 'pinned' from the bypass pass survives — preservation rule. expect(result.nodeResults['Agent'].executionMode).toBe('pinned'); @@ -562,6 +545,99 @@ describe('EvalExecutionService', () => { expect(result.nodeResults['Agent'].interceptedRequests).toHaveLength(1); }); + // Headline ledger-attribution rule for M3: a single eval run produces + // two kinds of traffic — vendor-SDK model turns (attributed to the AI + // root via the wire server's URL path) and tool HTTP traffic + // (attributed to the tool node via the existing helpers.httpRequest + // interceptor in `request-helper-functions.ts:1147`). The two must + // land in separate `nodeResults` entries; tools whose HTTP traffic + // gets folded into the Agent's ledger would mask real bugs. + it('splits the ledger: model turns to the Agent root, tool HTTP to the tool node', async () => { + const innerMockHandler = jest.fn().mockResolvedValue({ + body: { content: 'tool result' }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + createLlmMockHandlerMock.mockReturnValue(innerMockHandler); + + mockProcessRunExecutionData.mockImplementation(async () => { + const opts = capturedWireServerOptions.last as { + onIntercept?: (turn: unknown) => void; + }; + // Model turn — wire server's onIntercept fires with the root name. + opts.onIntercept?.({ + rootName: 'Agent', + url: 'https://api.openai.com/v1/chat/completions', + method: 'POST', + nodeType: '@n8n/n8n-nodes-langchain.lmChatOpenAi', + requestBody: { model: 'gpt-4o', messages: [] }, + mockResponse: { + tool_calls: [{ id: 'c1', function: { name: 'getOrder', arguments: '{}' } }], + }, + }); + + // Tool HTTP — `evalLlmMockHandler` is invoked from + // `request-helper-functions.ts` with the tool node's + // identity. The SUT passes `additionalData` as the first + // positional argument to the `WorkflowExecute` constructor + // (see `runWorkflow()` in `execution.service.ts`). If that + // contract ever changes, the explicit guard below fails + // loudly with an actionable message instead of silently + // reading the wrong argument slot. + const wfExecuteCtor = jest.mocked( + (await import('n8n-core')).WorkflowExecute, + ) as unknown as jest.Mock; + const additionalData = wfExecuteCtor.mock.calls[0][0] as { + evalLlmMockHandler?: (req: unknown, node: unknown) => Promise; + }; + if (!additionalData?.evalLlmMockHandler) { + throw new Error( + 'WorkflowExecute(additionalData, ...) contract changed — ' + + 'arg 0 no longer carries evalLlmMockHandler. Update the ledger-split test.', + ); + } + await additionalData.evalLlmMockHandler( + { url: 'https://orders.example.com/v1/orders/42', method: 'GET' }, + { + id: 'tool-node', + name: 'Get Order Tool', + type: 'n8n-nodes-base.httpRequestTool', + typeVersion: 1, + position: [0, 0], + parameters: {}, + }, + ); + + return makeIRun(); + }); + + const result = await service.executeWithLlmMock('wf-1', makeUser()); + + // Model turn attributed to Agent only. + expect(result.nodeResults['Agent']).toBeDefined(); + expect(result.nodeResults['Agent'].interceptedRequests).toHaveLength(1); + expect(result.nodeResults['Agent'].interceptedRequests[0].nodeType).toBe( + '@n8n/n8n-nodes-langchain.lmChatOpenAi', + ); + + // Tool HTTP attributed to the tool node, NOT to the Agent. + expect(result.nodeResults['Get Order Tool']).toBeDefined(); + expect(result.nodeResults['Get Order Tool'].interceptedRequests).toHaveLength(1); + expect(result.nodeResults['Get Order Tool'].interceptedRequests[0].url).toBe( + 'https://orders.example.com/v1/orders/42', + ); + expect(result.nodeResults['Get Order Tool'].interceptedRequests[0].nodeType).toBe( + 'n8n-nodes-base.httpRequestTool', + ); + expect(result.nodeResults['Get Order Tool'].executionMode).toBe('mocked'); + + // Cross-check: neither side's ledger contains the other side's URL. + const agentUrls = result.nodeResults['Agent'].interceptedRequests.map((r) => r.url); + const toolUrls = result.nodeResults['Get Order Tool'].interceptedRequests.map((r) => r.url); + expect(agentUrls).not.toContain('https://orders.example.com/v1/orders/42'); + expect(toolUrls).not.toContain('https://api.openai.com/v1/chat/completions'); + }); + it('upgrades a pre-marked "real" entry to "mocked" when a wire-server turn fires', async () => { // checkNodeConfig() pre-marks any node with a config-issue as // `executionMode: 'real'` BEFORE runWorkflow runs. If a wire- @@ -597,9 +673,7 @@ describe('EvalExecutionService', () => { return makeIRun(); }); - const result = await service.executeWithLlmMock('wf-1', makeUser(), { - unpinNodes: ['Agent'], - }); + const result = await service.executeWithLlmMock('wf-1', makeUser()); // 'real' (from config-issue pre-marking) gets upgraded to 'mocked'. expect(result.nodeResults['HTTP Request']).toBeDefined(); diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/llm-wire-server.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/llm-wire-server.test.ts index 2c231dd21e2..65e074c2408 100644 --- a/packages/cli/src/modules/instance-ai/eval/__tests__/llm-wire-server.test.ts +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/llm-wire-server.test.ts @@ -1,6 +1,7 @@ import type { Logger } from '@n8n/backend-common'; import type { EvalLlmMockHandler } from 'n8n-core'; import type { INode } from 'n8n-workflow'; +import OpenAI from 'openai'; import { type InterceptedTurn, LlmWireServer } from '../llm-wire-server'; @@ -65,6 +66,19 @@ describe('LlmWireServer', () => { await second.stop(); } }); + + it('accepts requests after start() → stop() → start() — shutdown latch resets', async () => { + await server.start(); + await server.stop(); + const url = await server.start(); + const response = await postChatCompletion(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o-mini', + messages: [], + }); + // Post-restart the route must hand back a 200 envelope, NOT the + // 503 the in-flight shutdown latch would emit if it weren't reset. + expect(response.status).toBe(200); + }); }); describe('POST /eval/:root/v1/chat/completions — stub fallback', () => { @@ -223,7 +237,7 @@ describe('LlmWireServer', () => { expect(warn.mock.calls[0][0]).toContain('ledger disk full'); }); - it('records an isolated deep copy of the request body in the ledger', async () => { + it('records a per-request body in the ledger that does not bleed across requests', async () => { const intercepts: InterceptedTurn[] = []; const mockHandler = jest.fn().mockResolvedValue({ body: { content: 'reply' }, @@ -388,4 +402,552 @@ describe('LlmWireServer', () => { expect(body.error.message).toContain('/eval//'); }); }); + + // SSE branch — switches when the inbound body has `stream: true`. The spec + // is strict on chunk semantics; the openai SDK throws opaque `BadStream` + // errors when the envelope is malformed, so the assertions here mirror + // what the SDK validates internally. + describe('POST /eval/:root/v1/chat/completions — SSE branch (stream: true)', () => { + const subNode = makeSubNode({ name: 'OpenAI Chat Model' }); + + async function readSseChunks(url: string, path: string, body: unknown) { + const response = await fetch(`${url}${path}`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Accept: 'text/event-stream' }, + body: JSON.stringify(body), + }); + const text = await response.text(); + const frames = text + .split('\n\n') + .map((f) => f.trim()) + .filter((f) => f.startsWith('data: ')) + .map((f) => f.slice('data: '.length)); + return { response, frames }; + } + + it('returns Content-Type: text/event-stream and a [DONE] terminator', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { content: 'streamed reply' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const { response, frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + stream: true, + messages: [{ role: 'user', content: 'hi' }], + }); + + expect(response.status).toBe(200); + expect(response.headers.get('content-type')).toMatch(/text\/event-stream/); + expect(frames[frames.length - 1]).toBe('[DONE]'); + }); + + it('emits chat.completion.chunk frames terminated with a stop finish_reason', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { content: 'hello via SSE' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const { frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + stream: true, + messages: [{ role: 'user', content: 'hi' }], + }); + + const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f)); + expect(dataFrames.every((f) => f.object === 'chat.completion.chunk')).toBe(true); + + const ids = new Set(dataFrames.map((f) => f.id)); + expect(ids.size).toBe(1); + + const contentChunk = dataFrames.find((f) => f.choices[0].delta.content === 'hello via SSE'); + expect(contentChunk).toBeDefined(); + + const terminal = dataFrames[dataFrames.length - 1]; + expect(terminal.choices[0].finish_reason).toBe('stop'); + }); + + it('streams tool_calls with first-chunk id+name and a terminal tool_calls finish_reason', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { + tool_calls: [ + { id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"NYC"}' } }, + ], + }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const { frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + stream: true, + messages: [{ role: 'user', content: 'weather in NYC?' }], + tools: [ + { + type: 'function', + function: { name: 'get_weather', parameters: { type: 'object' } }, + }, + ], + }); + + const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f)); + + const firstToolFrame = dataFrames.find( + (f) => f.choices[0].delta.tool_calls?.[0]?.id === 'call_1', + ); + expect(firstToolFrame).toBeDefined(); + expect(firstToolFrame.choices[0].delta.tool_calls[0].function.name).toBe('get_weather'); + + const argsFrame = dataFrames.find( + (f) => f.choices[0].delta.tool_calls?.[0]?.function?.arguments === '{"city":"NYC"}', + ); + expect(argsFrame).toBeDefined(); + // Args frame MUST NOT repeat id or name. + expect(argsFrame.choices[0].delta.tool_calls[0].id).toBeUndefined(); + expect(argsFrame.choices[0].delta.tool_calls[0].function.name).toBeUndefined(); + + const terminal = dataFrames[dataFrames.length - 1]; + expect(terminal.choices[0].finish_reason).toBe('tool_calls'); + }); + + it('attributes the streamed turn against the requested root in onIntercept', async () => { + const intercepts: InterceptedTurn[] = []; + const mockHandler = jest.fn().mockResolvedValue({ + body: { content: 'streamed' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + onIntercept: (t) => intercepts.push(t), + }); + const url = await server.start(); + + await readSseChunks(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + stream: true, + messages: [], + }); + + expect(intercepts).toHaveLength(1); + expect(intercepts[0].rootName).toBe('Agent'); + }); + + it('uses the no-handler stub for streaming when no mock handler is attached', async () => { + server = new LlmWireServer(); + const url = await server.start(); + + const { response, frames } = await readSseChunks(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + stream: true, + messages: [], + }); + + expect(response.headers.get('content-type')).toMatch(/text\/event-stream/); + const dataFrames = frames.filter((f) => f !== '[DONE]').map((f) => JSON.parse(f)); + const stubFrame = dataFrames.find( + (f) => + typeof f.choices[0].delta.content === 'string' && + f.choices[0].delta.content.includes('eval wire server stub'), + ); + expect(stubFrame).toBeDefined(); + }); + + // Live SDK round-trip — the master spec mandates this: "Test against + // the live `openai` v5 SDK — do not hand-roll envelope shape against + // documentation alone." The hand-rolled `readSseChunks` frame splitter + // above proves our wire shape against the spec; this test proves it + // against the *actual SDK parser*. If our `delta.tool_calls` chunks + // drift from what `openai`'s reducer expects, this test will throw a + // typed BadStream error before any of the per-frame asserts above + // would notice. + describe('live `openai` SDK round-trip (catches SDK-strict envelope drift)', () => { + function makeClient(serverUrl: string, rootName: string) { + return new OpenAI({ + apiKey: 'sk-eval-test', + baseURL: `${serverUrl}/eval/${encodeURIComponent(rootName)}/v1`, + // Disable retries — a failed parse should surface immediately, + // not loop the test through the default 2x retry budget. + maxRetries: 0, + }); + } + + it('non-streaming chat completion parses through the SDK reducer', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { content: 'hello via SDK' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + const client = makeClient(url, 'Agent'); + + const completion = await client.chat.completions.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + + expect(completion.object).toBe('chat.completion'); + expect(completion.choices[0].message.content).toBe('hello via SDK'); + expect(completion.choices[0].finish_reason).toBe('stop'); + }); + + it('streaming content yields chunks through the SDK async iterator', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { content: 'streamed via SDK' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + const client = makeClient(url, 'Agent'); + + const stream = await client.chat.completions.create({ + model: 'gpt-4o', + stream: true, + messages: [{ role: 'user', content: 'hi' }], + }); + + let assembled = ''; + let lastFinishReason: string | null | undefined; + for await (const chunk of stream) { + expect(chunk.object).toBe('chat.completion.chunk'); + const delta = chunk.choices[0]?.delta; + if (typeof delta?.content === 'string') { + assembled += delta.content; + } + if (chunk.choices[0]?.finish_reason !== undefined) { + lastFinishReason = chunk.choices[0].finish_reason; + } + } + + expect(assembled).toBe('streamed via SDK'); + expect(lastFinishReason).toBe('stop'); + }); + + it('streaming tool_calls accumulate through the SDK reducer with the correct final shape', async () => { + // The strictest test of the wire format. The SDK accumulates + // `delta.tool_calls` slices into a single tool call — first chunk + // owns `id` + `function.name`, later chunks contribute + // `function.arguments`. A drift here (e.g. repeating `id` on + // later chunks) throws a `BadStream` error, not a soft skip. + const mockHandler = jest.fn().mockResolvedValue({ + body: { + tool_calls: [ + { + id: 'call_live', + function: { name: 'get_weather', arguments: '{"city":"NYC"}' }, + }, + ], + }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + const client = makeClient(url, 'Agent'); + + const stream = await client.chat.completions.create({ + model: 'gpt-4o', + stream: true, + messages: [{ role: 'user', content: 'weather' }], + tools: [ + { + type: 'function', + function: { name: 'get_weather', parameters: { type: 'object' } }, + }, + ], + }); + + const accumulated: Record = {}; + let lastFinishReason: string | null | undefined; + for await (const chunk of stream) { + const toolDeltas = chunk.choices[0]?.delta?.tool_calls ?? []; + for (const td of toolDeltas) { + const slot = (accumulated[td.index] ??= { args: '' }); + if (td.id) slot.id = td.id; + if (td.function?.name) slot.name = td.function.name; + if (typeof td.function?.arguments === 'string') { + slot.args += td.function.arguments; + } + } + if (chunk.choices[0]?.finish_reason !== undefined) { + lastFinishReason = chunk.choices[0].finish_reason; + } + } + + // SDK reducer reassembled the full call. + expect(accumulated[0]).toEqual({ + id: 'call_live', + name: 'get_weather', + args: '{"city":"NYC"}', + }); + expect(lastFinishReason).toBe('tool_calls'); + }); + }); + + it('returns a JSON error envelope (not SSE) when the mock handler throws on a streaming request', async () => { + const mockHandler = jest + .fn() + .mockRejectedValue(new Error('LLM offline')) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const response = await fetch(`${url}/eval/Agent/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ model: 'gpt-4o', stream: true, messages: [] }), + }); + // SDK clients on a 500 short-circuit before iterating the stream, so + // returning a JSON error envelope here keeps both streaming and + // non-streaming code paths happy. + expect(response.status).toBe(500); + const body = (await response.json()) as { error: { message: string } }; + expect(body.error.message).toContain('LLM offline'); + }); + }); + + // Non-streaming tool_calls: the same envelope shape the agent-side SDK + // expects when stream:false. SDKs use `finish_reason: 'tool_calls'` to + // branch into tool-execution; we must set it whenever tool_calls is present. + describe('POST /eval/:root/v1/chat/completions — tool_calls (non-streaming)', () => { + const subNode = makeSubNode({ name: 'OpenAI Chat Model' }); + + it('emits tool_calls + content:null + finish_reason: tool_calls on the message', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { + tool_calls: [{ id: 'call_1', function: { name: 'lookup', arguments: '{"q":"hi"}' } }], + }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const response = await postChatCompletion(url, '/eval/Agent/v1/chat/completions', { + model: 'gpt-4o', + messages: [{ role: 'user', content: 'lookup hi' }], + tools: [{ type: 'function', function: { name: 'lookup', parameters: { type: 'object' } } }], + }); + + expect(response.status).toBe(200); + const body = (await response.json()) as { + choices: Array<{ + message: { + role: string; + content: string | null; + tool_calls: Array<{ + id: string; + type: string; + function: { name: string; arguments: string }; + }>; + }; + finish_reason: string; + }>; + }; + const choice = body.choices[0]; + expect(choice.message.role).toBe('assistant'); + expect(choice.message.content).toBeNull(); + expect(choice.message.tool_calls[0]).toMatchObject({ + id: 'call_1', + type: 'function', + function: { name: 'lookup', arguments: '{"q":"hi"}' }, + }); + expect(choice.finish_reason).toBe('tool_calls'); + }); + }); + + // `@langchain/openai` v1.3+ auto-routes Agent v3.1+ calls to /v1/responses + // instead of /v1/chat/completions. Verified empirically against a real + // LangChain Agent — without this route the SDK 404s. + describe('POST /eval/:root/v1/responses — Responses API', () => { + const subNode = makeSubNode({ name: 'OpenAI Chat Model' }); + + it('returns a `response` envelope with annotations:[] on output_text content', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { output_text: 'hello via responses' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const response = await postChatCompletion(url, '/eval/Agent/v1/responses', { + model: 'gpt-4o', + input: [{ role: 'user', content: 'hi' }], + }); + + expect(response.status).toBe(200); + const body = (await response.json()) as { + object: string; + status: string; + output: Array<{ + type: string; + content: Array<{ type: string; text: string; annotations: unknown[] }>; + }>; + }; + expect(body.object).toBe('response'); + expect(body.status).toBe('completed'); + expect(body.output[0].type).toBe('message'); + expect(body.output[0].content[0].text).toBe('hello via responses'); + // Without `annotations: []`, the LangChain extractor throws + // "Cannot read properties of undefined (reading 'map')". + expect(body.output[0].content[0].annotations).toEqual([]); + }); + + it('emits a function_call output item when the mock handler returns tool_calls', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { + tool_calls: [{ id: 'call_1', function: { name: 'lookup', arguments: '{"q":"x"}' } }], + }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const response = await postChatCompletion(url, '/eval/Agent/v1/responses', { + model: 'gpt-4o', + input: [{ role: 'user', content: 'x' }], + tools: [{ type: 'function', name: 'lookup' }], + }); + + const body = (await response.json()) as { + output: Array<{ type: string; name?: string; call_id?: string; arguments?: string }>; + }; + expect(body.output[0].type).toBe('function_call'); + expect(body.output[0].name).toBe('lookup'); + expect(body.output[0].call_id).toBe('call_1'); + expect(body.output[0].arguments).toBe('{"q":"x"}'); + }); + + it('streams response.* SSE events when stream:true', async () => { + const mockHandler = jest.fn().mockResolvedValue({ + body: { output_text: 'streamed reply' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['Agent', subNode]]), + }); + const url = await server.start(); + + const response = await fetch(`${url}/eval/Agent/v1/responses`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Accept: 'text/event-stream' }, + body: JSON.stringify({ + model: 'gpt-4o', + stream: true, + input: [{ role: 'user', content: 'hi' }], + }), + }); + + expect(response.headers.get('content-type')).toMatch(/text\/event-stream/); + const text = await response.text(); + + // Responses API doesn't use `data: [DONE]` — the terminal is + // `response.completed`. Parse the event frames and assert ordering. + const events: string[] = []; + for (const block of text.split('\n\n')) { + const eventLine = block.split('\n').find((l) => l.startsWith('event: ')); + if (eventLine) events.push(eventLine.slice('event: '.length)); + } + expect(events[0]).toBe('response.created'); + expect(events[events.length - 1]).toBe('response.completed'); + expect(events).toContain('response.output_text.delta'); + }); + + it('attributes the turn via onIntercept with the parsed root', async () => { + const intercepts: InterceptedTurn[] = []; + const mockHandler = jest.fn().mockResolvedValue({ + body: { output_text: 'ok' }, + headers: {}, + statusCode: 200, + }) as unknown as EvalLlmMockHandler; + server = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([['My Agent', subNode]]), + onIntercept: (t) => intercepts.push(t), + }); + const url = await server.start(); + + await postChatCompletion(url, '/eval/My%20Agent/v1/responses', { + model: 'gpt-4o', + input: [], + }); + + expect(intercepts).toHaveLength(1); + expect(intercepts[0].rootName).toBe('My Agent'); + // Reverse translator uses the canonical OpenAI URL so mock-handler's + // service/endpoint extraction derives `/v1/responses` correctly. + expect(intercepts[0].url).toBe('https://api.openai.com/v1/responses'); + }); + + it('returns the loud-fail error envelope when no /eval// prefix is used', async () => { + server = new LlmWireServer(); + const url = await server.start(); + + const response = await postChatCompletion(url, '/v1/responses', { + model: 'gpt-4o', + input: [], + }); + const body = (await response.json()) as { error: { message: string } }; + expect(response.status).toBe(500); + expect(body.error.message).toContain('/eval//'); + }); + + it('uses the stub envelope when no mock handler is attached', async () => { + server = new LlmWireServer(); + const url = await server.start(); + + const response = await postChatCompletion(url, '/eval/Agent/v1/responses', { + model: 'gpt-4o', + input: [], + }); + const body = (await response.json()) as { + output: Array<{ content: Array<{ text: string }> }>; + }; + expect(body.output[0].content[0].text).toContain('eval wire server stub'); + }); + }); }); diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/m3-fixtures.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/m3-fixtures.test.ts new file mode 100644 index 00000000000..191ec72d271 --- /dev/null +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/m3-fixtures.test.ts @@ -0,0 +1,496 @@ +import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core'; +import type { + ICredentialDataDecryptedObject, + ICredentialsHelper, + IExecuteData, + IHttpRequestOptions, + INode, + INodeCredentialsDetails, + IWorkflowExecuteAdditionalData, +} from 'n8n-workflow'; + +import { EvalMockedCredentialsHelper } from '../eval-mocked-credentials-helper'; +import { type InterceptedTurn, LlmWireServer } from '../llm-wire-server'; + +/** + * Integration-shaped unit test exercising credential rewrite + path-based + * root attribution + envelope correctness end-to-end. Boots a real + * `LlmWireServer` on a loopback port, instantiates a real + * `EvalMockedCredentialsHelper`, scripts mock-handler responses turn-by-turn, + * and drives the Agent loop with raw `fetch`. Envelope shape is locked down + * separately in `llm-wire-server.test.ts` and `openai-envelope.test.ts`. + * + * - **Mechanism** — tool IS connected. Asserts the ledger ends with model + * turns attributed to the Agent root and tool HTTP attributed to the tool + * node, with no cross-contamination. + * - **Regression-catch** — tool is disconnected. With un-pinning the eval + * must fail because the Agent's mocked output can't produce the tool- + * shaped result the grader expects. A counterfactual passes when the + * tool IS connected, proving the check is meaningful. + */ +describe('M3 fixtures — Agent + Chat Model + HTTP tool + MemoryBufferWindow', () => { + const llmSubNode: INode = { + id: 'sub-1', + name: 'OpenAI Chat Model', + type: '@n8n/n8n-nodes-langchain.lmChatOpenAi', + typeVersion: 1, + position: [0, 0], + parameters: { model: 'gpt-4o-mini' }, + }; + const toolNode: INode = { + id: 'tool-1', + name: 'Get Order Status Tool', + type: 'n8n-nodes-base.httpRequestTool', + typeVersion: 1, + position: [200, 0], + parameters: { url: 'https://orders.example.com/v1/orders/{{ $fromAI("orderId") }}' }, + }; + const rootName = 'Agent'; + + function makeInnerHelper(credentials: ICredentialDataDecryptedObject): ICredentialsHelper { + return { + getParentTypes: jest.fn().mockReturnValue([]), + authenticate: jest.fn(), + preAuthentication: jest.fn(), + runPreAuthentication: jest.fn(), + getCredentials: jest.fn(), + getDecrypted: jest.fn().mockResolvedValue(credentials), + updateCredentials: jest.fn(), + updateCredentialsOauthTokenData: jest.fn(), + getCredentialsProperties: jest.fn().mockReturnValue([]), + } as ICredentialsHelper; + } + + async function postViaRewrittenCredentials( + helper: EvalMockedCredentialsHelper, + serverBaseUrl: string, + requestBody: unknown, + callingSubNodeName: string, + ): Promise<{ rewrittenUrl: string; response: Response; body: Record }> { + const cred = await helper.getDecrypted( + {} as IWorkflowExecuteAdditionalData, + { id: 'cred-1', name: 'OpenAI' } as INodeCredentialsDetails, + 'openAiApi', + 'manual', + { node: { name: callingSubNodeName, id: 'n' } as INode } as IExecuteData, + ); + + const baseUrl = String(cred.url); + const response = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(requestBody), + }); + const body = (await response.json()) as Record; + expect(baseUrl.startsWith(serverBaseUrl)).toBe(true); + return { rewrittenUrl: baseUrl, response, body }; + } + + /** + * Build the eval-side glue that the M3 fixture exercises: + * - real LlmWireServer with a programmable mockHandler + * - real EvalMockedCredentialsHelper wired to the rewrite map + * - ledger accumulators for both model turns and tool HTTP + * + * The model-turn ledger mirrors what `execution.service.ts`'s + * `recordWireServerTurn` writes; the tool-HTTP ledger mirrors what its + * `createInterceptingHandler` writes. The split between the two is + * what the M3 mechanism fixture proves. + */ + async function bootM3Harness() { + const modelTurns: InterceptedTurn[] = []; + const toolHttpCalls: Array<{ nodeName: string; url: string; mockResponse: unknown }> = []; + + // Programmable mock handler — the M3 mechanism case feeds it a + // scripted sequence of returns, one per call. The value/regression + // case feeds it a single "plain content" return that lacks the + // tool-shaped output the grader looks for. + const scriptedResponses: EvalMockHttpResponse[] = []; + const mockHandler = jest + .fn, Parameters>() + .mockImplementation(async () => { + const next = scriptedResponses.shift(); + if (!next) { + throw new Error( + 'M3 fixture mock handler ran out of scripted responses — fixture script is wrong', + ); + } + return next; + }); + + const wireServer = new LlmWireServer({ + mockHandler, + rootToSubNode: new Map([[rootName, llmSubNode]]), + onIntercept: (t) => modelTurns.push(t), + }); + await wireServer.start(); + + const helper = new EvalMockedCredentialsHelper( + makeInnerHelper({ apiKey: 'sk-real', url: 'https://api.openai.com/v1' }), + wireServer.url, + undefined, + new Map([[llmSubNode.name, rootName]]), + ); + + // Mirror of `execution.service.ts:createInterceptingHandler` for the + // tool side — captures HTTP attributed to the tool's node identity. + const toolHttpInterceptor = async ( + request: IHttpRequestOptions, + node: INode, + ): Promise => { + const mockResponse: EvalMockHttpResponse = { + body: { + orderId: 'ORD-42', + status: 'shipped', + eta: '2026-05-25T00:00:00Z', + }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }; + toolHttpCalls.push({ + nodeName: node.name, + url: request.url, + mockResponse: mockResponse.body, + }); + return mockResponse; + }; + + return { + wireServer, + helper, + scriptedResponses, + modelTurns, + toolHttpCalls, + toolHttpInterceptor, + mockHandler, + }; + } + + // ── M3 mechanism ──────────────────────────────────────────────────── + + describe('mechanism (tool connected to Agent)', () => { + it('drives a full Agent loop: tool_calls turn → tool HTTP → follow-up turn → final answer', async () => { + const harness = await bootM3Harness(); + try { + // Turn 1: Agent posts with tools array; wire server's mock handler + // returns a tool_calls envelope. + harness.scriptedResponses.push({ + body: { + tool_calls: [ + { + id: 'call_1', + function: { name: 'get_order_status', arguments: '{"orderId":"ORD-42"}' }, + }, + ], + }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + // Turn 2: Agent re-posts with the tool result; mock returns the + // final natural-language answer. + harness.scriptedResponses.push({ + body: { + content: 'Your order ORD-42 has shipped and arrives 2026-05-25.', + }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + + const turn1 = await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [{ role: 'user', content: 'Where is my order ORD-42?' }], + tools: [ + { + type: 'function', + function: { + name: 'get_order_status', + description: 'Look up an order by id', + parameters: { type: 'object' }, + }, + }, + ], + }, + llmSubNode.name, + ); + + const choice1 = ( + turn1.body.choices as Array<{ + message: { + content: string | null; + tool_calls?: Array<{ + id: string; + function: { name: string; arguments: string }; + }>; + }; + finish_reason: string; + }> + )[0]; + expect(choice1.finish_reason).toBe('tool_calls'); + expect(choice1.message.tool_calls?.[0].function.name).toBe('get_order_status'); + const toolCallArgs = JSON.parse(choice1.message.tool_calls?.[0].function.arguments ?? '{}'); + expect(toolCallArgs).toEqual({ orderId: 'ORD-42' }); + + // Tool runs — `helpers.httpRequest` interception fires. The + // nodeType is the tool's `httpRequestTool`, not the Agent. + const toolResult = await harness.toolHttpInterceptor( + { + url: `https://orders.example.com/v1/orders/${toolCallArgs.orderId}`, + method: 'GET', + }, + toolNode, + ); + + // Turn 2: Agent threads the tool result back into messages and + // asks the model for a final answer. This mirrors what + // `AgentExecutor` does between tool calls and final response. + const turn2 = await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [ + { role: 'user', content: 'Where is my order ORD-42?' }, + { + role: 'assistant', + content: null, + tool_calls: choice1.message.tool_calls, + }, + { + role: 'tool', + tool_call_id: 'call_1', + content: JSON.stringify(toolResult.body), + }, + ], + }, + llmSubNode.name, + ); + + const choice2 = ( + turn2.body.choices as Array<{ + message: { content: string | null }; + finish_reason: string; + }> + )[0]; + expect(choice2.finish_reason).toBe('stop'); + expect(choice2.message.content).toContain('ORD-42'); + expect(choice2.message.content).toContain('shipped'); + + // Ledger assertions — the headline M3 split. + expect(harness.modelTurns).toHaveLength(2); + expect(harness.modelTurns.every((t) => t.rootName === rootName)).toBe(true); + expect(harness.modelTurns.every((t) => t.nodeType === llmSubNode.type)).toBe(true); + + expect(harness.toolHttpCalls).toHaveLength(1); + expect(harness.toolHttpCalls[0].nodeName).toBe(toolNode.name); + expect(harness.toolHttpCalls[0].url).toContain('orders.example.com'); + + // Cross-check: tool HTTP didn't leak into model-turn attribution. + const modelUrls = harness.modelTurns.map((t) => t.url); + expect(modelUrls.every((u) => u.includes('api.openai.com'))).toBe(true); + } finally { + await harness.wireServer.stop(); + } + }); + + it('passes the connected tools array through to the mock handler', async () => { + // Tool-list awareness: the mock handler must see the request `tools` + // array so it can emit a realistic tool_calls block. This is the + // "hard-coded tool-list awareness in the wire-server prompt" + // behaviour from the spec — the wire server just passes the inbound + // body through, and the handler reads it from `req.body.tools`. + const harness = await bootM3Harness(); + try { + harness.scriptedResponses.push({ + body: { content: 'ok' }, + headers: {}, + statusCode: 200, + }); + + await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [{ role: 'user', content: 'hi' }], + tools: [ + { + type: 'function', + function: { name: 'get_order_status', parameters: { type: 'object' } }, + }, + ], + }, + llmSubNode.name, + ); + + expect(harness.mockHandler).toHaveBeenCalledTimes(1); + const [requestOptions] = harness.mockHandler.mock.calls[0]; + const body = requestOptions.body as { + tools?: Array<{ function: { name: string } }>; + }; + expect(body.tools).toBeDefined(); + expect(body.tools?.[0].function.name).toBe('get_order_status'); + } finally { + await harness.wireServer.stop(); + } + }); + }); + + // ── M3 value (regression-catch fixture) ───────────────────────────── + + describe('value / regression-catch (tool disconnected from Agent)', () => { + // Substring grader — a deliberately lightweight stand-in for whatever + // the real eval grader does downstream. It looks for `ORD-42` AND + // `shipped` in the final answer; both substrings together can only + // appear when the Agent (a) saw the user's order id AND (b) saw the + // tool's HTTP response (`{ status: 'shipped' }`). Plain-text content + // without the tool result fails. The substring shape is intentionally + // simple — a more structural schema check would be a Tier 5 follow-up + // (`MockHints.toolHints` quality work); the contract this fixture + // proves is "the spike makes the grader fail when pinning would have + // hidden the regression", not "this is a production-grade grader". + function graderCheck(finalAnswer: unknown): { passed: boolean; reason?: string } { + if (typeof finalAnswer !== 'string') { + return { passed: false, reason: 'final answer was not a string' }; + } + const hasOrderId = finalAnswer.includes('ORD-42'); + const hasShipped = finalAnswer.toLowerCase().includes('shipped'); + if (hasOrderId && hasShipped) return { passed: true }; + return { + passed: false, + reason: `grader expected order id + status substrings; got: ${JSON.stringify(finalAnswer)}`, + }; + } + + it('the grader fails when the Agent has no tool connection — only the spike catches this', async () => { + const harness = await bootM3Harness(); + try { + // Mock handler returns plain content WITHOUT a tool_calls block + // (because the disconnected workflow has no tools to call). + // The Agent gives up and emits an apology — the grader sees + // none of the tool-derived fields and reports failure. + harness.scriptedResponses.push({ + body: { + content: "I'd love to help, but I don't have an order-lookup tool available right now.", + }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + + const turn = await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [{ role: 'user', content: 'Where is my order ORD-42?' }], + // IMPORTANT: no `tools` array — the tool is disconnected. + }, + llmSubNode.name, + ); + + const choice = ( + turn.body.choices as Array<{ message: { content: string }; finish_reason: string }> + )[0]; + expect(choice.finish_reason).toBe('stop'); + + const verdict = graderCheck(choice.message.content); + // This is the M3 value assertion — pinning today would pass; + // the spike must fail because the Agent's mocked output can't + // produce the substrings the grader expects (which only + // appear once the tool's HTTP response threads back through + // turn 2 — see the counterfactual test below). + expect(verdict.passed).toBe(false); + expect(verdict.reason).toContain('order id + status'); + + // No tool HTTP fired — confirms the tool was actually disconnected. + expect(harness.toolHttpCalls).toHaveLength(0); + + // Model turn ran (this is the headline behavioural delta vs. + // today's pinned path, where no model turn would fire at all). + expect(harness.modelTurns).toHaveLength(1); + } finally { + await harness.wireServer.stop(); + } + }); + + // Counterfactual: the same grader passes for the connected fixture. + // Without this assertion, the regression-catch could be a false + // negative (a perpetually-failing grader proves nothing). + it('the grader passes when the tool IS connected — confirms the check is meaningful', async () => { + const harness = await bootM3Harness(); + try { + harness.scriptedResponses.push({ + body: { + tool_calls: [ + { + id: 'call_1', + function: { name: 'get_order_status', arguments: '{"orderId":"ORD-42"}' }, + }, + ], + }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + harness.scriptedResponses.push({ + body: { content: 'Your order ORD-42 has shipped — eta 2026-05-25.' }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }); + + // Turn 1. + const turn1 = await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [{ role: 'user', content: 'Where is my order ORD-42?' }], + tools: [ + { + type: 'function', + function: { name: 'get_order_status', parameters: { type: 'object' } }, + }, + ], + }, + llmSubNode.name, + ); + + const choice1 = ( + turn1.body.choices as Array<{ + message: { tool_calls?: Array<{ id: string }> }; + }> + )[0]; + await harness.toolHttpInterceptor( + { url: 'https://orders.example.com/v1/orders/ORD-42', method: 'GET' }, + toolNode, + ); + + // Turn 2. + const turn2 = await postViaRewrittenCredentials( + harness.helper, + harness.wireServer.url, + { + model: 'gpt-4o-mini', + messages: [ + { role: 'user', content: 'Where is my order ORD-42?' }, + { + role: 'assistant', + content: null, + tool_calls: choice1.message.tool_calls, + }, + { role: 'tool', tool_call_id: 'call_1', content: '{"status":"shipped"}' }, + ], + }, + llmSubNode.name, + ); + + const choice2 = (turn2.body.choices as Array<{ message: { content: string } }>)[0]; + + expect(graderCheck(choice2.message.content).passed).toBe(true); + } finally { + await harness.wireServer.stop(); + } + }); + }); +}); diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/openai-envelope.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/openai-envelope.test.ts index 44b04f55a2e..5a1333e6393 100644 --- a/packages/cli/src/modules/instance-ai/eval/__tests__/openai-envelope.test.ts +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/openai-envelope.test.ts @@ -3,7 +3,10 @@ import type { EvalMockHttpResponse } from 'n8n-core'; import { buildOpenAiErrorEnvelope, extractRequestModel, + extractToolCalls, forwardTranslateToChatCompletion, + forwardTranslateToSseChunks, + isStreamRequested, reverseTranslateOpenAiRequest, } from '../openai-envelope'; @@ -63,6 +66,122 @@ describe('extractRequestModel', () => { }); }); +describe('isStreamRequested', () => { + it('returns true only when stream === true', () => { + expect(isStreamRequested({ stream: true })).toBe(true); + }); + + it('returns false for missing, false, or truthy-non-true values', () => { + expect(isStreamRequested({})).toBe(false); + expect(isStreamRequested({ stream: false })).toBe(false); + expect(isStreamRequested({ stream: 1 })).toBe(false); + expect(isStreamRequested({ stream: 'true' })).toBe(false); + expect(isStreamRequested(undefined)).toBe(false); + expect(isStreamRequested(null)).toBe(false); + }); +}); + +describe('extractToolCalls', () => { + it('returns an empty list when no tool calls are present', () => { + expect(extractToolCalls(undefined)).toEqual([]); + expect(extractToolCalls(null)).toEqual([]); + expect(extractToolCalls({})).toEqual([]); + expect(extractToolCalls({ content: 'just text' })).toEqual([]); + }); + + it('normalizes the OpenAI-native tool_calls shape', () => { + const result = extractToolCalls({ + tool_calls: [ + { id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"Paris"}' } }, + ], + }); + + expect(result).toEqual([{ id: 'call_1', name: 'get_weather', arguments: '{"city":"Paris"}' }]); + }); + + it('generates a synthetic id when none is provided', () => { + const result = extractToolCalls({ + tool_calls: [{ function: { name: 'foo', arguments: '{}' } }], + }); + + expect(result).toHaveLength(1); + expect(result[0].id).toMatch(/^call_[a-f0-9]+$/); + expect(result[0].name).toBe('foo'); + }); + + it('coerces object arguments to JSON strings (SDKs require strings)', () => { + const result = extractToolCalls({ + tool_calls: [{ function: { name: 'foo', arguments: { city: 'Paris' } } }], + }); + + expect(result[0].arguments).toBe('{"city":"Paris"}'); + }); + + it('defaults arguments to "{}" when missing or null', () => { + const result = extractToolCalls({ + tool_calls: [{ function: { name: 'foo' } }, { function: { name: 'bar', arguments: null } }], + }); + + expect(result[0].arguments).toBe('{}'); + expect(result[1].arguments).toBe('{}'); + }); + + it('accepts the `{ name, arguments }` shorthand', () => { + const result = extractToolCalls({ + tool_calls: [{ name: 'shorthand', arguments: '{"a":1}' }], + }); + + expect(result).toEqual([expect.objectContaining({ name: 'shorthand', arguments: '{"a":1}' })]); + }); + + it('unwraps tool calls nested under a choices envelope', () => { + const result = extractToolCalls({ + choices: [ + { + message: { + tool_calls: [{ id: 'call_2', function: { name: 'lookup', arguments: '{}' } }], + }, + }, + ], + }); + + expect(result).toHaveLength(1); + expect(result[0].name).toBe('lookup'); + }); + + it('extracts a single-tool shorthand under `tool`', () => { + const result = extractToolCalls({ + tool: { name: 'single', arguments: '{"x":1}' }, + }); + + expect(result).toEqual([expect.objectContaining({ name: 'single', arguments: '{"x":1}' })]); + }); + + it('handles multiple tool calls', () => { + const result = extractToolCalls({ + tool_calls: [ + { id: 'a', function: { name: 'one', arguments: '{}' } }, + { id: 'b', function: { name: 'two', arguments: '{}' } }, + ], + }); + + expect(result.map((t) => t.name)).toEqual(['one', 'two']); + expect(result.map((t) => t.id)).toEqual(['a', 'b']); + }); + + it('skips entries without a function name', () => { + const result = extractToolCalls({ + tool_calls: [ + { id: 'a', function: { arguments: '{}' } }, + { id: 'b', function: { name: 'kept', arguments: '{}' } }, + ], + }); + + expect(result).toHaveLength(1); + expect(result[0].name).toBe('kept'); + }); +}); + describe('forwardTranslateToChatCompletion', () => { function mockResponse(body: unknown): EvalMockHttpResponse { return { @@ -180,6 +299,231 @@ describe('forwardTranslateToChatCompletion', () => { expect(envelope.model).toBe('gpt-5'); }); + + it('emits tool_calls on the assistant message when the body contains them', () => { + const envelope = forwardTranslateToChatCompletion( + mockResponse({ + tool_calls: [ + { id: 'call_1', function: { name: 'get_weather', arguments: '{"city":"Paris"}' } }, + ], + }), + 'gpt-4o', + ); + + const choice = ( + envelope.choices as Array<{ + message: { + role: string; + content: string | null; + tool_calls?: Array<{ + id: string; + type: string; + function: { name: string; arguments: string }; + }>; + }; + finish_reason: string; + }> + )[0]; + expect(choice.message.role).toBe('assistant'); + // Tool-call envelopes require content === null — SDKs reject content + tool_calls. + expect(choice.message.content).toBeNull(); + expect(choice.message.tool_calls).toEqual([ + { + id: 'call_1', + type: 'function', + function: { name: 'get_weather', arguments: '{"city":"Paris"}' }, + }, + ]); + expect(choice.finish_reason).toBe('tool_calls'); + }); + + it('emits multiple tool_calls when several are present', () => { + const envelope = forwardTranslateToChatCompletion( + mockResponse({ + tool_calls: [ + { id: 'a', function: { name: 'one', arguments: '{}' } }, + { id: 'b', function: { name: 'two', arguments: '{}' } }, + ], + }), + 'gpt-4o', + ); + + const choice = ( + envelope.choices as Array<{ + message: { tool_calls?: Array<{ id: string }> }; + finish_reason: string; + }> + )[0]; + expect(choice.message.tool_calls).toHaveLength(2); + expect(choice.finish_reason).toBe('tool_calls'); + }); +}); + +describe('forwardTranslateToSseChunks', () => { + function mockResponse(body: unknown): EvalMockHttpResponse { + return { + body, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }; + } + + it('emits an opening role chunk, a content chunk, and a finish_reason chunk', () => { + const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hello' }), 'gpt-4o'); + + expect(chunks.length).toBeGreaterThanOrEqual(3); + const firstDelta = (chunks[0].choices as Array<{ delta: { role?: string } }>)[0].delta; + expect(firstDelta.role).toBe('assistant'); + + const contentChunk = chunks.find( + (c) => (c.choices as Array<{ delta: { content?: string } }>)[0].delta.content === 'hello', + ); + expect(contentChunk).toBeDefined(); + + const terminal = chunks[chunks.length - 1]; + const terminalChoice = (terminal.choices as Array<{ finish_reason: string }>)[0]; + expect(terminalChoice.finish_reason).toBe('stop'); + }); + + it('every chunk carries the canonical object discriminator', () => { + const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-4o'); + + for (const chunk of chunks) { + expect(chunk.object).toBe('chat.completion.chunk'); + } + }); + + it('every chunk shares the same id and created timestamp', () => { + const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-4o'); + + const ids = new Set(chunks.map((c) => c.id)); + const createdSet = new Set(chunks.map((c) => c.created)); + expect(ids.size).toBe(1); + expect(createdSet.size).toBe(1); + }); + + it('emits tool_calls with first-chunk id+name then arg-stream chunks then a tool_calls terminal', () => { + const chunks = forwardTranslateToSseChunks( + mockResponse({ + tool_calls: [ + { + id: 'call_xyz', + function: { name: 'get_weather', arguments: '{"city":"Paris"}' }, + }, + ], + }), + 'gpt-4o', + ); + + // Opening role chunk + first-chunk (id+name) + args-chunk + terminal = 4. + expect(chunks).toHaveLength(4); + + const opener = (chunks[0].choices as Array<{ delta: Record }>)[0].delta; + expect(opener.role).toBe('assistant'); + // SDK reducers expect content: null when the turn will emit tool_calls. + expect(opener.content).toBeNull(); + + const firstToolChunk = ( + chunks[1].choices as Array<{ + delta: { + tool_calls?: Array<{ + index: number; + id?: string; + type?: string; + function?: { name?: string; arguments?: string }; + }>; + }; + }> + )[0].delta; + expect(firstToolChunk.tool_calls?.[0]).toMatchObject({ + index: 0, + id: 'call_xyz', + type: 'function', + function: { name: 'get_weather', arguments: '' }, + }); + + const argsChunk = ( + chunks[2].choices as Array<{ + delta: { + tool_calls?: Array<{ index: number; function?: { arguments?: string } }>; + }; + }> + )[0].delta; + // Arg-stream chunk MUST set `index` (SDKs use it to identify the slot) + // but MUST NOT repeat `id` or `function.name` (only the first chunk owns those). + expect(argsChunk.tool_calls?.[0].index).toBe(0); + expect(argsChunk.tool_calls?.[0].function?.arguments).toBe('{"city":"Paris"}'); + const argEntry = argsChunk.tool_calls?.[0] as { + index: number; + id?: string; + function?: { name?: string; arguments?: string }; + }; + expect(argEntry.id).toBeUndefined(); + expect(argEntry.function?.name).toBeUndefined(); + + const terminal = chunks[3]; + expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe( + 'tool_calls', + ); + }); + + it('emits the empty-arguments tool call without an arg-stream chunk', () => { + const chunks = forwardTranslateToSseChunks( + mockResponse({ + tool_calls: [{ id: 'call_1', function: { name: 'noop', arguments: '' } }], + }), + 'gpt-4o', + ); + + // opener + first-chunk(id+name) + terminal = 3 — no args slice. + expect(chunks).toHaveLength(3); + const firstToolChunk = (chunks[1].choices as Array<{ delta: { tool_calls?: unknown[] } }>)[0] + .delta; + expect(firstToolChunk.tool_calls).toBeDefined(); + expect((chunks[2].choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe( + 'tool_calls', + ); + }); + + it('emits two first-chunks (one per tool) for multi-tool responses', () => { + const chunks = forwardTranslateToSseChunks( + mockResponse({ + tool_calls: [ + { id: 'a', function: { name: 'one', arguments: '{"a":1}' } }, + { id: 'b', function: { name: 'two', arguments: '{"b":2}' } }, + ], + }), + 'gpt-4o', + ); + + const firstChunks = chunks + .flatMap( + (c) => + (c.choices as Array<{ delta: { tool_calls?: Array<{ id?: string }> } }>)[0].delta + .tool_calls ?? [], + ) + .filter((tc) => typeof tc.id === 'string'); + expect(firstChunks.map((tc) => tc.id)).toEqual(['a', 'b']); + + const terminal = chunks[chunks.length - 1]; + expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe( + 'tool_calls', + ); + }); + + it('streams empty content as the terminal finish_reason chunk only (no content chunk)', () => { + const chunks = forwardTranslateToSseChunks(mockResponse({ content: '' }), 'gpt-4o'); + + // opener + terminal = 2. + expect(chunks).toHaveLength(2); + const terminal = chunks[chunks.length - 1]; + expect((terminal.choices as Array<{ finish_reason: string }>)[0].finish_reason).toBe('stop'); + }); + + it('uses the provided model verbatim across all chunks', () => { + const chunks = forwardTranslateToSseChunks(mockResponse({ content: 'hi' }), 'gpt-5'); + expect(chunks.every((c) => c.model === 'gpt-5')).toBe(true); + }); }); describe('buildOpenAiErrorEnvelope', () => { diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/openai-responses-envelope.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/openai-responses-envelope.test.ts new file mode 100644 index 00000000000..0e7567800ab --- /dev/null +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/openai-responses-envelope.test.ts @@ -0,0 +1,367 @@ +import type { EvalMockHttpResponse } from 'n8n-core'; + +import { + buildResponsesErrorEnvelope, + extractResponsesRequestModel, + forwardTranslateToResponsesEnvelope, + forwardTranslateToResponsesSseEvents, + isResponsesStreamRequested, + reverseTranslateOpenAiResponsesRequest, +} from '../openai-responses-envelope'; + +describe('reverseTranslateOpenAiResponsesRequest', () => { + it('emits the synthetic /v1/responses URL and POST method', () => { + const result = reverseTranslateOpenAiResponsesRequest({ model: 'gpt-4o-mini', input: [] }); + + expect(result.url).toBe('https://api.openai.com/v1/responses'); + expect(result.method).toBe('POST'); + }); + + it('passes the inbound body through unchanged', () => { + const body = { + model: 'gpt-4o', + input: [{ role: 'user', content: 'hi' }], + tools: [{ type: 'function', name: 'foo' }], + stream: true, + }; + + const result = reverseTranslateOpenAiResponsesRequest(body); + + expect(result.body).toBe(body); + }); + + it('substitutes an empty object when body is null or undefined', () => { + expect(reverseTranslateOpenAiResponsesRequest(undefined).body).toEqual({}); + expect(reverseTranslateOpenAiResponsesRequest(null).body).toEqual({}); + }); +}); + +describe('extractResponsesRequestModel', () => { + it('returns the model string from a well-formed body', () => { + expect(extractResponsesRequestModel({ model: 'gpt-5' })).toBe('gpt-5'); + }); + + it('falls back to gpt-4o-mini for missing, empty, or non-string values', () => { + expect(extractResponsesRequestModel({})).toBe('gpt-4o-mini'); + expect(extractResponsesRequestModel({ model: '' })).toBe('gpt-4o-mini'); + expect(extractResponsesRequestModel({ model: 42 })).toBe('gpt-4o-mini'); + expect(extractResponsesRequestModel(undefined)).toBe('gpt-4o-mini'); + expect(extractResponsesRequestModel(null)).toBe('gpt-4o-mini'); + }); +}); + +describe('isResponsesStreamRequested', () => { + it('returns true only when stream === true', () => { + expect(isResponsesStreamRequested({ stream: true })).toBe(true); + }); + + it('returns false for missing, false, or truthy-non-true values', () => { + expect(isResponsesStreamRequested({})).toBe(false); + expect(isResponsesStreamRequested({ stream: false })).toBe(false); + expect(isResponsesStreamRequested({ stream: 1 })).toBe(false); + expect(isResponsesStreamRequested({ stream: 'true' })).toBe(false); + expect(isResponsesStreamRequested(undefined)).toBe(false); + expect(isResponsesStreamRequested(null)).toBe(false); + }); +}); + +describe('forwardTranslateToResponsesEnvelope', () => { + function mockResponse(body: unknown): EvalMockHttpResponse { + return { + body, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }; + } + + it('produces a `response` envelope with all required top-level fields', () => { + const envelope = forwardTranslateToResponsesEnvelope( + mockResponse({ output_text: 'hello there' }), + 'gpt-4o', + ); + + expect(envelope).toMatchObject({ + object: 'response', + status: 'completed', + model: 'gpt-4o', + usage: { input_tokens: 0, output_tokens: 0, total_tokens: 0 }, + }); + expect(typeof envelope.id).toBe('string'); + expect((envelope.id as string).startsWith('resp_')).toBe(true); + expect(typeof envelope.created_at).toBe('number'); + }); + + it('emits a single assistant message with `annotations: []` on output_text', () => { + const envelope = forwardTranslateToResponsesEnvelope( + mockResponse({ output_text: 'a reply' }), + 'gpt-4o', + ); + + const output = envelope.output as Array<{ + type: string; + role: string; + content: Array<{ type: string; text: string; annotations: unknown[] }>; + }>; + expect(output).toHaveLength(1); + expect(output[0].type).toBe('message'); + expect(output[0].role).toBe('assistant'); + expect(output[0].content[0].type).toBe('output_text'); + expect(output[0].content[0].text).toBe('a reply'); + // `annotations: []` is required by the OpenAI SDK — LangChain's + // extractor calls `.annotations.map(...)` and crashes on undefined. + expect(output[0].content[0].annotations).toEqual([]); + }); + + it('extracts content from `output_text`, `content`, and `message` shorthand bodies', () => { + const cases: Array<[unknown, string]> = [ + [{ output_text: 'first' }, 'first'], + [{ content: 'second' }, 'second'], + [{ message: 'third' }, 'third'], + ]; + + for (const [body, expected] of cases) { + const env = forwardTranslateToResponsesEnvelope(mockResponse(body), 'gpt-4o'); + const output = env.output as Array<{ + content: Array<{ text: string }>; + }>; + expect(output[0].content[0].text).toBe(expected); + } + }); + + it('extracts content from an already-shaped responses envelope', () => { + const inner = { + id: 'resp_inner', + object: 'response', + output: [ + { + id: 'msg_inner', + type: 'message', + role: 'assistant', + content: [{ type: 'output_text', text: 'unwrap me', annotations: [] }], + status: 'completed', + }, + ], + }; + const env = forwardTranslateToResponsesEnvelope(mockResponse(inner), 'gpt-4o'); + const output = env.output as Array<{ content: Array<{ text: string }> }>; + expect(output[0].content[0].text).toBe('unwrap me'); + }); + + it('replaces the message with a function_call item when the body has tool_calls', () => { + const envelope = forwardTranslateToResponsesEnvelope( + mockResponse({ + tool_calls: [ + { id: 'call_1', function: { name: 'lookup_order', arguments: '{"id":"42"}' } }, + ], + }), + 'gpt-4o', + ); + + const output = envelope.output as Array>; + expect(output).toHaveLength(1); + expect(output[0].type).toBe('function_call'); + expect(output[0].name).toBe('lookup_order'); + expect(output[0].call_id).toBe('call_1'); + expect(output[0].arguments).toBe('{"id":"42"}'); + // No message item alongside the tool call — Responses API mode is exclusive. + expect(output.find((item) => item.type === 'message')).toBeUndefined(); + }); + + it('emits multiple function_call items when several tool_calls are present', () => { + const envelope = forwardTranslateToResponsesEnvelope( + mockResponse({ + tool_calls: [ + { id: 'a', function: { name: 'one', arguments: '{}' } }, + { id: 'b', function: { name: 'two', arguments: '{}' } }, + ], + }), + 'gpt-4o', + ); + const output = envelope.output as Array<{ type: string; name: string }>; + expect(output.map((o) => o.type)).toEqual(['function_call', 'function_call']); + expect(output.map((o) => o.name)).toEqual(['one', 'two']); + }); +}); + +describe('forwardTranslateToResponsesSseEvents', () => { + function mockResponse(body: unknown): EvalMockHttpResponse { + return { + body, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }; + } + + it('emits the canonical event sequence for a plain text response', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ output_text: 'hello' }), + 'gpt-4o', + ); + + const eventNames = events.map((e) => e.event); + expect(eventNames).toEqual([ + 'response.created', + 'response.in_progress', + 'response.output_item.added', + 'response.content_part.added', + 'response.output_text.delta', + 'response.output_text.done', + 'response.content_part.done', + 'response.output_item.done', + 'response.completed', + ]); + }); + + it('skips the output_text.delta event when content is empty', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ output_text: '' }), + 'gpt-4o', + ); + const eventNames = events.map((e) => e.event); + expect(eventNames).not.toContain('response.output_text.delta'); + expect(eventNames[eventNames.length - 1]).toBe('response.completed'); + }); + + it('every event carries `annotations: []` on output_text parts', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ output_text: 'hi' }), + 'gpt-4o', + ); + + const partEvents = events.filter( + (e) => e.event === 'response.content_part.added' || e.event === 'response.content_part.done', + ); + for (const e of partEvents) { + const part = (e.data as { part?: { annotations?: unknown } }).part; + expect(part?.annotations).toEqual([]); + } + }); + + it('terminal message item (`output_item.done`, `response.completed`) carries `annotations: []`', () => { + // Regression: earlier the terminal `messageItem` set `content: + // [{ type: 'output_text', text }]` without `annotations: []`. SDK + // consumers iterating the completed response would crash on + // `.annotations.map(...)` exactly like the non-streaming bug we + // already fixed. + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ output_text: 'hello' }), + 'gpt-4o', + ); + + type MsgItem = { content?: Array<{ type?: string; annotations?: unknown }> }; + const findItem = (eventName: string): MsgItem | undefined => { + const e = events.find((ev) => ev.event === eventName); + if (eventName === 'response.completed') { + return ((e?.data as { response?: { output?: MsgItem[] } }).response?.output ?? [])[0]; + } + return (e?.data as { item?: MsgItem }).item; + }; + + for (const name of [ + 'response.output_item.added', + 'response.output_item.done', + 'response.completed', + ]) { + const item = findItem(name); + expect(item?.content?.[0].type).toBe('output_text'); + expect(item?.content?.[0].annotations).toEqual([]); + } + }); + + it('keeps `id` stable across output_item / arguments / completed events for the same tool call', () => { + // Regression: earlier the SSE path generated the tool-call `id` once + // for `output_item.added/done` and then re-ran the synthesizer for + // `response.completed.output[]`, producing two different `fc_` + // values for the same `output_index`. SDK consumers that reconcile + // state by `id` (e.g. tracing UIs) would fail to match. + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ + tool_calls: [ + { id: 'call_x', function: { name: 'fn', arguments: '{}' } }, + { id: 'call_y', function: { name: 'fn2', arguments: '{}' } }, + ], + }), + 'gpt-4o', + ); + + const addedItems = events.filter((e) => e.event === 'response.output_item.added'); + const doneItems = events.filter((e) => e.event === 'response.output_item.done'); + const completed = events.find((e) => e.event === 'response.completed'); + const completedOutput = (completed?.data as { response?: { output?: Array<{ id?: string }> } }) + .response?.output; + + for (let i = 0; i < addedItems.length; i++) { + const addedId = (addedItems[i].data as { item?: { id?: string } }).item?.id; + const doneId = (doneItems[i].data as { item?: { id?: string } }).item?.id; + const completedId = completedOutput?.[i].id; + expect(addedId).toBe(doneId); + expect(addedId).toBe(completedId); + expect(typeof addedId).toBe('string'); + } + }); + + it('emits function_call event sequence with delta + done arguments for tool calls', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ + tool_calls: [{ id: 'call_xyz', function: { name: 'lookup', arguments: '{"q":"hi"}' } }], + }), + 'gpt-4o', + ); + + const eventNames = events.map((e) => e.event); + expect(eventNames).toContain('response.output_item.added'); + expect(eventNames).toContain('response.function_call_arguments.delta'); + expect(eventNames).toContain('response.function_call_arguments.done'); + expect(eventNames).toContain('response.output_item.done'); + expect(eventNames[eventNames.length - 1]).toBe('response.completed'); + + const deltaEvent = events.find((e) => e.event === 'response.function_call_arguments.delta'); + expect((deltaEvent?.data as { delta?: string })?.delta).toBe('{"q":"hi"}'); + + const doneEvent = events.find((e) => e.event === 'response.function_call_arguments.done'); + expect((doneEvent?.data as { arguments?: string })?.arguments).toBe('{"q":"hi"}'); + }); + + it('skips the function_call_arguments.delta event when arguments are empty', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ + tool_calls: [{ id: 'call_1', function: { name: 'noop', arguments: '' } }], + }), + 'gpt-4o', + ); + + const deltaEvent = events.find((e) => e.event === 'response.function_call_arguments.delta'); + expect(deltaEvent).toBeUndefined(); + expect(events.find((e) => e.event === 'response.function_call_arguments.done')).toBeDefined(); + }); + + it('uses a single response id across the entire event sequence', () => { + const events = forwardTranslateToResponsesSseEvents( + mockResponse({ output_text: 'hi' }), + 'gpt-4o', + ); + const ids = new Set(); + for (const e of events) { + const data = e.data as { response?: { id?: string } }; + if (data.response?.id) ids.add(data.response.id); + } + expect(ids.size).toBe(1); + const id = Array.from(ids)[0]; + expect(id?.startsWith('resp_')).toBe(true); + }); +}); + +describe('buildResponsesErrorEnvelope', () => { + it('produces the standard error shape with the supplied message', () => { + const envelope = buildResponsesErrorEnvelope('mock failed: rate-limited'); + + expect(envelope).toEqual({ + error: { + message: 'mock failed: rate-limited', + type: 'eval_wire_server_error', + code: 'eval_mock_generation_failed', + param: null, + }, + }); + }); +}); diff --git a/packages/cli/src/modules/instance-ai/eval/__tests__/workflow-analysis.test.ts b/packages/cli/src/modules/instance-ai/eval/__tests__/workflow-analysis.test.ts index ce0ddf9cb61..299ee60644d 100644 --- a/packages/cli/src/modules/instance-ai/eval/__tests__/workflow-analysis.test.ts +++ b/packages/cli/src/modules/instance-ai/eval/__tests__/workflow-analysis.test.ts @@ -11,11 +11,11 @@ import { createEvalAgent, extractText } from '@n8n/instance-ai'; import type { IConnections, INode, INodeParameters, IWorkflowBase } from 'n8n-workflow'; import { - assertUnpinCompatibility, buildVendorLlmRouting, generateMockHints, identifyNodesForHints, identifyNodesForPinData, + partitionAiRoots, } from '../workflow-analysis'; import { UserError } from 'n8n-workflow'; @@ -205,7 +205,7 @@ describe('identifyNodesForPinData', () => { }); }); -describe('assertUnpinCompatibility', () => { +describe('partitionAiRoots', () => { function agentWithMemory(memoryType: string) { const nodes = [ makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), @@ -219,164 +219,166 @@ describe('assertUnpinCompatibility', () => { return makeWorkflow(nodes, connections); } - it('is a no-op when unpinNodes is empty', () => { - const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryPostgresChat'); - expect(() => assertUnpinCompatibility(workflow, [])).not.toThrow(); + describe('explicit pin validation (typo guard)', () => { + it('throws when an explicit pin name does not exist in the workflow', () => { + const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow'); + let thrown: unknown; + try { + partitionAiRoots(workflow, ['Ghost']); + } catch (e) { + thrown = e; + } + expect(thrown).toBeInstanceOf(UserError); + expect((thrown as UserError).message).toContain('not found in workflow'); + expect((thrown as UserError).message).toContain('"Ghost"'); + }); + + it('throws when an explicit pin name refers to a disabled root', () => { + const nodes = [ + makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent', disabled: true }), + ]; + const connections: IConnections = { + PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, + }; + let thrown: unknown; + try { + partitionAiRoots(makeWorkflow(nodes, connections), ['Agent']); + } catch (e) { + thrown = e; + } + expect(thrown).toBeInstanceOf(UserError); + expect((thrown as UserError).message).toContain('disabled'); + expect((thrown as UserError).message).toContain('"Agent"'); + }); + + it('throws when an explicit pin name refers to a non-AI-root node', () => { + const nodes = [ + makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + let thrown: unknown; + try { + partitionAiRoots(makeWorkflow(nodes), ['Set']); + } catch (e) { + thrown = e; + } + expect(thrown).toBeInstanceOf(UserError); + expect((thrown as UserError).message).toContain('not AI root nodes'); + expect((thrown as UserError).message).toContain('"Set"'); + }); }); - it('allows unpinning an Agent backed by MemoryBufferWindow', () => { - const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow'); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow(); + describe('default partition (no explicit pin)', () => { + it('intercepts an Agent backed by a non-protocol-binary memory', () => { + const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow'); + const result = partitionAiRoots(workflow); + expect(result.unpinNodes).toEqual(['Agent']); + expect(result.pinNodes).toEqual([]); + expect(result.autoPinned).toEqual([]); + }); + + it('returns an empty partition when the workflow has no AI roots', () => { + const nodes = [makeNode({ name: 'Set', type: 'n8n-nodes-base.set' })]; + const result = partitionAiRoots(makeWorkflow(nodes)); + expect(result.unpinNodes).toEqual([]); + expect(result.pinNodes).toEqual([]); + expect(result.autoPinned).toEqual([]); + }); + + it('ignores disabled sub-nodes when partitioning', () => { + const nodes = [ + makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), + makeNode({ + name: 'PgMem', + type: '@n8n/n8n-nodes-langchain.memoryPostgresChat', + disabled: true, + }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + const connections: IConnections = { + OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, + PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, + }; + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.unpinNodes).toEqual(['Agent']); + expect(result.autoPinned).toEqual([]); + }); }); - it('allows unpinning an Agent with no sub-nodes attached', () => { - const nodes = [makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' })]; - expect(() => assertUnpinCompatibility(makeWorkflow(nodes), ['Agent'])).not.toThrow(); + describe('explicit pin opt-out', () => { + it('moves explicitly pinned roots to pinNodes', () => { + const nodes = [ + makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + const connections: IConnections = { + OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, + }; + const result = partitionAiRoots(makeWorkflow(nodes, connections), ['Agent']); + expect(result.unpinNodes).toEqual([]); + expect(result.pinNodes).toEqual(['Agent']); + expect(result.autoPinned).toEqual([]); + }); }); - it('ignores disabled sub-nodes when checking compatibility', () => { - const nodes = [ - makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), - makeNode({ - name: 'PgMem', - type: '@n8n/n8n-nodes-langchain.memoryPostgresChat', - disabled: true, - }), - makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, - PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, - }; - expect(() => - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']), - ).not.toThrow(); - }); + describe('auto-pin on incompatible sub-nodes', () => { + it.each([ + ['Postgres memory', '@n8n/n8n-nodes-langchain.memoryPostgresChat'], + ['Redis memory', '@n8n/n8n-nodes-langchain.memoryRedisChat'], + ['MongoDB memory', '@n8n/n8n-nodes-langchain.memoryMongoDbChat'], + ])('auto-pins an Agent backed by %s', (_label, memoryType) => { + const workflow = agentWithMemory(memoryType); + const result = partitionAiRoots(workflow); + expect(result.unpinNodes).toEqual([]); + expect(result.pinNodes).toEqual(['Agent']); + expect(result.autoPinned).toContainEqual({ + root: 'Agent', + subNode: 'Memory', + subNodeType: memoryType, + reason: 'protocol_binary', + }); + }); - it('refuses unknown root names rather than silently skipping (typo guard)', () => { - const workflow = agentWithMemory('@n8n/n8n-nodes-langchain.memoryBufferWindow'); + it.each([ + '@n8n/n8n-nodes-langchain.vectorStorePGVector', + '@n8n/n8n-nodes-langchain.vectorStoreMongoDBAtlas', + '@n8n/n8n-nodes-langchain.vectorStoreRedis', + '@n8n/n8n-nodes-langchain.vectorStoreMilvus', + '@n8n/n8n-nodes-langchain.chatHubVectorStorePGVector', + ])('auto-pins an Agent backed by protocol-binary vector store %s', (vectorStoreType) => { + const nodes = [ + makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), + makeNode({ name: 'Store', type: vectorStoreType }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + const connections: IConnections = { + OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, + Store: { ai_vectorStore: [[{ node: 'Agent', type: 'ai_vectorStore', index: 0 }]] }, + }; + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.pinNodes).toEqual(['Agent']); + expect(result.autoPinned.some((e) => e.reason === 'protocol_binary')).toBe(true); + }); - let thrown: unknown; - try { - assertUnpinCompatibility(workflow, ['Ghost']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - expect((thrown as UserError).message).toContain('not found in workflow'); - expect((thrown as UserError).message).toContain('"Ghost"'); - }); - - it('refuses disabled roots rather than silently skipping (typo guard)', () => { - const nodes = [ - makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }), - makeNode({ - name: 'Agent', - type: '@n8n/n8n-nodes-langchain.agent', - disabled: true, - }), - ]; - const connections: IConnections = { - PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, - }; - - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - expect((thrown as UserError).message).toContain('disabled'); - expect((thrown as UserError).message).toContain('"Agent"'); - }); - - it('refuses non-AI-root nodes (e.g. a regular Set node in unpinNodes is a caller mistake)', () => { - const nodes = [ - makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }), - makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes), ['Set']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - expect((thrown as UserError).message).toContain('not AI root nodes'); - expect((thrown as UserError).message).toContain('"Set"'); - }); - - it.each([ - '@n8n/n8n-nodes-langchain.chainLlm', - '@n8n/n8n-nodes-langchain.chainRetrievalQa', - '@n8n/n8n-nodes-langchain.chainSummarization', - ])('recognises %s by type even when it has no inbound ai_* connections', (chainType) => { - const nodes = [makeNode({ name: 'Chain', type: chainType })]; - expect(() => assertUnpinCompatibility(makeWorkflow(nodes), ['Chain'])).not.toThrow(); - }); - - it.each([ - ['Postgres memory', '@n8n/n8n-nodes-langchain.memoryPostgresChat'], - ['Redis memory', '@n8n/n8n-nodes-langchain.memoryRedisChat'], - ['MongoDB memory', '@n8n/n8n-nodes-langchain.memoryMongoDbChat'], - ])('refuses unpinning an Agent backed by %s', (_label, memoryType) => { - const workflow = agentWithMemory(memoryType); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).toThrow(UserError); - }); - - it.each([ - '@n8n/n8n-nodes-langchain.vectorStorePGVector', - '@n8n/n8n-nodes-langchain.vectorStoreMongoDBAtlas', - '@n8n/n8n-nodes-langchain.vectorStoreRedis', - '@n8n/n8n-nodes-langchain.vectorStoreMilvus', - '@n8n/n8n-nodes-langchain.chatHubVectorStorePGVector', - ])('refuses unpinning an Agent backed by protocol-binary vector store %s', (vectorStoreType) => { - const nodes = [ - makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), - makeNode({ name: 'Store', type: vectorStoreType }), - makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, - Store: { ai_vectorStore: [[{ node: 'Agent', type: 'ai_vectorStore', index: 0 }]] }, - }; - expect(() => assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent'])).toThrow( - UserError, - ); - }); - - it('reports all offending roots when multiple unpin targets are mixed', () => { - const nodes = [ - makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), - makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }), - makeNode({ name: 'BufMem', type: '@n8n/n8n-nodes-langchain.memoryBufferWindow' }), - makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }), - makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - OpenAI: { ai_languageModel: [[{ node: 'AgentB', type: 'ai_languageModel', index: 0 }]] }, - PgMem: { ai_memory: [[{ node: 'AgentA', type: 'ai_memory', index: 0 }]] }, - BufMem: { ai_memory: [[{ node: 'AgentB', type: 'ai_memory', index: 0 }]] }, - }; - - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('AgentA'); - expect(message).toContain('PgMem'); - expect(message).not.toContain('AgentB'); - expect(message).not.toContain('BufMem'); + it('partitions independently across multiple roots — pin one, intercept the other', () => { + const nodes = [ + makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), + makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }), + makeNode({ name: 'BufMem', type: '@n8n/n8n-nodes-langchain.memoryBufferWindow' }), + makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }), + makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + const connections: IConnections = { + OpenAI: { ai_languageModel: [[{ node: 'AgentB', type: 'ai_languageModel', index: 0 }]] }, + PgMem: { ai_memory: [[{ node: 'AgentA', type: 'ai_memory', index: 0 }]] }, + BufMem: { ai_memory: [[{ node: 'AgentB', type: 'ai_memory', index: 0 }]] }, + }; + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.unpinNodes).toEqual(['AgentB']); + expect(result.pinNodes).toEqual(['AgentA']); + expect(result.autoPinned.map((e) => e.root)).toEqual(['AgentA']); + }); }); describe('vendor LLM mapping', () => { @@ -391,9 +393,10 @@ describe('assertUnpinCompatibility', () => { return makeWorkflow(nodes, connections); } - it('allows unpinning an Agent backed by lmChatOpenAi (the only mapped vendor for M1)', () => { - const workflow = agentWithLlm('@n8n/n8n-nodes-langchain.lmChatOpenAi'); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow(); + it('intercepts an Agent backed by lmChatOpenAi (the only mapped vendor for M1)', () => { + const result = partitionAiRoots(agentWithLlm('@n8n/n8n-nodes-langchain.lmChatOpenAi')); + expect(result.unpinNodes).toEqual(['Agent']); + expect(result.autoPinned).toEqual([]); }); it.each([ @@ -408,51 +411,17 @@ describe('assertUnpinCompatibility', () => { '@n8n/n8n-nodes-langchain.lmChatDeepSeek', '@n8n/n8n-nodes-langchain.lmChatOllama', '@n8n/n8n-nodes-langchain.lmOpenAi', - ])('refuses unpinning an Agent backed by unmapped vendor LLM %s', (llmType) => { - const workflow = agentWithLlm(llmType); - - let thrown: unknown; - try { - assertUnpinCompatibility(workflow, ['Agent']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('unsupported vendor LLM'); - expect(message).toContain(llmType); + ])('auto-pins an Agent backed by unmapped vendor LLM %s', (llmType) => { + const result = partitionAiRoots(agentWithLlm(llmType)); + expect(result.pinNodes).toEqual(['Agent']); + expect(result.autoPinned[0]).toMatchObject({ + root: 'Agent', + subNodeType: llmType, + reason: 'unsupported_vendor_llm', + }); }); - it('groups protocol-binary and unsupported-vendor refusals into the same error', () => { - const nodes = [ - makeNode({ name: 'Anthropic', type: '@n8n/n8n-nodes-langchain.lmChatAnthropic' }), - makeNode({ name: 'PgMem', type: '@n8n/n8n-nodes-langchain.memoryPostgresChat' }), - makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - Anthropic: { - ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]], - }, - PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, - }; - - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('protocol-binary'); - expect(message).toContain('PgMem'); - expect(message).toContain('unsupported vendor LLM'); - expect(message).toContain('Anthropic'); - }); - - it('ignores disabled vendor LLM sub-nodes when checking compatibility', () => { + it('ignores disabled vendor LLM sub-nodes when partitioning', () => { const nodes = [ makeNode({ name: 'Anthropic', @@ -466,10 +435,8 @@ describe('assertUnpinCompatibility', () => { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]], }, }; - - expect(() => - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']), - ).not.toThrow(); + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.unpinNodes).toEqual(['Agent']); }); describe('lmChatOpenAi options.baseURL override', () => { @@ -488,71 +455,26 @@ describe('assertUnpinCompatibility', () => { return makeWorkflow(nodes, connections); } - it('allows lmChatOpenAi with no options', () => { - const workflow = agentWithOpenAi({}); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow(); + it.each([ + ['no options', {}], + ['empty baseURL', { options: { baseURL: '' } }], + ['whitespace-only baseURL', { options: { baseURL: ' ' } }], + ])('intercepts lmChatOpenAi with %s', (_label, parameters) => { + const result = partitionAiRoots(agentWithOpenAi(parameters)); + expect(result.unpinNodes).toEqual(['Agent']); }); - it('allows lmChatOpenAi with empty options.baseURL', () => { - const workflow = agentWithOpenAi({ options: { baseURL: '' } }); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow(); - }); - - it('allows lmChatOpenAi when options.baseURL is whitespace-only', () => { - const workflow = agentWithOpenAi({ options: { baseURL: ' ' } }); - expect(() => assertUnpinCompatibility(workflow, ['Agent'])).not.toThrow(); - }); - - it('refuses lmChatOpenAi when options.baseURL is set — credential rewrite would be bypassed', () => { + it('auto-pins lmChatOpenAi when options.baseURL would bypass the credential rewrite', () => { const workflow = agentWithOpenAi({ options: { baseURL: 'https://my-proxy.example.com/v1' }, }); - - let thrown: unknown; - try { - assertUnpinCompatibility(workflow, ['Agent']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('options.baseURL'); - expect(message).toContain('"OpenAI"'); - expect(message).not.toContain('unsupported vendor LLM'); - }); - - it('groups baseURL-override refusals alongside protocol-binary refusals', () => { - const nodes = [ - makeNode({ - name: 'OpenAI', - type: '@n8n/n8n-nodes-langchain.lmChatOpenAi', - parameters: { options: { baseURL: 'https://my-proxy.example.com/v1' } }, - }), - makeNode({ - name: 'PgMem', - type: '@n8n/n8n-nodes-langchain.memoryPostgresChat', - }), - makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, - PgMem: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] }, - }; - - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('protocol-binary'); - expect(message).toContain('PgMem'); - expect(message).toContain('options.baseURL'); - expect(message).toContain('OpenAI'); + const result = partitionAiRoots(workflow); + expect(result.pinNodes).toEqual(['Agent']); + expect(result.autoPinned[0]).toMatchObject({ + root: 'Agent', + subNode: 'OpenAI', + reason: 'unsafe_baseurl_override', + }); }); it('skips the baseURL check when the OpenAI sub-node is disabled', () => { @@ -568,15 +490,13 @@ describe('assertUnpinCompatibility', () => { const connections: IConnections = { OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, }; - - expect(() => - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['Agent']), - ).not.toThrow(); + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.unpinNodes).toEqual(['Agent']); }); }); - describe('shared vendor LLM sub-node across multiple unpinned roots', () => { - it('refuses unpinning both roots when one OpenAI sub-node feeds both', () => { + describe('shared vendor LLM sub-node across multiple roots', () => { + function workflowWithSharedSubNode(): IWorkflowBase { const nodes = [ makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }), @@ -592,49 +512,25 @@ describe('assertUnpinCompatibility', () => { ], }, }; + return makeWorkflow(nodes, connections); + } - let thrown: unknown; - try { - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']); - } catch (e) { - thrown = e; - } - - expect(thrown).toBeInstanceOf(UserError); - const message = (thrown as UserError).message; - expect(message).toContain('shared by multiple unpinned roots'); - expect(message).toContain('"OpenAI"'); - // Both root attributions listed in the error so the user can see - // exactly which conflict to resolve. - expect(message).toContain('AgentA'); - expect(message).toContain('AgentB'); + it('auto-pins both roots when one OpenAI sub-node feeds both', () => { + const result = partitionAiRoots(workflowWithSharedSubNode()); + expect(result.unpinNodes).toEqual([]); + expect(result.pinNodes).toEqual(['AgentA', 'AgentB']); + const reasons = result.autoPinned.map((e) => e.reason); + expect(reasons).toContain('shared_vendor_llm_subnode'); }); - it('allows unpinning when only one root references the shared OpenAI sub-node', () => { - const nodes = [ - makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), - makeNode({ name: 'AgentA', type: '@n8n/n8n-nodes-langchain.agent' }), - makeNode({ name: 'AgentB', type: '@n8n/n8n-nodes-langchain.agent' }), - ]; - const connections: IConnections = { - OpenAI: { - ai_languageModel: [ - [ - { node: 'AgentA', type: 'ai_languageModel', index: 0 }, - { node: 'AgentB', type: 'ai_languageModel', index: 0 }, - ], - ], - }, - }; - - // Only AgentA is being unpinned — AgentB stays pinned so there's - // no attribution conflict at the wire-server layer. - expect(() => - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA']), - ).not.toThrow(); + it('intercepts the remaining root when the other one is explicitly pinned', () => { + // AgentA is opted out → AgentB no longer shares the sub-node ambiguously. + const result = partitionAiRoots(workflowWithSharedSubNode(), ['AgentA']); + expect(result.unpinNodes).toEqual(['AgentB']); + expect(result.pinNodes).toEqual(['AgentA']); }); - it('ignores a disabled sub-node when counting shared references', () => { + it('ignores a disabled shared sub-node when partitioning', () => { const nodes = [ makeNode({ name: 'OpenAI', @@ -654,10 +550,8 @@ describe('assertUnpinCompatibility', () => { ], }, }; - - expect(() => - assertUnpinCompatibility(makeWorkflow(nodes, connections), ['AgentA', 'AgentB']), - ).not.toThrow(); + const result = partitionAiRoots(makeWorkflow(nodes, connections)); + expect(result.unpinNodes.sort()).toEqual(['AgentA', 'AgentB']); }); }); }); @@ -694,6 +588,25 @@ describe('buildVendorLlmRouting', () => { expect(routing.rootToSubNode.get('Agent')?.name).toBe('OpenAI'); }); + it('also self-maps the root in subNodeToRoot so agent-context credential lookups resolve', () => { + // LangChain's Agent invokes the LLM sub-node's `supplyData` with a + // context whose `executeData.node` is the Agent itself (observed + // empirically). The credential helper looks up `subNodeToRoot` by + // that name — without the self-map, the lookup would miss and the + // SDK would post to the wire server's loud-fail no-root route. + const nodes = [ + makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), + makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }), + ]; + const connections: IConnections = { + OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] }, + }; + + const routing = buildVendorLlmRouting(makeWorkflow(nodes, connections), ['Agent']); + + expect(routing.subNodeToRoot.get('Agent')).toBe('Agent'); + }); + it('does not include sub-nodes feeding roots that are still pinned', () => { const nodes = [ makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }), @@ -747,7 +660,12 @@ describe('buildVendorLlmRouting', () => { const routing = buildVendorLlmRouting(makeWorkflow(nodes, connections), ['Agent']); - expect(Array.from(routing.subNodeToRoot.keys())).toEqual(['OpenAI']); + // `Agent` is also present in subNodeToRoot via the agent-context + // self-map (see test above) — assert by lookup so the test isn't + // sensitive to insertion order. + expect(routing.subNodeToRoot.get('OpenAI')).toBe('Agent'); + expect(routing.subNodeToRoot.get('Agent')).toBe('Agent'); + expect(routing.subNodeToRoot.size).toBe(2); expect(Array.from(routing.rootToSubNode.keys())).toEqual(['Agent']); }); diff --git a/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts b/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts index 450b27d034c..0c0e2e918b9 100644 --- a/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts +++ b/packages/cli/src/modules/instance-ai/eval/eval-mocked-credentials-helper.ts @@ -3,6 +3,7 @@ import type { InstanceAiEvalRewrittenCredential, } from '@n8n/api-types'; import type { Logger } from '@n8n/backend-common'; +import { buildEvalMockCredentials } from 'n8n-core'; import type { ICredentialDataDecryptedObject, ICredentials, @@ -123,7 +124,22 @@ export class EvalMockedCredentialsHelper extends ICredentialsHelper { credentialId: nodeCredentials.id ?? undefined, }); - credentials = { [MOCK_MARKER]: true }; + // When called with no credential id (eval-mode bypass for nodes + // with no credentials of any type configured), schema-synthesize + // so the wire-server URL rewrite below has a real `url` field to + // augment. Otherwise vendor SDK traffic would escape to the real + // provider with placeholder values and 401 at the wire layer. + // `buildEvalMockCredentials` is typed `Record` — + // schema defaults can be richer than `CredentialInformation`, but + // at runtime emits only JSON-shaped values, which is what the + // rewrite path consumes. + credentials = + nodeCredentials.id === null + ? ({ + ...buildEvalMockCredentials(this.inner.getCredentialsProperties(type)), + [MOCK_MARKER]: true, + } as ICredentialDataDecryptedObject) + : { [MOCK_MARKER]: true }; } return this.applyServerUrlRewrite(credentials, type, nodeCredentials, executeData); diff --git a/packages/cli/src/modules/instance-ai/eval/execution.service.ts b/packages/cli/src/modules/instance-ai/eval/execution.service.ts index 9c9179c55e9..cc247357381 100644 --- a/packages/cli/src/modules/instance-ai/eval/execution.service.ts +++ b/packages/cli/src/modules/instance-ai/eval/execution.service.ts @@ -42,12 +42,12 @@ import { createLlmMockHandler } from './mock-handler'; import { generatePinData } from './pin-data-generator'; import { patchNoProxyForLoopback } from './proxy-loopback'; import { - assertUnpinCompatibility, buildVendorLlmRouting, generateMockHints, identifyNodesForHints, identifyNodesForPinData, type MockHints, + partitionAiRoots, type VendorLlmRouting, } from './workflow-analysis'; @@ -89,11 +89,13 @@ export class EvalExecutionService { return this.errorResult(executionId, `Workflow ${workflowId} not found or not accessible`); } - const unpinNodes = options.unpinNodes ?? []; - - // Compatibility guard runs before the kill-switch so actionable errors aren't shadowed. + // Partition AI roots into "intercept via wire server" vs "leave pinned". + // Default-on: every root with compatible sub-nodes gets intercepted; + // callers can opt specific roots out via `pinNodes` (e.g. for A/B + // comparison). Roots whose sub-nodes are incompatible auto-pin. + let partitioned: ReturnType; try { - assertUnpinCompatibility(workflowEntity, unpinNodes); + partitioned = partitionAiRoots(workflowEntity, options.pinNodes ?? []); } catch (error) { if (error instanceof UserError) { return this.errorResult(executionId, error.message); @@ -101,15 +103,23 @@ export class EvalExecutionService { throw error; } + for (const entry of partitioned.autoPinned) { + this.logger.debug( + `[EvalMock] Auto-pinning AI root "${entry.root}" — sub-node "${entry.subNode}" (${entry.subNodeType}) is ${entry.reason}`, + ); + } + + // Kill-switch: when interception is disabled, every root falls back to + // the pinned path regardless of partition or explicit `pinNodes`. let interceptionEnabled = false; + let unpinNodes = partitioned.unpinNodes; if (unpinNodes.length > 0) { interceptionEnabled = await this.isInterceptionEnabled(user); if (!interceptionEnabled) { - return this.errorResult( - executionId, - '`unpinNodes` is reserved — vendor SDK interception is currently disabled. ' + - 'Submit the request without `unpinNodes` to use the existing pinned path.', + this.logger.warn( + '[EvalMock] Vendor SDK interception disabled by kill-switch — pinning all AI roots', ); + unpinNodes = []; } } diff --git a/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts b/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts index b7fdfa3d836..d4fc43e9471 100644 --- a/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts +++ b/packages/cli/src/modules/instance-ai/eval/llm-wire-server.ts @@ -1,15 +1,25 @@ import type { Logger } from '@n8n/backend-common'; import express, { type Express, type Request, type Response } from 'express'; -import type { EvalLlmMockHandler } from 'n8n-core'; -import type { INode } from 'n8n-workflow'; +import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core'; +import type { IHttpRequestOptions, INode } from 'n8n-workflow'; import { type Server } from 'node:http'; import { buildOpenAiErrorEnvelope, extractRequestModel, forwardTranslateToChatCompletion, + forwardTranslateToSseChunks, + isStreamRequested, reverseTranslateOpenAiRequest, } from './openai-envelope'; +import { + buildResponsesErrorEnvelope, + extractResponsesRequestModel, + forwardTranslateToResponsesEnvelope, + forwardTranslateToResponsesSseEvents, + isResponsesStreamRequested, + reverseTranslateOpenAiResponsesRequest, +} from './openai-responses-envelope'; /** Loopback HTTP server that intercepts vendor SDK calls during eval. Binds to an OS-assigned port. */ export interface InterceptedTurn { @@ -31,9 +41,67 @@ export interface LlmWireServerOptions { logger?: Logger; } +/** Per-protocol translator + formatter — adding a new vendor envelope is a new adapter, not a new handler. */ +interface ProtocolAdapter { + name: string; + extractModel(body: unknown): string; + isStreamRequested(body: unknown): boolean; + reverseTranslate(body: unknown): IHttpRequestOptions; + forwardObject(response: EvalMockHttpResponse | undefined, model: string): Record; + /** Pre-formatted SSE frames (`data: ...\n\n` or `event: ...\ndata: ...\n\n`), incl. any terminator. */ + buildSseFrames(response: EvalMockHttpResponse | undefined, model: string): string[]; + buildErrorEnvelope(message: string): Record; + stubResponse(): EvalMockHttpResponse; +} + +const chatCompletionsAdapter: ProtocolAdapter = { + name: 'chat-completions', + extractModel: extractRequestModel, + isStreamRequested, + reverseTranslate: reverseTranslateOpenAiRequest, + forwardObject: forwardTranslateToChatCompletion, + buildSseFrames: (response, model) => { + const chunks = forwardTranslateToSseChunks(response, model); + const frames = chunks.map((chunk) => `data: ${JSON.stringify(chunk)}\n\n`); + // Terminator per OpenAI SSE spec — SDKs stop reading on this sentinel. + frames.push('data: [DONE]\n\n'); + return frames; + }, + buildErrorEnvelope: buildOpenAiErrorEnvelope, + stubResponse: () => ({ + body: { content: '[eval wire server stub] — no mock handler attached' }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }), +}; + +const responsesAdapter: ProtocolAdapter = { + name: 'responses', + extractModel: extractResponsesRequestModel, + isStreamRequested: isResponsesStreamRequested, + reverseTranslate: reverseTranslateOpenAiResponsesRequest, + forwardObject: forwardTranslateToResponsesEnvelope, + buildSseFrames: (response, model) => { + // Responses API uses `event: \ndata: \n\n` frames and emits + // `response.completed` as its terminal sentinel (no `[DONE]` line). + const events = forwardTranslateToResponsesSseEvents(response, model); + return events.map(({ event, data }) => `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`); + }, + buildErrorEnvelope: buildResponsesErrorEnvelope, + stubResponse: () => ({ + body: { output_text: '[eval wire server stub] — no mock handler attached' }, + headers: { 'content-type': 'application/json' }, + statusCode: 200, + }), +}; + export class LlmWireServer { private server: Server | undefined; private resolvedUrl: string | undefined; + /** In-flight handler promises — `stop()` awaits these before resolving. */ + private readonly inFlight = new Set>(); + /** Set by `stop()` so any request that beats the close-callback gets a 503 instead of starting a fresh handler that would race the teardown. */ + private stopping = false; constructor(private readonly options: LlmWireServerOptions = {}) {} @@ -47,6 +115,9 @@ export class LlmWireServer { async start(): Promise { if (this.server) return this.url; + // Reset the shutdown latch in case this instance is restarted after stop(). + this.stopping = false; + const app = this.buildApp(); this.server = await new Promise((resolve, reject) => { @@ -65,9 +136,15 @@ export class LlmWireServer { async stop(): Promise { const server = this.server; if (!server) return; + // Flip stopping FIRST so new requests 503 instead of racing the teardown. + this.stopping = true; this.server = undefined; this.resolvedUrl = undefined; + // Drain in-flight handlers so the mock-handler resolve can't write to a + // torn-down socket and `onIntercept` can't fire after stop(). + await Promise.allSettled(Array.from(this.inFlight)); + server.closeAllConnections(); await new Promise((resolve, reject) => { @@ -78,54 +155,71 @@ export class LlmWireServer { private buildApp(): Express { const app = express(); app.use(express.json({ limit: '4mb' })); - app.post('/eval/:root/v1/chat/completions', this.handleChatCompletion); + app.post('/eval/:root/v1/chat/completions', this.routeFor(chatCompletionsAdapter)); + // `@langchain/openai` v1.3+ auto-routes Agent v3.1+ calls to /v1/responses. + app.post('/eval/:root/v1/responses', this.routeFor(responsesAdapter)); // Surfaces credential-rewrite misconfiguration loudly instead of 404'ing. - app.post('/v1/chat/completions', this.handleUnroutedChatCompletion); + app.post('/v1/chat/completions', this.handleUnrouted); + app.post('/v1/responses', this.handleUnrouted); return app; } - private handleChatCompletion = async (req: Request, res: Response): Promise => { + /** Wraps each route in the in-flight tracker so `stop()` can drain. */ + private routeFor(adapter: ProtocolAdapter) { + return async (req: Request, res: Response): Promise => { + if (this.stopping) { + res.status(503).json(adapter.buildErrorEnvelope('Wire server is shutting down')); + return; + } + const promise = this.handleProtocol(adapter, req, res); + this.inFlight.add(promise); + try { + await promise; + } finally { + this.inFlight.delete(promise); + } + }; + } + + private async handleProtocol( + adapter: ProtocolAdapter, + req: Request, + res: Response, + ): Promise { // Express decodes route params; a second decode would mangle literal `%`. const rootName = req.params.root; - const model = extractRequestModel(req.body); + const model = adapter.extractModel(req.body); + const stream = adapter.isStreamRequested(req.body); const subNode = this.resolveSubNode(rootName); if (!this.options.mockHandler) { - const envelope = forwardTranslateToChatCompletion( - { - body: { content: '[eval wire server stub] — no mock handler attached' }, - headers: { 'content-type': 'application/json' }, - statusCode: 200, - }, - model, - ); - res.status(200).json(envelope); + this.respondWithStub(adapter, req, res, model, stream); return; } - let synthetic: ReturnType; - let mockResponse: Awaited>; - let envelope: Record; + let synthetic: IHttpRequestOptions; + let mockResponse: Awaited>; try { - synthetic = reverseTranslateOpenAiRequest(req.body); + synthetic = adapter.reverseTranslate(req.body); mockResponse = await this.options.mockHandler(synthetic, subNode); - envelope = forwardTranslateToChatCompletion(mockResponse, model); } catch (error) { const message = error instanceof Error ? error.message : String(error); this.options.logger?.error(`[EvalMock] Wire-server mock generation failed: ${message}`); - res.status(500).json(buildOpenAiErrorEnvelope(`Mock generation failed: ${message}`)); + this.respondWithError(adapter, res, message); return; } - // Best-effort ledger write — never let it taint the 200 the SDK sees. + // Ledger write BEFORE the response so consumers see the entry deterministically + // after `await fetch(...)`. `requestBody` is stored by reference (express.json + // never re-touches it); callers must not mutate. A thrown `onIntercept` never + // blocks the response the SDK gets. try { this.options.onIntercept?.({ rootName, url: synthetic.url, method: synthetic.method ?? 'POST', nodeType: subNode.type, - // Deep-clone so the ledger entry can't be mutated by later code. - requestBody: this.cloneRequestBody(req.body), + requestBody: req.body, mockResponse: mockResponse?.body, }); } catch (error) { @@ -133,10 +227,85 @@ export class LlmWireServer { this.options.logger?.warn(`[EvalMock] Wire-server ledger write failed: ${message}`); } - res.status(200).json(envelope); - }; + try { + if (stream) { + this.writeSseResponse(adapter, req, res, mockResponse, model); + } else { + res.status(200).json(adapter.forwardObject(mockResponse, model)); + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + this.options.logger?.error(`[EvalMock] Wire-server response write failed: ${message}`); + // Headers not yet flushed → send a typed error envelope; otherwise close. + if (!res.headersSent) { + this.respondWithError(adapter, res, message); + } else if (!res.writableEnded) { + res.end(); + } + } + } - private handleUnroutedChatCompletion = (_req: Request, res: Response): void => { + /** Stream the mock response as SSE frames, short-circuiting if the client disconnects. */ + private writeSseResponse( + adapter: ProtocolAdapter, + req: Request, + res: Response, + mockResponse: Awaited>, + model: string, + ): void { + // Build frames BEFORE setting headers so a translator throw surfaces as a + // 500 envelope via `handleProtocol`'s outer catch, not a 200 + empty body. + const frames = adapter.buildSseFrames(mockResponse, model); + + res.status(200); + res.setHeader('Content-Type', 'text/event-stream'); + res.setHeader('Cache-Control', 'no-cache, no-transform'); + res.setHeader('Connection', 'keep-alive'); + // Forces immediate flush in proxied setups (Nginx etc.). + res.setHeader('X-Accel-Buffering', 'no'); + + // Short-circuit on SDK abort (timeout / AbortController) — otherwise the + // loop keeps writing to a destroyed socket. + let aborted = false; + const onClose = () => { + aborted = true; + }; + req.once('close', onClose); + + try { + for (const frame of frames) { + if (aborted || res.writableEnded || res.destroyed) break; + res.write(frame); + } + } finally { + req.off('close', onClose); + if (!res.writableEnded) res.end(); + } + } + + private respondWithStub( + adapter: ProtocolAdapter, + req: Request, + res: Response, + model: string, + stream: boolean, + ): void { + const stubBody = adapter.stubResponse(); + if (stream) { + this.writeSseResponse(adapter, req, res, stubBody, model); + return; + } + res.status(200).json(adapter.forwardObject(stubBody, model)); + } + + private respondWithError(adapter: ProtocolAdapter, res: Response, message: string): void { + // Streaming clients still parse a JSON error envelope (the SDK throws an + // APIError before iterating chunks). Sending a 500 + JSON keeps both + // streaming and non-streaming SDK paths happy — no SSE branch needed. + res.status(500).json(adapter.buildErrorEnvelope(`Mock generation failed: ${message}`)); + } + + private handleUnrouted = (_req: Request, res: Response): void => { res .status(500) .json( @@ -147,19 +316,6 @@ export class LlmWireServer { ); }; - /** Deep-clone via `structuredClone`; logs and falls back to the original ref if it throws. */ - private cloneRequestBody(body: unknown): unknown { - try { - return structuredClone(body); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - this.options.logger?.warn( - `[EvalMock] Wire-server ledger entry not isolated — clone failed: ${message}`, - ); - return body; - } - } - private resolveSubNode(rootName: string): INode { const subNode = this.options.rootToSubNode?.get(rootName); if (subNode) return subNode; diff --git a/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts b/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts index d8f834c170a..6a193746db4 100644 --- a/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts +++ b/packages/cli/src/modules/instance-ai/eval/openai-envelope.ts @@ -3,7 +3,9 @@ import type { IHttpRequestOptions } from 'n8n-workflow'; import { randomUUID } from 'node:crypto'; // Translation between the OpenAI chat-completions wire format and the shape -// `createLlmMockHandler` consumes/emits. Non-streaming, no-tools subset only. +// `createLlmMockHandler` consumes/emits. Covers non-streaming, streaming, +// and tool-call emission. The OpenAI SDK is strict about envelope shape — +// keep this in sync with `ChatCompletion` and `ChatCompletionChunk` schemas. // Kept identical to OpenAI's real URL so mock-handler's service/endpoint // extraction derives the right prompt-builder context. @@ -11,6 +13,13 @@ const OPENAI_SYNTHETIC_URL = 'https://api.openai.com/v1/chat/completions'; const DEFAULT_MODEL = 'gpt-4o-mini'; +/** Tool call extracted from the mock handler's response body. */ +export interface NormalizedToolCall { + id: string; + name: string; + arguments: string; +} + /** Synthesize an `IHttpRequestOptions` from the inbound body so vendor-SDK traffic looks identical to HTTP-helper traffic. */ export function reverseTranslateOpenAiRequest(body: unknown): IHttpRequestOptions { return { @@ -27,13 +36,34 @@ export function extractRequestModel(body: unknown): string { return typeof model === 'string' && model.length > 0 ? model : DEFAULT_MODEL; } +/** True when the inbound request opted into streaming via `stream: true`. */ +export function isStreamRequested(body: unknown): boolean { + if (typeof body !== 'object' || body === null) return false; + return (body as { stream?: unknown }).stream === true; +} + /** Wrap the mock handler's response in a canonical chat.completion envelope. */ export function forwardTranslateToChatCompletion( mockResponse: EvalMockHttpResponse | undefined, model: string, ): Record { - const content = extractAssistantContent(mockResponse?.body); - const finishReason = extractFinishReason(mockResponse?.body); + const toolCalls = extractToolCalls(mockResponse?.body); + const content = toolCalls.length > 0 ? null : extractAssistantContent(mockResponse?.body); + // When tool_calls present, finish_reason MUST be 'tool_calls' — SDKs branch on this. + const finishReason = + toolCalls.length > 0 ? 'tool_calls' : extractFinishReason(mockResponse?.body); + + const message: Record = { + role: 'assistant', + content, + }; + if (toolCalls.length > 0) { + message.tool_calls = toolCalls.map((tc) => ({ + id: tc.id, + type: 'function' as const, + function: { name: tc.name, arguments: tc.arguments }, + })); + } return { id: `chatcmpl-${randomUUID()}`, @@ -43,21 +73,84 @@ export function forwardTranslateToChatCompletion( choices: [ { index: 0, - message: { role: 'assistant', content }, + message, finish_reason: finishReason, }, ], - // Zero counts = "no real metering" — stubbed non-zero would compute - // as plausible-but-fictional cost in downstream cost trackers. - usage: { - prompt_tokens: 0, - completion_tokens: 0, - total_tokens: 0, - }, + // Zero counts = "no real metering" — stubbed non-zero would fake plausible cost. + usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }, + // Non-conforming fingerprint so telemetry can tag eval traffic at a glance. system_fingerprint: 'eval-wire-server', }; } +/** + * Stream the mock handler's response as `chat.completion.chunk` frames per + * OpenAI's SSE accumulation contract: `index` on every tool-call delta; + * `id`/`function.name` only on the FIRST chunk per call; `function.arguments` + * streamed; terminal chunk's `finish_reason` is `tool_calls` when any call + * was emitted, otherwise `stop`. Returned as an array so tests can snapshot. + */ +export function forwardTranslateToSseChunks( + mockResponse: EvalMockHttpResponse | undefined, + model: string, +): Array> { + const id = `chatcmpl-${randomUUID()}`; + const created = Math.floor(Date.now() / 1000); + const toolCalls = extractToolCalls(mockResponse?.body); + + const chunks: Array> = []; + + const baseChunk = (delta: Record, finishReason: string | null = null) => ({ + id, + object: 'chat.completion.chunk' as const, + created, + model, + choices: [{ index: 0, delta, finish_reason: finishReason }], + system_fingerprint: 'eval-wire-server', + }); + + // Opening chunk announces the assistant role with no content payload yet — + // matches what the real API sends so SDK reducers initialize correctly. + chunks.push(baseChunk({ role: 'assistant', content: toolCalls.length > 0 ? null : '' })); + + if (toolCalls.length > 0) { + toolCalls.forEach((tc, callIndex) => { + // First chunk per tool call carries id + name; arguments start empty. + chunks.push( + baseChunk({ + tool_calls: [ + { + index: callIndex, + id: tc.id, + type: 'function', + function: { name: tc.name, arguments: '' }, + }, + ], + }), + ); + // One arg-slice is enough — the SDK accumulates regardless of chunk size. + if (tc.arguments.length > 0) { + chunks.push( + baseChunk({ + tool_calls: [{ index: callIndex, function: { arguments: tc.arguments } }], + }), + ); + } + }); + chunks.push(baseChunk({}, 'tool_calls')); + return chunks; + } + + const content = extractAssistantContent(mockResponse?.body); + if (content.length > 0) { + chunks.push(baseChunk({ content })); + } + const finishReason = extractFinishReason(mockResponse?.body); + chunks.push(baseChunk({}, finishReason)); + return chunks; +} + /** OpenAI-style error envelope — makes the SDK throw a typed APIError instead of choking on a malformed body. */ export function buildOpenAiErrorEnvelope(message: string): Record { return { @@ -70,6 +163,71 @@ export function buildOpenAiErrorEnvelope(message: string): Record; + + const fromChoices = pickToolCallsFromChoices(obj); + if (fromChoices.length > 0) return fromChoices; + + const fromTopLevel = normalizeToolCallList(obj.tool_calls); + if (fromTopLevel.length > 0) return fromTopLevel; + + if (typeof obj.tool === 'object' && obj.tool !== null) { + const single = normalizeToolCallList([obj.tool]); + if (single.length > 0) return single; + } + + return []; +} + +function pickToolCallsFromChoices(obj: Record): NormalizedToolCall[] { + const choices = obj.choices; + if (!Array.isArray(choices) || choices.length === 0) return []; + const first: unknown = choices[0]; + if (typeof first !== 'object' || first === null) return []; + const message = (first as { message?: unknown }).message; + if (typeof message !== 'object' || message === null) return []; + return normalizeToolCallList((message as { tool_calls?: unknown }).tool_calls); +} + +function normalizeToolCallList(raw: unknown): NormalizedToolCall[] { + if (!Array.isArray(raw)) return []; + const out: NormalizedToolCall[] = []; + for (const entry of raw) { + if (typeof entry !== 'object' || entry === null) continue; + const e = entry as Record; + const fn = (e.function ?? e) as Record; + const name = typeof fn.name === 'string' ? fn.name : undefined; + if (!name) continue; + const args = coerceArgumentsToString(fn.arguments); + const id = + typeof e.id === 'string' ? e.id : `call_${randomUUID().replace(/-/g, '').slice(0, 16)}`; + out.push({ id, name, arguments: args }); + } + return out; +} + +function coerceArgumentsToString(args: unknown): string { + if (typeof args === 'string') return args; + if (args === undefined || args === null) return '{}'; + // Object/array → JSON string. SDKs choke on non-string arguments. + // A circular structure throws here; let it propagate to the wire server's + // 500-envelope catch so the broken mock-handler output surfaces loudly + // rather than as a confusing tool-arg mismatch downstream. + return JSON.stringify(args); +} + function extractAssistantContent(body: unknown): string { if (body === null || body === undefined) return ''; if (typeof body === 'string') return body; diff --git a/packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts b/packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts new file mode 100644 index 00000000000..2e7e9d7e88b --- /dev/null +++ b/packages/cli/src/modules/instance-ai/eval/openai-responses-envelope.ts @@ -0,0 +1,287 @@ +import type { EvalMockHttpResponse } from 'n8n-core'; +import type { IHttpRequestOptions } from 'n8n-workflow'; +import { randomUUID } from 'node:crypto'; + +import { extractToolCalls, type NormalizedToolCall } from './openai-envelope'; + +// Translation between the OpenAI Responses API (`/v1/responses`) wire format +// and the shape `createLlmMockHandler` consumes/emits. The Responses API is +// what `@langchain/openai` v1.3+ auto-routes to for newer chat models — the +// chat-completions path covered by `openai-envelope.ts` is no longer the +// default for v1.3+ Agent workflows. + +const OPENAI_RESPONSES_SYNTHETIC_URL = 'https://api.openai.com/v1/responses'; + +const DEFAULT_MODEL = 'gpt-4o-mini'; + +/** Same as `reverseTranslateOpenAiRequest` but for the Responses API endpoint. */ +export function reverseTranslateOpenAiResponsesRequest(body: unknown): IHttpRequestOptions { + return { + url: OPENAI_RESPONSES_SYNTHETIC_URL, + method: 'POST', + body: body ?? {}, + }; +} + +/** Pull `.model` from the body; identical fallback to the chat-completions translator. */ +export function extractResponsesRequestModel(body: unknown): string { + if (typeof body !== 'object' || body === null) return DEFAULT_MODEL; + const model = (body as { model?: unknown }).model; + return typeof model === 'string' && model.length > 0 ? model : DEFAULT_MODEL; +} + +/** True when the inbound Responses API request opted into streaming via `stream: true`. */ +export function isResponsesStreamRequested(body: unknown): boolean { + if (typeof body !== 'object' || body === null) return false; + return (body as { stream?: unknown }).stream === true; +} + +/** + * Wrap the mock handler's response in a canonical `response` envelope. + * The Responses API uses a single `output` array — each entry is either a + * `message` (assistant text) or a `function_call` (tool call). Mixing both + * in one response is legal but rare; tool-call mode replaces the message. + */ +export function forwardTranslateToResponsesEnvelope( + mockResponse: EvalMockHttpResponse | undefined, + model: string, +): Record { + const toolCalls = extractToolCalls(mockResponse?.body); + const responseId = `resp_${randomUUID().replace(/-/g, '').slice(0, 32)}`; + const now = Math.floor(Date.now() / 1000); + + const output = + toolCalls.length > 0 + ? toolCallsToResponsesOutput(toolCalls) + : [buildAssistantMessage(extractResponsesContent(mockResponse?.body))]; + + return { + id: responseId, + object: 'response', + created_at: now, + status: 'completed', + model, + output, + // Mirror chat-completions: zero counts make eval cost trackers happy. + usage: { + input_tokens: 0, + output_tokens: 0, + total_tokens: 0, + }, + // `previous_response_id`, `instructions`, `metadata` are intentionally + // omitted — the SDK tolerates missing optional fields, and a stub + // fingerprint isn't part of the Responses API envelope. + }; +} + +/** + * Stream the mock response as Responses API SSE events. Non-tool-call turn: + * created → in_progress → output_item.added → content_part.added → + * output_text.delta → output_text.done → content_part.done → + * output_item.done → completed. Tool calls swap the message item for a + * `function_call` item with `function_call_arguments.delta`/`.done`. + */ +export function forwardTranslateToResponsesSseEvents( + mockResponse: EvalMockHttpResponse | undefined, + model: string, +): Array<{ event: string; data: Record }> { + const responseId = `resp_${randomUUID().replace(/-/g, '').slice(0, 32)}`; + const createdAt = Math.floor(Date.now() / 1000); + const toolCalls = extractToolCalls(mockResponse?.body); + + const baseResponse = (status: string, output: unknown[]) => ({ + id: responseId, + object: 'response', + created_at: createdAt, + status, + model, + output, + usage: { input_tokens: 0, output_tokens: 0, total_tokens: 0 }, + }); + + const events: Array<{ event: string; data: Record }> = []; + + events.push({ event: 'response.created', data: { response: baseResponse('in_progress', []) } }); + events.push({ + event: 'response.in_progress', + data: { response: baseResponse('in_progress', []) }, + }); + + if (toolCalls.length > 0) { + // Pre-build final items so `id` stays stable across every event the SDK + // reconciles (added / delta / done / terminal completed.output[i]). + const finalItems = toolCallsToResponsesOutput(toolCalls); + toolCalls.forEach((tc, callIndex) => { + const finalItem = finalItems[callIndex]; + const itemId = finalItem.id as string; + const initialItem = { ...finalItem, arguments: '' }; + events.push({ + event: 'response.output_item.added', + data: { output_index: callIndex, item: initialItem }, + }); + if (tc.arguments.length > 0) { + events.push({ + event: 'response.function_call_arguments.delta', + data: { + item_id: itemId, + output_index: callIndex, + delta: tc.arguments, + }, + }); + } + events.push({ + event: 'response.function_call_arguments.done', + data: { + item_id: itemId, + output_index: callIndex, + arguments: tc.arguments, + }, + }); + events.push({ + event: 'response.output_item.done', + data: { output_index: callIndex, item: finalItem }, + }); + }); + events.push({ + event: 'response.completed', + data: { response: baseResponse('completed', finalItems) }, + }); + return events; + } + + // Plain message mode. + const content = extractResponsesContent(mockResponse?.body); + const messageId = `msg_${randomUUID().replace(/-/g, '').slice(0, 16)}`; + // `annotations: []` is required — LangChain's extractor calls `.annotations.map(...)`. + const messageItem = { + id: messageId, + type: 'message' as const, + role: 'assistant' as const, + content: [{ type: 'output_text' as const, text: content, annotations: [] }], + status: 'completed' as const, + }; + events.push({ + event: 'response.output_item.added', + data: { + output_index: 0, + item: { + ...messageItem, + content: [{ type: 'output_text', text: '', annotations: [] }], + status: 'in_progress', + }, + }, + }); + events.push({ + event: 'response.content_part.added', + data: { + item_id: messageId, + output_index: 0, + content_index: 0, + part: { type: 'output_text', text: '', annotations: [] }, + }, + }); + if (content.length > 0) { + events.push({ + event: 'response.output_text.delta', + data: { + item_id: messageId, + output_index: 0, + content_index: 0, + delta: content, + }, + }); + } + events.push({ + event: 'response.output_text.done', + data: { + item_id: messageId, + output_index: 0, + content_index: 0, + text: content, + }, + }); + events.push({ + event: 'response.content_part.done', + data: { + item_id: messageId, + output_index: 0, + content_index: 0, + part: { type: 'output_text', text: content, annotations: [] }, + }, + }); + events.push({ + event: 'response.output_item.done', + data: { output_index: 0, item: messageItem }, + }); + events.push({ + event: 'response.completed', + data: { response: baseResponse('completed', [messageItem]) }, + }); + + return events; +} + +/** Responses API uses the same error envelope as chat-completions, with `error.type` describing the failure. */ +export function buildResponsesErrorEnvelope(message: string): Record { + return { + error: { + message, + type: 'eval_wire_server_error', + code: 'eval_mock_generation_failed', + param: null, + }, + }; +} + +function toolCallsToResponsesOutput( + toolCalls: NormalizedToolCall[], +): Array> { + return toolCalls.map((tc) => ({ + id: `fc_${randomUUID().replace(/-/g, '').slice(0, 16)}`, + type: 'function_call', + call_id: tc.id, + name: tc.name, + arguments: tc.arguments, + })); +} + +function buildAssistantMessage(text: string): Record { + return { + id: `msg_${randomUUID().replace(/-/g, '').slice(0, 16)}`, + type: 'message', + role: 'assistant', + status: 'completed', + // `annotations: []` is required — LangChain's extractor calls `.annotations.map(...)`. + content: [{ type: 'output_text', text, annotations: [] }], + }; +} + +/** Tolerant content extractor: handles `output[].content[].text`, `output_text`, `{ content }`, `{ message }`, bare strings. */ +function extractResponsesContent(body: unknown): string { + if (body === null || body === undefined) return ''; + if (typeof body === 'string') return body; + if (typeof body !== 'object') return String(body as number | boolean | bigint); + + const obj = body as Record; + + if (typeof obj.output_text === 'string') return obj.output_text; + + const output = obj.output; + if (Array.isArray(output) && output.length > 0) { + for (const item of output) { + if (typeof item !== 'object' || item === null) continue; + const content = (item as { content?: unknown }).content; + if (!Array.isArray(content) || content.length === 0) continue; + const first: unknown = content[0]; + if (typeof first === 'object' && first !== null) { + const text = (first as { text?: unknown }).text; + if (typeof text === 'string') return text; + } + } + } + + if (typeof obj.content === 'string') return obj.content; + if (typeof obj.message === 'string') return obj.message; + + return JSON.stringify(body); +} diff --git a/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts b/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts index 32aa3df1a7c..d57ae115576 100644 --- a/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts +++ b/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts @@ -2,6 +2,7 @@ import { Logger } from '@n8n/backend-common'; import { Container } from '@n8n/di'; import { createEvalAgent, extractText } from '@n8n/instance-ai'; import { + findAiRootNodeNames, type INode, type IPinData, type IWorkflowBase, @@ -12,25 +13,6 @@ import { import { extractNodeConfig } from './node-config'; -/** Targets of `ai_*` connections — Agent/Chain root nodes. Pinning these short-circuits sub-node SDK calls. */ -function findAiRootNodeNames(workflow: IWorkflowBase): Set { - const roots = new Set(); - for (const nodeConns of Object.values(workflow.connections)) { - for (const [connType, outputs] of Object.entries(nodeConns)) { - if (!connType.startsWith('ai_') || !Array.isArray(outputs)) continue; - for (const group of outputs) { - if (!Array.isArray(group)) continue; - for (const conn of group) { - if (typeof conn === 'object' && conn !== null && 'node' in conn) { - roots.add((conn as { node: string }).node); - } - } - } - } - } - return roots; -} - /** * AI root node types — lets the typo guard accept a no-sub-node Agent. * Keep in sync with new agent/chain types in `@n8n/n8n-nodes-langchain`. @@ -116,7 +98,7 @@ export function identifyNodesForPinData( workflow: IWorkflowBase, exclusionSet?: Set, ): INode[] { - const aiRootNodes = findAiRootNodeNames(workflow); + const aiRootNodes = findAiRootNodeNames(workflow.connections); return workflow.nodes.filter((node) => { if (node.disabled) return false; @@ -126,19 +108,21 @@ export function identifyNodesForPinData( }); } -type UnpinRefusal = { +export type AutoPinReason = + | 'protocol_binary' + | 'unsupported_vendor_llm' + | 'unsafe_baseurl_override' + | 'shared_vendor_llm_subnode'; + +export interface AutoPinEntry { root: string; subNode: string; subNodeType: string; - reason: - | 'protocol_binary' - | 'unsupported_vendor_llm' - | 'unsafe_baseurl_override' - | 'shared_vendor_llm_subnode'; -}; + reason: AutoPinReason; +} -// Routing maps for vendor SDK interception. `assertUnpinCompatibility` -// refuses shared sub-node topologies, so each sub-node maps to one root. +// Routing maps for vendor SDK interception. `partitionAiRoots` auto-pins +// shared-sub-node topologies, so each remaining sub-node maps to one root. export interface VendorLlmRouting { subNodeToRoot: Map; rootToSubNode: Map; @@ -175,6 +159,17 @@ export function buildVendorLlmRouting( } if (!rootToSubNode.has(rootName)) { rootToSubNode.set(rootName, subNode); + // Self-map the root: `LmChatOpenAi.supplyData()` reads + // `getCredentials('openAiApi')` from a context whose + // `executeData.node` is sometimes the parent Agent rather + // than the LLM sub-node — observed empirically against a + // real LangChain Agent. Without this entry the credential + // helper's lookup misses, falls back to the no-root URL, + // and the wire server's loud-fail handler rejects the + // SDK call. Self-mapping the root keeps the lookup honest + // regardless of which side of the supplyData boundary + // asked for the credential. + subNodeToRoot.set(rootName, rootName); } } } @@ -184,20 +179,102 @@ export function buildVendorLlmRouting( return { subNodeToRoot, rootToSubNode }; } -/** Throws if any unpinned AI root has a sub-node we can't intercept: protocol-binary, unmapped vendor LLM, or unsafe baseURL override. Also refuses entries that don't resolve to an enabled AI root (typo guard). */ -export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: string[]): void { - if (unpinNodes.length === 0) return; +export interface PartitionedAiRoots { + /** Names of AI roots that will run through the wire-server interception path. */ + unpinNodes: string[]; + /** Names of AI roots that will remain pinned — explicit `pinNodes` + auto-pinned roots. */ + pinNodes: string[]; + /** Per-(root, sub-node) reasons a root was auto-pinned, for diagnostic logging. */ + autoPinned: AutoPinEntry[]; +} +/** + * Default-on partition: every AI root in the workflow runs through the wire + * server unless one of these applies: + * - It's in the caller-supplied `explicitPinNodes` list (opt-out for nodes + * the caller wants to keep pinned, e.g. for an A/B comparison). + * - One of its inbound `ai_*` sub-nodes is incompatible (protocol-binary + * memory/vector store, unsupported vendor LLM, configured + * `options.baseURL` that bypasses the credential rewrite). + * - It shares a supported vendor LLM sub-node with another root — wire- + * server attribution is path-based and first-wins, so multiple roots + * fanning into the same sub-node would mis-attribute later turns. Both + * sides get auto-pinned. + * + * `explicitPinNodes` is validated up front: unknown / disabled / non-AI-root + * entries throw a `UserError` to surface typos as actionable errors instead + * of being silently ignored. + */ +export function partitionAiRoots( + workflow: IWorkflowBase, + explicitPinNodes: string[] = [], +): PartitionedAiRoots { const nodesByName = new Map(workflow.nodes.map((n) => [n.name, n])); const connectionsByDestination = mapConnectionsByDestination(workflow.connections); - const aiRootNodes = findAiRootNodeNames(workflow); + const allRoots = findAiRootNodeNames(workflow.connections); - // Refuse typos / disabled / non-AI-root entries up front. A root counts - // if it has inbound ai_* connections OR its type is on AI_ROOT_NODE_TYPES. + validateExplicitPinNodes(nodesByName, allRoots, explicitPinNodes); + + const explicitPinSet = new Set(explicitPinNodes); + const sharedSupportedSubNodes = trackSharedSupportedSubNodes( + connectionsByDestination, + nodesByName, + allRoots, + explicitPinSet, + ); + + const autoPinned: AutoPinEntry[] = []; + const pinSet = new Set(explicitPinNodes); + + for (const rootName of allRoots) { + if (explicitPinSet.has(rootName)) continue; + + const inbound = connectionsByDestination[rootName]; + if (!inbound) continue; + + for (const [connType, groups] of Object.entries(inbound)) { + if (!connType.startsWith('ai_') || !Array.isArray(groups)) continue; + for (const group of groups) { + if (!Array.isArray(group)) continue; + for (const conn of group) { + const sourceNode = nodesByName.get(conn.node); + if (!sourceNode || sourceNode.disabled) continue; + + const reason = categorizeSubNodeIncompatibility(sourceNode, sharedSupportedSubNodes); + if (reason === null) continue; + + autoPinned.push({ + root: rootName, + subNode: sourceNode.name, + subNodeType: sourceNode.type, + reason, + }); + pinSet.add(rootName); + } + } + } + } + + const unpinNodes: string[] = []; + const pinNodes: string[] = []; + for (const rootName of allRoots) { + if (pinSet.has(rootName)) pinNodes.push(rootName); + else unpinNodes.push(rootName); + } + + return { unpinNodes, pinNodes, autoPinned }; +} + +/** Throw `UserError` if any explicit pin entry isn't a real, enabled AI root in the workflow. */ +function validateExplicitPinNodes( + nodesByName: Map, + aiRootNodes: Set, + explicitPinNodes: string[], +): void { const unknownRoots: string[] = []; const disabledRoots: string[] = []; const nonAiRoots: string[] = []; - for (const rootName of unpinNodes) { + for (const rootName of explicitPinNodes) { const node = nodesByName.get(rootName); if (!node) unknownRoots.push(rootName); else if (node.disabled) disabledRoots.push(rootName); @@ -211,21 +288,28 @@ export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: st if (unknownRoots.length) parts.push(`not found in workflow: ${formatNames(unknownRoots)}`); if (disabledRoots.length) parts.push(`disabled: ${formatNames(disabledRoots)}`); if (nonAiRoots.length) parts.push(`not AI root nodes: ${formatNames(nonAiRoots)}`); - throw new UserError(`Cannot unpin — ${parts.join('; ')}.`); + throw new UserError(`Cannot pin — ${parts.join('; ')}.`); } +} - const refusals: UnpinRefusal[] = []; - // Track which unpinned roots each supported vendor LLM sub-node feeds. - // A sub-node feeding ≥2 unpinned roots can't be attributed correctly — - // the wire server's path-based root token is baked into the credential - // URL at resolution time (first-wins), so later turns from the same - // sub-node would mis-attribute to the first root. - const sharedSupportedSubNodes = new Map }>(); - - for (const rootName of unpinNodes) { +/** + * Walk every AI root in the workflow and record which supported vendor LLM + * sub-nodes feed more than one root. Used by `categorizeSubNodeIncompatibility` + * so both sides of a shared sub-node get auto-pinned (attribution would be + * ambiguous otherwise). Roots in `explicitPinSet` don't contribute — pinning + * them removes the ambiguity. + */ +function trackSharedSupportedSubNodes( + connectionsByDestination: ReturnType, + nodesByName: Map, + allRoots: Set, + explicitPinSet: Set, +): Set { + const usage = new Map>(); + for (const rootName of allRoots) { + if (explicitPinSet.has(rootName)) continue; const inbound = connectionsByDestination[rootName]; if (!inbound) continue; - for (const [connType, groups] of Object.entries(inbound)) { if (!connType.startsWith('ai_') || !Array.isArray(groups)) continue; for (const group of groups) { @@ -233,101 +317,44 @@ export function assertUnpinCompatibility(workflow: IWorkflowBase, unpinNodes: st for (const conn of group) { const sourceNode = nodesByName.get(conn.node); if (!sourceNode || sourceNode.disabled) continue; - - if (SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) { - const tracked = sharedSupportedSubNodes.get(sourceNode.name) ?? { - type: sourceNode.type, - roots: new Set(), - }; - tracked.roots.add(rootName); - sharedSupportedSubNodes.set(sourceNode.name, tracked); - } - - const reason = categorizeSubNodeRefusal(sourceNode); - if (reason === null) continue; - refusals.push({ - root: rootName, - subNode: sourceNode.name, - subNodeType: sourceNode.type, - reason, - }); + if (!SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) continue; + const tracked = usage.get(sourceNode.name) ?? new Set(); + tracked.add(rootName); + usage.set(sourceNode.name, tracked); } } } } - - // Emit a `shared_vendor_llm_subnode` refusal for every sub-node feeding - // more than one unpinned root. One entry per offending (root, sub-node) - // pair so the error message lists every conflict. - for (const [subNodeName, { type, roots }] of sharedSupportedSubNodes) { - if (roots.size < 2) continue; - for (const rootName of roots) { - refusals.push({ - root: rootName, - subNode: subNodeName, - subNodeType: type, - reason: 'shared_vendor_llm_subnode', - }); - } + const shared = new Set(); + for (const [subNodeName, roots] of usage) { + if (roots.size >= 2) shared.add(subNodeName); } - - if (refusals.length === 0) return; - - const segments = [ - formatRefusalSegment( - refusals, - 'protocol_binary', - 'protocol-binary sub-nodes (cannot be intercepted via HTTP)', - ), - formatRefusalSegment( - refusals, - 'unsupported_vendor_llm', - 'unsupported vendor LLM sub-nodes (no eval URL-rewrite mapping yet)', - ), - formatRefusalSegment( - refusals, - 'unsafe_baseurl_override', - 'vendor LLM sub-nodes with a configured options.baseURL that bypasses the credential rewrite', - ), - formatRefusalSegment( - refusals, - 'shared_vendor_llm_subnode', - 'vendor LLM sub-nodes shared by multiple unpinned roots (attribution would be ambiguous)', - ), - ].filter((s): s is string => s !== undefined); - - throw new UserError( - `Cannot unpin AI root nodes — ${segments.join('; ')}. ` + - 'Leave these roots pinned, remove the parameter override, or replace the sub-node with one that has interception support.', - ); + return shared; } -/** Classify a sub-node into one of the three refusal reasons, or null if acceptable. Order matters: protocol-binary, then baseURL-override on a supported vendor, then unsupported `lm*`. */ -function categorizeSubNodeRefusal(sourceNode: INode): UnpinRefusal['reason'] | null { +/** + * Return the auto-pin reason for a sub-node, or null if it's safe to intercept. + * Order: protocol-binary (HTTP can't reach it) → shared (attribution ambiguous) → + * supported-vendor-with-baseURL-override (SDK bypasses the rewrite) → unsupported + * vendor LLM (no URL-rewrite mapping yet). + */ +function categorizeSubNodeIncompatibility( + sourceNode: INode, + sharedSupportedSubNodes: Set, +): AutoPinReason | null { if (PROTOCOL_BINARY_SUB_NODE_TYPES.has(sourceNode.type)) return 'protocol_binary'; if (SUPPORTED_VENDOR_LLM_SUB_NODE_TYPES.has(sourceNode.type)) { + if (sharedSupportedSubNodes.has(sourceNode.name)) return 'shared_vendor_llm_subnode'; return hasUnsafeBaseUrlOverride(sourceNode) ? 'unsafe_baseurl_override' : null; } if (isVendorLlmSubNode(sourceNode.type)) return 'unsupported_vendor_llm'; return null; } -/** One segment of the `assertUnpinCompatibility` error message, or undefined when no refusals match. */ -function formatRefusalSegment( - refusals: UnpinRefusal[], - reason: UnpinRefusal['reason'], - label: string, -): string | undefined { - const matching = refusals.filter((r) => r.reason === reason); - if (matching.length === 0) return undefined; - const pairs = matching.map((r) => `"${r.subNode}" (${r.subNodeType}) → "${r.root}"`).join(', '); - return `${label}: ${pairs}`; -} - /** Nodes that should receive mock hints — excludes AI sub-nodes (handled via root) and pinned nodes. */ export function identifyNodesForHints(workflow: IWorkflowBase): INode[] { const aiSubNodes = findAiSubNodeNames(workflow); - const aiRootNodes = findAiRootNodeNames(workflow); + const aiRootNodes = findAiRootNodeNames(workflow.connections); const pinnedNodeNames = new Set(identifyNodesForPinData(workflow).map((n) => n.name)); return workflow.nodes.filter((node) => { diff --git a/packages/core/src/execution-engine/index.ts b/packages/core/src/execution-engine/index.ts index 430c9b462d8..47c027839bd 100644 --- a/packages/core/src/execution-engine/index.ts +++ b/packages/core/src/execution-engine/index.ts @@ -97,3 +97,7 @@ export { ExternalSecretsProxy, type IExternalSecretsManager } from './external-s export { ExecutionContextService } from './execution-context.service'; export { establishExecutionContext } from './execution-context'; export { isEngineRequest } from './requests-response'; +// Exposed so eval-mode credential helpers (e.g. `EvalMockedCredentialsHelper`) +// can reuse the same schema-driven cred synthesizer the wire-server URL +// rewrite expects. See its `getDecrypted` catch path for the consumer. +export { buildEvalMockCredentials } from './eval-mock-helpers'; diff --git a/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts b/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts index bf4196482bd..a163e9aead4 100644 --- a/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts +++ b/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts @@ -314,13 +314,21 @@ export abstract class NodeExecutionContext implements Omit 0; if (!hasOtherCreds) { - const { buildEvalMockCredentials } = await import('../eval-mock-helpers'); - return buildEvalMockCredentials( - additionalData.credentialsHelper.getCredentialsProperties(type), - ) as T; + return (await additionalData.credentialsHelper.getDecrypted( + additionalData, + { id: null, name: type }, + type, + mode, + executeData, + )) as T; } } diff --git a/packages/workflow/src/common/find-ai-root-node-names.ts b/packages/workflow/src/common/find-ai-root-node-names.ts new file mode 100644 index 00000000000..0fa31dc80f2 --- /dev/null +++ b/packages/workflow/src/common/find-ai-root-node-names.ts @@ -0,0 +1,38 @@ +/** + * AI root nodes are the target of any `ai_*` connection — Agent/Chain nodes + * to which language model, memory, tool, etc. sub-nodes attach. Pinning these + * during eval short-circuits sub-node SDK calls. + * + * Accepts `unknown` so callers reading workflow JSON from the wire (which + * arrives as `Record`) can use it without an `as` cast. + * Typed-`IConnections` callers assign in without widening. + */ +function isObjectRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null; +} + +// `Array.isArray` narrows to `any[]` in lib.es5.d.ts; wrap it so the elements +// stay typed as `unknown` and downstream checks have to narrow explicitly. +function isUnknownArray(value: unknown): value is readonly unknown[] { + return Array.isArray(value); +} + +export function findAiRootNodeNames(connections: unknown): Set { + const roots = new Set(); + if (!isObjectRecord(connections)) return roots; + for (const nodeConns of Object.values(connections)) { + if (!isObjectRecord(nodeConns)) continue; + for (const [connType, outputs] of Object.entries(nodeConns)) { + if (!connType.startsWith('ai_') || !isUnknownArray(outputs)) continue; + for (const group of outputs) { + if (!isUnknownArray(group)) continue; + for (const conn of group) { + if (isObjectRecord(conn) && typeof conn.node === 'string') { + roots.add(conn.node); + } + } + } + } + } + return roots; +} diff --git a/packages/workflow/src/common/index.ts b/packages/workflow/src/common/index.ts index 196ec09a377..5657c48572b 100644 --- a/packages/workflow/src/common/index.ts +++ b/packages/workflow/src/common/index.ts @@ -1,3 +1,4 @@ +export * from './find-ai-root-node-names'; export * from './get-child-nodes'; export * from './get-connected-nodes'; export * from './get-node-by-name'; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f553d488600..159eee83a5d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -240,6 +240,9 @@ catalogs: nanoid: specifier: 3.3.8 version: 3.3.8 + openai: + specifier: 6.19.0 + version: 6.19.0 oxlint: specifier: ^1.61.0 version: 1.61.0 @@ -2955,7 +2958,7 @@ importers: version: 9.0.3 langsmith: specifier: 0.6.0 - version: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)) + version: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)) ldapts: specifier: 4.2.6 version: 4.2.6 @@ -3194,6 +3197,9 @@ importers: n8n-containers: specifier: workspace:* version: link:../testing/containers + openai: + specifier: 'catalog:' + version: 6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67) openapi-types: specifier: ^12.1.3 version: 12.1.3 @@ -17383,6 +17389,18 @@ packages: resolution: {integrity: sha512-MVHddDVweXZF3awtlAS+6pgKLlm/JgxZ90+/NBurBoQctVOOB/zDdVjcyPzQ+0laDGbsWgrRkflI65sQeOgT9Q==} engines: {node: '>=8'} + openai@6.19.0: + resolution: {integrity: sha512-5uGrF82Ql7TKgIWUnuxh+OyzYbPRPwYDSgGc05JowbXRFsOkuj0dJuCdPCTBZT4mcmp2NEvj/URwDzW+lYgmVw==} + hasBin: true + peerDependencies: + ws: '>=8.20.1' + zod: 3.25.67 + peerDependenciesMeta: + ws: + optional: true + zod: + optional: true + openai@6.34.0: resolution: {integrity: sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw==} hasBin: true @@ -35048,6 +35066,16 @@ snapshots: - ws - zod-to-json-schema + langsmith@0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)): + dependencies: + p-queue: 6.6.2 + optionalDependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/exporter-trace-otlp-proto': 0.217.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0) + openai: 6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67) + ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10) + langsmith@0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)): dependencies: p-queue: 6.6.2 @@ -37083,6 +37111,11 @@ snapshots: is-docker: 2.2.1 is-wsl: 2.2.0 + openai@6.19.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67): + optionalDependencies: + ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10) + zod: 3.25.67 + openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67): optionalDependencies: ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)