From e13d4e0465ddece8ccd7905e56d0ea1ecb2b1ee1 Mon Sep 17 00:00:00 2001 From: Luca Mattiazzi Date: Wed, 20 May 2026 18:18:06 +0200 Subject: [PATCH] feat(core): Add eval-data populator tool (no-changelog) (#30680) --- .../src/tools/__tests__/index.test.ts | 4 + .../generate-sample-rows.service.test.ts | 104 ++++ .../evals/generate-sample-rows.service.ts | 87 ++- packages/@n8n/instance-ai/src/tools/index.ts | 2 + .../__tests__/eval-data-agent.tool.test.ts | 519 ++++++++++++++++++ .../orchestration/eval-data-agent.tool.ts | 244 ++++++++ .../@n8n/instance-ai/src/tools/tool-ids.ts | 1 + 7 files changed, 950 insertions(+), 11 deletions(-) create mode 100644 packages/@n8n/instance-ai/src/tools/orchestration/__tests__/eval-data-agent.tool.test.ts create mode 100644 packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts diff --git a/packages/@n8n/instance-ai/src/tools/__tests__/index.test.ts b/packages/@n8n/instance-ai/src/tools/__tests__/index.test.ts index e3da9db43c3..3c2660bea21 100644 --- a/packages/@n8n/instance-ai/src/tools/__tests__/index.test.ts +++ b/packages/@n8n/instance-ai/src/tools/__tests__/index.test.ts @@ -56,6 +56,10 @@ jest.mock('../orchestration/eval-setup-agent.tool', () => ({ createEvalSetupAgentTool: jest.fn(() => ({ id: 'eval-setup-with-agent' })), })); +jest.mock('../orchestration/eval-data-agent.tool', () => ({ + createEvalDataAgentTool: jest.fn(() => ({ id: 'eval-data' })), +})); + jest.mock('../orchestration/plan-with-agent.tool', () => ({ createPlanWithAgentTool: jest.fn(() => ({ id: 'plan' })), })); diff --git a/packages/@n8n/instance-ai/src/tools/evals/__tests__/generate-sample-rows.service.test.ts b/packages/@n8n/instance-ai/src/tools/evals/__tests__/generate-sample-rows.service.test.ts index 3b16f519e82..2d51c1f4bd5 100644 --- a/packages/@n8n/instance-ai/src/tools/evals/__tests__/generate-sample-rows.service.test.ts +++ b/packages/@n8n/instance-ai/src/tools/evals/__tests__/generate-sample-rows.service.test.ts @@ -275,6 +275,110 @@ describe('runBatch', () => { expect(rows).toEqual([]); expect(generate).not.toHaveBeenCalled(); }); + + describe('realExamples few-shot block', () => { + function captureBatchPrompt(generate: GenerateMock): string { + return getPromptText(generate); + } + + it('injects a reference-not-seed block when examples are provided', async () => { + const generate = createGenerateMock(); + mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType< + typeof createEvalAgent + >); + mockExtractText.mockReturnValue(JSON.stringify([])); + await runBatch({ + facet: BATCH_FACET, + rowCount: 1, + context: BATCH_CONTEXT, + columns: ['user_query'], + realExamples: [{ user_query: 'how do I refund an order?' }], + }); + const promptText = captureBatchPrompt(generate); + expect(promptText).toContain('Recent real inputs the agent has received in production'); + expect(promptText).toContain('REFERENCE, not seeds'); + expect(promptText).toContain('how do I refund an order?'); + expect(promptText).toMatch(/Do NOT copy or paraphrase them/); + }); + + it('omits the block entirely when realExamples is undefined', async () => { + const generate = createGenerateMock(); + mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType< + typeof createEvalAgent + >); + mockExtractText.mockReturnValue(JSON.stringify([])); + await runBatch({ + facet: BATCH_FACET, + rowCount: 1, + context: BATCH_CONTEXT, + columns: ['user_query'], + }); + expect(captureBatchPrompt(generate)).not.toContain('Recent real inputs'); + }); + + it('filters examples to the requested columns and drops rows that lack all of them', async () => { + const generate = createGenerateMock(); + mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType< + typeof createEvalAgent + >); + mockExtractText.mockReturnValue(JSON.stringify([])); + await runBatch({ + facet: BATCH_FACET, + rowCount: 1, + context: BATCH_CONTEXT, + columns: ['user_query'], + realExamples: [ + { user_query: 'real one', expected_response: 'should not leak' }, + { unrelated: 'dropped' }, + ], + }); + const promptText = captureBatchPrompt(generate); + expect(promptText).toContain('real one'); + expect(promptText).not.toContain('should not leak'); + expect(promptText).not.toContain('dropped'); + expect(promptText).not.toContain('unrelated'); + }); + + it('caps the example list at 10 entries', async () => { + const generate = createGenerateMock(); + mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType< + typeof createEvalAgent + >); + mockExtractText.mockReturnValue(JSON.stringify([])); + const examples = Array.from({ length: 13 }, (_, i) => ({ user_query: `q${i}` })); + await runBatch({ + facet: BATCH_FACET, + rowCount: 1, + context: BATCH_CONTEXT, + columns: ['user_query'], + realExamples: examples, + }); + const promptText = captureBatchPrompt(generate); + expect(promptText).toContain('q0'); + expect(promptText).toContain('q9'); + expect(promptText).not.toContain('q10'); + expect(promptText).not.toContain('q12'); + }); + + it('truncates values longer than 300 characters with an ellipsis', async () => { + const generate = createGenerateMock(); + mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType< + typeof createEvalAgent + >); + mockExtractText.mockReturnValue(JSON.stringify([])); + const longValue = 'x'.repeat(500); + await runBatch({ + facet: BATCH_FACET, + rowCount: 1, + context: BATCH_CONTEXT, + columns: ['user_query'], + realExamples: [{ user_query: longValue }], + }); + const promptText = captureBatchPrompt(generate); + expect(promptText).toMatch(/x{300}…/); + expect(promptText).not.toMatch(/x{301}/); + }); + }); }); describe('extractAgentContext', () => { diff --git a/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts b/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts index ab93ac437ef..defd195bf5c 100644 --- a/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts +++ b/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts @@ -9,6 +9,8 @@ import { createEvalAgent, extractText, HAIKU_MODEL } from '../../utils/eval-agen const FACET_COUNT = 5; const DEFAULT_ROW_COUNT = 25; const SYSTEM_PROMPT_MAX_CHARS = 2000; +const REAL_EXAMPLES_MAX_COUNT = 10; +const REAL_EXAMPLE_VALUE_MAX_CHARS = 300; export interface SampleRowFacet { length: string; @@ -145,6 +147,52 @@ function buildAgentContextBlock(context: AgentContext | undefined): string { const FORMAT_INFERENCE = "Inspect the agent's system prompt, prompt template, and connected tools to infer what kind of text this agent receives at runtime. It may be a user chat message, output from another tool, scraped web content, structured records (JSON/key-value), document chunks, log lines, code, etc. Generate inputs that look like what would arrive at the agent in production. Do not assume a human user when the agent suggests otherwise."; +function truncateExampleValue(value: string): string { + return value.length > REAL_EXAMPLE_VALUE_MAX_CHARS + ? `${value.slice(0, REAL_EXAMPLE_VALUE_MAX_CHARS)}…` + : value; +} + +/** + * Render a small block of recent real inputs (filtered to the requested + * `columns`) as a reference for the LLM. Returns an empty string when no + * usable examples exist — the caller injects this block only when + * non-empty, so the generator keeps producing rows from agent context + * alone when history is missing. + * + * The directive is explicit that these are flavour reference, not seed + * data to copy: the generator must produce NEW inputs in the same domain + * and tone, not paraphrase the examples. + */ +function buildRealExamplesBlock( + examples: ReadonlyArray> | undefined, + columns: string[], +): string { + if (!examples || examples.length === 0 || columns.length === 0) return ''; + const filtered: Array> = []; + for (const example of examples.slice(0, REAL_EXAMPLES_MAX_COUNT)) { + const row: Record = {}; + let hasValue = false; + for (const col of columns) { + const raw = example[col]; + if (raw === undefined || raw === null) continue; + const str = typeof raw === 'string' ? raw : JSON.stringify(raw); + if (str.length === 0) continue; + row[col] = truncateExampleValue(str); + hasValue = true; + } + if (hasValue) filtered.push(row); + } + if (filtered.length === 0) return ''; + const numbered = filtered.map((row, i) => `${i + 1}. ${JSON.stringify(row)}`).join('\n'); + return [ + '', + 'Recent real inputs the agent has received in production (REFERENCE, not seeds):', + numbered, + 'Use these as a hint about the actual domain, tone and shape of inputs the agent sees. Do NOT copy or paraphrase them — produce NEW inputs that fit the same setting.', + ].join('\n'); +} + const BATCH_SYSTEM_INSTRUCTIONS = `You generate realistic test inputs for an n8n workflow evaluation dataset. Output: JSON array of objects. Keys = exactly the provided column names. Values = short strings. No prose outside the JSON. @@ -158,6 +206,7 @@ export interface RunBatchInput { rowCount: number; context: AgentContext | undefined; columns: string[]; + realExamples?: ReadonlyArray>; logger?: Pick; } @@ -194,17 +243,23 @@ export async function runBatch(input: RunBatchInput): Promise>; logger?: Pick; } @@ -270,6 +334,7 @@ export async function generateSampleRows( rowCount: counts[i], context, columns: input.columns, + realExamples: input.realExamples, logger: input.logger, }); }), diff --git a/packages/@n8n/instance-ai/src/tools/index.ts b/packages/@n8n/instance-ai/src/tools/index.ts index 74a762a51ed..a3c9c0ecc2e 100644 --- a/packages/@n8n/instance-ai/src/tools/index.ts +++ b/packages/@n8n/instance-ai/src/tools/index.ts @@ -13,6 +13,7 @@ import { createBrowserCredentialSetupTool } from './orchestration/browser-creden import { createBuildWorkflowAgentTool } from './orchestration/build-workflow-agent.tool'; import { createCompleteCheckpointTool } from './orchestration/complete-checkpoint.tool'; import { createDelegateTool } from './orchestration/delegate.tool'; +import { createEvalDataAgentTool } from './orchestration/eval-data-agent.tool'; import { createEvalSetupAgentTool } from './orchestration/eval-setup-agent.tool'; import { createPlanWithAgentTool } from './orchestration/plan-with-agent.tool'; import { createPlanTool } from './orchestration/plan.tool'; @@ -89,6 +90,7 @@ export function createOrchestrationTools(context: OrchestrationContext): Instanc [ORCHESTRATION_TOOL_IDS.BUILD_WORKFLOW_WITH_AGENT, createBuildWorkflowAgentTool(context)], [ORCHESTRATION_TOOL_IDS.COMPLETE_CHECKPOINT, createCompleteCheckpointTool(context)], [ORCHESTRATION_TOOL_IDS.EVAL_SETUP_WITH_AGENT, createEvalSetupAgentTool(context)], + [ORCHESTRATION_TOOL_IDS.EVAL_DATA, createEvalDataAgentTool(context)], ]; if (context.browserMcpConfig || hasGatewayBrowserTools(context)) { diff --git a/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/eval-data-agent.tool.test.ts b/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/eval-data-agent.tool.test.ts new file mode 100644 index 00000000000..3fd9d0e570a --- /dev/null +++ b/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/eval-data-agent.tool.test.ts @@ -0,0 +1,519 @@ +import type { WorkflowJSON } from '@n8n/workflow-sdk'; + +import * as sampleRowsService from '../../evals/generate-sample-rows.service'; +import { createEvalDataAgentTool } from '../eval-data-agent.tool'; + +type EvalDataToolResult = { + status: 'imported' | 'generated' | 'skipped'; + source?: 'history' | 'synthetic'; + rowCount?: number; + expectedOutputsNeedUserReview?: boolean; + expectedOutputColumns?: string[]; + table?: { + id: string; + name: string; + projectId?: string; + rowCount: number; + inputColumns: string[]; + previewRows: Array>; + }; +}; + +async function runEvalDataTool( + ctx: ReturnType, + input: { workflowId: string; projectId?: string }, +): Promise { + const tool = createEvalDataAgentTool(ctx as never); + return (await tool.handler!(input, {} as never)) as EvalDataToolResult; +} + +const evalWf = (): WorkflowJSON => + ({ + name: 't', + nodes: [ + { + name: 'EvalTrig', + type: 'n8n-nodes-base.evaluationTrigger', + typeVersion: 1, + parameters: { dataTableId: { value: 'dt-1' } }, + position: [0, 0], + id: 't', + }, + { + name: 'Agent', + type: '@n8n/n8n-nodes-langchain.agent', + typeVersion: 1, + parameters: { text: '={{ $json.user_query }}' }, + position: [200, 0], + id: 'a', + }, + ], + connections: { + EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] }, + }, + pinData: {}, + settings: {}, + }) as unknown as WorkflowJSON; + +const evalWfWithMetrics = (): WorkflowJSON => + ({ + name: 't', + nodes: [ + { + name: 'EvalTrig', + type: 'n8n-nodes-base.evaluationTrigger', + typeVersion: 1, + parameters: { dataTableId: { value: 'dt-1' } }, + position: [0, 0], + id: 't', + }, + { + name: 'Agent', + type: '@n8n/n8n-nodes-langchain.agent', + typeVersion: 1, + parameters: { text: '={{ $json.user_query }}' }, + position: [200, 0], + id: 'a', + }, + { + name: 'MetricN', + type: 'n8n-nodes-base.evaluation', + typeVersion: 1, + parameters: { + operation: 'setMetrics', + expectedAnswer: "={{ $('EvalTrig').item.json.expected_response }}", + actualAnswer: '={{ $json.output }}', + }, + position: [400, 0], + id: 'm', + }, + ], + connections: { + EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] }, + Agent: { main: [[{ node: 'MetricN', type: 'main', index: 0 }]] }, + }, + pinData: {}, + settings: {}, + }) as unknown as WorkflowJSON; + +const defaultInsertResult = { + insertedCount: 0, + dataTableId: 'dt-1', + tableName: 'eval_dataset', + projectId: 'proj-1', +}; + +const silentLogger = () => ({ info: jest.fn(), warn: jest.fn(), error: jest.fn() }); + +/** Default DataTable service stub. Override individual mocks per test as needed. */ +function defaultDataTableService( + overrides: Partial<{ + insertRows: jest.Mock; + getSchema: jest.Mock; + addColumn: jest.Mock; + queryRows: jest.Mock; + }> = {}, +) { + return { + insertRows: jest.fn().mockResolvedValue(defaultInsertResult), + getSchema: jest.fn().mockResolvedValue([]), + addColumn: jest.fn().mockResolvedValue(undefined), + queryRows: jest.fn().mockResolvedValue({ count: 0, data: [] }), + ...overrides, + }; +} + +/** Execution service stub with no successful executions to read from. */ +function emptyExecutionService() { + return { + list: jest.fn().mockResolvedValueOnce([]).mockResolvedValueOnce([]), + getNodeOutput: jest.fn(), + }; +} + +/** + * Execution service stub returning `count` successful executions whose + * EvalTrig output exposes `user_query: "real-eN"` per execution. + */ +function trigInputHistoryExecutionService(count: number) { + const summaries = Array.from({ length: count }, (_, i) => ({ + id: `e${i}`, + status: 'success', + })); + return { + list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]), + getNodeOutput: jest.fn( + async (id: string) => + await Promise.resolve({ + nodeName: 'EvalTrig', + items: [{ json: { user_query: `real-${id}` } }], + totalItems: 1, + returned: { from: 0, to: 0 }, + }), + ), + }; +} + +/** + * Execution service stub returning `count` successful executions with both + * an EvalTrig input (`user_query: "q-eN"`) and an Agent output (`output: "a-eN"`). + */ +function trigInputAgentOutputExecutionService(count: number) { + const summaries = Array.from({ length: count }, (_, i) => ({ + id: `e${i}`, + status: 'success', + })); + return { + list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]), + getNodeOutput: jest.fn(async (id: string, nodeName: string) => + nodeName === 'EvalTrig' + ? await Promise.resolve({ + nodeName, + items: [{ json: { user_query: `q-${id}` } }], + totalItems: 1, + returned: { from: 0, to: 0 }, + }) + : await Promise.resolve({ + nodeName, + items: [{ json: { output: `a-${id}` } }], + totalItems: 1, + returned: { from: 0, to: 0 }, + }), + ), + }; +} + +interface BuildCtxOptions { + workflow?: WorkflowJSON; + dataTableService?: ReturnType; + executionService?: ReturnType; +} + +const buildOrchestrationCtx = (opts: BuildCtxOptions = {}) => ({ + domainContext: { + workflowService: { + getAsWorkflowJSON: jest.fn().mockResolvedValue(opts.workflow ?? evalWf()), + }, + dataTableService: opts.dataTableService ?? defaultDataTableService(), + executionService: opts.executionService ?? emptyExecutionService(), + logger: silentLogger(), + }, +}); + +describe('eval-data tool', () => { + beforeEach(() => { + jest.restoreAllMocks(); + }); + + it('imports rows from execution history when >= 10 valid rows are available', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ + dataTableService, + executionService: trigInputHistoryExecutionService(12), + }); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.status).toBe('imported'); + expect(result.source).toBe('history'); + expect(result.rowCount).toBe(12); + expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined); + }); + + it('falls back to synthetic generation when fewer than 10 valid history rows are available', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ dataTableService }); + jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue(Array.from({ length: 10 }, (_, i) => ({ user_query: `gen-${i}` }))); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.status).toBe('generated'); + expect(result.source).toBe('synthetic'); + expect(result.rowCount).toBe(10); + expect(dataTableService.insertRows).toHaveBeenCalled(); + }); + + it('returns skipped when no eval target exists', async () => { + const wf: WorkflowJSON = { + name: 't', + nodes: [], + connections: {}, + pinData: {}, + settings: {}, + } as never; + const ctx = buildOrchestrationCtx({ workflow: wf }); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.status).toBe('skipped'); + }); + + it('populates with the fallback "input" column when agent has no $json refs', async () => { + const wf = { + name: 't', + nodes: [ + { + name: 'EvalTrig', + type: 'n8n-nodes-base.evaluationTrigger', + typeVersion: 1, + parameters: { dataTableId: { value: 'dt-1' } }, + position: [0, 0], + id: 't', + }, + // Agent with no $json refs in its parameters + { + name: 'Agent', + type: '@n8n/n8n-nodes-langchain.agent', + typeVersion: 1, + parameters: { text: 'literal prompt' }, + position: [200, 0], + id: 'a', + }, + ], + connections: { + EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] }, + }, + pinData: {}, + settings: {}, + } as unknown as WorkflowJSON; + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'input' }]), + }); + const ctx = buildOrchestrationCtx({ workflow: wf, dataTableService }); + jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ input: 'sample' }]); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.status).toBe('generated'); + expect(dataTableService.insertRows).toHaveBeenCalledWith( + 'dt-1', + [{ input: 'sample' }], + undefined, + ); + }); + + it('populates expected_* columns from agent output in the history path', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest + .fn() + .mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]), + }); + const ctx = buildOrchestrationCtx({ + workflow: evalWfWithMetrics(), + dataTableService, + executionService: trigInputAgentOutputExecutionService(12), + }); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.status).toBe('imported'); + expect(result.rowCount).toBe(12); + expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined); + expect(dataTableService.insertRows).toHaveBeenCalledWith( + 'dt-1', + expect.arrayContaining([{ user_query: 'q-e0', expected_response: 'a-e0' }]), + undefined, + ); + }); + + it('synthetic path generates ONLY input columns and flags expected outputs for user review', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest + .fn() + .mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]), + }); + const ctx = buildOrchestrationCtx({ workflow: evalWfWithMetrics(), dataTableService }); + const generateSpy = jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue([{ user_query: 'q' }]); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(generateSpy).toHaveBeenCalledWith( + expect.objectContaining({ + columns: ['user_query'], + rowCount: 10, + }), + ); + expect(result).toMatchObject({ + status: 'generated', + source: 'synthetic', + expectedOutputsNeedUserReview: true, + expectedOutputColumns: ['expected_response'], + }); + }); + + it('does not flag user review on the history path (real outputs are ground truth)', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest + .fn() + .mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]), + }); + const ctx = buildOrchestrationCtx({ + workflow: evalWfWithMetrics(), + dataTableService, + executionService: trigInputAgentOutputExecutionService(12), + }); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.source).toBe('history'); + expect(result.expectedOutputsNeedUserReview).toBeUndefined(); + }); + + it('does not flag user review when there are no expected-output columns', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ dataTableService }); + jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.source).toBe('synthetic'); + expect(result.expectedOutputsNeedUserReview).toBeUndefined(); + }); + + it('adds missing columns to the DataTable before inserting rows', async () => { + // Schema has only the input column; expected_response is missing. + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ + workflow: evalWfWithMetrics(), + dataTableService, + }); + jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]); + + await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(dataTableService.getSchema).toHaveBeenCalledWith('dt-1', undefined); + expect(dataTableService.addColumn).toHaveBeenCalledTimes(1); + expect(dataTableService.addColumn).toHaveBeenCalledWith( + 'dt-1', + { name: 'expected_response', type: 'string' }, + undefined, + ); + expect(dataTableService.insertRows).toHaveBeenCalled(); + expect(dataTableService.addColumn.mock.invocationCallOrder[0]).toBeLessThan( + dataTableService.insertRows.mock.invocationCallOrder[0], + ); + }); + + it('does not add columns that already exist in the DataTable schema', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest + .fn() + .mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]), + }); + const ctx = buildOrchestrationCtx({ + workflow: evalWfWithMetrics(), + dataTableService, + }); + jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]); + + await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(dataTableService.addColumn).not.toHaveBeenCalled(); + expect(dataTableService.insertRows).toHaveBeenCalled(); + }); + + it('forwards projectId to insertRows when present', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ dataTableService }); + jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]); + + await runEvalDataTool(ctx, { workflowId: 'w1', projectId: 'proj-1' }); + + expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), { + projectId: 'proj-1', + }); + }); + + it('returns a `table` summary so the agent can recap the populated dataset to the user', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + queryRows: jest.fn().mockResolvedValue({ + count: 2, + data: [ + { + user_query: + 'first question with a really long body that should be truncated past eighty characters of content easily', + }, + { user_query: 'second' }, + ], + }), + }); + const ctx = buildOrchestrationCtx({ dataTableService }); + jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.table).toMatchObject({ + id: 'dt-1', + name: 'eval_dataset', + projectId: 'proj-1', + rowCount: 1, + inputColumns: ['user_query'], + }); + expect(result.table?.previewRows).toHaveLength(2); + // First row's long string should be truncated. + expect(String(result.table?.previewRows[0]?.user_query)).toMatch(/…$/); + }); + + describe('few-shot seeding from history', () => { + it('passes residual history rows to generateSampleRows when below threshold', async () => { + // 3 valid history rows — below the 10-row threshold, so the tool + // goes synthetic but should still hand the rows to the generator. + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ + dataTableService, + executionService: trigInputHistoryExecutionService(3), + }); + const generateSpy = jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue([{ user_query: 'synth' }]); + + const result = await runEvalDataTool(ctx, { workflowId: 'w1' }); + + expect(result.source).toBe('synthetic'); + const callArg = generateSpy.mock.calls[0]?.[0]; + expect(callArg?.realExamples).toEqual( + expect.arrayContaining([ + expect.objectContaining({ user_query: 'real-e0' }), + expect.objectContaining({ user_query: 'real-e1' }), + expect.objectContaining({ user_query: 'real-e2' }), + ]), + ); + }); + + it('does not pass realExamples when no history rows are available', async () => { + const dataTableService = defaultDataTableService({ + getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]), + }); + const ctx = buildOrchestrationCtx({ dataTableService }); + const generateSpy = jest + .spyOn(sampleRowsService, 'generateSampleRows') + .mockResolvedValue([{ user_query: 'synth' }]); + + await runEvalDataTool(ctx, { workflowId: 'w1' }); + + const callArg = generateSpy.mock.calls[0]?.[0]; + expect(callArg).not.toHaveProperty('realExamples'); + }); + }); +}); diff --git a/packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts b/packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts new file mode 100644 index 00000000000..a4848ca0553 --- /dev/null +++ b/packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts @@ -0,0 +1,244 @@ +import { Tool } from '@n8n/agents'; +import { z } from 'zod'; + +import type { InstanceAiDataTableService, OrchestrationContext } from '../../types'; +import { analyzeEvalDataRequirements } from '../evals/eval-data-requirements.service'; +import { extractRowsFromExecutionHistory } from '../evals/extract-rows-from-history.service'; +import { generateSampleRows } from '../evals/generate-sample-rows.service'; + +const HISTORY_THRESHOLD = 10; +const GENERATE_ROW_COUNT = 10; + +async function ensureColumnsExist( + dataTableService: InstanceAiDataTableService, + dataTableId: string, + rows: Array>, + extraColumns: readonly string[], + options: { projectId?: string } | undefined, +): Promise { + const referencedColumns = new Set(extraColumns); + for (const row of rows) { + for (const key of Object.keys(row)) referencedColumns.add(key); + } + if (referencedColumns.size === 0) return; + + const schema = await dataTableService.getSchema(dataTableId, options); + const existing = new Set(schema.map((c) => c.name)); + const missing = [...referencedColumns].filter((name) => !existing.has(name)); + + for (const name of missing) { + await dataTableService.addColumn(dataTableId, { name, type: 'string' }, options); + } +} + +const evalDataInputSchema = z.object({ + workflowId: z.string().describe('ID of the workflow whose eval DataTable should be populated'), + projectId: z.string().optional(), +}); + +const PREVIEW_ROW_COUNT = 3; +const PREVIEW_VALUE_MAX_LEN = 80; + +const tableSummarySchema = z.object({ + id: z.string(), + name: z.string(), + projectId: z.string().optional(), + rowCount: z.number(), + inputColumns: z.array(z.string()), + expectedOutputColumns: z.array(z.string()), + previewRows: z.array(z.record(z.string(), z.unknown())), +}); + +const outputSchema = z.object({ + status: z.enum(['imported', 'generated', 'skipped']), + rowCount: z.number().optional(), + source: z.enum(['history', 'synthetic']).optional(), + reason: z.string().optional(), + /** + * True when synthetic rows were inserted with empty expected-output columns. + * The agent must tell the user to fill those columns in before running the + * evaluation, so the eval measures correctness instead of self-consistency + * with the generator's own guess at the right answer. + */ + expectedOutputsNeedUserReview: z.boolean().optional(), + expectedOutputColumns: z.array(z.string()).optional(), + /** + * Snapshot of the populated DataTable so the agent can show the user what + * was generated alongside the metric setup, without making them dig through + * the data-tables UI to verify. Includes the table id (for deep-linking) and + * a short row preview. Only present on success paths. + */ + table: tableSummarySchema.optional(), +}); + +function truncateForPreview(value: unknown): unknown { + if (typeof value !== 'string') return value; + return value.length > PREVIEW_VALUE_MAX_LEN ? `${value.slice(0, PREVIEW_VALUE_MAX_LEN)}…` : value; +} + +function buildPreviewRows(rows: Array>): Array> { + return rows.slice(0, PREVIEW_ROW_COUNT).map((row) => { + const truncated: Record = {}; + for (const [key, value] of Object.entries(row)) { + truncated[key] = truncateForPreview(value); + } + return truncated; + }); +} + +export function createEvalDataAgentTool(context: OrchestrationContext) { + return new Tool('eval-data') + .description( + 'Populate an eval DataTable for a workflow that already has its eval setup wired. ' + + 'First scans the workflow execution history for real rows (these include real expected ' + + 'outputs); if fewer than 10 valid rows are available, generates synthetic rows with INPUT ' + + 'columns only — expected-output columns are left empty so the user can fill them in with ' + + 'the correct answers. We never auto-fill expected outputs with model-generated guesses, ' + + 'because that would measure self-consistency rather than correctness. ' + + 'Inserts at most 25 rows total. Synchronous — no sub-agent, no HITL.', + ) + .input(evalDataInputSchema) + .output(outputSchema) + .handler(async (input: z.infer) => { + const domain = context.domainContext; + if (!domain) { + return { status: 'skipped' as const, reason: 'Domain context unavailable.' }; + } + + const log = (level: 'info' | 'warn' | 'error', msg: string) => { + domain.logger?.[level]?.(`[eval-data] ${msg}`); + }; + const j = (v: unknown) => JSON.stringify(v); + + log('info', `start workflowId=${input.workflowId} projectId=${j(input.projectId)}`); + + const workflow = await domain.workflowService.getAsWorkflowJSON(input.workflowId); + const reqs = analyzeEvalDataRequirements(workflow); + const target = reqs.targets[0]; + if (!target) { + log('warn', `skip:no-target reason=${j(reqs.reason)}`); + return { status: 'skipped' as const, reason: reqs.reason ?? 'No eval target.' }; + } + log( + 'info', + `target dataTableId=${target.dataTableId} agent=${j(target.targetAgentNodeName)} inputColumns=${j(target.inputColumns)} expectedOutputColumns=${j(target.expectedOutputColumns)} pairs=${j(target.expectedToActualPairs)}`, + ); + if (!target.targetAgentNodeName) { + log('warn', 'skip:no-agent'); + return { + status: 'skipped' as const, + reason: 'No agent node reachable from EvaluationTrigger.', + }; + } + + const { rows: historyRows } = await extractRowsFromExecutionHistory(domain, { + workflow, + workflowId: input.workflowId, + agentNodeName: target.targetAgentNodeName, + inputColumns: target.inputColumns, + expectedToActualPairs: target.expectedToActualPairs, + }); + log('info', `history-extracted count=${historyRows.length}`); + + let rowsToInsert: Array>; + let source: 'history' | 'synthetic'; + + if (historyRows.length >= HISTORY_THRESHOLD) { + rowsToInsert = historyRows; + source = 'history'; + } else { + // This will only generate the input part: expected output columns + // will stay empty so that the user has to supply the ground truth. + // If the threshold for using history rows has not been reached, however + // many rows exist get passed as `realExamples` — a domain reference, + // not seeds to paraphrase. + rowsToInsert = await generateSampleRows({ + workflow, + columns: target.inputColumns, + rowCount: GENERATE_ROW_COUNT, + targetAgentNodeName: target.targetAgentNodeName, + ...(historyRows.length > 0 ? { realExamples: historyRows } : {}), + }); + source = 'synthetic'; + } + log( + 'info', + `rows-prepared source=${source} count=${rowsToInsert.length} firstRowKeys=${j(rowsToInsert[0] ? Object.keys(rowsToInsert[0]) : [])}`, + ); + + const dataTableOptions = input.projectId ? { projectId: input.projectId } : undefined; + + // On the synthetic path we leave expected-output columns empty, so the + // rows never reference them. Still make sure those columns exist in + // the table so the user has somewhere to type the correct answer. + const extraColumns = source === 'synthetic' ? target.expectedOutputColumns : []; + + try { + await ensureColumnsExist( + domain.dataTableService, + target.dataTableId, + rowsToInsert, + extraColumns, + dataTableOptions, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log('error', `ensureColumnsExist-failed error=${j(message)}`); + throw error; + } + + let insertResult: Awaited>; + try { + insertResult = await domain.dataTableService.insertRows( + target.dataTableId, + rowsToInsert, + dataTableOptions, + ); + log('info', `insertRows-ok result=${j(insertResult)}`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log('error', `insertRows-failed error=${j(message)}`); + throw error; + } + + // Fetch a tiny preview so the agent can recap WHAT was generated, not + // just that something was. Treat failures here as non-fatal — the + // insert already succeeded; a missing preview is a UX gap, not a bug. + let previewRows: Array> = []; + try { + const preview = await domain.dataTableService.queryRows(target.dataTableId, { + limit: PREVIEW_ROW_COUNT, + ...(insertResult.projectId ? { projectId: insertResult.projectId } : {}), + }); + previewRows = buildPreviewRows(preview.data); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log('warn', `preview-query-failed error=${j(message)}`); + } + + log('info', `done source=${source} rowCount=${rowsToInsert.length}`); + const needsReview = source === 'synthetic' && target.expectedOutputColumns.length > 0; + const table = { + id: target.dataTableId, + name: insertResult.tableName, + ...(insertResult.projectId ? { projectId: insertResult.projectId } : {}), + rowCount: rowsToInsert.length, + inputColumns: target.inputColumns, + expectedOutputColumns: target.expectedOutputColumns, + previewRows, + }; + return { + status: source === 'history' ? ('imported' as const) : ('generated' as const), + rowCount: rowsToInsert.length, + source, + ...(needsReview + ? { + expectedOutputsNeedUserReview: true as const, + expectedOutputColumns: target.expectedOutputColumns, + } + : {}), + table, + }; + }) + .build(); +} diff --git a/packages/@n8n/instance-ai/src/tools/tool-ids.ts b/packages/@n8n/instance-ai/src/tools/tool-ids.ts index a70f1484fc9..c2160748cf4 100644 --- a/packages/@n8n/instance-ai/src/tools/tool-ids.ts +++ b/packages/@n8n/instance-ai/src/tools/tool-ids.ts @@ -22,6 +22,7 @@ export const ORCHESTRATION_TOOL_IDS = { DELEGATE: 'delegate', BUILD_WORKFLOW_WITH_AGENT: 'build-workflow-with-agent', EVAL_SETUP_WITH_AGENT: 'eval-setup-with-agent', + EVAL_DATA: 'eval-data', MANAGE_DATA_TABLES_WITH_AGENT: 'manage-data-tables-with-agent', RESEARCH_WITH_AGENT: 'research-with-agent', BROWSER_CREDENTIAL_SETUP: 'browser-credential-setup',