feat(core): Add eval-data populator tool (no-changelog) (#30680)

2026-05-30 00:07:02 +02:00 · 2026-05-20 18:18:06 +02:00 · 2026-05-20 18:18:06 +02:00 · e13d4e0465
commit e13d4e0465
parent 73eae7f1d4
7 changed files with 950 additions and 11 deletions
--- a/packages/@n8n/instance-ai/src/tools/tests/index.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/tests/index.test.ts
@ -56,6 +56,10 @@ jest.mock('../orchestration/eval-setup-agent.tool', () => ({
 	createEvalSetupAgentTool: jest.fn(() => ({ id: 'eval-setup-with-agent' })),
 }));

+jest.mock('../orchestration/eval-data-agent.tool', () => ({
+	createEvalDataAgentTool: jest.fn(() => ({ id: 'eval-data' })),
+}));
+
 jest.mock('../orchestration/plan-with-agent.tool', () => ({
 	createPlanWithAgentTool: jest.fn(() => ({ id: 'plan' })),
 }));
--- a/packages/@n8n/instance-ai/src/tools/evals/tests/generate-sample-rows.service.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/evals/tests/generate-sample-rows.service.test.ts
@ -275,6 +275,110 @@ describe('runBatch', () => {
 		expect(rows).toEqual([]);
 		expect(generate).not.toHaveBeenCalled();
 	});
+
+	describe('realExamples few-shot block', () => {
+		function captureBatchPrompt(generate: GenerateMock): string {
+			return getPromptText(generate);
+		}
+
+		it('injects a reference-not-seed block when examples are provided', async () => {
+			const generate = createGenerateMock();
+			mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
+				typeof createEvalAgent
+			>);
+			mockExtractText.mockReturnValue(JSON.stringify([]));
+			await runBatch({
+				facet: BATCH_FACET,
+				rowCount: 1,
+				context: BATCH_CONTEXT,
+				columns: ['user_query'],
+				realExamples: [{ user_query: 'how do I refund an order?' }],
+			});
+			const promptText = captureBatchPrompt(generate);
+			expect(promptText).toContain('Recent real inputs the agent has received in production');
+			expect(promptText).toContain('REFERENCE, not seeds');
+			expect(promptText).toContain('how do I refund an order?');
+			expect(promptText).toMatch(/Do NOT copy or paraphrase them/);
+		});
+
+		it('omits the block entirely when realExamples is undefined', async () => {
+			const generate = createGenerateMock();
+			mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
+				typeof createEvalAgent
+			>);
+			mockExtractText.mockReturnValue(JSON.stringify([]));
+			await runBatch({
+				facet: BATCH_FACET,
+				rowCount: 1,
+				context: BATCH_CONTEXT,
+				columns: ['user_query'],
+			});
+			expect(captureBatchPrompt(generate)).not.toContain('Recent real inputs');
+		});
+
+		it('filters examples to the requested columns and drops rows that lack all of them', async () => {
+			const generate = createGenerateMock();
+			mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
+				typeof createEvalAgent
+			>);
+			mockExtractText.mockReturnValue(JSON.stringify([]));
+			await runBatch({
+				facet: BATCH_FACET,
+				rowCount: 1,
+				context: BATCH_CONTEXT,
+				columns: ['user_query'],
+				realExamples: [
+					{ user_query: 'real one', expected_response: 'should not leak' },
+					{ unrelated: 'dropped' },
+				],
+			});
+			const promptText = captureBatchPrompt(generate);
+			expect(promptText).toContain('real one');
+			expect(promptText).not.toContain('should not leak');
+			expect(promptText).not.toContain('dropped');
+			expect(promptText).not.toContain('unrelated');
+		});
+
+		it('caps the example list at 10 entries', async () => {
+			const generate = createGenerateMock();
+			mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
+				typeof createEvalAgent
+			>);
+			mockExtractText.mockReturnValue(JSON.stringify([]));
+			const examples = Array.from({ length: 13 }, (_, i) => ({ user_query: `q${i}` }));
+			await runBatch({
+				facet: BATCH_FACET,
+				rowCount: 1,
+				context: BATCH_CONTEXT,
+				columns: ['user_query'],
+				realExamples: examples,
+			});
+			const promptText = captureBatchPrompt(generate);
+			expect(promptText).toContain('q0');
+			expect(promptText).toContain('q9');
+			expect(promptText).not.toContain('q10');
+			expect(promptText).not.toContain('q12');
+		});
+
+		it('truncates values longer than 300 characters with an ellipsis', async () => {
+			const generate = createGenerateMock();
+			mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
+				typeof createEvalAgent
+			>);
+			mockExtractText.mockReturnValue(JSON.stringify([]));
+			const longValue = 'x'.repeat(500);
+			await runBatch({
+				facet: BATCH_FACET,
+				rowCount: 1,
+				context: BATCH_CONTEXT,
+				columns: ['user_query'],
+				realExamples: [{ user_query: longValue }],
+			});
+			const promptText = captureBatchPrompt(generate);
+			expect(promptText).toMatch(/x{300}…/);
+			expect(promptText).not.toMatch(/x{301}/);
+		});
+	});
 });

 describe('extractAgentContext', () => {
--- a/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts
+++ b/packages/@n8n/instance-ai/src/tools/evals/generate-sample-rows.service.ts
@ -9,6 +9,8 @@ import { createEvalAgent, extractText, HAIKU_MODEL } from '../../utils/eval-agen
 const FACET_COUNT = 5;
 const DEFAULT_ROW_COUNT = 25;
 const SYSTEM_PROMPT_MAX_CHARS = 2000;
+const REAL_EXAMPLES_MAX_COUNT = 10;
+const REAL_EXAMPLE_VALUE_MAX_CHARS = 300;

 export interface SampleRowFacet {
 	length: string;
@ -145,6 +147,52 @@ function buildAgentContextBlock(context: AgentContext | undefined): string {
 const FORMAT_INFERENCE =
 	"Inspect the agent's system prompt, prompt template, and connected tools to infer what kind of text this agent receives at runtime. It may be a user chat message, output from another tool, scraped web content, structured records (JSON/key-value), document chunks, log lines, code, etc. Generate inputs that look like what would arrive at the agent in production. Do not assume a human user when the agent suggests otherwise.";

+function truncateExampleValue(value: string): string {
+	return value.length > REAL_EXAMPLE_VALUE_MAX_CHARS
+		? `${value.slice(0, REAL_EXAMPLE_VALUE_MAX_CHARS)}…`
+		: value;
+}
+
+/**
+ * Render a small block of recent real inputs (filtered to the requested
+ * `columns`) as a reference for the LLM. Returns an empty string when no
+ * usable examples exist — the caller injects this block only when
+ * non-empty, so the generator keeps producing rows from agent context
+ * alone when history is missing.
+ *
+ * The directive is explicit that these are flavour reference, not seed
+ * data to copy: the generator must produce NEW inputs in the same domain
+ * and tone, not paraphrase the examples.
+ */
+function buildRealExamplesBlock(
+	examples: ReadonlyArray<Record<string, unknown>> | undefined,
+	columns: string[],
+): string {
+	if (!examples || examples.length === 0 || columns.length === 0) return '';
+	const filtered: Array<Record<string, string>> = [];
+	for (const example of examples.slice(0, REAL_EXAMPLES_MAX_COUNT)) {
+		const row: Record<string, string> = {};
+		let hasValue = false;
+		for (const col of columns) {
+			const raw = example[col];
+			if (raw === undefined || raw === null) continue;
+			const str = typeof raw === 'string' ? raw : JSON.stringify(raw);
+			if (str.length === 0) continue;
+			row[col] = truncateExampleValue(str);
+			hasValue = true;
+		}
+		if (hasValue) filtered.push(row);
+	}
+	if (filtered.length === 0) return '';
+	const numbered = filtered.map((row, i) => `${i + 1}. ${JSON.stringify(row)}`).join('\n');
+	return [
+		'',
+		'Recent real inputs the agent has received in production (REFERENCE, not seeds):',
+		numbered,
+		'Use these as a hint about the actual domain, tone and shape of inputs the agent sees. Do NOT copy or paraphrase them — produce NEW inputs that fit the same setting.',
+	].join('\n');
+}
+
 const BATCH_SYSTEM_INSTRUCTIONS = `You generate realistic test inputs for an n8n workflow evaluation dataset.

 Output: JSON array of objects. Keys = exactly the provided column names. Values = short strings. No prose outside the JSON.
@ -158,6 +206,7 @@ export interface RunBatchInput {
 	rowCount: number;
 	context: AgentContext | undefined;
 	columns: string[];
+	realExamples?: ReadonlyArray<Record<string, unknown>>;
 	logger?: Pick<Logger, 'warn'>;
 }

@ -194,17 +243,23 @@ export async function runBatch(input: RunBatchInput): Promise<Array<Record<strin
 			model: HAIKU_MODEL,
 			instructions: BATCH_SYSTEM_INSTRUCTIONS,
 		});
-		const userText = [
-			buildAgentContextBlock(input.context),
-			'',
-			FORMAT_INFERENCE,
-			'',
-			`Variation focus for this batch: length = ${input.facet.length}; mode = ${input.facet.edgeMode}.`,
-			input.facet.instructions,
-			'',
-			`Columns: ${generatedColumns.join(', ')}`,
-			`Generate exactly ${requestedRowCount} rows.`,
-		].join('\n');
+		const realExamplesBlock = buildRealExamplesBlock(input.realExamples, generatedColumns);
+		const sections = [buildAgentContextBlock(input.context)];
+		if (realExamplesBlock) sections.push(realExamplesBlock);
+		sections.push(FORMAT_INFERENCE);
+		sections.push(
+			[
+				`Variation focus for this batch: length = ${input.facet.length}; mode = ${input.facet.edgeMode}.`,
+				input.facet.instructions,
+			].join('\n'),
+		);
+		sections.push(
+			[
+				`Columns: ${generatedColumns.join(', ')}`,
+				`Generate exactly ${requestedRowCount} rows.`,
+			].join('\n'),
+		);
+		const userText = sections.join('\n\n');
 		const result = await agent.generate(userText);
 		const text = extractText(result);
 		const parsed: unknown = JSON.parse(stripMarkdownFences(text));
@ -235,6 +290,15 @@ export interface GenerateSampleRowsInput {
 	columns: string[];
 	rowCount?: number;
 	targetAgentNodeName?: string;
+	/**
+	 * Recent real input rows extracted from the workflow's execution history.
+	 * When present (typically below the history threshold that would otherwise
+	 * have been used directly), they are passed to the LLM as a flavour
+	 * reference — rows are filtered to the requested `columns`, truncated, and
+	 * accompanied by an explicit "reference, not seed" directive so the
+	 * generator produces new in-domain inputs instead of paraphrasing them.
+	 */
+	realExamples?: ReadonlyArray<Record<string, unknown>>;
 	logger?: Pick<Logger, 'warn'>;
 }

@ -270,6 +334,7 @@ export async function generateSampleRows(
 				rowCount: counts[i],
 				context,
 				columns: input.columns,
+				realExamples: input.realExamples,
 				logger: input.logger,
 			});
 		}),
--- a/packages/@n8n/instance-ai/src/tools/index.ts
+++ b/packages/@n8n/instance-ai/src/tools/index.ts
@ -13,6 +13,7 @@ import { createBrowserCredentialSetupTool } from './orchestration/browser-creden
 import { createBuildWorkflowAgentTool } from './orchestration/build-workflow-agent.tool';
 import { createCompleteCheckpointTool } from './orchestration/complete-checkpoint.tool';
 import { createDelegateTool } from './orchestration/delegate.tool';
+import { createEvalDataAgentTool } from './orchestration/eval-data-agent.tool';
 import { createEvalSetupAgentTool } from './orchestration/eval-setup-agent.tool';
 import { createPlanWithAgentTool } from './orchestration/plan-with-agent.tool';
 import { createPlanTool } from './orchestration/plan.tool';
@ -89,6 +90,7 @@ export function createOrchestrationTools(context: OrchestrationContext): Instanc
 		[ORCHESTRATION_TOOL_IDS.BUILD_WORKFLOW_WITH_AGENT, createBuildWorkflowAgentTool(context)],
 		[ORCHESTRATION_TOOL_IDS.COMPLETE_CHECKPOINT, createCompleteCheckpointTool(context)],
 		[ORCHESTRATION_TOOL_IDS.EVAL_SETUP_WITH_AGENT, createEvalSetupAgentTool(context)],
+		[ORCHESTRATION_TOOL_IDS.EVAL_DATA, createEvalDataAgentTool(context)],
 	];

 	if (context.browserMcpConfig || hasGatewayBrowserTools(context)) {
--- a/packages/@n8n/instance-ai/src/tools/orchestration/tests/eval-data-agent.tool.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/tests/eval-data-agent.tool.test.ts
@ -0,0 +1,519 @@
+import type { WorkflowJSON } from '@n8n/workflow-sdk';
+
+import * as sampleRowsService from '../../evals/generate-sample-rows.service';
+import { createEvalDataAgentTool } from '../eval-data-agent.tool';
+
+type EvalDataToolResult = {
+	status: 'imported' | 'generated' | 'skipped';
+	source?: 'history' | 'synthetic';
+	rowCount?: number;
+	expectedOutputsNeedUserReview?: boolean;
+	expectedOutputColumns?: string[];
+	table?: {
+		id: string;
+		name: string;
+		projectId?: string;
+		rowCount: number;
+		inputColumns: string[];
+		previewRows: Array<Record<string, unknown>>;
+	};
+};
+
+async function runEvalDataTool(
+	ctx: ReturnType<typeof buildOrchestrationCtx>,
+	input: { workflowId: string; projectId?: string },
+): Promise<EvalDataToolResult> {
+	const tool = createEvalDataAgentTool(ctx as never);
+	return (await tool.handler!(input, {} as never)) as EvalDataToolResult;
+}
+
+const evalWf = (): WorkflowJSON =>
+	({
+		name: 't',
+		nodes: [
+			{
+				name: 'EvalTrig',
+				type: 'n8n-nodes-base.evaluationTrigger',
+				typeVersion: 1,
+				parameters: { dataTableId: { value: 'dt-1' } },
+				position: [0, 0],
+				id: 't',
+			},
+			{
+				name: 'Agent',
+				type: '@n8n/n8n-nodes-langchain.agent',
+				typeVersion: 1,
+				parameters: { text: '={{ $json.user_query }}' },
+				position: [200, 0],
+				id: 'a',
+			},
+		],
+		connections: {
+			EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
+		},
+		pinData: {},
+		settings: {},
+	}) as unknown as WorkflowJSON;
+
+const evalWfWithMetrics = (): WorkflowJSON =>
+	({
+		name: 't',
+		nodes: [
+			{
+				name: 'EvalTrig',
+				type: 'n8n-nodes-base.evaluationTrigger',
+				typeVersion: 1,
+				parameters: { dataTableId: { value: 'dt-1' } },
+				position: [0, 0],
+				id: 't',
+			},
+			{
+				name: 'Agent',
+				type: '@n8n/n8n-nodes-langchain.agent',
+				typeVersion: 1,
+				parameters: { text: '={{ $json.user_query }}' },
+				position: [200, 0],
+				id: 'a',
+			},
+			{
+				name: 'MetricN',
+				type: 'n8n-nodes-base.evaluation',
+				typeVersion: 1,
+				parameters: {
+					operation: 'setMetrics',
+					expectedAnswer: "={{ $('EvalTrig').item.json.expected_response }}",
+					actualAnswer: '={{ $json.output }}',
+				},
+				position: [400, 0],
+				id: 'm',
+			},
+		],
+		connections: {
+			EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
+			Agent: { main: [[{ node: 'MetricN', type: 'main', index: 0 }]] },
+		},
+		pinData: {},
+		settings: {},
+	}) as unknown as WorkflowJSON;
+
+const defaultInsertResult = {
+	insertedCount: 0,
+	dataTableId: 'dt-1',
+	tableName: 'eval_dataset',
+	projectId: 'proj-1',
+};
+
+const silentLogger = () => ({ info: jest.fn(), warn: jest.fn(), error: jest.fn() });
+
+/** Default DataTable service stub. Override individual mocks per test as needed. */
+function defaultDataTableService(
+	overrides: Partial<{
+		insertRows: jest.Mock;
+		getSchema: jest.Mock;
+		addColumn: jest.Mock;
+		queryRows: jest.Mock;
+	}> = {},
+) {
+	return {
+		insertRows: jest.fn().mockResolvedValue(defaultInsertResult),
+		getSchema: jest.fn().mockResolvedValue([]),
+		addColumn: jest.fn().mockResolvedValue(undefined),
+		queryRows: jest.fn().mockResolvedValue({ count: 0, data: [] }),
+		...overrides,
+	};
+}
+
+/** Execution service stub with no successful executions to read from. */
+function emptyExecutionService() {
+	return {
+		list: jest.fn().mockResolvedValueOnce([]).mockResolvedValueOnce([]),
+		getNodeOutput: jest.fn(),
+	};
+}
+
+/**
+ * Execution service stub returning `count` successful executions whose
+ * EvalTrig output exposes `user_query: "real-eN"` per execution.
+ */
+function trigInputHistoryExecutionService(count: number) {
+	const summaries = Array.from({ length: count }, (_, i) => ({
+		id: `e${i}`,
+		status: 'success',
+	}));
+	return {
+		list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]),
+		getNodeOutput: jest.fn(
+			async (id: string) =>
+				await Promise.resolve({
+					nodeName: 'EvalTrig',
+					items: [{ json: { user_query: `real-${id}` } }],
+					totalItems: 1,
+					returned: { from: 0, to: 0 },
+				}),
+		),
+	};
+}
+
+/**
+ * Execution service stub returning `count` successful executions with both
+ * an EvalTrig input (`user_query: "q-eN"`) and an Agent output (`output: "a-eN"`).
+ */
+function trigInputAgentOutputExecutionService(count: number) {
+	const summaries = Array.from({ length: count }, (_, i) => ({
+		id: `e${i}`,
+		status: 'success',
+	}));
+	return {
+		list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]),
+		getNodeOutput: jest.fn(async (id: string, nodeName: string) =>
+			nodeName === 'EvalTrig'
+				? await Promise.resolve({
+						nodeName,
+						items: [{ json: { user_query: `q-${id}` } }],
+						totalItems: 1,
+						returned: { from: 0, to: 0 },
+					})
+				: await Promise.resolve({
+						nodeName,
+						items: [{ json: { output: `a-${id}` } }],
+						totalItems: 1,
+						returned: { from: 0, to: 0 },
+					}),
+		),
+	};
+}
+
+interface BuildCtxOptions {
+	workflow?: WorkflowJSON;
+	dataTableService?: ReturnType<typeof defaultDataTableService>;
+	executionService?: ReturnType<typeof emptyExecutionService>;
+}
+
+const buildOrchestrationCtx = (opts: BuildCtxOptions = {}) => ({
+	domainContext: {
+		workflowService: {
+			getAsWorkflowJSON: jest.fn().mockResolvedValue(opts.workflow ?? evalWf()),
+		},
+		dataTableService: opts.dataTableService ?? defaultDataTableService(),
+		executionService: opts.executionService ?? emptyExecutionService(),
+		logger: silentLogger(),
+	},
+});
+
+describe('eval-data tool', () => {
+	beforeEach(() => {
+		jest.restoreAllMocks();
+	});
+
+	it('imports rows from execution history when >= 10 valid rows are available', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+		});
+		const ctx = buildOrchestrationCtx({
+			dataTableService,
+			executionService: trigInputHistoryExecutionService(12),
+		});
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.status).toBe('imported');
+		expect(result.source).toBe('history');
+		expect(result.rowCount).toBe(12);
+		expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined);
+	});
+
+	it('falls back to synthetic generation when fewer than 10 valid history rows are available', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+		});
+		const ctx = buildOrchestrationCtx({ dataTableService });
+		jest
+			.spyOn(sampleRowsService, 'generateSampleRows')
+			.mockResolvedValue(Array.from({ length: 10 }, (_, i) => ({ user_query: `gen-${i}` })));
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.status).toBe('generated');
+		expect(result.source).toBe('synthetic');
+		expect(result.rowCount).toBe(10);
+		expect(dataTableService.insertRows).toHaveBeenCalled();
+	});
+
+	it('returns skipped when no eval target exists', async () => {
+		const wf: WorkflowJSON = {
+			name: 't',
+			nodes: [],
+			connections: {},
+			pinData: {},
+			settings: {},
+		} as never;
+		const ctx = buildOrchestrationCtx({ workflow: wf });
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.status).toBe('skipped');
+	});
+
+	it('populates with the fallback "input" column when agent has no $json refs', async () => {
+		const wf = {
+			name: 't',
+			nodes: [
+				{
+					name: 'EvalTrig',
+					type: 'n8n-nodes-base.evaluationTrigger',
+					typeVersion: 1,
+					parameters: { dataTableId: { value: 'dt-1' } },
+					position: [0, 0],
+					id: 't',
+				},
+				// Agent with no $json refs in its parameters
+				{
+					name: 'Agent',
+					type: '@n8n/n8n-nodes-langchain.agent',
+					typeVersion: 1,
+					parameters: { text: 'literal prompt' },
+					position: [200, 0],
+					id: 'a',
+				},
+			],
+			connections: {
+				EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
+			},
+			pinData: {},
+			settings: {},
+		} as unknown as WorkflowJSON;
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'input' }]),
+		});
+		const ctx = buildOrchestrationCtx({ workflow: wf, dataTableService });
+		jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ input: 'sample' }]);
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.status).toBe('generated');
+		expect(dataTableService.insertRows).toHaveBeenCalledWith(
+			'dt-1',
+			[{ input: 'sample' }],
+			undefined,
+		);
+	});
+
+	it('populates expected_* columns from agent output in the history path', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest
+				.fn()
+				.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
+		});
+		const ctx = buildOrchestrationCtx({
+			workflow: evalWfWithMetrics(),
+			dataTableService,
+			executionService: trigInputAgentOutputExecutionService(12),
+		});
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.status).toBe('imported');
+		expect(result.rowCount).toBe(12);
+		expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined);
+		expect(dataTableService.insertRows).toHaveBeenCalledWith(
+			'dt-1',
+			expect.arrayContaining([{ user_query: 'q-e0', expected_response: 'a-e0' }]),
+			undefined,
+		);
+	});
+
+	it('synthetic path generates ONLY input columns and flags expected outputs for user review', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest
+				.fn()
+				.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
+		});
+		const ctx = buildOrchestrationCtx({ workflow: evalWfWithMetrics(), dataTableService });
+		const generateSpy = jest
+			.spyOn(sampleRowsService, 'generateSampleRows')
+			.mockResolvedValue([{ user_query: 'q' }]);
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(generateSpy).toHaveBeenCalledWith(
+			expect.objectContaining({
+				columns: ['user_query'],
+				rowCount: 10,
+			}),
+		);
+		expect(result).toMatchObject({
+			status: 'generated',
+			source: 'synthetic',
+			expectedOutputsNeedUserReview: true,
+			expectedOutputColumns: ['expected_response'],
+		});
+	});
+
+	it('does not flag user review on the history path (real outputs are ground truth)', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest
+				.fn()
+				.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
+		});
+		const ctx = buildOrchestrationCtx({
+			workflow: evalWfWithMetrics(),
+			dataTableService,
+			executionService: trigInputAgentOutputExecutionService(12),
+		});
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.source).toBe('history');
+		expect(result.expectedOutputsNeedUserReview).toBeUndefined();
+	});
+
+	it('does not flag user review when there are no expected-output columns', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+		});
+		const ctx = buildOrchestrationCtx({ dataTableService });
+		jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.source).toBe('synthetic');
+		expect(result.expectedOutputsNeedUserReview).toBeUndefined();
+	});
+
+	it('adds missing columns to the DataTable before inserting rows', async () => {
+		// Schema has only the input column; expected_response is missing.
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+		});
+		const ctx = buildOrchestrationCtx({
+			workflow: evalWfWithMetrics(),
+			dataTableService,
+		});
+		jest
+			.spyOn(sampleRowsService, 'generateSampleRows')
+			.mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]);
+
+		await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(dataTableService.getSchema).toHaveBeenCalledWith('dt-1', undefined);
+		expect(dataTableService.addColumn).toHaveBeenCalledTimes(1);
+		expect(dataTableService.addColumn).toHaveBeenCalledWith(
+			'dt-1',
+			{ name: 'expected_response', type: 'string' },
+			undefined,
+		);
+		expect(dataTableService.insertRows).toHaveBeenCalled();
+		expect(dataTableService.addColumn.mock.invocationCallOrder[0]).toBeLessThan(
+			dataTableService.insertRows.mock.invocationCallOrder[0],
+		);
+	});
+
+	it('does not add columns that already exist in the DataTable schema', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest
+				.fn()
+				.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
+		});
+		const ctx = buildOrchestrationCtx({
+			workflow: evalWfWithMetrics(),
+			dataTableService,
+		});
+		jest
+			.spyOn(sampleRowsService, 'generateSampleRows')
+			.mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]);
+
+		await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(dataTableService.addColumn).not.toHaveBeenCalled();
+		expect(dataTableService.insertRows).toHaveBeenCalled();
+	});
+
+	it('forwards projectId to insertRows when present', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+		});
+		const ctx = buildOrchestrationCtx({ dataTableService });
+		jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
+
+		await runEvalDataTool(ctx, { workflowId: 'w1', projectId: 'proj-1' });
+
+		expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), {
+			projectId: 'proj-1',
+		});
+	});
+
+	it('returns a `table` summary so the agent can recap the populated dataset to the user', async () => {
+		const dataTableService = defaultDataTableService({
+			getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+			queryRows: jest.fn().mockResolvedValue({
+				count: 2,
+				data: [
+					{
+						user_query:
+							'first question with a really long body that should be truncated past eighty characters of content easily',
+					},
+					{ user_query: 'second' },
+				],
+			}),
+		});
+		const ctx = buildOrchestrationCtx({ dataTableService });
+		jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
+
+		const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+		expect(result.table).toMatchObject({
+			id: 'dt-1',
+			name: 'eval_dataset',
+			projectId: 'proj-1',
+			rowCount: 1,
+			inputColumns: ['user_query'],
+		});
+		expect(result.table?.previewRows).toHaveLength(2);
+		// First row's long string should be truncated.
+		expect(String(result.table?.previewRows[0]?.user_query)).toMatch(/…$/);
+	});
+
+	describe('few-shot seeding from history', () => {
+		it('passes residual history rows to generateSampleRows when below threshold', async () => {
+			// 3 valid history rows — below the 10-row threshold, so the tool
+			// goes synthetic but should still hand the rows to the generator.
+			const dataTableService = defaultDataTableService({
+				getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+			});
+			const ctx = buildOrchestrationCtx({
+				dataTableService,
+				executionService: trigInputHistoryExecutionService(3),
+			});
+			const generateSpy = jest
+				.spyOn(sampleRowsService, 'generateSampleRows')
+				.mockResolvedValue([{ user_query: 'synth' }]);
+
+			const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+			expect(result.source).toBe('synthetic');
+			const callArg = generateSpy.mock.calls[0]?.[0];
+			expect(callArg?.realExamples).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({ user_query: 'real-e0' }),
+					expect.objectContaining({ user_query: 'real-e1' }),
+					expect.objectContaining({ user_query: 'real-e2' }),
+				]),
+			);
+		});
+
+		it('does not pass realExamples when no history rows are available', async () => {
+			const dataTableService = defaultDataTableService({
+				getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
+			});
+			const ctx = buildOrchestrationCtx({ dataTableService });
+			const generateSpy = jest
+				.spyOn(sampleRowsService, 'generateSampleRows')
+				.mockResolvedValue([{ user_query: 'synth' }]);
+
+			await runEvalDataTool(ctx, { workflowId: 'w1' });
+
+			const callArg = generateSpy.mock.calls[0]?.[0];
+			expect(callArg).not.toHaveProperty('realExamples');
+		});
+	});
+});
--- a/packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/eval-data-agent.tool.ts
@ -0,0 +1,244 @@
+import { Tool } from '@n8n/agents';
+import { z } from 'zod';
+
+import type { InstanceAiDataTableService, OrchestrationContext } from '../../types';
+import { analyzeEvalDataRequirements } from '../evals/eval-data-requirements.service';
+import { extractRowsFromExecutionHistory } from '../evals/extract-rows-from-history.service';
+import { generateSampleRows } from '../evals/generate-sample-rows.service';
+
+const HISTORY_THRESHOLD = 10;
+const GENERATE_ROW_COUNT = 10;
+
+async function ensureColumnsExist(
+	dataTableService: InstanceAiDataTableService,
+	dataTableId: string,
+	rows: Array<Record<string, unknown>>,
+	extraColumns: readonly string[],
+	options: { projectId?: string } | undefined,
+): Promise<void> {
+	const referencedColumns = new Set<string>(extraColumns);
+	for (const row of rows) {
+		for (const key of Object.keys(row)) referencedColumns.add(key);
+	}
+	if (referencedColumns.size === 0) return;
+
+	const schema = await dataTableService.getSchema(dataTableId, options);
+	const existing = new Set(schema.map((c) => c.name));
+	const missing = [...referencedColumns].filter((name) => !existing.has(name));
+
+	for (const name of missing) {
+		await dataTableService.addColumn(dataTableId, { name, type: 'string' }, options);
+	}
+}
+
+const evalDataInputSchema = z.object({
+	workflowId: z.string().describe('ID of the workflow whose eval DataTable should be populated'),
+	projectId: z.string().optional(),
+});
+
+const PREVIEW_ROW_COUNT = 3;
+const PREVIEW_VALUE_MAX_LEN = 80;
+
+const tableSummarySchema = z.object({
+	id: z.string(),
+	name: z.string(),
+	projectId: z.string().optional(),
+	rowCount: z.number(),
+	inputColumns: z.array(z.string()),
+	expectedOutputColumns: z.array(z.string()),
+	previewRows: z.array(z.record(z.string(), z.unknown())),
+});
+
+const outputSchema = z.object({
+	status: z.enum(['imported', 'generated', 'skipped']),
+	rowCount: z.number().optional(),
+	source: z.enum(['history', 'synthetic']).optional(),
+	reason: z.string().optional(),
+	/**
+	 * True when synthetic rows were inserted with empty expected-output columns.
+	 * The agent must tell the user to fill those columns in before running the
+	 * evaluation, so the eval measures correctness instead of self-consistency
+	 * with the generator's own guess at the right answer.
+	 */
+	expectedOutputsNeedUserReview: z.boolean().optional(),
+	expectedOutputColumns: z.array(z.string()).optional(),
+	/**
+	 * Snapshot of the populated DataTable so the agent can show the user what
+	 * was generated alongside the metric setup, without making them dig through
+	 * the data-tables UI to verify. Includes the table id (for deep-linking) and
+	 * a short row preview. Only present on success paths.
+	 */
+	table: tableSummarySchema.optional(),
+});
+
+function truncateForPreview(value: unknown): unknown {
+	if (typeof value !== 'string') return value;
+	return value.length > PREVIEW_VALUE_MAX_LEN ? `${value.slice(0, PREVIEW_VALUE_MAX_LEN)}…` : value;
+}
+
+function buildPreviewRows(rows: Array<Record<string, unknown>>): Array<Record<string, unknown>> {
+	return rows.slice(0, PREVIEW_ROW_COUNT).map((row) => {
+		const truncated: Record<string, unknown> = {};
+		for (const [key, value] of Object.entries(row)) {
+			truncated[key] = truncateForPreview(value);
+		}
+		return truncated;
+	});
+}
+
+export function createEvalDataAgentTool(context: OrchestrationContext) {
+	return new Tool('eval-data')
+		.description(
+			'Populate an eval DataTable for a workflow that already has its eval setup wired. ' +
+				'First scans the workflow execution history for real rows (these include real expected ' +
+				'outputs); if fewer than 10 valid rows are available, generates synthetic rows with INPUT ' +
+				'columns only — expected-output columns are left empty so the user can fill them in with ' +
+				'the correct answers. We never auto-fill expected outputs with model-generated guesses, ' +
+				'because that would measure self-consistency rather than correctness. ' +
+				'Inserts at most 25 rows total. Synchronous — no sub-agent, no HITL.',
+		)
+		.input(evalDataInputSchema)
+		.output(outputSchema)
+		.handler(async (input: z.infer<typeof evalDataInputSchema>) => {
+			const domain = context.domainContext;
+			if (!domain) {
+				return { status: 'skipped' as const, reason: 'Domain context unavailable.' };
+			}
+
+			const log = (level: 'info' | 'warn' | 'error', msg: string) => {
+				domain.logger?.[level]?.(`[eval-data] ${msg}`);
+			};
+			const j = (v: unknown) => JSON.stringify(v);
+
+			log('info', `start workflowId=${input.workflowId} projectId=${j(input.projectId)}`);
+
+			const workflow = await domain.workflowService.getAsWorkflowJSON(input.workflowId);
+			const reqs = analyzeEvalDataRequirements(workflow);
+			const target = reqs.targets[0];
+			if (!target) {
+				log('warn', `skip:no-target reason=${j(reqs.reason)}`);
+				return { status: 'skipped' as const, reason: reqs.reason ?? 'No eval target.' };
+			}
+			log(
+				'info',
+				`target dataTableId=${target.dataTableId} agent=${j(target.targetAgentNodeName)} inputColumns=${j(target.inputColumns)} expectedOutputColumns=${j(target.expectedOutputColumns)} pairs=${j(target.expectedToActualPairs)}`,
+			);
+			if (!target.targetAgentNodeName) {
+				log('warn', 'skip:no-agent');
+				return {
+					status: 'skipped' as const,
+					reason: 'No agent node reachable from EvaluationTrigger.',
+				};
+			}
+
+			const { rows: historyRows } = await extractRowsFromExecutionHistory(domain, {
+				workflow,
+				workflowId: input.workflowId,
+				agentNodeName: target.targetAgentNodeName,
+				inputColumns: target.inputColumns,
+				expectedToActualPairs: target.expectedToActualPairs,
+			});
+			log('info', `history-extracted count=${historyRows.length}`);
+
+			let rowsToInsert: Array<Record<string, unknown>>;
+			let source: 'history' | 'synthetic';
+
+			if (historyRows.length >= HISTORY_THRESHOLD) {
+				rowsToInsert = historyRows;
+				source = 'history';
+			} else {
+				// This will only generate the input part: expected output columns
+				// will stay empty so that the user has to supply the ground truth.
+				// If the threshold for using history rows has not been reached, however
+				// many rows exist get passed as `realExamples` — a domain reference,
+				// not seeds to paraphrase.
+				rowsToInsert = await generateSampleRows({
+					workflow,
+					columns: target.inputColumns,
+					rowCount: GENERATE_ROW_COUNT,
+					targetAgentNodeName: target.targetAgentNodeName,
+					...(historyRows.length > 0 ? { realExamples: historyRows } : {}),
+				});
+				source = 'synthetic';
+			}
+			log(
+				'info',
+				`rows-prepared source=${source} count=${rowsToInsert.length} firstRowKeys=${j(rowsToInsert[0] ? Object.keys(rowsToInsert[0]) : [])}`,
+			);
+
+			const dataTableOptions = input.projectId ? { projectId: input.projectId } : undefined;
+
+			// On the synthetic path we leave expected-output columns empty, so the
+			// rows never reference them. Still make sure those columns exist in
+			// the table so the user has somewhere to type the correct answer.
+			const extraColumns = source === 'synthetic' ? target.expectedOutputColumns : [];
+
+			try {
+				await ensureColumnsExist(
+					domain.dataTableService,
+					target.dataTableId,
+					rowsToInsert,
+					extraColumns,
+					dataTableOptions,
+				);
+			} catch (error) {
+				const message = error instanceof Error ? error.message : String(error);
+				log('error', `ensureColumnsExist-failed error=${j(message)}`);
+				throw error;
+			}
+
+			let insertResult: Awaited<ReturnType<typeof domain.dataTableService.insertRows>>;
+			try {
+				insertResult = await domain.dataTableService.insertRows(
+					target.dataTableId,
+					rowsToInsert,
+					dataTableOptions,
+				);
+				log('info', `insertRows-ok result=${j(insertResult)}`);
+			} catch (error) {
+				const message = error instanceof Error ? error.message : String(error);
+				log('error', `insertRows-failed error=${j(message)}`);
+				throw error;
+			}
+
+			// Fetch a tiny preview so the agent can recap WHAT was generated, not
+			// just that something was. Treat failures here as non-fatal — the
+			// insert already succeeded; a missing preview is a UX gap, not a bug.
+			let previewRows: Array<Record<string, unknown>> = [];
+			try {
+				const preview = await domain.dataTableService.queryRows(target.dataTableId, {
+					limit: PREVIEW_ROW_COUNT,
+					...(insertResult.projectId ? { projectId: insertResult.projectId } : {}),
+				});
+				previewRows = buildPreviewRows(preview.data);
+			} catch (error) {
+				const message = error instanceof Error ? error.message : String(error);
+				log('warn', `preview-query-failed error=${j(message)}`);
+			}
+
+			log('info', `done source=${source} rowCount=${rowsToInsert.length}`);
+			const needsReview = source === 'synthetic' && target.expectedOutputColumns.length > 0;
+			const table = {
+				id: target.dataTableId,
+				name: insertResult.tableName,
+				...(insertResult.projectId ? { projectId: insertResult.projectId } : {}),
+				rowCount: rowsToInsert.length,
+				inputColumns: target.inputColumns,
+				expectedOutputColumns: target.expectedOutputColumns,
+				previewRows,
+			};
+			return {
+				status: source === 'history' ? ('imported' as const) : ('generated' as const),
+				rowCount: rowsToInsert.length,
+				source,
+				...(needsReview
+					? {
+							expectedOutputsNeedUserReview: true as const,
+							expectedOutputColumns: target.expectedOutputColumns,
+						}
+					: {}),
+				table,
+			};
+		})
+		.build();
+}
--- a/packages/@n8n/instance-ai/src/tools/tool-ids.ts
+++ b/packages/@n8n/instance-ai/src/tools/tool-ids.ts
@ -22,6 +22,7 @@ export const ORCHESTRATION_TOOL_IDS = {
 	DELEGATE: 'delegate',
 	BUILD_WORKFLOW_WITH_AGENT: 'build-workflow-with-agent',
 	EVAL_SETUP_WITH_AGENT: 'eval-setup-with-agent',
+	EVAL_DATA: 'eval-data',
 	MANAGE_DATA_TABLES_WITH_AGENT: 'manage-data-tables-with-agent',
 	RESEARCH_WITH_AGENT: 'research-with-agent',
 	BROWSER_CREDENTIAL_SETUP: 'browser-credential-setup',