feat(core): Add eval-data populator tool (no-changelog) (#30680)

This commit is contained in:
Luca Mattiazzi 2026-05-20 18:18:06 +02:00 committed by GitHub
parent 73eae7f1d4
commit e13d4e0465
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 950 additions and 11 deletions

View File

@ -56,6 +56,10 @@ jest.mock('../orchestration/eval-setup-agent.tool', () => ({
createEvalSetupAgentTool: jest.fn(() => ({ id: 'eval-setup-with-agent' })),
}));
jest.mock('../orchestration/eval-data-agent.tool', () => ({
createEvalDataAgentTool: jest.fn(() => ({ id: 'eval-data' })),
}));
jest.mock('../orchestration/plan-with-agent.tool', () => ({
createPlanWithAgentTool: jest.fn(() => ({ id: 'plan' })),
}));

View File

@ -275,6 +275,110 @@ describe('runBatch', () => {
expect(rows).toEqual([]);
expect(generate).not.toHaveBeenCalled();
});
describe('realExamples few-shot block', () => {
function captureBatchPrompt(generate: GenerateMock): string {
return getPromptText(generate);
}
it('injects a reference-not-seed block when examples are provided', async () => {
const generate = createGenerateMock();
mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
typeof createEvalAgent
>);
mockExtractText.mockReturnValue(JSON.stringify([]));
await runBatch({
facet: BATCH_FACET,
rowCount: 1,
context: BATCH_CONTEXT,
columns: ['user_query'],
realExamples: [{ user_query: 'how do I refund an order?' }],
});
const promptText = captureBatchPrompt(generate);
expect(promptText).toContain('Recent real inputs the agent has received in production');
expect(promptText).toContain('REFERENCE, not seeds');
expect(promptText).toContain('how do I refund an order?');
expect(promptText).toMatch(/Do NOT copy or paraphrase them/);
});
it('omits the block entirely when realExamples is undefined', async () => {
const generate = createGenerateMock();
mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
typeof createEvalAgent
>);
mockExtractText.mockReturnValue(JSON.stringify([]));
await runBatch({
facet: BATCH_FACET,
rowCount: 1,
context: BATCH_CONTEXT,
columns: ['user_query'],
});
expect(captureBatchPrompt(generate)).not.toContain('Recent real inputs');
});
it('filters examples to the requested columns and drops rows that lack all of them', async () => {
const generate = createGenerateMock();
mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
typeof createEvalAgent
>);
mockExtractText.mockReturnValue(JSON.stringify([]));
await runBatch({
facet: BATCH_FACET,
rowCount: 1,
context: BATCH_CONTEXT,
columns: ['user_query'],
realExamples: [
{ user_query: 'real one', expected_response: 'should not leak' },
{ unrelated: 'dropped' },
],
});
const promptText = captureBatchPrompt(generate);
expect(promptText).toContain('real one');
expect(promptText).not.toContain('should not leak');
expect(promptText).not.toContain('dropped');
expect(promptText).not.toContain('unrelated');
});
it('caps the example list at 10 entries', async () => {
const generate = createGenerateMock();
mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
typeof createEvalAgent
>);
mockExtractText.mockReturnValue(JSON.stringify([]));
const examples = Array.from({ length: 13 }, (_, i) => ({ user_query: `q${i}` }));
await runBatch({
facet: BATCH_FACET,
rowCount: 1,
context: BATCH_CONTEXT,
columns: ['user_query'],
realExamples: examples,
});
const promptText = captureBatchPrompt(generate);
expect(promptText).toContain('q0');
expect(promptText).toContain('q9');
expect(promptText).not.toContain('q10');
expect(promptText).not.toContain('q12');
});
it('truncates values longer than 300 characters with an ellipsis', async () => {
const generate = createGenerateMock();
mockCreateEvalAgent.mockReturnValue({ generate } as unknown as ReturnType<
typeof createEvalAgent
>);
mockExtractText.mockReturnValue(JSON.stringify([]));
const longValue = 'x'.repeat(500);
await runBatch({
facet: BATCH_FACET,
rowCount: 1,
context: BATCH_CONTEXT,
columns: ['user_query'],
realExamples: [{ user_query: longValue }],
});
const promptText = captureBatchPrompt(generate);
expect(promptText).toMatch(/x{300}…/);
expect(promptText).not.toMatch(/x{301}/);
});
});
});
describe('extractAgentContext', () => {

View File

@ -9,6 +9,8 @@ import { createEvalAgent, extractText, HAIKU_MODEL } from '../../utils/eval-agen
const FACET_COUNT = 5;
const DEFAULT_ROW_COUNT = 25;
const SYSTEM_PROMPT_MAX_CHARS = 2000;
const REAL_EXAMPLES_MAX_COUNT = 10;
const REAL_EXAMPLE_VALUE_MAX_CHARS = 300;
export interface SampleRowFacet {
length: string;
@ -145,6 +147,52 @@ function buildAgentContextBlock(context: AgentContext | undefined): string {
const FORMAT_INFERENCE =
"Inspect the agent's system prompt, prompt template, and connected tools to infer what kind of text this agent receives at runtime. It may be a user chat message, output from another tool, scraped web content, structured records (JSON/key-value), document chunks, log lines, code, etc. Generate inputs that look like what would arrive at the agent in production. Do not assume a human user when the agent suggests otherwise.";
function truncateExampleValue(value: string): string {
return value.length > REAL_EXAMPLE_VALUE_MAX_CHARS
? `${value.slice(0, REAL_EXAMPLE_VALUE_MAX_CHARS)}`
: value;
}
/**
* Render a small block of recent real inputs (filtered to the requested
* `columns`) as a reference for the LLM. Returns an empty string when no
* usable examples exist the caller injects this block only when
* non-empty, so the generator keeps producing rows from agent context
* alone when history is missing.
*
* The directive is explicit that these are flavour reference, not seed
* data to copy: the generator must produce NEW inputs in the same domain
* and tone, not paraphrase the examples.
*/
function buildRealExamplesBlock(
examples: ReadonlyArray<Record<string, unknown>> | undefined,
columns: string[],
): string {
if (!examples || examples.length === 0 || columns.length === 0) return '';
const filtered: Array<Record<string, string>> = [];
for (const example of examples.slice(0, REAL_EXAMPLES_MAX_COUNT)) {
const row: Record<string, string> = {};
let hasValue = false;
for (const col of columns) {
const raw = example[col];
if (raw === undefined || raw === null) continue;
const str = typeof raw === 'string' ? raw : JSON.stringify(raw);
if (str.length === 0) continue;
row[col] = truncateExampleValue(str);
hasValue = true;
}
if (hasValue) filtered.push(row);
}
if (filtered.length === 0) return '';
const numbered = filtered.map((row, i) => `${i + 1}. ${JSON.stringify(row)}`).join('\n');
return [
'',
'Recent real inputs the agent has received in production (REFERENCE, not seeds):',
numbered,
'Use these as a hint about the actual domain, tone and shape of inputs the agent sees. Do NOT copy or paraphrase them — produce NEW inputs that fit the same setting.',
].join('\n');
}
const BATCH_SYSTEM_INSTRUCTIONS = `You generate realistic test inputs for an n8n workflow evaluation dataset.
Output: JSON array of objects. Keys = exactly the provided column names. Values = short strings. No prose outside the JSON.
@ -158,6 +206,7 @@ export interface RunBatchInput {
rowCount: number;
context: AgentContext | undefined;
columns: string[];
realExamples?: ReadonlyArray<Record<string, unknown>>;
logger?: Pick<Logger, 'warn'>;
}
@ -194,17 +243,23 @@ export async function runBatch(input: RunBatchInput): Promise<Array<Record<strin
model: HAIKU_MODEL,
instructions: BATCH_SYSTEM_INSTRUCTIONS,
});
const userText = [
buildAgentContextBlock(input.context),
'',
FORMAT_INFERENCE,
'',
`Variation focus for this batch: length = ${input.facet.length}; mode = ${input.facet.edgeMode}.`,
input.facet.instructions,
'',
`Columns: ${generatedColumns.join(', ')}`,
`Generate exactly ${requestedRowCount} rows.`,
].join('\n');
const realExamplesBlock = buildRealExamplesBlock(input.realExamples, generatedColumns);
const sections = [buildAgentContextBlock(input.context)];
if (realExamplesBlock) sections.push(realExamplesBlock);
sections.push(FORMAT_INFERENCE);
sections.push(
[
`Variation focus for this batch: length = ${input.facet.length}; mode = ${input.facet.edgeMode}.`,
input.facet.instructions,
].join('\n'),
);
sections.push(
[
`Columns: ${generatedColumns.join(', ')}`,
`Generate exactly ${requestedRowCount} rows.`,
].join('\n'),
);
const userText = sections.join('\n\n');
const result = await agent.generate(userText);
const text = extractText(result);
const parsed: unknown = JSON.parse(stripMarkdownFences(text));
@ -235,6 +290,15 @@ export interface GenerateSampleRowsInput {
columns: string[];
rowCount?: number;
targetAgentNodeName?: string;
/**
* Recent real input rows extracted from the workflow's execution history.
* When present (typically below the history threshold that would otherwise
* have been used directly), they are passed to the LLM as a flavour
* reference rows are filtered to the requested `columns`, truncated, and
* accompanied by an explicit "reference, not seed" directive so the
* generator produces new in-domain inputs instead of paraphrasing them.
*/
realExamples?: ReadonlyArray<Record<string, unknown>>;
logger?: Pick<Logger, 'warn'>;
}
@ -270,6 +334,7 @@ export async function generateSampleRows(
rowCount: counts[i],
context,
columns: input.columns,
realExamples: input.realExamples,
logger: input.logger,
});
}),

View File

@ -13,6 +13,7 @@ import { createBrowserCredentialSetupTool } from './orchestration/browser-creden
import { createBuildWorkflowAgentTool } from './orchestration/build-workflow-agent.tool';
import { createCompleteCheckpointTool } from './orchestration/complete-checkpoint.tool';
import { createDelegateTool } from './orchestration/delegate.tool';
import { createEvalDataAgentTool } from './orchestration/eval-data-agent.tool';
import { createEvalSetupAgentTool } from './orchestration/eval-setup-agent.tool';
import { createPlanWithAgentTool } from './orchestration/plan-with-agent.tool';
import { createPlanTool } from './orchestration/plan.tool';
@ -89,6 +90,7 @@ export function createOrchestrationTools(context: OrchestrationContext): Instanc
[ORCHESTRATION_TOOL_IDS.BUILD_WORKFLOW_WITH_AGENT, createBuildWorkflowAgentTool(context)],
[ORCHESTRATION_TOOL_IDS.COMPLETE_CHECKPOINT, createCompleteCheckpointTool(context)],
[ORCHESTRATION_TOOL_IDS.EVAL_SETUP_WITH_AGENT, createEvalSetupAgentTool(context)],
[ORCHESTRATION_TOOL_IDS.EVAL_DATA, createEvalDataAgentTool(context)],
];
if (context.browserMcpConfig || hasGatewayBrowserTools(context)) {

View File

@ -0,0 +1,519 @@
import type { WorkflowJSON } from '@n8n/workflow-sdk';
import * as sampleRowsService from '../../evals/generate-sample-rows.service';
import { createEvalDataAgentTool } from '../eval-data-agent.tool';
type EvalDataToolResult = {
status: 'imported' | 'generated' | 'skipped';
source?: 'history' | 'synthetic';
rowCount?: number;
expectedOutputsNeedUserReview?: boolean;
expectedOutputColumns?: string[];
table?: {
id: string;
name: string;
projectId?: string;
rowCount: number;
inputColumns: string[];
previewRows: Array<Record<string, unknown>>;
};
};
async function runEvalDataTool(
ctx: ReturnType<typeof buildOrchestrationCtx>,
input: { workflowId: string; projectId?: string },
): Promise<EvalDataToolResult> {
const tool = createEvalDataAgentTool(ctx as never);
return (await tool.handler!(input, {} as never)) as EvalDataToolResult;
}
const evalWf = (): WorkflowJSON =>
({
name: 't',
nodes: [
{
name: 'EvalTrig',
type: 'n8n-nodes-base.evaluationTrigger',
typeVersion: 1,
parameters: { dataTableId: { value: 'dt-1' } },
position: [0, 0],
id: 't',
},
{
name: 'Agent',
type: '@n8n/n8n-nodes-langchain.agent',
typeVersion: 1,
parameters: { text: '={{ $json.user_query }}' },
position: [200, 0],
id: 'a',
},
],
connections: {
EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
},
pinData: {},
settings: {},
}) as unknown as WorkflowJSON;
const evalWfWithMetrics = (): WorkflowJSON =>
({
name: 't',
nodes: [
{
name: 'EvalTrig',
type: 'n8n-nodes-base.evaluationTrigger',
typeVersion: 1,
parameters: { dataTableId: { value: 'dt-1' } },
position: [0, 0],
id: 't',
},
{
name: 'Agent',
type: '@n8n/n8n-nodes-langchain.agent',
typeVersion: 1,
parameters: { text: '={{ $json.user_query }}' },
position: [200, 0],
id: 'a',
},
{
name: 'MetricN',
type: 'n8n-nodes-base.evaluation',
typeVersion: 1,
parameters: {
operation: 'setMetrics',
expectedAnswer: "={{ $('EvalTrig').item.json.expected_response }}",
actualAnswer: '={{ $json.output }}',
},
position: [400, 0],
id: 'm',
},
],
connections: {
EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
Agent: { main: [[{ node: 'MetricN', type: 'main', index: 0 }]] },
},
pinData: {},
settings: {},
}) as unknown as WorkflowJSON;
const defaultInsertResult = {
insertedCount: 0,
dataTableId: 'dt-1',
tableName: 'eval_dataset',
projectId: 'proj-1',
};
const silentLogger = () => ({ info: jest.fn(), warn: jest.fn(), error: jest.fn() });
/** Default DataTable service stub. Override individual mocks per test as needed. */
function defaultDataTableService(
overrides: Partial<{
insertRows: jest.Mock;
getSchema: jest.Mock;
addColumn: jest.Mock;
queryRows: jest.Mock;
}> = {},
) {
return {
insertRows: jest.fn().mockResolvedValue(defaultInsertResult),
getSchema: jest.fn().mockResolvedValue([]),
addColumn: jest.fn().mockResolvedValue(undefined),
queryRows: jest.fn().mockResolvedValue({ count: 0, data: [] }),
...overrides,
};
}
/** Execution service stub with no successful executions to read from. */
function emptyExecutionService() {
return {
list: jest.fn().mockResolvedValueOnce([]).mockResolvedValueOnce([]),
getNodeOutput: jest.fn(),
};
}
/**
* Execution service stub returning `count` successful executions whose
* EvalTrig output exposes `user_query: "real-eN"` per execution.
*/
function trigInputHistoryExecutionService(count: number) {
const summaries = Array.from({ length: count }, (_, i) => ({
id: `e${i}`,
status: 'success',
}));
return {
list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]),
getNodeOutput: jest.fn(
async (id: string) =>
await Promise.resolve({
nodeName: 'EvalTrig',
items: [{ json: { user_query: `real-${id}` } }],
totalItems: 1,
returned: { from: 0, to: 0 },
}),
),
};
}
/**
* Execution service stub returning `count` successful executions with both
* an EvalTrig input (`user_query: "q-eN"`) and an Agent output (`output: "a-eN"`).
*/
function trigInputAgentOutputExecutionService(count: number) {
const summaries = Array.from({ length: count }, (_, i) => ({
id: `e${i}`,
status: 'success',
}));
return {
list: jest.fn().mockResolvedValueOnce(summaries).mockResolvedValueOnce([]),
getNodeOutput: jest.fn(async (id: string, nodeName: string) =>
nodeName === 'EvalTrig'
? await Promise.resolve({
nodeName,
items: [{ json: { user_query: `q-${id}` } }],
totalItems: 1,
returned: { from: 0, to: 0 },
})
: await Promise.resolve({
nodeName,
items: [{ json: { output: `a-${id}` } }],
totalItems: 1,
returned: { from: 0, to: 0 },
}),
),
};
}
interface BuildCtxOptions {
workflow?: WorkflowJSON;
dataTableService?: ReturnType<typeof defaultDataTableService>;
executionService?: ReturnType<typeof emptyExecutionService>;
}
const buildOrchestrationCtx = (opts: BuildCtxOptions = {}) => ({
domainContext: {
workflowService: {
getAsWorkflowJSON: jest.fn().mockResolvedValue(opts.workflow ?? evalWf()),
},
dataTableService: opts.dataTableService ?? defaultDataTableService(),
executionService: opts.executionService ?? emptyExecutionService(),
logger: silentLogger(),
},
});
describe('eval-data tool', () => {
beforeEach(() => {
jest.restoreAllMocks();
});
it('imports rows from execution history when >= 10 valid rows are available', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({
dataTableService,
executionService: trigInputHistoryExecutionService(12),
});
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.status).toBe('imported');
expect(result.source).toBe('history');
expect(result.rowCount).toBe(12);
expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined);
});
it('falls back to synthetic generation when fewer than 10 valid history rows are available', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({ dataTableService });
jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue(Array.from({ length: 10 }, (_, i) => ({ user_query: `gen-${i}` })));
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.status).toBe('generated');
expect(result.source).toBe('synthetic');
expect(result.rowCount).toBe(10);
expect(dataTableService.insertRows).toHaveBeenCalled();
});
it('returns skipped when no eval target exists', async () => {
const wf: WorkflowJSON = {
name: 't',
nodes: [],
connections: {},
pinData: {},
settings: {},
} as never;
const ctx = buildOrchestrationCtx({ workflow: wf });
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.status).toBe('skipped');
});
it('populates with the fallback "input" column when agent has no $json refs', async () => {
const wf = {
name: 't',
nodes: [
{
name: 'EvalTrig',
type: 'n8n-nodes-base.evaluationTrigger',
typeVersion: 1,
parameters: { dataTableId: { value: 'dt-1' } },
position: [0, 0],
id: 't',
},
// Agent with no $json refs in its parameters
{
name: 'Agent',
type: '@n8n/n8n-nodes-langchain.agent',
typeVersion: 1,
parameters: { text: 'literal prompt' },
position: [200, 0],
id: 'a',
},
],
connections: {
EvalTrig: { main: [[{ node: 'Agent', type: 'main', index: 0 }]] },
},
pinData: {},
settings: {},
} as unknown as WorkflowJSON;
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'input' }]),
});
const ctx = buildOrchestrationCtx({ workflow: wf, dataTableService });
jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ input: 'sample' }]);
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.status).toBe('generated');
expect(dataTableService.insertRows).toHaveBeenCalledWith(
'dt-1',
[{ input: 'sample' }],
undefined,
);
});
it('populates expected_* columns from agent output in the history path', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest
.fn()
.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
});
const ctx = buildOrchestrationCtx({
workflow: evalWfWithMetrics(),
dataTableService,
executionService: trigInputAgentOutputExecutionService(12),
});
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.status).toBe('imported');
expect(result.rowCount).toBe(12);
expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), undefined);
expect(dataTableService.insertRows).toHaveBeenCalledWith(
'dt-1',
expect.arrayContaining([{ user_query: 'q-e0', expected_response: 'a-e0' }]),
undefined,
);
});
it('synthetic path generates ONLY input columns and flags expected outputs for user review', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest
.fn()
.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
});
const ctx = buildOrchestrationCtx({ workflow: evalWfWithMetrics(), dataTableService });
const generateSpy = jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue([{ user_query: 'q' }]);
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(generateSpy).toHaveBeenCalledWith(
expect.objectContaining({
columns: ['user_query'],
rowCount: 10,
}),
);
expect(result).toMatchObject({
status: 'generated',
source: 'synthetic',
expectedOutputsNeedUserReview: true,
expectedOutputColumns: ['expected_response'],
});
});
it('does not flag user review on the history path (real outputs are ground truth)', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest
.fn()
.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
});
const ctx = buildOrchestrationCtx({
workflow: evalWfWithMetrics(),
dataTableService,
executionService: trigInputAgentOutputExecutionService(12),
});
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.source).toBe('history');
expect(result.expectedOutputsNeedUserReview).toBeUndefined();
});
it('does not flag user review when there are no expected-output columns', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({ dataTableService });
jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.source).toBe('synthetic');
expect(result.expectedOutputsNeedUserReview).toBeUndefined();
});
it('adds missing columns to the DataTable before inserting rows', async () => {
// Schema has only the input column; expected_response is missing.
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({
workflow: evalWfWithMetrics(),
dataTableService,
});
jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]);
await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(dataTableService.getSchema).toHaveBeenCalledWith('dt-1', undefined);
expect(dataTableService.addColumn).toHaveBeenCalledTimes(1);
expect(dataTableService.addColumn).toHaveBeenCalledWith(
'dt-1',
{ name: 'expected_response', type: 'string' },
undefined,
);
expect(dataTableService.insertRows).toHaveBeenCalled();
expect(dataTableService.addColumn.mock.invocationCallOrder[0]).toBeLessThan(
dataTableService.insertRows.mock.invocationCallOrder[0],
);
});
it('does not add columns that already exist in the DataTable schema', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest
.fn()
.mockResolvedValue([{ name: 'user_query' }, { name: 'expected_response' }]),
});
const ctx = buildOrchestrationCtx({
workflow: evalWfWithMetrics(),
dataTableService,
});
jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue([{ user_query: 'q', expected_response: 'r' }]);
await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(dataTableService.addColumn).not.toHaveBeenCalled();
expect(dataTableService.insertRows).toHaveBeenCalled();
});
it('forwards projectId to insertRows when present', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({ dataTableService });
jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
await runEvalDataTool(ctx, { workflowId: 'w1', projectId: 'proj-1' });
expect(dataTableService.insertRows).toHaveBeenCalledWith('dt-1', expect.any(Array), {
projectId: 'proj-1',
});
});
it('returns a `table` summary so the agent can recap the populated dataset to the user', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
queryRows: jest.fn().mockResolvedValue({
count: 2,
data: [
{
user_query:
'first question with a really long body that should be truncated past eighty characters of content easily',
},
{ user_query: 'second' },
],
}),
});
const ctx = buildOrchestrationCtx({ dataTableService });
jest.spyOn(sampleRowsService, 'generateSampleRows').mockResolvedValue([{ user_query: 'q' }]);
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.table).toMatchObject({
id: 'dt-1',
name: 'eval_dataset',
projectId: 'proj-1',
rowCount: 1,
inputColumns: ['user_query'],
});
expect(result.table?.previewRows).toHaveLength(2);
// First row's long string should be truncated.
expect(String(result.table?.previewRows[0]?.user_query)).toMatch(/…$/);
});
describe('few-shot seeding from history', () => {
it('passes residual history rows to generateSampleRows when below threshold', async () => {
// 3 valid history rows — below the 10-row threshold, so the tool
// goes synthetic but should still hand the rows to the generator.
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({
dataTableService,
executionService: trigInputHistoryExecutionService(3),
});
const generateSpy = jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue([{ user_query: 'synth' }]);
const result = await runEvalDataTool(ctx, { workflowId: 'w1' });
expect(result.source).toBe('synthetic');
const callArg = generateSpy.mock.calls[0]?.[0];
expect(callArg?.realExamples).toEqual(
expect.arrayContaining([
expect.objectContaining({ user_query: 'real-e0' }),
expect.objectContaining({ user_query: 'real-e1' }),
expect.objectContaining({ user_query: 'real-e2' }),
]),
);
});
it('does not pass realExamples when no history rows are available', async () => {
const dataTableService = defaultDataTableService({
getSchema: jest.fn().mockResolvedValue([{ name: 'user_query' }]),
});
const ctx = buildOrchestrationCtx({ dataTableService });
const generateSpy = jest
.spyOn(sampleRowsService, 'generateSampleRows')
.mockResolvedValue([{ user_query: 'synth' }]);
await runEvalDataTool(ctx, { workflowId: 'w1' });
const callArg = generateSpy.mock.calls[0]?.[0];
expect(callArg).not.toHaveProperty('realExamples');
});
});
});

View File

@ -0,0 +1,244 @@
import { Tool } from '@n8n/agents';
import { z } from 'zod';
import type { InstanceAiDataTableService, OrchestrationContext } from '../../types';
import { analyzeEvalDataRequirements } from '../evals/eval-data-requirements.service';
import { extractRowsFromExecutionHistory } from '../evals/extract-rows-from-history.service';
import { generateSampleRows } from '../evals/generate-sample-rows.service';
const HISTORY_THRESHOLD = 10;
const GENERATE_ROW_COUNT = 10;
async function ensureColumnsExist(
dataTableService: InstanceAiDataTableService,
dataTableId: string,
rows: Array<Record<string, unknown>>,
extraColumns: readonly string[],
options: { projectId?: string } | undefined,
): Promise<void> {
const referencedColumns = new Set<string>(extraColumns);
for (const row of rows) {
for (const key of Object.keys(row)) referencedColumns.add(key);
}
if (referencedColumns.size === 0) return;
const schema = await dataTableService.getSchema(dataTableId, options);
const existing = new Set(schema.map((c) => c.name));
const missing = [...referencedColumns].filter((name) => !existing.has(name));
for (const name of missing) {
await dataTableService.addColumn(dataTableId, { name, type: 'string' }, options);
}
}
const evalDataInputSchema = z.object({
workflowId: z.string().describe('ID of the workflow whose eval DataTable should be populated'),
projectId: z.string().optional(),
});
const PREVIEW_ROW_COUNT = 3;
const PREVIEW_VALUE_MAX_LEN = 80;
const tableSummarySchema = z.object({
id: z.string(),
name: z.string(),
projectId: z.string().optional(),
rowCount: z.number(),
inputColumns: z.array(z.string()),
expectedOutputColumns: z.array(z.string()),
previewRows: z.array(z.record(z.string(), z.unknown())),
});
const outputSchema = z.object({
status: z.enum(['imported', 'generated', 'skipped']),
rowCount: z.number().optional(),
source: z.enum(['history', 'synthetic']).optional(),
reason: z.string().optional(),
/**
* True when synthetic rows were inserted with empty expected-output columns.
* The agent must tell the user to fill those columns in before running the
* evaluation, so the eval measures correctness instead of self-consistency
* with the generator's own guess at the right answer.
*/
expectedOutputsNeedUserReview: z.boolean().optional(),
expectedOutputColumns: z.array(z.string()).optional(),
/**
* Snapshot of the populated DataTable so the agent can show the user what
* was generated alongside the metric setup, without making them dig through
* the data-tables UI to verify. Includes the table id (for deep-linking) and
* a short row preview. Only present on success paths.
*/
table: tableSummarySchema.optional(),
});
function truncateForPreview(value: unknown): unknown {
if (typeof value !== 'string') return value;
return value.length > PREVIEW_VALUE_MAX_LEN ? `${value.slice(0, PREVIEW_VALUE_MAX_LEN)}` : value;
}
function buildPreviewRows(rows: Array<Record<string, unknown>>): Array<Record<string, unknown>> {
return rows.slice(0, PREVIEW_ROW_COUNT).map((row) => {
const truncated: Record<string, unknown> = {};
for (const [key, value] of Object.entries(row)) {
truncated[key] = truncateForPreview(value);
}
return truncated;
});
}
export function createEvalDataAgentTool(context: OrchestrationContext) {
return new Tool('eval-data')
.description(
'Populate an eval DataTable for a workflow that already has its eval setup wired. ' +
'First scans the workflow execution history for real rows (these include real expected ' +
'outputs); if fewer than 10 valid rows are available, generates synthetic rows with INPUT ' +
'columns only — expected-output columns are left empty so the user can fill them in with ' +
'the correct answers. We never auto-fill expected outputs with model-generated guesses, ' +
'because that would measure self-consistency rather than correctness. ' +
'Inserts at most 25 rows total. Synchronous — no sub-agent, no HITL.',
)
.input(evalDataInputSchema)
.output(outputSchema)
.handler(async (input: z.infer<typeof evalDataInputSchema>) => {
const domain = context.domainContext;
if (!domain) {
return { status: 'skipped' as const, reason: 'Domain context unavailable.' };
}
const log = (level: 'info' | 'warn' | 'error', msg: string) => {
domain.logger?.[level]?.(`[eval-data] ${msg}`);
};
const j = (v: unknown) => JSON.stringify(v);
log('info', `start workflowId=${input.workflowId} projectId=${j(input.projectId)}`);
const workflow = await domain.workflowService.getAsWorkflowJSON(input.workflowId);
const reqs = analyzeEvalDataRequirements(workflow);
const target = reqs.targets[0];
if (!target) {
log('warn', `skip:no-target reason=${j(reqs.reason)}`);
return { status: 'skipped' as const, reason: reqs.reason ?? 'No eval target.' };
}
log(
'info',
`target dataTableId=${target.dataTableId} agent=${j(target.targetAgentNodeName)} inputColumns=${j(target.inputColumns)} expectedOutputColumns=${j(target.expectedOutputColumns)} pairs=${j(target.expectedToActualPairs)}`,
);
if (!target.targetAgentNodeName) {
log('warn', 'skip:no-agent');
return {
status: 'skipped' as const,
reason: 'No agent node reachable from EvaluationTrigger.',
};
}
const { rows: historyRows } = await extractRowsFromExecutionHistory(domain, {
workflow,
workflowId: input.workflowId,
agentNodeName: target.targetAgentNodeName,
inputColumns: target.inputColumns,
expectedToActualPairs: target.expectedToActualPairs,
});
log('info', `history-extracted count=${historyRows.length}`);
let rowsToInsert: Array<Record<string, unknown>>;
let source: 'history' | 'synthetic';
if (historyRows.length >= HISTORY_THRESHOLD) {
rowsToInsert = historyRows;
source = 'history';
} else {
// This will only generate the input part: expected output columns
// will stay empty so that the user has to supply the ground truth.
// If the threshold for using history rows has not been reached, however
// many rows exist get passed as `realExamples` — a domain reference,
// not seeds to paraphrase.
rowsToInsert = await generateSampleRows({
workflow,
columns: target.inputColumns,
rowCount: GENERATE_ROW_COUNT,
targetAgentNodeName: target.targetAgentNodeName,
...(historyRows.length > 0 ? { realExamples: historyRows } : {}),
});
source = 'synthetic';
}
log(
'info',
`rows-prepared source=${source} count=${rowsToInsert.length} firstRowKeys=${j(rowsToInsert[0] ? Object.keys(rowsToInsert[0]) : [])}`,
);
const dataTableOptions = input.projectId ? { projectId: input.projectId } : undefined;
// On the synthetic path we leave expected-output columns empty, so the
// rows never reference them. Still make sure those columns exist in
// the table so the user has somewhere to type the correct answer.
const extraColumns = source === 'synthetic' ? target.expectedOutputColumns : [];
try {
await ensureColumnsExist(
domain.dataTableService,
target.dataTableId,
rowsToInsert,
extraColumns,
dataTableOptions,
);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
log('error', `ensureColumnsExist-failed error=${j(message)}`);
throw error;
}
let insertResult: Awaited<ReturnType<typeof domain.dataTableService.insertRows>>;
try {
insertResult = await domain.dataTableService.insertRows(
target.dataTableId,
rowsToInsert,
dataTableOptions,
);
log('info', `insertRows-ok result=${j(insertResult)}`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
log('error', `insertRows-failed error=${j(message)}`);
throw error;
}
// Fetch a tiny preview so the agent can recap WHAT was generated, not
// just that something was. Treat failures here as non-fatal — the
// insert already succeeded; a missing preview is a UX gap, not a bug.
let previewRows: Array<Record<string, unknown>> = [];
try {
const preview = await domain.dataTableService.queryRows(target.dataTableId, {
limit: PREVIEW_ROW_COUNT,
...(insertResult.projectId ? { projectId: insertResult.projectId } : {}),
});
previewRows = buildPreviewRows(preview.data);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
log('warn', `preview-query-failed error=${j(message)}`);
}
log('info', `done source=${source} rowCount=${rowsToInsert.length}`);
const needsReview = source === 'synthetic' && target.expectedOutputColumns.length > 0;
const table = {
id: target.dataTableId,
name: insertResult.tableName,
...(insertResult.projectId ? { projectId: insertResult.projectId } : {}),
rowCount: rowsToInsert.length,
inputColumns: target.inputColumns,
expectedOutputColumns: target.expectedOutputColumns,
previewRows,
};
return {
status: source === 'history' ? ('imported' as const) : ('generated' as const),
rowCount: rowsToInsert.length,
source,
...(needsReview
? {
expectedOutputsNeedUserReview: true as const,
expectedOutputColumns: target.expectedOutputColumns,
}
: {}),
table,
};
})
.build();
}

View File

@ -22,6 +22,7 @@ export const ORCHESTRATION_TOOL_IDS = {
DELEGATE: 'delegate',
BUILD_WORKFLOW_WITH_AGENT: 'build-workflow-with-agent',
EVAL_SETUP_WITH_AGENT: 'eval-setup-with-agent',
EVAL_DATA: 'eval-data',
MANAGE_DATA_TABLES_WITH_AGENT: 'manage-data-tables-with-agent',
RESEARCH_WITH_AGENT: 'research-with-agent',
BROWSER_CREDENTIAL_SETUP: 'browser-credential-setup',