n8n/packages/@n8n/instance-ai/evaluations/__tests__/event-parser.test.ts
José Braulio González Valido 81ea56fa6b
test(ai-builder): Add multi-turn capability for IAI evals (no-changelog) (#30586)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 13:03:35 +00:00

581 lines
17 KiB
TypeScript

import {
buildConversationMetrics,
buildMetrics,
extractOutcomeFromEvents,
} from '../outcome/event-parser';
import type { CapturedEvent } from '../types';
// ---------------------------------------------------------------------------
// extractOutcomeFromEvents
// ---------------------------------------------------------------------------
describe('extractOutcomeFromEvents', () => {
it('returns empty outcome for no events', () => {
const result = extractOutcomeFromEvents([]);
expect(result.workflowIds).toEqual([]);
expect(result.executionIds).toEqual([]);
expect(result.dataTableIds).toEqual([]);
expect(result.finalText).toBe('');
expect(result.toolCalls).toEqual([]);
expect(result.agentActivities).toEqual([]);
});
it('collects text from text-delta events', () => {
const events: CapturedEvent[] = [
{ timestamp: 1000, type: 'text-delta', data: { type: 'text-delta', text: 'Hello ' } },
{ timestamp: 1001, type: 'text-delta', data: { type: 'text-delta', text: 'World' } },
];
const result = extractOutcomeFromEvents(events);
expect(result.finalText).toBe('Hello World');
});
it('extracts text from payload field', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'text-delta',
data: { type: 'text-delta', payload: { text: 'nested text' } },
},
];
const result = extractOutcomeFromEvents(events);
expect(result.finalText).toBe('nested text');
});
it('tracks tool calls with duration', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: {
toolCallId: 'tc-1',
toolName: 'build-workflow',
args: { name: 'Test' },
},
},
},
{
timestamp: 1500,
type: 'tool-result',
data: {
type: 'tool-result',
payload: {
toolCallId: 'tc-1',
toolName: 'build-workflow',
result: { workflowId: 'wf-123' },
},
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.toolCalls).toHaveLength(1);
expect(result.toolCalls[0].toolName).toBe('build-workflow');
expect(result.toolCalls[0].durationMs).toBe(500);
expect(result.workflowIds).toContain('wf-123');
});
it('extracts workflow IDs from known tool results', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-1', toolName: 'submit-workflow', args: {} },
},
},
{
timestamp: 1100,
type: 'tool-result',
data: {
type: 'tool-result',
payload: { toolCallId: 'tc-1', result: { id: 'wf-456' } },
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.workflowIds).toContain('wf-456');
});
it('extracts execution IDs from run-workflow results', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-1', toolName: 'run-workflow', args: {} },
},
},
{
timestamp: 1100,
type: 'tool-result',
data: {
type: 'tool-result',
payload: {
toolCallId: 'tc-1',
toolName: 'run-workflow',
result: { executionId: 'exec-789' },
},
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.executionIds).toContain('exec-789');
});
it('extracts data table IDs from create-data-table results', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-1', toolName: 'create-data-table', args: {} },
},
},
{
timestamp: 1100,
type: 'tool-result',
data: {
type: 'tool-result',
payload: {
toolCallId: 'tc-1',
toolName: 'create-data-table',
result: { dataTableId: 'dt-001' },
},
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.dataTableIds).toContain('dt-001');
});
it('captures tool errors', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-err', toolName: 'build-workflow', args: {} },
},
},
{
timestamp: 1200,
type: 'tool-error',
data: {
type: 'tool-error',
payload: { toolCallId: 'tc-err', error: 'Something went wrong' },
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.toolCalls).toHaveLength(1);
expect(result.toolCalls[0].error).toBe('Something went wrong');
expect(result.toolCalls[0].durationMs).toBe(200);
});
it('tracks agent activities', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'agent-spawned',
data: {
type: 'agent-spawned',
agentId: 'agent-1',
payload: { agentId: 'agent-1', role: 'builder', parentId: 'root' },
},
},
{
timestamp: 2000,
type: 'agent-completed',
data: {
type: 'agent-completed',
agentId: 'agent-1',
payload: { agentId: 'agent-1', status: 'completed', result: 'Done' },
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.agentActivities).toHaveLength(1);
expect(result.agentActivities[0].role).toBe('builder');
expect(result.agentActivities[0].status).toBe('completed');
});
it('deduplicates resource IDs', () => {
const events: CapturedEvent[] = [
{
timestamp: 1000,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-1', toolName: 'build-workflow', args: {} },
},
},
{
timestamp: 1100,
type: 'tool-result',
data: {
type: 'tool-result',
payload: { toolCallId: 'tc-1', result: { workflowId: 'wf-1' } },
},
},
{
timestamp: 1200,
type: 'tool-call',
data: {
type: 'tool-call',
payload: { toolCallId: 'tc-2', toolName: 'patch-workflow', args: {} },
},
},
{
timestamp: 1300,
type: 'tool-result',
data: {
type: 'tool-result',
payload: { toolCallId: 'tc-2', result: { workflowId: 'wf-1' } },
},
},
];
const result = extractOutcomeFromEvents(events);
expect(result.workflowIds).toEqual(['wf-1']);
});
});
// ---------------------------------------------------------------------------
// buildMetrics
// ---------------------------------------------------------------------------
describe('buildMetrics', () => {
const startTime = 1000;
it('returns zero metrics for no events', () => {
const metrics = buildMetrics([], startTime);
expect(metrics.totalTimeMs).toBe(0);
expect(metrics.timeToFirstTextMs).toBe(0);
expect(metrics.timeToRunFinishMs).toBe(0);
expect(metrics.totalToolCalls).toBe(0);
expect(metrics.subAgentsSpawned).toBe(0);
expect(metrics.confirmationRequests).toBe(0);
});
it('computes time to first text', () => {
const events: CapturedEvent[] = [
{ timestamp: 1500, type: 'tool-call', data: { type: 'tool-call' } },
{ timestamp: 2000, type: 'text-delta', data: { type: 'text-delta', text: 'hi' } },
{ timestamp: 2500, type: 'text-delta', data: { type: 'text-delta', text: ' there' } },
];
const metrics = buildMetrics(events, startTime);
expect(metrics.timeToFirstTextMs).toBe(1000); // 2000 - 1000
});
it('counts tool calls', () => {
const events: CapturedEvent[] = [
{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
{ timestamp: 1200, type: 'tool-call', data: { type: 'tool-call' } },
{ timestamp: 1300, type: 'tool-call', data: { type: 'tool-call' } },
];
const metrics = buildMetrics(events, startTime);
expect(metrics.totalToolCalls).toBe(3);
});
it('counts sub-agents spawned', () => {
const events: CapturedEvent[] = [
{
timestamp: 1100,
type: 'agent-spawned',
data: { type: 'agent-spawned', agentId: 'a1', payload: { agentId: 'a1', role: 'builder' } },
},
{
timestamp: 1200,
type: 'agent-spawned',
data: {
type: 'agent-spawned',
agentId: 'a2',
payload: { agentId: 'a2', role: 'researcher' },
},
},
];
const metrics = buildMetrics(events, startTime);
expect(metrics.subAgentsSpawned).toBe(2);
});
it('counts confirmation requests', () => {
const events: CapturedEvent[] = [
{ timestamp: 1100, type: 'confirmation-request', data: { type: 'confirmation-request' } },
];
const metrics = buildMetrics(events, startTime);
expect(metrics.confirmationRequests).toBe(1);
});
it('captures time to run finish', () => {
const events: CapturedEvent[] = [
{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
{ timestamp: 3000, type: 'run-finish', data: { type: 'run-finish' } },
];
const metrics = buildMetrics(events, startTime);
expect(metrics.timeToRunFinishMs).toBe(2000); // 3000 - 1000
});
it('computes total time from last event', () => {
const events: CapturedEvent[] = [
{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
{ timestamp: 5000, type: 'run-finish', data: { type: 'run-finish' } },
];
const metrics = buildMetrics(events, startTime);
expect(metrics.totalTimeMs).toBe(4000); // 5000 - 1000
});
});
// ---------------------------------------------------------------------------
// buildConversationMetrics — per-turn counters
// ---------------------------------------------------------------------------
describe('buildConversationMetrics', () => {
it('returns empty metrics for no events', () => {
const result = buildConversationMetrics([]);
expect(result.turnCount).toBe(0);
expect(result.perTurn).toEqual([]);
expect(result.confirmationAskedTotal).toBe(0);
expect(result.confirmationAskedByKind).toEqual({});
expect(result.reachedRunFinishCleanly).toBe(false);
});
it('segments a single turn and counts tool calls + errors', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'foo' } },
},
{ timestamp: 3, type: 'tool-error', data: { type: 'tool-error' } },
{
timestamp: 4,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'bar' } },
},
{
timestamp: 5,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'completed' } },
},
];
const result = buildConversationMetrics(events);
expect(result.turnCount).toBe(1);
expect(result.perTurn).toHaveLength(1);
expect(result.perTurn[0].turn).toBe(1);
expect(result.perTurn[0].toolCallCount).toBe(2);
expect(result.perTurn[0].toolErrorCount).toBe(1);
expect(result.perTurn[0].runFinishStatus).toBe('completed');
expect(result.reachedRunFinishCleanly).toBe(true);
});
it('segments multiple turns by run-start boundaries', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'a' } },
},
{
timestamp: 3,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'completed' } },
},
{ timestamp: 4, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 5,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'b' } },
},
{
timestamp: 6,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'c' } },
},
{
timestamp: 7,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'completed' } },
},
];
const result = buildConversationMetrics(events);
expect(result.turnCount).toBe(2);
expect(result.perTurn).toHaveLength(2);
expect(result.perTurn[0].toolCallCount).toBe(1);
expect(result.perTurn[1].toolCallCount).toBe(2);
});
it('groups confirmations by inputType', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'confirmation-request',
data: {
type: 'confirmation-request',
payload: { requestId: 'r1', inputType: 'questions' },
},
},
{
timestamp: 3,
type: 'confirmation-request',
data: {
type: 'confirmation-request',
payload: { requestId: 'r2', inputType: 'plan-review' },
},
},
{
timestamp: 4,
type: 'confirmation-request',
data: {
type: 'confirmation-request',
payload: { requestId: 'r3', inputType: 'questions' },
},
},
{
timestamp: 5,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'completed' } },
},
];
const result = buildConversationMetrics(events);
expect(result.confirmationAskedTotal).toBe(3);
expect(result.confirmationAskedByKind).toEqual({ questions: 2, 'plan-review': 1 });
expect(result.perTurn[0].confirmationAskedTotal).toBe(3);
expect(result.perTurn[0].confirmationAskedByKind).toEqual({
questions: 2,
'plan-review': 1,
});
});
it('defaults inputType to "approval" when omitted', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'confirmation-request',
data: { type: 'confirmation-request', payload: { requestId: 'r1' } },
},
{ timestamp: 3, type: 'run-finish', data: { type: 'run-finish' } },
];
const result = buildConversationMetrics(events);
expect(result.confirmationAskedByKind).toEqual({ approval: 1 });
});
it('detects repeat questions by requestId across turns', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'confirmation-request',
data: {
type: 'confirmation-request',
payload: { requestId: 'shared', inputType: 'questions' },
},
},
{ timestamp: 3, type: 'run-finish', data: { type: 'run-finish' } },
{ timestamp: 4, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 5,
type: 'confirmation-request',
data: {
type: 'confirmation-request',
payload: { requestId: 'shared', inputType: 'questions' },
},
},
{ timestamp: 6, type: 'run-finish', data: { type: 'run-finish' } },
];
const result = buildConversationMetrics(events);
expect(result.perTurn[0].repeatQuestionCount).toBe(0);
expect(result.perTurn[1].repeatQuestionCount).toBe(1);
});
it('counts replan_after_error when a tool-error is followed by tasks-update in the same turn', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{ timestamp: 2, type: 'tool-error', data: { type: 'tool-error' } },
{ timestamp: 3, type: 'tasks-update', data: { type: 'tasks-update' } },
{ timestamp: 4, type: 'run-finish', data: { type: 'run-finish' } },
];
const result = buildConversationMetrics(events);
expect(result.perTurn[0].replanAfterErrorCount).toBe(1);
});
it('counts replan_after_error when a tool-error is followed by a plan-typed tool-call', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{ timestamp: 2, type: 'tool-error', data: { type: 'tool-error' } },
{
timestamp: 3,
type: 'tool-call',
data: { type: 'tool-call', payload: { toolName: 'plan' } },
},
{ timestamp: 4, type: 'run-finish', data: { type: 'run-finish' } },
];
const result = buildConversationMetrics(events);
expect(result.perTurn[0].replanAfterErrorCount).toBe(1);
});
it('does NOT count replan_after_error when the recovery is in a previous turn', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{ timestamp: 2, type: 'tasks-update', data: { type: 'tasks-update' } },
{ timestamp: 3, type: 'run-finish', data: { type: 'run-finish' } },
{ timestamp: 4, type: 'run-start', data: { type: 'run-start' } },
{ timestamp: 5, type: 'tool-error', data: { type: 'tool-error' } },
{ timestamp: 6, type: 'run-finish', data: { type: 'run-finish' } },
];
const result = buildConversationMetrics(events);
expect(result.perTurn[1].replanAfterErrorCount).toBe(0);
});
it('marks reachedRunFinishCleanly false when the last run-finish is not completed', () => {
const events: CapturedEvent[] = [
{ timestamp: 1, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 2,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'completed' } },
},
{ timestamp: 3, type: 'run-start', data: { type: 'run-start' } },
{
timestamp: 4,
type: 'run-finish',
data: { type: 'run-finish', payload: { status: 'cancelled' } },
},
];
const result = buildConversationMetrics(events);
expect(result.reachedRunFinishCleanly).toBe(false);
expect(result.perTurn[1].runFinishStatus).toBe('cancelled');
});
});