Pairwise Eval Comparison — ${escapeHtml(labels.ee)} vs ${escapeHtml(labels.ia)}

// --------------------------------------------------------------------------- // Side-by-side comparison report for two pairwise eval runs // (typically: ai-workflow-builder.ee vs instance-ai). // // Usage: // pnpm tsx evaluations/cli/compare-pairwise.ts \ // --ee-dir ../ai-workflow-builder.ee/evaluations/.output/pairwise/ \ // --ia-dir .output/pairwise/ \ // --out .output/pairwise/comparison.html // // Both directories must contain a `summary.json`. Per-example data layouts // differ between the builders, so the loaders below normalize into a shared // `BuilderRecord` shape, joined by prompt text. // --------------------------------------------------------------------------- import { scrubSecretsInText } from '@n8n/utils'; import { jsonParse } from 'n8n-workflow'; import { promises as fs } from 'node:fs'; import path from 'node:path'; import { redactSecrets } from '../harness/redact'; // --------------------------------------------------------------------------- // Shared shape after normalization // --------------------------------------------------------------------------- export interface FeedbackEntry { metric: string; score: number; kind?: string; comment?: string; } export interface BuilderRecord { prompt: string; /** Stable id for the example. For IA, the LangSmith dataset example id; * for EE, the example directory name (e.g. `example-000-ab12cd`). */ exampleId?: string; dos?: string; donts?: string; workflow: unknown; durationMs: number; success: boolean; errorClass?: string; errorMessage?: string; feedback: FeedbackEntry[]; tokenInput?: number; tokenOutput?: number; /** Number of `submit-workflow` calls during the build. IA-only — EE * doesn't capture a tool-call timeline in the comparable shape. */ submitCalls?: number; /** Number of tool calls that errored or returned a failed result. */ toolCallErrors?: number; /** Total tool calls observed, used as the error-rate denominator. */ toolCallsTotal?: number; /** Raw tool-call trace. IA-only — populated by `loadInstanceAiRun`. */ toolCalls?: IAToolCallTrace[]; } interface BuilderSummary { label: string; dataset?: string; judgeModel?: string; numJudges?: number; startedAt?: string; finishedAt?: string; totals: { examples: number; buildSuccess: number; buildFailures: Record; primaryPassRate: number; avgDiagnostic: number; avgDurationMs: number; /** Total `submit-workflow` calls aggregated across IA records. Undefined * for EE (which doesn't capture a comparable tool-call timeline). */ submitCallsTotal?: number; /** Mean `submit-workflow` calls per record (IA only). */ avgSubmitCalls?: number; /** Total tool calls observed across IA records. */ toolCallsTotal?: number; /** Total errored tool calls observed across IA records. */ toolCallErrors?: number; /** `toolCallErrors / toolCallsTotal` micro-averaged. IA-only. */ toolCallErrorRate?: number; }; } interface BuilderRun { summary: BuilderSummary; records: BuilderRecord[]; } // --------------------------------------------------------------------------- // Instance AI loader (writes results.jsonl + workflows/.json + summary.json) // --------------------------------------------------------------------------- interface IAToolCallTrace { step: number; toolCallId: string; toolName: string; args?: unknown; result?: unknown; error?: string; elapsedMs?: number; } interface IAResultRecord { exampleId: string; iteration: number; prompt: string; dos?: string; donts?: string; workflow: unknown; build: { success: boolean; errorClass?: string; errorMessage?: string; durationMs: number; tokenUsage?: { input?: number; output?: number }; }; feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>; toolCalls?: IAToolCallTrace[]; } /** * Whether a tool call should count toward the "tool error rate" metric. * Mirrors `isErroredToolCall` in `pairwise.ts`. */ function isErroredIAToolCall(trace: IAToolCallTrace): boolean { if (trace.error !== undefined) return true; const r = trace.result; if (r === null || r === undefined) return false; if (typeof r === 'object' && !Array.isArray(r)) { const obj = r as Record; if (obj.success === false) return true; if (typeof obj.error === 'string' && obj.error.length > 0) return true; if (Array.isArray(obj.errors) && obj.errors.length > 0) return true; } if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true; return false; } interface IASummary { builder: string; dataset: string; judgeModel: string; numJudges: number; startedAt: string; finishedAt: string; totals: { examples: number; buildSuccess: number; buildFailures: Record; primaryPassRate: number; avgDiagnostic: number; }; } async function loadInstanceAiRun(dir: string): Promise { const summaryPath = path.join(dir, 'summary.json'); const resultsPath = path.join(dir, 'results.jsonl'); const [summaryRaw, resultsRaw] = await Promise.all([ fs.readFile(summaryPath, 'utf8'), fs.readFile(resultsPath, 'utf8'), ]); const summary = jsonParse(summaryRaw, { errorMessage: `Failed to parse ${summaryPath}`, }); const records = resultsRaw .split('\n') .filter((line) => line.trim().length > 0) .map((line) => jsonParse(line, { errorMessage: `Failed to parse a line in ${resultsPath}`, }), ) // Use only iteration 1 for a fair 1:1 comparison. .filter((r) => r.iteration === 1); const normalized: BuilderRecord[] = records.map((r) => { const tcs = r.toolCalls ?? []; return { prompt: r.prompt, exampleId: r.exampleId, dos: r.dos, donts: r.donts, workflow: r.workflow, durationMs: r.build.durationMs, success: r.build.success, errorClass: r.build.errorClass, errorMessage: r.build.errorMessage, feedback: r.feedback, tokenInput: r.build.tokenUsage?.input, tokenOutput: r.build.tokenUsage?.output, submitCalls: tcs.filter((tc) => tc.toolName === 'submit-workflow').length, toolCallErrors: tcs.filter(isErroredIAToolCall).length, toolCallsTotal: tcs.length, toolCalls: tcs, }; }); const avgDuration = normalized.length === 0 ? 0 : normalized.reduce((sum, r) => sum + r.durationMs, 0) / normalized.length; // Recompute totals from the filtered set so the comparison summary stays // consistent with the rendered records (1:1 across builders, iter 1 only). const buildSuccess = normalized.filter((r) => r.success).length; const buildFailures: Record = {}; for (const r of normalized) { if (r.success) continue; const key = r.errorClass ?? 'error'; buildFailures[key] = (buildFailures[key] ?? 0) + 1; } const primaryPasses = normalized.filter( (r) => findScore(r.feedback, 'pairwise_primary') === 1, ).length; const primaryPassRate = normalized.length === 0 ? 0 : primaryPasses / normalized.length; const diagnosticScores = normalized .map((r) => findScore(r.feedback, 'pairwise_diagnostic')) .filter((v): v is number => v !== undefined && Number.isFinite(v)); const avgDiagnostic = diagnosticScores.length === 0 ? 0 : diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length; const submitCallsTotal = normalized.reduce((s, r) => s + (r.submitCalls ?? 0), 0); const toolCallsTotal = normalized.reduce((s, r) => s + (r.toolCallsTotal ?? 0), 0); const toolCallErrors = normalized.reduce((s, r) => s + (r.toolCallErrors ?? 0), 0); return { summary: { label: `${summary.builder} (instance-ai)`, dataset: summary.dataset, judgeModel: summary.judgeModel, numJudges: summary.numJudges, startedAt: summary.startedAt, finishedAt: summary.finishedAt, totals: { examples: normalized.length, buildSuccess, buildFailures, primaryPassRate, avgDiagnostic, avgDurationMs: avgDuration, submitCallsTotal, avgSubmitCalls: normalized.length ? submitCallsTotal / normalized.length : 0, toolCallsTotal, toolCallErrors, toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0, }, }, records: normalized, }; } // --------------------------------------------------------------------------- // EE loader (writes example-NNN-HASH/{prompt.txt, workflow.json, feedback.json} // + summary.json with an aggregate `evaluatorAverages`). // --------------------------------------------------------------------------- interface EEFeedbackJson { index: number; status: string; durationMs: number; generationDurationMs?: number; generationInputTokens?: number; generationOutputTokens?: number; score?: number; evaluators?: Array<{ name: string; feedback: Array<{ key: string; metric: string; score: number; kind?: string; comment?: string; }>; averageScore?: number; }>; allFeedback?: Array<{ evaluator: string; metric: string; score: number; kind?: string; comment?: string; }>; } interface EESummaryJson { timestamp?: string; totalExamples: number; passed: number; failed: number; errors: number; passRate: number; averageScore?: number; evaluatorAverages?: Record; totalDurationMs?: number; } async function loadEERun(dir: string): Promise { const summaryPath = path.join(dir, 'summary.json'); const summaryRaw = await readOptional(summaryPath); const summary = summaryRaw ? jsonParse(summaryRaw, { errorMessage: `Failed to parse ${summaryPath}` }) : null; const entries = await fs.readdir(dir, { withFileTypes: true }); const exampleDirs = entries .filter((e) => e.isDirectory() && e.name.startsWith('example-')) .map((e) => path.join(dir, e.name)); const records: BuilderRecord[] = []; for (const exampleDir of exampleDirs) { const promptPath = path.join(exampleDir, 'prompt.txt'); const workflowPath = path.join(exampleDir, 'workflow.json'); const feedbackPath = path.join(exampleDir, 'feedback.json'); const errorPath = path.join(exampleDir, 'error.txt'); const prompt = await readOptional(promptPath); if (!prompt) continue; const [workflowRaw, feedbackRaw, errorRaw] = await Promise.all([ readOptional(workflowPath), readOptional(feedbackPath), readOptional(errorPath), ]); const workflow = workflowRaw ? jsonParse(workflowRaw, { errorMessage: `Failed to parse ${workflowPath}` }) : null; const feedbackJson = feedbackRaw ? jsonParse(feedbackRaw, { errorMessage: `Failed to parse ${feedbackPath}`, }) : null; const exampleId = path.basename(exampleDir); const feedback: FeedbackEntry[] = []; // Prefer `allFeedback` (flat list, matches IA shape), fall back to nested evaluators. if (feedbackJson?.allFeedback) { for (const f of feedbackJson.allFeedback) { feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment }); } } else if (feedbackJson?.evaluators) { for (const ev of feedbackJson.evaluators) { for (const f of ev.feedback) { feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment }); } } } // EE status: 'pass' | 'fail' | 'error'. Only 'error' means the workflow // was never built — 'fail' means it was built but the eval marked it // non-passing. We separate those: `success` = workflow exists. const status = feedbackJson?.status ?? 'unknown'; const success = status !== 'error' && workflow !== null; const errorClass = status === 'error' ? 'error' : success ? undefined : status; records.push({ prompt, exampleId, dos: extractDosFromPrompt(prompt) ?? undefined, donts: extractDontsFromPrompt(prompt) ?? undefined, workflow, durationMs: feedbackJson?.durationMs ?? 0, success, errorClass, errorMessage: errorRaw ?? undefined, feedback, tokenInput: feedbackJson?.generationInputTokens, tokenOutput: feedbackJson?.generationOutputTokens, }); } const avgDuration = records.length === 0 ? 0 : records.reduce((sum, r) => sum + r.durationMs, 0) / records.length; const primaryPassCount = records.filter( (r) => findScore(r.feedback, 'pairwise_primary') === 1, ).length; const diagnosticScores = records .map((r) => findScore(r.feedback, 'pairwise_diagnostic')) .filter((v): v is number => v !== undefined && Number.isFinite(v)); const avgDiagnostic = diagnosticScores.length === 0 ? 0 : diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length; const buildFailures: Record = {}; for (const r of records) { if (!r.success) { const key = r.errorClass ?? 'error'; buildFailures[key] = (buildFailures[key] ?? 0) + 1; } } const errorCount = records.filter((r) => !r.success).length; const buildSuccessCount = records.length - errorCount; return { summary: { label: 'Code Builder', startedAt: summary?.timestamp, totals: { examples: summary?.totalExamples ?? records.length, buildSuccess: summary ? summary.totalExamples - summary.errors : buildSuccessCount, buildFailures, primaryPassRate: records.length === 0 ? 0 : primaryPassCount / records.length, avgDiagnostic, avgDurationMs: avgDuration, }, }, records, }; } async function readOptional(filePath: string): Promise { try { return await fs.readFile(filePath, 'utf8'); } catch { return null; } } // EE prompts in `notion-pairwise-workflows` don't carry dos/donts text — those // are LangSmith inputs, not in prompt.txt. Return undefined so the IA criteria // (which we have) drive the rendering. These stubs are placeholders in case we // later hand-encode criteria into prompt.txt. function extractDosFromPrompt(_prompt: string): string | null { return null; } function extractDontsFromPrompt(_prompt: string): string | null { return null; } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- function findScore(feedback: FeedbackEntry[], metric: string): number | undefined { return feedback.find((f) => f.metric === metric)?.score; } function escapeHtml(input: string): string { return input .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); } function escapeAttr(input: string): string { return input.replace(/&/g, '&').replace(/'/g, ''').replace(/"/g, '"'); } function formatDuration(ms: number): string { if (ms < 1000) return `${ms}ms`; if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; const minutes = Math.floor(ms / 60_000); const seconds = Math.floor((ms % 60_000) / 1000); return `${minutes}m${seconds.toString().padStart(2, '0')}s`; } function pct(n: number): string { return `${(n * 100).toFixed(1)}%`; } // --------------------------------------------------------------------------- // Pairing // --------------------------------------------------------------------------- export interface ComparisonRow { prompt: string; dos?: string; donts?: string; ee?: BuilderRecord; ia?: BuilderRecord; verdict: 'both-pass' | 'both-fail' | 'ee-only' | 'ia-only' | 'neither'; } /** * Normalize prompt text used as the join key. EE and IA generate dirs/IDs * via different schemes, so we have to match by prompt. Trim + collapse * whitespace so trivial drift (CRLF, trailing space, indented blocks) * doesn't silently un-pair otherwise-identical examples. */ export function promptJoinKey(prompt: string): string { return prompt.replace(/\s+/g, ' ').trim(); } export function pairRecords(ee: BuilderRecord[], ia: BuilderRecord[]): ComparisonRow[] { const byKey = new Map(); const ensure = (prompt: string): ComparisonRow => { const key = promptJoinKey(prompt); const existing = byKey.get(key); if (existing) return existing; const created: ComparisonRow = { prompt, verdict: 'neither' }; byKey.set(key, created); return created; }; for (const r of ee) { const row = ensure(r.prompt); row.ee = r; } for (const r of ia) { const row = ensure(r.prompt); row.ia = r; // IA carries the dos/donts text, prefer it as the source of truth. if (r.dos) row.dos = r.dos; if (r.donts) row.donts = r.donts; } // Compute verdict for each row. for (const row of byKey.values()) { const eePass = row.ee && row.ee.success && findScore(row.ee.feedback, 'pairwise_primary') === 1; const iaPass = row.ia && row.ia.success && findScore(row.ia.feedback, 'pairwise_primary') === 1; row.verdict = eePass && iaPass ? 'both-pass' : eePass ? 'ee-only' : iaPass ? 'ia-only' : 'both-fail'; } const order: Record = { 'ee-only': 0, 'ia-only': 1, 'both-fail': 2, 'both-pass': 3, neither: 4, }; return [...byKey.values()].sort((a, b) => { const ord = order[a.verdict] - order[b.verdict]; if (ord !== 0) return ord; return a.prompt.localeCompare(b.prompt); }); } // --------------------------------------------------------------------------- // Rendering // --------------------------------------------------------------------------- function renderCriteriaList(raw: string | undefined, kind: 'do' | 'dont'): string { if (!raw) return ''; const lines = raw .split('\n') .map((line) => line.trim()) .filter((line) => line.length > 0); if (lines.length === 0) return ''; const items = lines.map((line) => `

${escapeHtml(line)}

`).join(''); const label = kind === 'do' ? 'Do' : "Don't"; return `

${label}

${items}

`; } function renderWorkflow(workflow: unknown): string { if (!workflow) { return '

No workflow built.

'; } const json = JSON.stringify(workflow); return ``; } function renderJudgeRows(feedback: FeedbackEntry[]): string { const judges = feedback.filter((f) => /^judge\d+$/.test(f.metric)); if (judges.length === 0) return ''; const rows = judges .map((j) => { const cls = j.score === 1 ? 'judge-pass' : 'judge-fail'; const comment = j.comment ? escapeHtml(j.comment) : 'no violations'; return `${escapeHtml(j.metric)}${j.score}${comment}`; }) .join(''); return `${rows}

Judge	Pass	Notes

`; } interface BuilderHeadline { statusBadge: string; statusKind: 'pass' | 'fail' | 'missing'; metaText: string; // duration · diagnostic · token info } function buildHeadline(record: BuilderRecord | undefined): BuilderHeadline { if (!record) { return { statusBadge: 'N/A', statusKind: 'missing', metaText: '—', }; } const primary = findScore(record.feedback, 'pairwise_primary'); const diagnostic = findScore(record.feedback, 'pairwise_diagnostic'); const statusBadge = !record.success ? `BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}` : primary === 1 ? 'PASS' : 'FAIL'; const statusKind: BuilderHeadline['statusKind'] = !record.success ? 'fail' : primary === 1 ? 'pass' : 'fail'; const metaParts: string[] = [formatDuration(record.durationMs)]; if (diagnostic !== undefined) metaParts.push(`diag ${diagnostic.toFixed(2)}`); return { statusBadge, statusKind, metaText: metaParts.join(' · ') }; } function renderBuilderColumn(label: string, record: BuilderRecord | undefined): string { if (!record) { return `

${escapeHtml(label)}

No record for this prompt.

`; } const primary = findScore(record.feedback, 'pairwise_primary'); const diagnostic = findScore(record.feedback, 'pairwise_diagnostic'); const totalPasses = findScore(record.feedback, 'pairwise_total_passes'); const totalViolations = findScore(record.feedback, 'pairwise_total_violations'); const statusBadge = !record.success ? `BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}` : primary === 1 ? 'PASS' : 'FAIL'; const metaParts: string[] = [`${formatDuration(record.durationMs)}`]; if (diagnostic !== undefined) { metaParts.push(`diag ${diagnostic.toFixed(2)}`); } if (totalPasses !== undefined && totalViolations !== undefined) { metaParts.push(`${totalPasses}p / ${totalViolations}v`); } if (record.tokenInput !== undefined && record.tokenOutput !== undefined) { metaParts.push(`${record.tokenInput}+${record.tokenOutput} tok`); } if (record.submitCalls !== undefined && record.submitCalls > 0) { metaParts.push(`submit ×${record.submitCalls}`); } if (record.toolCallErrors !== undefined && record.toolCallErrors > 0) { metaParts.push(`err ×${record.toolCallErrors}`); } const errorBlock = record.errorMessage ? `

${escapeHtml(scrubSecretsInText(record.errorMessage))}

` : ''; const idLine = record.exampleId ? `

${escapeHtml(record.exampleId)}

` : ''; return `

${escapeHtml(label)}

${statusBadge}

${idLine}

${metaParts.join(' · ')}

${errorBlock}

${renderWorkflow(record.workflow)}

${renderJudgeRows(record.feedback)}

`; } interface SideLabels { ee: string; ia: string; } function summarizeToolCallArgs(toolName: string, args: unknown): string { if (!args || typeof args !== 'object') return ''; // Redact secret-shaped keys (password / token / api_key / …) and scrub the // extracted free-form strings for known credential patterns. The same // summarized text is used as a data-attribute on the row, so any leaked // secret would persist in the HTML and be searchable via filter input. const a = redactSecrets(args) as Record; const str = (v: unknown): string => (typeof v === 'string' ? scrubSecretsInText(v) : ''); const trunc = (s: string, n = 160): string => (s.length > n ? s.slice(0, n) + '…' : s); switch (toolName) { case 'workspace_execute_command': return trunc(str(a.command)); case 'workspace_str_replace_file': return trunc(str(a.path)); case 'workspace_write_file': { const p = str(a.path); const len = typeof a.content === 'string' ? a.content.length : 0; return `${p}${len ? ` (${len} chars)` : ''}`; } case 'workspace_read_file': { const p = str(a.path); const off = typeof a.offset === 'number' ? a.offset : undefined; const lim = typeof a.limit === 'number' ? a.limit : undefined; const range = off !== undefined || lim !== undefined ? ` @${off ?? 0}+${lim ?? '∞'}` : ''; return trunc(`${p}${range}`); } case 'workspace_grep': return trunc(`${str(a.pattern)}${a.path ? ` in ${str(a.path)}` : ''}`); case 'workspace_file_stat': { return trunc(str(a.path)); } case 'workspace_mkdir': return trunc(str(a.path)); case 'submit-workflow': return trunc(`${str(a.name)} ${str(a.filePath)}`); case 'verify-built-workflow': return trunc(str(a.workflowId) || str(a.workItemId)); case 'credentials': case 'data-tables': case 'nodes': case 'workflows': return trunc(str(a.action)); default: return trunc(scrubSecretsInText(JSON.stringify(a)), 120); } } function renderToolCallRows(traces: IAToolCallTrace[]): string { return traces .map((tc) => { const errored = isErroredIAToolCall(tc); const elapsed = tc.elapsedMs !== undefined ? formatDuration(tc.elapsedMs) : '—'; const cls = errored ? 'tc-err' : ''; const argSummary = summarizeToolCallArgs(tc.toolName, tc.args); const argCell = argSummary ? `${escapeHtml(argSummary)}` : ''; const detail = errored && tc.error ? escapeHtml(scrubSecretsInText(tc.error)) : ''; return `${tc.step}${escapeHtml(tc.toolName)}${argCell}${elapsed}${detail}`; }) .join(''); } function renderToolCallsColumn(label: string, record: BuilderRecord | undefined): string { if (!record) { return `

${escapeHtml(label)}

No record.

`; } const traces = record.toolCalls ?? []; if (traces.length === 0) { return `

${escapeHtml(label)} (0 calls)

No tool calls captured.

`; } const errCount = traces.filter(isErroredIAToolCall).length; const errBadge = errCount > 0 ? ` · ${errCount} err` : ''; return `

${escapeHtml(label)} (${traces.length} call${traces.length === 1 ? '' : 's'}${errBadge})

${renderToolCallRows(traces)}

#	Tool	Args	Time	Error

`; } function renderToolCallsSection(row: ComparisonRow, labels: SideLabels): string { const anyTraces = (row.ee?.toolCalls?.length ?? 0) > 0 || (row.ia?.toolCalls?.length ?? 0) > 0; if (!anyTraces) return ''; return `

Tool calls

${renderToolCallsColumn(labels.ee, row.ee)} ${renderToolCallsColumn(labels.ia, row.ia)}

`; } function renderRow(row: ComparisonRow, index: number, labels: SideLabels): string { const verdictLabel: Record = { 'both-pass': 'BOTH PASS', 'both-fail': 'BOTH FAIL', 'ee-only': `${labels.ee.toUpperCase()} ONLY`, 'ia-only': `${labels.ia.toUpperCase()} ONLY`, neither: '—', }; const verdictCls: Record = { 'both-pass': 'verdict-both-pass', 'both-fail': 'verdict-both-fail', 'ee-only': 'verdict-ee-only', 'ia-only': 'verdict-ia-only', neither: 'verdict-neither', }; const eeHead = buildHeadline(row.ee); const iaHead = buildHeadline(row.ia); const promptPreview = row.prompt.slice(0, 110) + (row.prompt.length > 110 ? '…' : ''); const builderChip = (label: string, head: BuilderHeadline): string => ` ${escapeHtml(label)} ${head.statusBadge} ${escapeHtml(head.metaText)} `; const ids: string[] = []; if (row.ia?.exampleId) ids.push(row.ia.exampleId); if (row.ee?.exampleId && row.ee.exampleId !== row.ia?.exampleId) ids.push(row.ee.exampleId); const idText = ids.join(' / '); const idChip = `${escapeHtml(idText)}`; // Heavy content (workflow previews + judge tables) is wrapped in a