// --------------------------------------------------------------------------- // Generate an HTML report from all saved pairwise eval runs. // // Walks `/pairwise/*` (default `.output/pairwise/`), reads // every run's `summary.json` + `results.jsonl`, and produces one HTML // file with a run picker and per-example details. Each built workflow is // embedded as an `` web component so reviewers can poke at the // canvas inline. // // https://github.com/n8n-io/n8n-demo-webcomponent // --------------------------------------------------------------------------- import { jsonParse } from 'n8n-workflow'; import { promises as fs } from 'node:fs'; import path from 'node:path'; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- interface SummaryJson { builder: string; dataset: string; judgeModel: string; numJudges: number; iterations: number; experimentName: string; startedAt: string; finishedAt: string; totals: { examples: number; runs: number; buildSuccess: number; buildFailures: Record; primaryPassRate: number; avgDiagnostic: number; submitCallsTotal?: number; avgSubmitCalls?: number; toolCallsTotal?: number; toolCallErrors?: number; toolCallErrorRate?: number; }; interactivity: { askUserCount: number; planToolCount: number; autoApprovedSuspensions: number; mockedCredentialTypes: string[]; }; } interface FeedbackEntry { evaluator: string; metric: string; score: number; kind?: string; comment?: string; } interface ToolCallSuspension { message?: string; questions?: unknown; severity?: string; autoApproved: boolean; } interface ToolCallTrace { step: number; toolCallId: string; toolName: string; args?: unknown; result?: unknown; error?: string; elapsedMs?: number; suspension?: ToolCallSuspension; } interface ResultRecord { exampleId: string; iteration: number; prompt: string; dos?: string; donts?: string; workflow: unknown; build: { success: boolean; errorClass?: string; errorMessage?: string; durationMs: number; extraWorkflowCount: number; interactivity: { askUserCount: number; planToolCount: number; autoApprovedSuspensions: number; mockedCredentialTypes: string[]; }; }; /** Optional — older runs predate the field. */ toolCalls?: ToolCallTrace[]; feedback: FeedbackEntry[]; } interface Run { dirName: string; summary: SummaryJson; results: ResultRecord[]; } // --------------------------------------------------------------------------- // Discovery // --------------------------------------------------------------------------- export async function loadRuns(rootDir: string): Promise { const entries = await fs.readdir(rootDir, { withFileTypes: true }); const runs: Run[] = []; for (const entry of entries) { if (!entry.isDirectory()) continue; const dir = path.join(rootDir, entry.name); const summaryPath = path.join(dir, 'summary.json'); const resultsPath = path.join(dir, 'results.jsonl'); let summaryRaw: string; let resultsRaw: string; try { [summaryRaw, resultsRaw] = await Promise.all([ fs.readFile(summaryPath, 'utf8'), fs.readFile(resultsPath, 'utf8'), ]); } catch (error) { // Incomplete/aborted runs lack one of the two files — skip those // silently. Any other read failure (permissions, I/O) should surface. if (isMissingFileError(error)) continue; throw error; } const summary = jsonParse(summaryRaw, { errorMessage: `Failed to parse ${summaryPath}`, }); const results = resultsRaw .split('\n') .filter((line) => line.trim().length > 0) .map((line) => jsonParse(line, { errorMessage: `Failed to parse a line in ${resultsPath}`, }), ); runs.push({ dirName: entry.name, summary, results }); } runs.sort((a, b) => b.summary.startedAt.localeCompare(a.summary.startedAt)); return runs; } function isMissingFileError(error: unknown): boolean { return ( typeof error === 'object' && error !== null && 'code' in error && (error as { code: unknown }).code === 'ENOENT' ); } // --------------------------------------------------------------------------- // Rendering // --------------------------------------------------------------------------- function escapeHtml(input: string): string { return input .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); } function escapeAttr(input: string): string { return input.replace(/&/g, '&').replace(/'/g, ''').replace(/"/g, '"'); } /** * Whether a tool call should count toward the "tool error rate" metric. * Mirrors `isErroredToolCall` in `pairwise.ts` — kept in sync by hand * because the report walks pre-saved `results.jsonl` files written by * older runs of the eval too. */ function isErroredToolCall(trace: ToolCallTrace): boolean { if (trace.error !== undefined) return true; const r = trace.result; if (r === null || r === undefined) return false; if (typeof r === 'object' && !Array.isArray(r)) { const obj = r as Record; if (obj.success === false) return true; if (typeof obj.error === 'string' && obj.error.length > 0) return true; if (Array.isArray(obj.errors) && obj.errors.length > 0) return true; } if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true; return false; } function countSubmitCalls(traces: ToolCallTrace[] | undefined): number { if (!traces) return 0; return traces.filter((t) => t.toolName === 'submit-workflow').length; } function countToolCallErrors(traces: ToolCallTrace[] | undefined): number { if (!traces) return 0; return traces.filter(isErroredToolCall).length; } function findScore(feedback: FeedbackEntry[], metric: string): number | undefined { return feedback.find((f) => f.metric === metric)?.score; } function renderCriteriaList(raw: string | undefined, kind: 'do' | 'dont'): string { if (!raw) return ''; const lines = raw .split('\n') .map((line) => line.trim()) .filter((line) => line.length > 0); if (lines.length === 0) return ''; const items = lines.map((line) => `

${escapeHtml(line)}

`).join(''); const label = kind === 'do' ? 'Do' : "Don't"; return `

${label}

${items}

`; } function renderFeedbackBadges(feedback: FeedbackEntry[]): string { const primary = findScore(feedback, 'pairwise_primary'); const diagnostic = findScore(feedback, 'pairwise_diagnostic'); const judgesPassed = findScore(feedback, 'pairwise_judges_passed'); const totalPasses = findScore(feedback, 'pairwise_total_passes'); const totalViolations = findScore(feedback, 'pairwise_total_violations'); const badges: string[] = []; if (primary !== undefined) { const cls = primary === 1 ? 'badge-pass' : 'badge-fail'; badges.push(`primary ${primary}`); } if (diagnostic !== undefined) { badges.push(`diagnostic ${diagnostic.toFixed(2)}`); } if (judgesPassed !== undefined) { badges.push(`${judgesPassed} judges pass`); } if (totalPasses !== undefined && totalViolations !== undefined) { badges.push( `${totalPasses} passes / ${totalViolations} violations`, ); } return badges.join(''); } function renderJudgeComments(feedback: FeedbackEntry[]): string { const judges = feedback.filter((f) => /^judge\d+$/.test(f.metric)); if (judges.length === 0) return ''; const rows = judges .map((j) => { const cls = j.score === 1 ? 'judge-pass' : 'judge-fail'; const comment = j.comment ? escapeHtml(j.comment) : 'no violations'; return `${escapeHtml(j.metric)}${j.score}${comment}`; }) .join(''); return `${rows}

Judge	Pass	Notes

`; } function formatJson(value: unknown): string { if (value === undefined) return ''; if (typeof value === 'string') return value; try { return JSON.stringify(value, null, 2); } catch { // Fallback when `value` is non-serialisable (e.g. has a circular ref). // `String(value)` may produce '[object Object]' but it's the only way // to surface *something* in the report instead of throwing. // eslint-disable-next-line @typescript-eslint/no-base-to-string return String(value); } } function renderToolCallTimeline(toolCalls: ToolCallTrace[] | undefined): string { if (!toolCalls || toolCalls.length === 0) { return '

No tool calls recorded.

'; } const items = toolCalls .map((trace) => { const elapsed = typeof trace.elapsedMs === 'number' ? `${trace.elapsedMs}ms` : 'pending'; const stateBits: string[] = []; if (trace.error) stateBits.push('error'); else if (trace.result !== undefined) stateBits.push('ok'); else stateBits.push('pending'); if (trace.suspension) { stateBits.push( trace.suspension.autoApproved ? 'auto-approved' : 'suspended', ); } const blocks: string[] = []; if (trace.suspension) { const suspParts: string[] = []; if (trace.suspension.message) { suspParts.push(`

${escapeHtml(trace.suspension.message)}

`); } if (trace.suspension.questions) { suspParts.push( `

Questions asked

${escapeHtml(formatJson(trace.suspension.questions))}

`, ); } blocks.push(`

${suspParts.join('')}

`); } if (trace.args !== undefined) { blocks.push( `

Input

${escapeHtml(formatJson(trace.args))}

`, ); } if (trace.error) { blocks.push( `

Error

${escapeHtml(trace.error)}

`, ); } else if (trace.result !== undefined) { blocks.push( `

Output

${escapeHtml(formatJson(trace.result))}

`, ); } return `

#${trace.step} ${escapeHtml(trace.toolName)} ${elapsed} ${stateBits.join('')}

${blocks.join('')}

`; }) .join(''); return `

${items}`; } function renderWorkflow(workflow: unknown): string { if (!workflow) { return '

No workflow built.

'; } const json = JSON.stringify(workflow); // Lazy mount: store the workflow on a placeholder and let the inline // script inject the element when the parent

is // expanded. Rendering all 77 demos upfront kills first-paint performance. return `

`; } function renderExample(record: ResultRecord, idPrefix: string): string { const primary = findScore(record.feedback, 'pairwise_primary'); const statusCls = record.build.success && primary === 1 ? 'ex-pass' : record.build.success ? 'ex-partial' : 'ex-fail'; const statusLabel = !record.build.success ? `BUILD ${record.build.errorClass ?? 'FAILED'}` : primary === 1 ? 'PASS' : 'FAIL'; const exampleId = `${idPrefix}-${record.exampleId}-${record.iteration}`; const interact = record.build.interactivity; const interactBits: string[] = []; if (interact.askUserCount > 0) interactBits.push(`ask-user ×${interact.askUserCount}`); if (interact.planToolCount > 0) interactBits.push(`plan ×${interact.planToolCount}`); if (interact.autoApprovedSuspensions > 0) interactBits.push(`suspend ×${interact.autoApprovedSuspensions}`); if (interact.mockedCredentialTypes.length > 0) interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`); // Per-record build-path stats. Surfaced inline in the summary line so a // reviewer can scan retries / errors without expanding each row. Numbers // match the columns added to `results.csv`. const submitCalls = countSubmitCalls(record.toolCalls); const toolErrors = countToolCallErrors(record.toolCalls); const buildStatBits: string[] = []; if (submitCalls > 0) buildStatBits.push(`submit ×${submitCalls}`); if (toolErrors > 0) buildStatBits.push(`err ×${toolErrors}`); const errorBlock = record.build.errorMessage ? `

${escapeHtml(record.build.errorMessage)}

` : ''; const promptPreview = record.prompt.replace(/\s+/g, ' ').trim(); return `

${statusLabel}

${escapeHtml(promptPreview)} ${escapeHtml(record.exampleId)}

#${record.iteration} ${record.build.durationMs}ms ${buildStatBits.length > 0 ? `${buildStatBits.map(escapeHtml).join(' · ')}` : ''} ${renderFeedbackBadges(record.feedback)}

Prompt

${escapeHtml(record.prompt)}

${renderCriteriaList(record.dos, 'do')} ${renderCriteriaList(record.donts, 'dont')}

${errorBlock} ${interactBits.length > 0 ? `

${interactBits.map(escapeHtml).join(' · ')}

` : ''}

Built workflow

${renderWorkflow(record.workflow)}

Tool calls${record.toolCalls && record.toolCalls.length > 0 ? ` (${record.toolCalls.length})` : ''}

${renderToolCallTimeline(record.toolCalls)}

${renderJudgeComments(record.feedback)}

`; } function renderRun(run: Run, index: number): string { const s = run.summary; const pct = (n: number): string => `${(n * 100).toFixed(1)}%`; const totalFailures = Object.values(s.totals.buildFailures).reduce((a, b) => a + b, 0); const failureDetail = Object.entries(s.totals.buildFailures) .map(([k, v]) => `${k}: ${v}`) .join(', '); const examples = run.results .sort((a, b) => a.exampleId === b.exampleId ? a.iteration - b.iteration : a.exampleId.localeCompare(b.exampleId), ) .map((r) => renderExample(r, `run-${index}`)) .join('\n'); return `

${escapeHtml(s.experimentName)}

Builder: ${escapeHtml(s.builder)} Dataset: ${escapeHtml(s.dataset)} Judges: ${s.numJudges} Judge model: ${escapeHtml(s.judgeModel)} Iterations: ${s.iterations} Started: ${escapeHtml(s.startedAt)} Dir: ${escapeHtml(run.dirName)}

Examples: ${s.totals.examples} Runs: ${s.totals.runs} Build ok: ${s.totals.buildSuccess} Build fail: ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''} Primary pass rate: ${pct(s.totals.primaryPassRate)} Avg diagnostic: ${s.totals.avgDiagnostic.toFixed(2)} ${ s.totals.toolCallErrorRate !== undefined ? `Tool error rate: ${pct(s.totals.toolCallErrorRate)}${s.totals.toolCallErrors !== undefined && s.totals.toolCallsTotal !== undefined ? ` (${s.totals.toolCallErrors}/${s.totals.toolCallsTotal})` : ''}` : '' } ${ s.totals.avgSubmitCalls !== undefined ? `Submit calls: ${s.totals.submitCallsTotal ?? 0} total, ${s.totals.avgSubmitCalls.toFixed(2)} avg/build` : '' }

${ s.interactivity.askUserCount > 0 || s.interactivity.planToolCount > 0 || s.interactivity.autoApprovedSuspensions > 0 || s.interactivity.mockedCredentialTypes.length > 0 ? `

Interactivity: ask-user ×${s.interactivity.askUserCount} · plan ×${s.interactivity.planToolCount} · suspend ×${s.interactivity.autoApprovedSuspensions} · mocked creds: ${s.interactivity.mockedCredentialTypes.map(escapeHtml).join(', ') || 'none'}

` : '' }

${examples}

`; } export function renderDocument(runs: Run[]): string { const body = runs.map((run, i) => renderRun(run, i)).join('\n'); return ` Instance AI — Pairwise Eval Report

Instance AI — Pairwise Eval Report (${runs.length} run${runs.length === 1 ? '' : 's'})

${body}

`; } // --------------------------------------------------------------------------- // CLI // --------------------------------------------------------------------------- interface ReportArgs { outputRoot: string; reportFile: string; } function parseArgs(argv: string[]): ReportArgs { const get = (flag: string): string | undefined => { const idx = argv.indexOf(flag); if (idx === -1) return undefined; const value = argv[idx + 1]; return value && !value.startsWith('--') ? value : undefined; }; const defaultRoot = path.resolve(process.cwd(), '.output', 'pairwise'); const outputRoot = path.resolve(get('--output-root') ?? defaultRoot); const reportFile = path.resolve(get('--report-file') ?? path.join(outputRoot, 'report.html')); return { outputRoot, reportFile }; } async function main(): Promise { const args = parseArgs(process.argv.slice(2)); const runs = await loadRuns(args.outputRoot); if (runs.length === 0) { console.error(`No runs found under ${args.outputRoot}`); process.exit(1); } const html = renderDocument(runs); await fs.writeFile(args.reportFile, html, 'utf8'); console.log(`Wrote ${runs.length} run(s) to ${args.reportFile}`); } if (require.main === module) { main().catch((error) => { console.error(error instanceof Error ? (error.stack ?? error.message) : String(error)); process.exit(1); }); }