n8n/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts
Mutasem Aldmour fdceec21b9
feat: Add pairwise workflow eval pipeline (#29123)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Jaakko Husso <jaakko@n8n.io>
2026-05-04 13:26:27 +00:00

961 lines
36 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ---------------------------------------------------------------------------
// Side-by-side comparison report for two pairwise eval runs
// (typically: ai-workflow-builder.ee vs instance-ai).
//
// Usage:
// pnpm tsx evaluations/cli/compare-pairwise.ts \
// --ee-dir ../ai-workflow-builder.ee/evaluations/.output/pairwise/<ts> \
// --ia-dir .output/pairwise/<ts> \
// --out .output/pairwise/comparison.html
//
// Both directories must contain a `summary.json`. Per-example data layouts
// differ between the builders, so the loaders below normalize into a shared
// `BuilderRecord` shape, joined by prompt text.
// ---------------------------------------------------------------------------
import { jsonParse } from 'n8n-workflow';
import { promises as fs } from 'node:fs';
import path from 'node:path';
// ---------------------------------------------------------------------------
// Shared shape after normalization
// ---------------------------------------------------------------------------
export interface FeedbackEntry {
metric: string;
score: number;
kind?: string;
comment?: string;
}
export interface BuilderRecord {
prompt: string;
/** Stable id for the example. For IA, the LangSmith dataset example id;
* for EE, the example directory name (e.g. `example-000-ab12cd`). */
exampleId?: string;
dos?: string;
donts?: string;
workflow: unknown;
durationMs: number;
success: boolean;
errorClass?: string;
errorMessage?: string;
feedback: FeedbackEntry[];
tokenInput?: number;
tokenOutput?: number;
}
interface BuilderSummary {
label: string;
dataset?: string;
judgeModel?: string;
numJudges?: number;
startedAt?: string;
finishedAt?: string;
totals: {
examples: number;
buildSuccess: number;
buildFailures: Record<string, number>;
primaryPassRate: number;
avgDiagnostic: number;
avgDurationMs: number;
};
}
interface BuilderRun {
summary: BuilderSummary;
records: BuilderRecord[];
}
// ---------------------------------------------------------------------------
// Instance AI loader (writes results.jsonl + workflows/<id>.json + summary.json)
// ---------------------------------------------------------------------------
interface IAResultRecord {
exampleId: string;
iteration: number;
prompt: string;
dos?: string;
donts?: string;
workflow: unknown;
build: {
success: boolean;
errorClass?: string;
errorMessage?: string;
durationMs: number;
tokenUsage?: { input?: number; output?: number };
};
feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>;
}
interface IASummary {
builder: string;
dataset: string;
judgeModel: string;
numJudges: number;
startedAt: string;
finishedAt: string;
totals: {
examples: number;
buildSuccess: number;
buildFailures: Record<string, number>;
primaryPassRate: number;
avgDiagnostic: number;
};
}
async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
const summaryPath = path.join(dir, 'summary.json');
const resultsPath = path.join(dir, 'results.jsonl');
const [summaryRaw, resultsRaw] = await Promise.all([
fs.readFile(summaryPath, 'utf8'),
fs.readFile(resultsPath, 'utf8'),
]);
const summary = jsonParse<IASummary>(summaryRaw, {
errorMessage: `Failed to parse ${summaryPath}`,
});
const records = resultsRaw
.split('\n')
.filter((line) => line.trim().length > 0)
.map((line) =>
jsonParse<IAResultRecord>(line, {
errorMessage: `Failed to parse a line in ${resultsPath}`,
}),
)
// Use only iteration 1 for a fair 1:1 comparison.
.filter((r) => r.iteration === 1);
const normalized: BuilderRecord[] = records.map((r) => ({
prompt: r.prompt,
exampleId: r.exampleId,
dos: r.dos,
donts: r.donts,
workflow: r.workflow,
durationMs: r.build.durationMs,
success: r.build.success,
errorClass: r.build.errorClass,
errorMessage: r.build.errorMessage,
feedback: r.feedback,
tokenInput: r.build.tokenUsage?.input,
tokenOutput: r.build.tokenUsage?.output,
}));
const avgDuration =
normalized.length === 0
? 0
: normalized.reduce((sum, r) => sum + r.durationMs, 0) / normalized.length;
// Recompute totals from the filtered set so the comparison summary stays
// consistent with the rendered records (1:1 across builders, iter 1 only).
const buildSuccess = normalized.filter((r) => r.success).length;
const buildFailures: Record<string, number> = {};
for (const r of normalized) {
if (r.success) continue;
const key = r.errorClass ?? 'error';
buildFailures[key] = (buildFailures[key] ?? 0) + 1;
}
const primaryPasses = normalized.filter(
(r) => findScore(r.feedback, 'pairwise_primary') === 1,
).length;
const primaryPassRate = normalized.length === 0 ? 0 : primaryPasses / normalized.length;
const diagnosticScores = normalized
.map((r) => findScore(r.feedback, 'pairwise_diagnostic'))
.filter((v): v is number => v !== undefined && Number.isFinite(v));
const avgDiagnostic =
diagnosticScores.length === 0
? 0
: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
return {
summary: {
label: `${summary.builder} (instance-ai)`,
dataset: summary.dataset,
judgeModel: summary.judgeModel,
numJudges: summary.numJudges,
startedAt: summary.startedAt,
finishedAt: summary.finishedAt,
totals: {
examples: normalized.length,
buildSuccess,
buildFailures,
primaryPassRate,
avgDiagnostic,
avgDurationMs: avgDuration,
},
},
records: normalized,
};
}
// ---------------------------------------------------------------------------
// EE loader (writes example-NNN-HASH/{prompt.txt, workflow.json, feedback.json}
// + summary.json with an aggregate `evaluatorAverages`).
// ---------------------------------------------------------------------------
interface EEFeedbackJson {
index: number;
status: string;
durationMs: number;
generationDurationMs?: number;
generationInputTokens?: number;
generationOutputTokens?: number;
score?: number;
evaluators?: Array<{
name: string;
feedback: Array<{
key: string;
metric: string;
score: number;
kind?: string;
comment?: string;
}>;
averageScore?: number;
}>;
allFeedback?: Array<{
evaluator: string;
metric: string;
score: number;
kind?: string;
comment?: string;
}>;
}
interface EESummaryJson {
timestamp?: string;
totalExamples: number;
passed: number;
failed: number;
errors: number;
passRate: number;
averageScore?: number;
evaluatorAverages?: Record<string, number>;
totalDurationMs?: number;
}
async function loadEERun(dir: string): Promise<BuilderRun> {
const summaryPath = path.join(dir, 'summary.json');
const summaryRaw = await readOptional(summaryPath);
const summary = summaryRaw
? jsonParse<EESummaryJson>(summaryRaw, { errorMessage: `Failed to parse ${summaryPath}` })
: null;
const entries = await fs.readdir(dir, { withFileTypes: true });
const exampleDirs = entries
.filter((e) => e.isDirectory() && e.name.startsWith('example-'))
.map((e) => path.join(dir, e.name));
const records: BuilderRecord[] = [];
for (const exampleDir of exampleDirs) {
const promptPath = path.join(exampleDir, 'prompt.txt');
const workflowPath = path.join(exampleDir, 'workflow.json');
const feedbackPath = path.join(exampleDir, 'feedback.json');
const errorPath = path.join(exampleDir, 'error.txt');
const prompt = await readOptional(promptPath);
if (!prompt) continue;
const [workflowRaw, feedbackRaw, errorRaw] = await Promise.all([
readOptional(workflowPath),
readOptional(feedbackPath),
readOptional(errorPath),
]);
const workflow = workflowRaw
? jsonParse<unknown>(workflowRaw, { errorMessage: `Failed to parse ${workflowPath}` })
: null;
const feedbackJson = feedbackRaw
? jsonParse<EEFeedbackJson>(feedbackRaw, {
errorMessage: `Failed to parse ${feedbackPath}`,
})
: null;
const exampleId = path.basename(exampleDir);
const feedback: FeedbackEntry[] = [];
// Prefer `allFeedback` (flat list, matches IA shape), fall back to nested evaluators.
if (feedbackJson?.allFeedback) {
for (const f of feedbackJson.allFeedback) {
feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment });
}
} else if (feedbackJson?.evaluators) {
for (const ev of feedbackJson.evaluators) {
for (const f of ev.feedback) {
feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment });
}
}
}
// EE status: 'pass' | 'fail' | 'error'. Only 'error' means the workflow
// was never built — 'fail' means it was built but the eval marked it
// non-passing. We separate those: `success` = workflow exists.
const status = feedbackJson?.status ?? 'unknown';
const success = status !== 'error' && workflow !== null;
const errorClass = status === 'error' ? 'error' : success ? undefined : status;
records.push({
prompt,
exampleId,
dos: extractDosFromPrompt(prompt) ?? undefined,
donts: extractDontsFromPrompt(prompt) ?? undefined,
workflow,
durationMs: feedbackJson?.durationMs ?? 0,
success,
errorClass,
errorMessage: errorRaw ?? undefined,
feedback,
tokenInput: feedbackJson?.generationInputTokens,
tokenOutput: feedbackJson?.generationOutputTokens,
});
}
const avgDuration =
records.length === 0 ? 0 : records.reduce((sum, r) => sum + r.durationMs, 0) / records.length;
const primaryPassCount = records.filter(
(r) => findScore(r.feedback, 'pairwise_primary') === 1,
).length;
const diagnosticScores = records
.map((r) => findScore(r.feedback, 'pairwise_diagnostic'))
.filter((v): v is number => v !== undefined && Number.isFinite(v));
const avgDiagnostic =
diagnosticScores.length === 0
? 0
: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
const buildFailures: Record<string, number> = {};
for (const r of records) {
if (!r.success) {
const key = r.errorClass ?? 'error';
buildFailures[key] = (buildFailures[key] ?? 0) + 1;
}
}
const errorCount = records.filter((r) => !r.success).length;
const buildSuccessCount = records.length - errorCount;
return {
summary: {
label: 'Code Builder',
startedAt: summary?.timestamp,
totals: {
examples: summary?.totalExamples ?? records.length,
buildSuccess: summary ? summary.totalExamples - summary.errors : buildSuccessCount,
buildFailures,
primaryPassRate: records.length === 0 ? 0 : primaryPassCount / records.length,
avgDiagnostic,
avgDurationMs: avgDuration,
},
},
records,
};
}
async function readOptional(filePath: string): Promise<string | null> {
try {
return await fs.readFile(filePath, 'utf8');
} catch {
return null;
}
}
// EE prompts in `notion-pairwise-workflows` don't carry dos/donts text — those
// are LangSmith inputs, not in prompt.txt. Return undefined so the IA criteria
// (which we have) drive the rendering. These stubs are placeholders in case we
// later hand-encode criteria into prompt.txt.
function extractDosFromPrompt(_prompt: string): string | null {
return null;
}
function extractDontsFromPrompt(_prompt: string): string | null {
return null;
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
return feedback.find((f) => f.metric === metric)?.score;
}
function escapeHtml(input: string): string {
return input
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
}
function escapeAttr(input: string): string {
return input.replace(/&/g, '&amp;').replace(/'/g, '&apos;').replace(/"/g, '&quot;');
}
function formatDuration(ms: number): string {
if (ms < 1000) return `${ms}ms`;
if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
const minutes = Math.floor(ms / 60_000);
const seconds = Math.floor((ms % 60_000) / 1000);
return `${minutes}m${seconds.toString().padStart(2, '0')}s`;
}
function pct(n: number): string {
return `${(n * 100).toFixed(1)}%`;
}
// ---------------------------------------------------------------------------
// Pairing
// ---------------------------------------------------------------------------
export interface ComparisonRow {
prompt: string;
dos?: string;
donts?: string;
ee?: BuilderRecord;
ia?: BuilderRecord;
verdict: 'both-pass' | 'both-fail' | 'ee-only' | 'ia-only' | 'neither';
}
/**
* Normalize prompt text used as the join key. EE and IA generate dirs/IDs
* via different schemes, so we have to match by prompt. Trim + collapse
* whitespace so trivial drift (CRLF, trailing space, indented blocks)
* doesn't silently un-pair otherwise-identical examples.
*/
export function promptJoinKey(prompt: string): string {
return prompt.replace(/\s+/g, ' ').trim();
}
export function pairRecords(ee: BuilderRecord[], ia: BuilderRecord[]): ComparisonRow[] {
const byKey = new Map<string, ComparisonRow>();
const ensure = (prompt: string): ComparisonRow => {
const key = promptJoinKey(prompt);
const existing = byKey.get(key);
if (existing) return existing;
const created: ComparisonRow = { prompt, verdict: 'neither' };
byKey.set(key, created);
return created;
};
for (const r of ee) {
const row = ensure(r.prompt);
row.ee = r;
}
for (const r of ia) {
const row = ensure(r.prompt);
row.ia = r;
// IA carries the dos/donts text, prefer it as the source of truth.
if (r.dos) row.dos = r.dos;
if (r.donts) row.donts = r.donts;
}
// Compute verdict for each row.
for (const row of byKey.values()) {
const eePass = row.ee && row.ee.success && findScore(row.ee.feedback, 'pairwise_primary') === 1;
const iaPass = row.ia && row.ia.success && findScore(row.ia.feedback, 'pairwise_primary') === 1;
row.verdict =
eePass && iaPass ? 'both-pass' : eePass ? 'ee-only' : iaPass ? 'ia-only' : 'both-fail';
}
const order: Record<ComparisonRow['verdict'], number> = {
'ee-only': 0,
'ia-only': 1,
'both-fail': 2,
'both-pass': 3,
neither: 4,
};
return [...byKey.values()].sort((a, b) => {
const ord = order[a.verdict] - order[b.verdict];
if (ord !== 0) return ord;
return a.prompt.localeCompare(b.prompt);
});
}
// ---------------------------------------------------------------------------
// Rendering
// ---------------------------------------------------------------------------
function renderCriteriaList(raw: string | undefined, kind: 'do' | 'dont'): string {
if (!raw) return '';
const lines = raw
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0);
if (lines.length === 0) return '';
const items = lines.map((line) => `<li>${escapeHtml(line)}</li>`).join('');
const label = kind === 'do' ? 'Do' : "Don't";
return `<div class="criteria ${kind}"><h4>${label}</h4><ul>${items}</ul></div>`;
}
function renderWorkflow(workflow: unknown): string {
if (!workflow) {
return '<div class="no-workflow">No workflow built.</div>';
}
const json = JSON.stringify(workflow);
return `<n8n-demo workflow="${escapeAttr(json)}" frame="true" clicktointeract="true" collapseformobile="true"></n8n-demo>`;
}
function renderJudgeRows(feedback: FeedbackEntry[]): string {
const judges = feedback.filter((f) => /^judge\d+$/.test(f.metric));
if (judges.length === 0) return '';
const rows = judges
.map((j) => {
const cls = j.score === 1 ? 'judge-pass' : 'judge-fail';
const comment = j.comment ? escapeHtml(j.comment) : '<em>no violations</em>';
return `<tr><td class="${cls}">${escapeHtml(j.metric)}</td><td>${j.score}</td><td>${comment}</td></tr>`;
})
.join('');
return `<table class="judges"><thead><tr><th>Judge</th><th>Pass</th><th>Notes</th></tr></thead><tbody>${rows}</tbody></table>`;
}
interface BuilderHeadline {
statusBadge: string;
statusKind: 'pass' | 'fail' | 'missing';
metaText: string; // duration · diagnostic · token info
}
function buildHeadline(record: BuilderRecord | undefined): BuilderHeadline {
if (!record) {
return {
statusBadge: '<span class="status status-missing">N/A</span>',
statusKind: 'missing',
metaText: '—',
};
}
const primary = findScore(record.feedback, 'pairwise_primary');
const diagnostic = findScore(record.feedback, 'pairwise_diagnostic');
const statusBadge = !record.success
? `<span class="status status-fail">BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}</span>`
: primary === 1
? '<span class="status status-pass">PASS</span>'
: '<span class="status status-fail">FAIL</span>';
const statusKind: BuilderHeadline['statusKind'] = !record.success
? 'fail'
: primary === 1
? 'pass'
: 'fail';
const metaParts: string[] = [formatDuration(record.durationMs)];
if (diagnostic !== undefined) metaParts.push(`diag ${diagnostic.toFixed(2)}`);
return { statusBadge, statusKind, metaText: metaParts.join(' · ') };
}
function renderBuilderColumn(label: string, record: BuilderRecord | undefined): string {
if (!record) {
return `<div class="builder-col missing"><div class="builder-label">${escapeHtml(label)}</div><div class="missing-msg">No record for this prompt.</div></div>`;
}
const primary = findScore(record.feedback, 'pairwise_primary');
const diagnostic = findScore(record.feedback, 'pairwise_diagnostic');
const totalPasses = findScore(record.feedback, 'pairwise_total_passes');
const totalViolations = findScore(record.feedback, 'pairwise_total_violations');
const statusBadge = !record.success
? `<span class="status status-fail">BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}</span>`
: primary === 1
? '<span class="status status-pass">PASS</span>'
: '<span class="status status-fail">FAIL</span>';
const metaParts: string[] = [`<span>${formatDuration(record.durationMs)}</span>`];
if (diagnostic !== undefined) {
metaParts.push(`<span>diag ${diagnostic.toFixed(2)}</span>`);
}
if (totalPasses !== undefined && totalViolations !== undefined) {
metaParts.push(`<span>${totalPasses}p / ${totalViolations}v</span>`);
}
if (record.tokenInput !== undefined && record.tokenOutput !== undefined) {
metaParts.push(`<span>${record.tokenInput}+${record.tokenOutput} tok</span>`);
}
const errorBlock = record.errorMessage
? `<div class="error">${escapeHtml(record.errorMessage)}</div>`
: '';
const idLine = record.exampleId
? `<div class="builder-id" title="${escapeAttr(record.exampleId)}">${escapeHtml(record.exampleId)}</div>`
: '';
return `<div class="builder-col">
<div class="builder-header">
<div class="builder-label">${escapeHtml(label)}</div>
${statusBadge}
</div>
${idLine}
<div class="builder-meta">${metaParts.join(' · ')}</div>
${errorBlock}
<div class="workflow-wrap">${renderWorkflow(record.workflow)}</div>
${renderJudgeRows(record.feedback)}
</div>`;
}
function renderRow(row: ComparisonRow, index: number): string {
const verdictLabel: Record<ComparisonRow['verdict'], string> = {
'both-pass': 'BOTH PASS',
'both-fail': 'BOTH FAIL',
'ee-only': 'CODE ONLY',
'ia-only': 'IA ONLY',
neither: '—',
};
const verdictCls: Record<ComparisonRow['verdict'], string> = {
'both-pass': 'verdict-both-pass',
'both-fail': 'verdict-both-fail',
'ee-only': 'verdict-ee-only',
'ia-only': 'verdict-ia-only',
neither: 'verdict-neither',
};
const eeHead = buildHeadline(row.ee);
const iaHead = buildHeadline(row.ia);
const promptPreview = row.prompt.slice(0, 110) + (row.prompt.length > 110 ? '…' : '');
const builderChip = (label: string, head: BuilderHeadline): string =>
`<span class="builder-chip chip-${head.statusKind}">
<span class="chip-label">${escapeHtml(label)}</span>
${head.statusBadge}
<span class="chip-meta">${escapeHtml(head.metaText)}</span>
</span>`;
const ids: string[] = [];
if (row.ia?.exampleId) ids.push(row.ia.exampleId);
if (row.ee?.exampleId && row.ee.exampleId !== row.ia?.exampleId) ids.push(row.ee.exampleId);
const idText = ids.join(' / ');
const idChip = `<span class="example-id" title="${escapeAttr(idText)}">${escapeHtml(idText)}</span>`;
// Heavy content (workflow previews + judge tables) is wrapped in a <template>
// so the n8n-demo web component is NOT instantiated until the user expands
// the row. The lazy loader script in the document head does the swap.
return `<details class="row ${verdictCls[row.verdict]}" id="row-${index}">
<summary>
<span class="verdict">${verdictLabel[row.verdict]}</span>
${idChip}
<span class="prompt-preview">${escapeHtml(promptPreview)}</span>
<span class="builder-chips">
${builderChip('Code', eeHead)}
${builderChip('IA', iaHead)}
</span>
</summary>
<div class="body">
<section class="prompt-block">
<h3>Prompt</h3>
<pre>${escapeHtml(row.prompt)}</pre>
</section>
<section class="criteria-row">
${renderCriteriaList(row.dos, 'do')}
${renderCriteriaList(row.donts, 'dont')}
</section>
<div class="lazy-slot" data-loaded="false">
<template>
<div class="builder-grid">
${renderBuilderColumn('Code Builder', row.ee)}
${renderBuilderColumn('instance-ai', row.ia)}
</div>
</template>
<div class="lazy-placeholder">Click to load workflow previews and judge details…</div>
</div>
</div>
</details>`;
}
function renderSummaryCard(
label: string,
summary: BuilderSummary,
totalRecords: number,
records: BuilderRecord[],
): string {
const failureBits = Object.entries(summary.totals.buildFailures)
.map(([k, v]) => `${k}: ${v}`)
.join(', ');
const primaryPasses = records.filter(
(r) => findScore(r.feedback, 'pairwise_primary') === 1,
).length;
const overallPassRate = totalRecords === 0 ? 0 : primaryPasses / totalRecords;
return `<div class="summary-card">
<h2>${escapeHtml(label)}</h2>
${summary.dataset ? `<div class="meta-row">Dataset: <code>${escapeHtml(summary.dataset)}</code></div>` : ''}
${summary.judgeModel ? `<div class="meta-row">Judge: ${escapeHtml(summary.judgeModel)} × ${summary.numJudges ?? 1}</div>` : ''}
${summary.startedAt ? `<div class="meta-row">Started: ${escapeHtml(summary.startedAt)}</div>` : ''}
<div class="metric"><strong>${pct(overallPassRate)}</strong><span>primary pass</span></div>
<div class="metric"><strong>${summary.totals.avgDiagnostic.toFixed(2)}</strong><span>avg diagnostic</span></div>
<div class="metric"><strong>${formatDuration(summary.totals.avgDurationMs)}</strong><span>avg build time</span></div>
<div class="metric"><strong>${summary.totals.buildSuccess}/${totalRecords}</strong><span>built ok</span></div>
${failureBits ? `<div class="meta-row failures">Failures: ${escapeHtml(failureBits)}</div>` : ''}
</div>`;
}
function renderMetricsNote(): string {
return `<aside class="metrics-note">
<strong>Metric definitions:</strong>
<span><b>Primary pass</b> — workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail.</span>
<span><b>Average diagnostic</b> — mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 01; gives partial credit.</span>
<span><b>Average build time</b> — averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number.</span>
<span><b>Verdicts</b> compare per-prompt primary pass between the two builders.</span>
</aside>`;
}
function renderVerdictTotals(rows: ComparisonRow[]): string {
const counts: Record<ComparisonRow['verdict'], number> = {
'both-pass': 0,
'both-fail': 0,
'ee-only': 0,
'ia-only': 0,
neither: 0,
};
for (const r of rows) counts[r.verdict]++;
const total = rows.length;
const card = (label: string, n: number, cls: string): string =>
`<div class="verdict-card ${cls}"><strong>${n}</strong><span>${escapeHtml(label)}</span><em>${total === 0 ? '0%' : pct(n / total)}</em></div>`;
return `<div class="verdict-grid">
${card('Both pass', counts['both-pass'], 'verdict-both-pass')}
${card('Code Builder only passes', counts['ee-only'], 'verdict-ee-only')}
${card('IA only passes', counts['ia-only'], 'verdict-ia-only')}
${card('Both fail', counts['both-fail'], 'verdict-both-fail')}
</div>`;
}
function renderDocument(ee: BuilderRun, ia: BuilderRun, rows: ComparisonRow[]): string {
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Pairwise Eval Comparison — Code Builder vs Instance AI</title>
<script defer src="https://cdn.jsdelivr.net/npm/@webcomponents/webcomponentsjs@2.0.0/webcomponents-loader.js"></script>
<script defer src="https://www.unpkg.com/lit@2.0.0-rc.2/polyfill-support.js"></script>
<script type="module" src="https://cdn.jsdelivr.net/npm/@n8n_io/n8n-demo-component/n8n-demo.bundled.js"></script>
<style>
:root {
font-family: ui-sans-serif, system-ui, -apple-system, sans-serif;
color-scheme: dark;
--bg: #0d1117;
--fg: #e6edf3;
--muted: #8b949e;
--border: #30363d;
--card: #161b22;
--subtle: #1c2129;
--pass: #3fb950;
--fail: #f85149;
--partial: #d29922;
--accent: #7c8cff;
--ee: #818cf8;
--ia: #2dd4bf;
}
body { margin: 0; background: var(--bg); color: var(--fg); }
header.top { padding: 16px 20px; background: var(--card); border-bottom: 1px solid var(--border); }
header.top h1 { margin: 0 0 6px 0; font-size: 18px; }
header.top .subhead { color: var(--muted); font-size: 13px; }
main { padding: 20px; max-width: 1600px; margin: 0 auto; display: flex; flex-direction: column; gap: 24px; }
.summary-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
.summary-card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 16px; display: flex; flex-direction: column; gap: 6px; }
.summary-card h2 { margin: 0 0 4px 0; font-size: 15px; }
.summary-card .meta-row { font-size: 12px; color: var(--muted); }
.summary-card .meta-row code { font-family: ui-monospace, monospace; font-size: 11px; background: var(--subtle); padding: 1px 4px; border-radius: 3px; }
.summary-card .metric { display: flex; justify-content: space-between; align-items: baseline; margin-top: 4px; font-size: 13px; }
.summary-card .metric strong { font-size: 18px; color: var(--accent); }
.summary-card .metric span { color: var(--muted); }
.summary-card .meta-row.failures { color: var(--fail); margin-top: 6px; }
.verdict-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; }
.metrics-note {
background: var(--card);
border: 1px solid var(--border);
border-radius: 8px;
padding: 12px 16px;
font-size: 12px;
color: var(--muted);
display: flex;
flex-direction: column;
gap: 4px;
}
.metrics-note strong { color: var(--fg); font-size: 12px; }
.metrics-note b { color: var(--fg); font-weight: 600; }
.verdict-card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 14px 16px; display: flex; flex-direction: column; gap: 4px; align-items: flex-start; }
.verdict-card strong { font-size: 26px; font-weight: 700; }
.verdict-card span { color: var(--muted); font-size: 12px; }
.verdict-card em { color: var(--muted); font-size: 11px; font-style: normal; }
.verdict-both-pass strong { color: var(--pass); }
.verdict-both-fail strong { color: var(--fail); }
.verdict-ee-only strong { color: var(--ee); }
.verdict-ia-only strong { color: var(--ia); }
.rows { background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; }
details.row { border-bottom: 1px solid var(--border); }
details.row:last-child { border-bottom: none; }
details.row > summary {
list-style: none;
cursor: pointer;
padding: 10px 16px;
display: grid;
grid-template-columns: 110px minmax(0, auto) minmax(0, 1fr) auto;
gap: 16px;
align-items: center;
font-size: 13px;
}
details.row > summary > .example-id {
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
font-size: 11px;
color: var(--muted);
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
max-width: 220px;
}
details.row > summary:hover { background: var(--subtle); }
details.row[open] > summary { background: var(--subtle); border-bottom: 1px solid var(--border); }
details.row > summary::-webkit-details-marker { display: none; }
details.row > summary .verdict {
font-size: 11px;
font-weight: 700;
letter-spacing: 0.04em;
padding: 3px 8px;
border-radius: 3px;
text-align: center;
}
details.row.verdict-both-pass > summary .verdict { background: rgba(63,185,80,0.18); color: var(--pass); }
details.row.verdict-both-fail > summary .verdict { background: rgba(248,81,73,0.18); color: var(--fail); }
details.row.verdict-ee-only > summary .verdict { background: rgba(129,140,248,0.2); color: var(--ee); }
details.row.verdict-ia-only > summary .verdict { background: rgba(45,212,191,0.18); color: var(--ia); }
details.row > summary .prompt-preview { color: var(--fg); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
details.row > summary .builder-chips { display: flex; gap: 8px; white-space: nowrap; }
.builder-chip {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 3px 8px;
border-radius: 4px;
font-size: 11px;
border: 1px solid var(--border);
background: var(--card);
}
.builder-chip.chip-pass { border-color: rgba(63,185,80,0.4); background: rgba(63,185,80,0.08); }
.builder-chip.chip-fail { border-color: rgba(248,81,73,0.35); background: rgba(248,81,73,0.08); }
.builder-chip.chip-missing { border-color: var(--border); background: var(--subtle); }
.builder-chip .chip-label { font-weight: 700; color: var(--muted); letter-spacing: 0.04em; }
.builder-chip .chip-meta { color: var(--muted); }
.lazy-slot { margin-top: 14px; }
.lazy-placeholder { padding: 18px; text-align: center; color: var(--muted); font-size: 12px; border: 1px dashed var(--border); border-radius: 4px; background: var(--subtle); }
details.row > .body { padding: 16px; background: var(--subtle); border-top: 1px solid var(--border); }
details.row > .body h3 { margin: 0 0 6px 0; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em; color: var(--muted); }
details.row pre { background: var(--bg); border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; font-size: 12px; white-space: pre-wrap; max-height: 200px; overflow-y: auto; color: var(--fg); }
.criteria-row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-top: 12px; }
.criteria { border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; background: var(--card); }
.criteria h4 { margin: 0 0 4px 0; font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; }
.criteria.do h4 { color: var(--pass); }
.criteria.dont h4 { color: var(--fail); }
.criteria ul { margin: 0; padding-left: 18px; font-size: 12px; }
.builder-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-top: 14px; }
.builder-col { background: var(--card); border: 1px solid var(--border); border-radius: 6px; padding: 12px; display: flex; flex-direction: column; gap: 8px; }
.builder-col.missing { background: var(--subtle); }
.builder-col .missing-msg { color: var(--muted); font-style: italic; font-size: 12px; }
.builder-header { display: flex; justify-content: space-between; align-items: center; }
.builder-label { font-weight: 600; font-size: 13px; }
.status { font-size: 11px; font-weight: 700; padding: 3px 8px; border-radius: 3px; letter-spacing: 0.04em; }
.status-pass { background: rgba(63,185,80,0.2); color: var(--pass); }
.status-fail { background: rgba(248,81,73,0.2); color: var(--fail); }
.builder-meta { font-size: 11px; color: var(--muted); display: flex; gap: 8px; flex-wrap: wrap; }
.builder-id { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.error { padding: 8px 10px; background: rgba(248,81,73,0.12); color: var(--fail); border-radius: 4px; font-size: 11px; white-space: pre-wrap; max-height: 120px; overflow-y: auto; }
.workflow-wrap { display: flex; }
n8n-demo { display: block; width: 100%; height: 320px; border: 1px solid var(--border); border-radius: 4px; background: #fff; color-scheme: light; }
.no-workflow { padding: 30px; text-align: center; color: var(--muted); font-size: 12px; border: 1px dashed var(--border); border-radius: 4px; flex: 1; }
table.judges { width: 100%; border-collapse: collapse; font-size: 11px; background: var(--card); border: 1px solid var(--border); border-radius: 4px; overflow: hidden; }
table.judges th, table.judges td { padding: 5px 8px; text-align: left; border-bottom: 1px solid var(--border); vertical-align: top; }
table.judges tr:last-child td { border-bottom: none; }
table.judges td.judge-pass { color: var(--pass); font-weight: 600; }
table.judges td.judge-fail { color: var(--fail); font-weight: 600; }
</style>
</head>
<body>
<header class="top">
<h1>Pairwise Eval Comparison — Code Builder vs Instance AI</h1>
<div class="subhead">${rows.length} prompt${rows.length === 1 ? '' : 's'} compared. Rows are ordered: Code-only wins, IA-only wins, both fail, both pass.</div>
</header>
<main>
<section class="summary-row">
${renderSummaryCard('Code Builder', ee.summary, ee.records.length, ee.records)}
${renderSummaryCard('instance-ai', ia.summary, ia.records.length, ia.records)}
</section>
${renderVerdictTotals(rows)}
${renderMetricsNote()}
<section class="rows">
${rows.map((r, i) => renderRow(r, i)).join('\n')}
</section>
</main>
<script>
// Lazy-load heavy preview content (n8n-demo + judge tables) on first expand.
// Each row contains <template> with the workflow previews inside a
// .lazy-slot[data-loaded="false"] div. On the first toggle-open we move the
// template's content into the live DOM so the n8n-demo web component is
// only constructed for rows the user actually reads.
document.querySelectorAll('details.row').forEach((details) => {
details.addEventListener('toggle', () => {
if (!details.open) return;
const slot = details.querySelector('.lazy-slot[data-loaded="false"]');
if (!slot) return;
const template = slot.querySelector('template');
const placeholder = slot.querySelector('.lazy-placeholder');
if (template) {
slot.appendChild(template.content.cloneNode(true));
template.remove();
}
if (placeholder) placeholder.remove();
slot.dataset.loaded = 'true';
}, { once: true });
});
</script>
</body>
</html>`;
}
// ---------------------------------------------------------------------------
// CLI
// ---------------------------------------------------------------------------
interface CliArgs {
eeDir: string;
iaDir: string;
out: string;
}
function parseArgs(argv: string[]): CliArgs {
const get = (flag: string): string | undefined => {
const idx = argv.indexOf(flag);
if (idx === -1) return undefined;
const value = argv[idx + 1];
return value && !value.startsWith('--') ? value : undefined;
};
const eeDir = get('--ee-dir');
const iaDir = get('--ia-dir');
if (!eeDir || !iaDir) {
throw new Error(
'Usage: tsx evaluations/cli/compare-pairwise.ts --ee-dir <path> --ia-dir <path> [--out <path>]',
);
}
const defaultOut = path.join(path.dirname(path.resolve(iaDir)), 'comparison.html');
const out = path.resolve(get('--out') ?? defaultOut);
return { eeDir: path.resolve(eeDir), iaDir: path.resolve(iaDir), out };
}
async function main(): Promise<void> {
const args = parseArgs(process.argv.slice(2));
const [ee, ia] = await Promise.all([loadEERun(args.eeDir), loadInstanceAiRun(args.iaDir)]);
console.log(
`EE records: ${ee.records.length} (pass rate ${pct(ee.summary.totals.primaryPassRate)})`,
);
console.log(
`IA records: ${ia.records.length} (pass rate ${pct(ia.summary.totals.primaryPassRate)})`,
);
const rows = pairRecords(ee.records, ia.records);
const matched = rows.filter((r) => r.ee && r.ia).length;
console.log(`Joined ${rows.length} prompts (${matched} matched on both sides)`);
const html = renderDocument(ee, ia, rows);
await fs.writeFile(args.out, html, 'utf8');
console.log(`Wrote comparison report to ${args.out}`);
}
if (require.main === module) {
main().catch((error) => {
console.error(error instanceof Error ? (error.stack ?? error.message) : String(error));
process.exit(1);
});
}