n8n/packages/@n8n/instance-ai/evaluations/comparison/format.ts
José Braulio González Valido 2164afc5df
Some checks are pending
CI: Master (Build, Test, Lint) / Build for Github Cache (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (22.x) (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (24.14.1) (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (25.x) (push) Waiting to run
CI: Master (Build, Test, Lint) / Lint (push) Waiting to run
CI: Master (Build, Test, Lint) / Performance (push) Waiting to run
CI: Master (Build, Test, Lint) / Notify Slack on failure (push) Blocked by required conditions
chore(ai-builder): Improve eval comparison alert clarity (no-changelog) (#29929)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 21:20:49 +00:00

978 lines
33 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ---------------------------------------------------------------------------
// Render the eval run as a PR comment (markdown) or a console summary
// (aligned plain text). Both formats are driven by:
//
// - MultiRunEvaluation — pass rates, build counts, per-trial reasoning
// - ComparisonOutcome (optional) — tagged result of the baseline
// comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or
// `fetch_failed` / `self_baseline` (skipped for cause). Each kind
// drives a distinct top-of-comment alert so a LangSmith outage doesn't
// get dressed up as "no baseline configured".
//
// When no comparison is available (no baseline yet, LangSmith offline)
// the renderers still produce a useful per-test-case summary. When a
// comparison is available, sections render in priority order:
// regressions, likely regressions, worth watching, improvements,
// failure-category drift. Only sections with content are emitted.
// ---------------------------------------------------------------------------
import {
hardRegressions,
improvements,
softRegressions,
watchList,
type ComparisonOutcome,
type ComparisonResult,
type FailureCategoryComparison,
type ScenarioComparison,
} from './compare';
import type {
MultiRunEvaluation,
TestCaseAggregation,
WorkflowTestCase,
WorkflowTestCaseResult,
} from '../types';
interface FormatOptions {
/** Optional commit SHA to include in the heading. Truncated to 8 chars. */
commitSha?: string;
/** Maps each test-case reference to its file slug. When provided, the
* per-scenario failure breakdown looks up failed runs by
* `${fileSlug}/${scenarioName}` — deterministic across collisions like
* multiple `happy-path` scenarios. When omitted, the breakdown is
* skipped (no name-only fallback — that lookup was wrong on real data). */
slugByTestCase?: Map<WorkflowTestCase, string>;
}
// ---------------------------------------------------------------------------
// Markdown PR comment
// ---------------------------------------------------------------------------
export function formatComparisonMarkdown(
evaluation: MultiRunEvaluation,
outcome?: ComparisonOutcome,
options: FormatOptions = {},
): string {
const lines: string[] = [];
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
lines.push(formatHeading(options.commitSha));
lines.push('');
lines.push(formatTopAlert(outcome));
lines.push('');
lines.push(formatAggregateBlock(evaluation, comparison));
lines.push('');
if (comparison) {
const hard = hardRegressions(comparison);
const soft = softRegressions(comparison);
const watch = watchList(comparison);
const imps = improvements(comparison);
const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0;
// Built once and reused across the regression-tier sections so each
// scenario row can carry a collapsible breakdown of its failed PR runs.
// Improvements skip the breakdown — they passed. Skipped entirely when
// the caller didn't pass a slug map (lookup would be ambiguous).
const failedIndex = options.slugByTestCase
? buildFailedRunsIndex(evaluation, options.slugByTestCase)
: undefined;
if (hard.length > 0) {
lines.push(
...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex),
);
}
if (soft.length > 0) {
lines.push(
...renderScenarioSection(
'Likely regressions',
'— looser statistical flag, investigate if related to your changes',
soft,
true,
failedIndex,
),
);
}
if (watch.length > 0) {
lines.push(
...renderScenarioSection(
'Worth watching',
'— large change, not flagged as a regression',
watch,
false,
failedIndex,
),
);
}
if (imps.length > 0) {
lines.push(...renderScenarioSection('Improvements', '', imps, true));
}
if (renderedAnyTable) {
lines.push(
"_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._",
);
lines.push('');
}
// Always render the breakdown when comparison data is available — the
// renderer drops 0/0 rows itself, so empty categories don't pollute
// the output but the reader still sees the full taxonomy of what's
// tracked.
lines.push(...renderFailureCategorySection(comparison.failureCategories));
}
lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase));
if (comparison) {
const otherFindings = renderOtherFindings(comparison);
if (otherFindings.length > 0) lines.push(...otherFindings);
}
const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase);
if (failureDetails.length > 0) lines.push(...failureDetails);
return lines.join('\n');
}
function formatHeading(commitSha?: string): string {
const sha = commitSha ? `\`${commitSha.slice(0, 8)}\`` : '';
return `### Instance AI Workflow Eval${sha}`;
}
function formatTopAlert(outcome?: ComparisonOutcome): string {
if (!outcome) {
return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join(
'\n',
);
}
if (outcome.kind === 'no_baseline') {
return [
'> [!NOTE]',
'> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.',
].join('\n');
}
if (outcome.kind === 'self_baseline') {
return [
'> [!NOTE]',
`> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`,
].join('\n');
}
if (outcome.kind === 'fetch_failed') {
return [
'> [!WARNING]',
`> Regression detection did not run — baseline fetch failed: ${outcome.error}`,
].join('\n');
}
const comparison = outcome.result;
const hard = hardRegressions(comparison).length;
const soft = softRegressions(comparison).length;
const watch = watchList(comparison).length;
const imps = improvements(comparison).length;
const stable = countByVerdict(comparison, 'stable');
const aggDelta = comparison.aggregate.delta * 100;
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
const passRateText = `pass rate ${aggDeltaText} vs master`;
// Two-line summary: regression-tier counts on top, positives/neutrals on the
// bottom. The pass-rate delta tails whichever line matches its sign so the
// per-line story stays coherent (negative delta lives with the concerns).
const concernsParts = [
hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions',
`${soft} likely regression${soft === 1 ? '' : 's'}`,
`${watch} worth watching`,
];
const winsParts = [`${imps} improvement${imps === 1 ? '' : 's'}`, `${stable} stable`];
if (aggDelta < 0) {
concernsParts.push(passRateText);
} else {
winsParts.push(passRateText);
}
const concerns = concernsParts.join(' · ');
const wins = winsParts.join(' · ');
let icon: string;
let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP';
if (hard > 0) {
icon = '🔴';
alertKind = 'CAUTION';
} else if (soft > 0) {
icon = '🟡';
alertKind = 'WARNING';
} else if (watch > 0) {
icon = '🔵';
alertKind = 'NOTE';
} else {
icon = '🟢';
alertKind = 'TIP';
}
return `> [!${alertKind}]\n> ${icon} ${concerns}\n> ${wins}`;
}
function formatAggregateBlock(
evaluation: MultiRunEvaluation,
comparison?: ComparisonResult,
): string {
if (!comparison) {
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
const rate = total > 0 ? (passed / total) * 100 : 0;
return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`;
}
const { aggregate } = comparison;
const delta = aggregate.delta * 100;
const sign = delta >= 0 ? '+' : '';
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
const baselineN = inferBaselineN(comparison);
const sampleLine = baselineN
? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_`
: `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`;
const partial = comparison.baselineOnly.length + comparison.prOnly.length;
const partialNote =
partial > 0
? `\n_Partial: ${[
comparison.baselineOnly.length > 0
? `${comparison.baselineOnly.length} baseline scenarios not run by PR`
: null,
comparison.prOnly.length > 0
? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)`
: null,
]
.filter((s) => s !== null)
.join(', ')}._`
: '';
return [
`**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`,
sampleLine + partialNote,
].join('\n');
}
function renderScenarioSection(
heading: string,
subtitle: string,
scenarios: ScenarioComparison[],
withPValue: boolean,
failedIndex?: FailedRunsBySlug,
): string[] {
const lines: string[] = [];
const headingLine = subtitle
? `#### ${heading} (${scenarios.length}) ${subtitle}`
: `#### ${heading} (${scenarios.length})`;
lines.push(headingLine);
lines.push('');
if (withPValue) {
lines.push('| Scenario | PR | Baseline | Δ | p |');
lines.push('|---|---|---|---|---|');
} else {
lines.push('| Scenario | PR | Baseline | Δ |');
lines.push('|---|---|---|---|');
}
for (const s of scenarios) {
const cells = [
`\`${s.testCaseFile}/${s.scenarioName}\``,
formatRateCell(s.prPasses, s.prTotal),
formatRateCell(s.baselinePasses, s.baselineTotal),
formatDeltaCell(s.delta),
];
if (withPValue) {
const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft;
cells.push(p.toFixed(3));
}
lines.push(`| ${cells.join(' | ')} |`);
}
lines.push('');
// Per-scenario failure breakdown — one collapsible per row that had failed
// PR runs. Lets the reader drill into each flagged scenario without
// hunting through a separate "Failure details" section.
if (failedIndex) {
for (const s of scenarios) {
const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? [];
if (failedRuns.length === 0) continue;
lines.push(...renderScenarioFailureBreakdown(s, failedRuns));
}
}
return lines;
}
function renderScenarioFailureBreakdown(
s: ScenarioComparison,
failedRuns: FailedRunDetail[],
): string[] {
const slug = `${s.testCaseFile}/${s.scenarioName}`;
const categoryMix = summarizeCategories(failedRuns);
const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`];
if (categoryMix) summaryParts.push(categoryMix);
const lines: string[] = [];
lines.push(`<details><summary><code>${slug}</code> — ${summaryParts.join(' · ')}</summary>`);
lines.push('');
for (const fr of failedRuns) {
const tag = fr.category ? ` [${fr.category}]` : '';
lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`);
lines.push('>');
}
// Drop the trailing empty quote line.
if (lines[lines.length - 1] === '>') lines.pop();
lines.push('');
lines.push('</details>');
lines.push('');
return lines;
}
function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] {
// Drop rows that are 0/0 on both sides — they carry no signal for the
// reader. Categories with non-zero count on either side are kept so the
// reader sees the full picture even if not "notable".
const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0);
if (rows.length === 0) return [];
const lines: string[] = [];
lines.push('#### Failure breakdown');
lines.push('');
lines.push('| Category | PR | Baseline | Δ | |');
lines.push('|---|---|---|---|---|');
for (const c of rows) {
const isNew = c.baselineCount === 0 && c.prCount > 0;
const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``;
const delta = c.delta * 100;
const sign = delta >= 0 ? '+' : '';
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
const notableMarker = c.notable ? '**notable**' : '';
lines.push(
`| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`,
);
}
lines.push('');
return lines;
}
function renderPerTestCaseDetails(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const { totalRuns, testCases } = evaluation;
if (testCases.length === 0) return [];
const lines: string[] = [];
lines.push(`<details><summary>Per-test-case results (${testCases.length})</summary>`);
lines.push('');
const renderName = (tc: TestCaseAggregation): string => {
const slug = slugByTestCase?.get(tc.testCase);
return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``;
};
if (totalRuns > 1) {
lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`);
lines.push('|---|---|---|---|');
for (const tc of testCases) {
const meanPassAtK = tc.scenarios.length
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
const meanPassHatK = tc.scenarios.length
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
lines.push(
`| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`,
);
}
} else {
lines.push('| Workflow | Built | Pass rate |');
lines.push('|---|---|---|');
for (const tc of testCases) {
const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗';
const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length;
const total = tc.scenarios.length;
lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`);
}
}
lines.push('');
lines.push('</details>');
lines.push('');
return lines;
}
function renderOtherFindings(comparison: ComparisonResult): string[] {
const stable = countByVerdict(comparison, 'stable');
const flaky = countByVerdict(comparison, 'unreliable_baseline');
const noData = countByVerdict(comparison, 'insufficient_data');
if (stable === 0 && flaky === 0 && noData === 0) return [];
const summaryParts: string[] = [];
if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`);
if (noData > 0) summaryParts.push(`${noData} no data`);
if (stable > 0) summaryParts.push(`${stable} stable`);
const summary = summaryParts.join(' · ');
const lines: string[] = [];
lines.push(`<details><summary>Other findings: ${summary}</summary>`);
lines.push('');
const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable');
const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline');
const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data');
if (flakyScenarios.length > 0) {
lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**');
lines.push('');
lines.push('| Scenario | PR | Baseline | Δ |');
lines.push('|---|---|---|---|');
for (const s of flakyScenarios) {
lines.push(
`| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`,
);
}
lines.push('');
}
if (noDataScenarios.length > 0) {
lines.push(
`**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`,
);
lines.push('');
}
if (stableScenarios.length > 0) {
lines.push(`**Stable (${stableScenarios.length}):**`);
lines.push(
stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.',
);
lines.push('');
}
lines.push('</details>');
lines.push('');
return lines;
}
function renderFailureDetails(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const failed: Array<{
tc: WorkflowTestCaseResult;
fileSlug: string | undefined;
scenarioName: string;
failedRuns: Array<{ category?: string; reasoning: string }>;
}> = [];
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase?.get(tc.testCase);
for (const sa of tc.scenarios) {
const failedRuns = sa.runs
.filter((r) => !r.success)
.map((r) => ({ category: r.failureCategory, reasoning: r.reasoning }));
if (failedRuns.length > 0) {
failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns });
}
}
}
if (failed.length === 0) return [];
const lines: string[] = [];
lines.push('<details><summary>Failure details</summary>');
lines.push('');
for (const { tc, fileSlug, scenarioName, failedRuns } of failed) {
const slug = fileSlug
? `${fileSlug}/${scenarioName}`
: `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`;
lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`);
for (const fr of failedRuns) {
const tag = fr.category ? ` [${fr.category}]` : '';
lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`);
}
lines.push('');
}
lines.push('</details>');
lines.push('');
return lines;
}
// ---------------------------------------------------------------------------
// Per-scenario failure lookup
// ---------------------------------------------------------------------------
//
// The comparison carries per-scenario counts (passed / total) but not the
// underlying reasoning text. The evaluation has the reasoning, but keys
// testCases by reference identity — not by the `testCaseFile` slug used in
// the comparison. The slug map (built in cli/index.ts where the file slugs
// are first known) bridges the two so the lookup is deterministic. Without
// it we'd have to disambiguate by scenarioName alone, which collides on
// reused names (`happy-path` shows up across most workflows).
interface FailedRunDetail {
category?: string;
reasoning: string;
runIndex: number; // 1-based for display
}
type FailedRunsBySlug = Map<string, FailedRunDetail[]>;
function buildFailedRunsIndex(
evaluation: MultiRunEvaluation,
slugByTestCase: Map<WorkflowTestCase, string>,
): FailedRunsBySlug {
const map: FailedRunsBySlug = new Map();
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase.get(tc.testCase);
if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute
for (const sa of tc.scenarios) {
const failedRuns: FailedRunDetail[] = [];
sa.runs.forEach((r, i) => {
if (!r.success) {
failedRuns.push({
category: r.failureCategory,
reasoning: r.reasoning,
runIndex: i + 1,
});
}
});
if (failedRuns.length > 0) {
map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns);
}
}
}
return map;
}
function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined {
const counts = new Map<string, number>();
for (const fr of failedRuns) {
if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1);
}
if (counts.size === 0) return undefined;
return [...counts.entries()]
.sort((a, b) => b[1] - a[1])
.map(([cat, n]) => `${n}× ${cat}`)
.join(', ');
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function pct(rate: number): string {
return (rate * 100).toFixed(1);
}
function formatRateCell(passes: number, total: number): string {
const rate = total > 0 ? Math.round((passes / total) * 100) : 0;
return `${passes}/${total} (${rate}%)`;
}
function formatDeltaCell(delta: number): string {
const pp = delta * 100;
const sign = pp >= 0 ? '+' : '';
const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : '';
return `${sign}${pp.toFixed(0)}pp${arrow}`;
}
function countByVerdict(
comparison: ComparisonResult,
verdict: ScenarioComparison['verdict'],
): number {
return comparison.scenarios.filter((s) => s.verdict === verdict).length;
}
/** Best-effort N=baseline iteration count. The comparison only carries trial
* totals per scenario; we infer N from the most-common scenario total since
* the baseline runs every scenario the same number of times. */
function inferBaselineN(comparison: ComparisonResult): number | undefined {
const totals = comparison.scenarios
.filter((s) => s.baselineTotal > 0)
.map((s) => s.baselineTotal);
if (totals.length === 0) return undefined;
const counts = new Map<number, number>();
for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1);
let best = totals[0];
let bestCount = 0;
for (const [n, c] of counts) {
if (c > bestCount) {
best = n;
bestCount = c;
}
}
return best;
}
// ---------------------------------------------------------------------------
// Terminal renderer: aligned plain text for the eval CLI's end-of-run print.
// ---------------------------------------------------------------------------
const TERMINAL_INDENT = ' ';
const TERMINAL_TABLE_INDENT = ' ';
export function formatComparisonTerminal(
evaluation: MultiRunEvaluation,
outcome?: ComparisonOutcome,
options: FormatOptions = {},
): string {
const lines: string[] = [];
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
const titleSuffix = options.commitSha ? `${options.commitSha.slice(0, 8)}` : '';
const title = `Instance AI Workflow Eval${titleSuffix}`;
lines.push(title);
lines.push('═'.repeat(title.length));
lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome));
lines.push('');
lines.push(...formatTerminalAggregate(evaluation, comparison));
lines.push('');
lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase));
if (comparison) {
const hard = hardRegressions(comparison);
const soft = softRegressions(comparison);
const watch = watchList(comparison);
const imps = improvements(comparison);
if (hard.length > 0) {
lines.push(
TERMINAL_INDENT +
'REGRESSIONS (high-confidence: large drop on a reliable scenario, unlikely noise)',
);
lines.push(formatTerminalScenarioTable(hard, true));
lines.push('');
}
if (soft.length > 0) {
lines.push(
TERMINAL_INDENT +
'LIKELY REGRESSIONS (looser statistical flag — investigate if related to your changes)',
);
lines.push(formatTerminalScenarioTable(soft, true));
lines.push('');
}
if (watch.length > 0) {
lines.push(TERMINAL_INDENT + 'WORTH WATCHING (large change, not flagged as a regression)');
lines.push(formatTerminalScenarioTable(watch, false));
lines.push('');
}
if (imps.length > 0) {
lines.push(TERMINAL_INDENT + 'IMPROVEMENTS');
lines.push(formatTerminalScenarioTable(imps, true));
lines.push('');
}
// Always render the breakdown when comparison data is available — same
// rationale as the markdown side. The terminal table drops 0/0 rows
// itself.
const breakdownRows = comparison.failureCategories.filter(
(c) => c.prCount > 0 || c.baselineCount > 0,
);
if (breakdownRows.length > 0) {
lines.push(TERMINAL_INDENT + 'failure breakdown');
lines.push(formatTerminalCategoryTable(breakdownRows));
lines.push('');
}
// Stable count is already in the verdict line; surface only the rarer
// outcomes here.
const flaky = countByVerdict(comparison, 'unreliable_baseline');
const noData = countByVerdict(comparison, 'insufficient_data');
const otherParts: string[] = [];
if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`);
if (noData > 0) otherParts.push(`${noData} no data`);
if (otherParts.length > 0) {
lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · '));
}
}
return lines.join('\n');
}
function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string {
if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).';
if (outcome.kind === 'no_baseline') {
return '▶ No baseline configured — comparison skipped.';
}
if (outcome.kind === 'self_baseline') {
return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`;
}
if (outcome.kind === 'fetch_failed') {
return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`;
}
const comparison = outcome.result;
const hard = hardRegressions(comparison).length;
const soft = softRegressions(comparison).length;
const watch = watchList(comparison).length;
const imps = improvements(comparison).length;
const stable = countByVerdict(comparison, 'stable');
const aggDelta = comparison.aggregate.delta * 100;
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
const passRateText = `pass rate ${aggDeltaText} vs master`;
const concernsParts = [
`${hard} regression${hard === 1 ? '' : 's'}`,
`${soft} likely regression${soft === 1 ? '' : 's'}`,
`${watch} worth watching`,
];
const winsParts = [`${imps} improvement${imps === 1 ? '' : 's'}`, `${stable} stable`];
if (aggDelta < 0) {
concernsParts.push(passRateText);
} else {
winsParts.push(passRateText);
}
// The caller prepends TERMINAL_INDENT to the start of this string. Embed an
// extra TERMINAL_INDENT after the line break so the wins line aligns under
// the concerns text (past the `▶ ` arrow).
return `${concernsParts.join(' · ')}\n${TERMINAL_INDENT} ${winsParts.join(' · ')}`;
}
function formatTerminalAggregate(
evaluation: MultiRunEvaluation,
comparison?: ComparisonResult,
): string[] {
const lines: string[] = [];
if (!comparison) {
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
const rate = total > 0 ? (passed / total) * 100 : 0;
lines.push(
TERMINAL_INDENT +
`Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`,
);
return lines;
}
const { aggregate } = comparison;
const baselineN = inferBaselineN(comparison);
const aggDelta = aggregate.delta * 100;
const sign = aggDelta >= 0 ? '+' : '';
const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : '';
lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`);
lines.push(
TERMINAL_INDENT +
` PR ${pct(aggregate.prAggregatePassRate)}% (N=${evaluation.totalRuns})`,
);
if (baselineN !== undefined) {
lines.push(
TERMINAL_INDENT +
` baseline ${pct(aggregate.baselineAggregatePassRate)}% (N=${baselineN})`,
);
} else {
lines.push(TERMINAL_INDENT + ` baseline ${pct(aggregate.baselineAggregatePassRate)}%`);
}
lines.push(TERMINAL_INDENT + ` Δ ${sign}${aggDelta.toFixed(1)}pp${arrow}`);
if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) {
const partialParts: string[] = [];
if (comparison.baselineOnly.length > 0)
partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`);
if (comparison.prOnly.length > 0)
partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`);
lines.push(TERMINAL_INDENT + ` partial: ${partialParts.join(', ')}`);
}
return lines;
}
function formatTerminalPerTestCase(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const { totalRuns, testCases } = evaluation;
if (testCases.length === 0) return [];
const lines: string[] = [];
const heading = `Per-test-case results (${testCases.length})`;
lines.push(TERMINAL_INDENT + heading);
const nameOf = (tc: TestCaseAggregation, max: number): string => {
const slug = slugByTestCase?.get(tc.testCase);
return slug ?? tc.testCase.prompt.slice(0, max);
};
if (totalRuns > 1) {
const rows = testCases.map((tc) => {
const meanPassAtK =
tc.scenarios.length > 0
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
const meanPassHatK =
tc.scenarios.length > 0
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
return {
name: nameOf(tc, 60),
builds: `${tc.buildSuccessCount}/${totalRuns}`,
passAtK: `${meanPassAtK}%`,
passHatK: `${meanPassHatK}%`,
};
});
const nameW = maxWidth(
rows.map((r) => r.name),
'workflow',
);
const buildsW = maxWidth(
rows.map((r) => r.builds),
'builds',
);
const atKHeader = `pass@${totalRuns}`;
const hatKHeader = `pass^${totalRuns}`;
const atKW = maxWidth(
rows.map((r) => r.passAtK),
atKHeader,
);
const hatKW = maxWidth(
rows.map((r) => r.passHatK),
hatKHeader,
);
lines.push(
TERMINAL_TABLE_INDENT +
`${'workflow'.padEnd(nameW)} ${'builds'.padEnd(buildsW)} ${atKHeader.padStart(atKW)} ${hatKHeader.padStart(hatKW)}`,
);
lines.push(
TERMINAL_TABLE_INDENT +
`${'─'.repeat(nameW)} ${'─'.repeat(buildsW)} ${'─'.repeat(atKW)} ${'─'.repeat(hatKW)}`,
);
for (const r of rows) {
lines.push(
TERMINAL_TABLE_INDENT +
`${r.name.padEnd(nameW)} ${r.builds.padEnd(buildsW)} ${r.passAtK.padStart(atKW)} ${r.passHatK.padStart(hatKW)}`,
);
}
} else {
for (const tc of testCases) {
const r = tc.runs[0];
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
lines.push('');
lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}`);
lines.push(TERMINAL_INDENT + ` ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
if (r.buildError) lines.push(TERMINAL_INDENT + ` error: ${r.buildError.slice(0, 200)}`);
for (const sa of tc.scenarios) {
const sr = sa.runs[0];
const status = sr.success ? 'PASS' : 'FAIL';
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
lines.push(TERMINAL_INDENT + ` ${status} ${sr.scenario.name}${category}`);
if (!sr.success) {
const errs = sr.evalResult?.errors ?? [];
if (errs.length > 0) {
lines.push(TERMINAL_INDENT + ` error: ${errs.join('; ').slice(0, 200)}`);
}
lines.push(TERMINAL_INDENT + ` diagnosis: ${sr.reasoning.slice(0, 200)}`);
}
}
}
}
lines.push('');
return lines;
}
function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string {
const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`);
const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`);
const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`);
const deltaCells = scenarios.map((s) => {
const d = s.delta * 100;
const sign = d >= 0 ? '+' : '';
const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : '';
return `${sign}${d.toFixed(0)}pp${arrow}`;
});
const pCells = withPValue
? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3))
: [];
const nameW = maxWidth(names, 'scenario');
const prW = maxWidth(prCells, 'PR');
const baseW = maxWidth(baseCells, 'baseline');
const deltaW = maxWidth(deltaCells, 'Δ');
const pW = withPValue ? maxWidth(pCells, 'p') : 0;
const headers = [
'scenario'.padEnd(nameW),
'PR'.padEnd(prW),
'baseline'.padEnd(baseW),
'Δ'.padEnd(deltaW),
];
if (withPValue) headers.push('p'.padEnd(pW));
const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW];
const sep = widths.map((w) => '─'.repeat(w)).join(' ');
const rows = scenarios.map((_, i) => {
const cells = [
names[i].padEnd(nameW),
prCells[i].padEnd(prW),
baseCells[i].padEnd(baseW),
deltaCells[i].padEnd(deltaW),
];
if (withPValue) cells.push(pCells[i].padEnd(pW));
return TERMINAL_TABLE_INDENT + cells.join(' ');
});
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
'\n',
);
}
function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string {
const names = cats.map((c) => {
const isNew = c.baselineCount === 0 && c.prCount > 0;
return c.category + (isNew ? ' 🆕' : '');
});
const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`);
const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`);
const deltaCells = cats.map((c) => {
const d = c.delta * 100;
const sign = d >= 0 ? '+' : '';
return `${sign}${d.toFixed(1)}pp`;
});
const nameW = maxWidth(names, 'category');
const prW = maxWidth(prCells, 'PR');
const baseW = maxWidth(baseCells, 'baseline');
const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ'];
const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join(' ');
const rows = cats.map(
(_, i) =>
TERMINAL_TABLE_INDENT +
[
names[i].padEnd(nameW),
prCells[i].padEnd(prW),
baseCells[i].padEnd(baseW),
deltaCells[i],
].join(' '),
);
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
'\n',
);
}
function maxWidth(values: string[], header: string): number {
return values.reduce((m, v) => Math.max(m, v.length), header.length);
}