feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 16:10:30 +02:00 · 2026-05-06 09:15:08 +01:00 · 2026-05-06 09:15:08 +01:00 · bbe3e2d148
commit bbe3e2d148
parent 5b01cba8b2
14 changed files with 2818 additions and 83 deletions
--- a/.github/workflows/test-evals-instance-ai.yml
+++ b/.github/workflows/test-evals-instance-ai.yml
@ -143,7 +143,7 @@ jobs:
            --base-url "$BASE_URLS" \
            --concurrency 32 \
            --verbose \
-            --iterations 3 \
+            --iterations 5 \
            ${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}

      - name: Stop n8n containers
@ -160,22 +160,16 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json"
-          if [ ! -f "$RESULTS_FILE" ]; then
-            echo "No eval results file found"
+          # The eval CLI writes the full PR comment as eval-pr-comment.md
+          # (see comparison/format.ts:formatComparisonMarkdown). It includes
+          # the alert, aggregate, comparison sections, per-test-case results
+          # collapsed, and failure details collapsed. CI just relays it.
+          COMMENT_FILE="packages/@n8n/instance-ai/eval-pr-comment.md"
+          if [ ! -f "$COMMENT_FILE" ]; then
+            echo "No PR comment file found (eval likely cancelled before writing results)"
            exit 0
          fi
-
-          # Build the full comment body with jq
-          jq -r '
-            "### Instance AI Workflow Eval Results\n\n" +
-            "**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
-            "| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
-            ([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
-            "\n\n<details><summary>Failure details</summary>\n\n" +
-            ([.testCases[] as $tc | $tc.scenarios[] | select(.passHatK < 1) | "**\($tc.name) / \(.name)** — \(.passCount)/\(.totalRuns) passed" + "\n" + ([.runs[] | select(.passed == false) | "> Run\(if .failureCategory then " [\(.failureCategory)]" else "" end): \(.reasoning | .[0:200])"] | join("\n"))] | join("\n\n")) +
-            "\n</details>"
-          ' "$RESULTS_FILE" > /tmp/eval-comment.md
+          cp "$COMMENT_FILE" /tmp/eval-comment.md

          # Find and update existing eval comment, or create new one
          COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ packages/testing/playwright/playwright-report
 packages/testing/playwright/test-results
 packages/testing/playwright/eval-results.json
 packages/@n8n/instance-ai/eval-results.json
+packages/@n8n/instance-ai/eval-pr-comment.md
 packages/testing/playwright/.playwright-browsers
 packages/testing/playwright/.playwright-cli
 test-results/
--- a/packages/@n8n/instance-ai/evaluations/README.md
+++ b/packages/@n8n/instance-ai/evaluations/README.md
@ -121,7 +121,7 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --iterations 3
 | `--base-url` | `http://localhost:5678` | n8n instance URL |
 | `--email` | E2E test owner | Override login email (or `N8N_EVAL_EMAIL`) |
 | `--password` | E2E test owner | Override login password (or `N8N_EVAL_PASSWORD`) |
-| `--timeout-ms` | `600000` | Per-test-case timeout |
+| `--timeout-ms` | `900000` | Per-test-case timeout |
 | `--output-dir` | cwd | Where to write `eval-results.json` |
 | `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
 | `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
@ -155,6 +155,47 @@ Every run produces:

 **LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results.

+## Regression detection
+
+When `LANGSMITH_API_KEY` is set, every eval run automatically compares its results against the most recent pinned baseline (any experiment whose name starts with `instance-ai-baseline-`). Two output files are written:
+
+- `eval-results.json` — structured data only, including `comparison.result` when a baseline was found.
+- `eval-pr-comment.md` — the full PR comment rendered as markdown, including the alert, aggregate, comparison sections, per-test-case results, and failure details. Always written; falls back to a no-baseline summary when no comparison ran.
+
+The CI PR-comment step uses `eval-pr-comment.md` as the entire comment body (no jq assembly in the workflow). The console output uses a separate aligned-text formatter — same data, no markdown noise in the terminal.
+
+### Refreshing the baseline
+
+There is no auto-refresh — refresh explicitly when you want a new reference point, ideally with high N for low noise:
+
+```bash
+# From packages/@n8n/instance-ai/, on master at the version you want to pin
+LANGSMITH_API_KEY=... dotenvx run -f ../../../.env.local -- \
+  pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
+```
+
+LangSmith appends a random suffix (e.g. `instance-ai-baseline-7abc1234`); the most recently started one becomes the comparison target on the next eval run. The comparison is silently skipped on the baseline-creation run itself.
+
+### How scenarios are tiered
+
+Each scenario lands in one of three regression tiers, evaluated in order of strictness:
+
+- **Regression** — high-confidence flag, gating-grade. The drop must be statistically significant (chance of seeing it by noise < 5%), at least 30 percentage points in size, and the baseline must have been reliable (≥ 70% pass rate).
+- **Soft regression** — looser bar for visibility on borderline cases. Looser confidence threshold (chance by noise < 20%), drop ≥ 15 percentage points, baseline ≥ 50%. Frequently natural variance — worth a glance only if your changes touch related code paths.
+- **Notable movement** — any scenario whose pass rate moved by ≥ 35 percentage points without reaching either flag tier. Pure visibility, no implication of cause.
+
+Other verdicts: `improvement` (PR significantly better, skips the reliability gate), `unreliable_baseline` (confident drop but baseline was too flaky to call a regression — surfaced but not flagged), `stable`, `insufficient_data`.
+
+Why these tiers and not a flat percentage threshold? At the small N PR runs use (typically 3 iterations), a flat threshold can't tell a real regression from coin-flip noise. The confidence cutoff filters out gaps that could plausibly happen by chance, and the reliability gate avoids chasing noise on already-flaky scenarios. Implementation lives in `comparison/statistics.ts` (Fisher's exact test for the confidence check, Wilson interval for the headline aggregate band). Tune the soft tier first if the false-positive rate looks off — keep the hard tier strict.
+
+### Failure-category drift
+
+When both sides captured per-trial `failureCategory` values, the comparison also surfaces a run-level table of category rates (PR vs baseline). A category is marked **notable** when its absolute rate delta is ≥ 5 percentage points _and_ the count change beyond what scenario-count scaling would predict is ≥ 3 trials. This catches cross-scenario shifts (e.g. mock-generation breaking, or a model getting weaker overall) that per-scenario flags can miss.
+
+### Best-effort
+
+Comparison is logged and skipped on any LangSmith failure — it never fails the eval. It is also skipped when no baseline experiment exists yet.
+
 ## Pairwise evals

 Pairwise evals score a built workflow against the dataset's `dos` / `donts`
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-compare.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-compare.test.ts
@ -0,0 +1,190 @@
+import { compareBuckets, type ExperimentBucket, type ScenarioCounts } from '../comparison/compare';
+
+function bucket(
+	name: string,
+	scenarios: ScenarioCounts[],
+	categories?: { totals: Record<string, number>; trialTotal: number },
+): ExperimentBucket {
+	return {
+		experimentName: name,
+		scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
+		failureCategoryTotals: categories?.totals,
+		trialTotal: categories?.trialTotal,
+	};
+}
+
+function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
+	return { testCaseFile: file, scenarioName: scenario, passed, total };
+}
+
+describe('compareBuckets', () => {
+	it('produces a clean intersection when both sides have the same scenarios', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 8, 10), s('weather', 'happy', 1, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 9, 10), s('weather', 'happy', 0, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.scenarios).toHaveLength(2);
+		expect(result.prOnly).toEqual([]);
+		expect(result.baselineOnly).toEqual([]);
+		expect(result.aggregate.intersectionSize).toBe(2);
+	});
+
+	it('flags scenarios only present on one side', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 5, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 8, 10), s('weather', 'happy', 5, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.scenarios).toHaveLength(1);
+		expect(result.scenarios[0].testCaseFile).toBe('contact');
+		expect(result.baselineOnly).toEqual([{ testCaseFile: 'weather', scenarioName: 'happy' }]);
+		expect(result.prOnly).toEqual([]);
+	});
+
+	it('aggregates only over the intersection, not over baseline-only or pr-only', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 10, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 5, 10), s('other', 'happy', 0, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.aggregate.prAggregatePassRate).toBe(1);
+		expect(result.aggregate.baselineAggregatePassRate).toBe(0.5);
+		expect(result.aggregate.intersectionSize).toBe(1);
+	});
+
+	it('sorts scenarios with regressions first, then improvements, then stable', () => {
+		const pr = bucket('pr', [
+			s('a', 'stable', 10, 10),
+			s('b', 'regression', 0, 10),
+			s('c', 'improvement', 10, 10),
+		]);
+		const base = bucket('master', [
+			s('a', 'stable', 10, 10),
+			s('b', 'regression', 10, 10),
+			s('c', 'improvement', 0, 10),
+		]);
+
+		const result = compareBuckets(pr, base);
+		expect(result.scenarios.map((sc) => sc.scenarioName)).toEqual([
+			'regression',
+			'improvement',
+			'stable',
+		]);
+	});
+
+	it('returns insufficient_data when one side has zero trials for a scenario', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 0, 0)]);
+		const base = bucket('master', [s('contact', 'happy', 10, 10)]);
+
+		const result = compareBuckets(pr, base);
+		expect(result.scenarios[0].verdict).toBe('insufficient_data');
+	});
+
+	it('returns no failure-category drift when either side lacks category totals', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		expect(compareBuckets(pr, base).failureCategories).toEqual([]);
+	});
+
+	it('flags a category as notable when both rate and trial-count gaps clear the bars', () => {
+		// Haiku-style shift: framework_issue 0/290 → 9/145.
+		// Rate gap: 6.2pp ≥ 5pp ✓.  Expected PR count given baseline = 0 × (145/290) = 0; |9 − 0| = 9 ≥ 3 ✓.
+		const pr = bucket('pr', [s('a', 'happy', 50, 145)], {
+			totals: { framework_issue: 9 },
+			trialTotal: 145,
+		});
+		const base = bucket('master', [s('a', 'happy', 200, 290)], {
+			totals: { framework_issue: 0 },
+			trialTotal: 290,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		const fw = cats.find((c) => c.category === 'framework_issue');
+		expect(fw?.notable).toBe(true);
+	});
+
+	it('does not flag when the rate gap is below the 5pp bar', () => {
+		// 3/100 vs 2/100 = 1pp gap, count gap = 1 — neither bar cleared.
+		const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
+			totals: { mock_issue: 3 },
+			trialTotal: 100,
+		});
+		const base = bucket('master', [s('a', 'happy', 50, 100)], {
+			totals: { mock_issue: 2 },
+			trialTotal: 100,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		expect(cats.find((c) => c.category === 'mock_issue')?.notable).toBe(false);
+	});
+
+	it('does not flag when the rate gap is large but the count gap is tiny (small N guard)', () => {
+		// PR 1/3 vs baseline 0/270 — rate gap = 33pp ≥ 5pp, but expected PR count = 0
+		// and observed = 1, count gap = 1 < 3. Should NOT flag — single trial on small N.
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)], {
+			totals: { builder_issue: 1 },
+			trialTotal: 3,
+		});
+		const base = bucket('master', [s('a', 'happy', 270, 270)], {
+			totals: { builder_issue: 0 },
+			trialTotal: 270,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		expect(cats.find((c) => c.category === 'builder_issue')?.notable).toBe(false);
+	});
+
+	it('drops unknown categories with a console warning, keeps all known categories', () => {
+		const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)], {
+			totals: { '-': 5, builder_issue: 2 },
+			trialTotal: 10,
+		});
+		const base = bucket('master', [s('a', 'happy', 8, 10)], {
+			totals: { builder_issue: 1 },
+			trialTotal: 10,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		// All five known categories are always present (some at 0/0 — renderer
+		// drops those). The unknown `-` category is dropped here with a warning.
+		expect(cats.map((c) => c.category).sort()).toEqual([
+			'build_failure',
+			'builder_issue',
+			'framework_issue',
+			'mock_issue',
+			'verification_failure',
+		]);
+		expect(warn).toHaveBeenCalledWith(expect.stringContaining('"-"'));
+		warn.mockRestore();
+	});
+
+	it('sorts notable categories before non-notable, then by absolute delta', () => {
+		const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
+			totals: { framework_issue: 10, mock_issue: 4, builder_issue: 25 },
+			trialTotal: 100,
+		});
+		const base = bucket('master', [s('a', 'happy', 50, 100)], {
+			totals: { framework_issue: 0, mock_issue: 3, builder_issue: 22 },
+			trialTotal: 100,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		// framework_issue is the only notable one (rate gap 10pp, count gap 10).
+		expect(cats[0].category).toBe('framework_issue');
+		expect(cats[0].notable).toBe(true);
+		expect(cats.slice(1).every((c) => !c.notable)).toBe(true);
+	});
+
+	it('accepts custom tiered thresholds for tests', () => {
+		const pr = bucket('pr', [s('a', 'happy', 5, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+
+		// Defaults: 5/10 vs 8/10 = -30pp drop, p ≈ 0.18 → soft_regression
+		// (passes soft maxPValue=0.20, soft minDelta=0.15, baseline 80% above soft 50%).
+		const defaults = compareBuckets(pr, base);
+		expect(defaults.scenarios[0].verdict).toBe('soft_regression');
+
+		// Stricter soft p-value cutoff excludes this case.
+		const stricter = compareBuckets(pr, base, {
+			soft: { maxPValue: 0.1, minDelta: 0.15, minBaselinePassRate: 0.5 },
+		});
+		expect(['stable', 'watch']).toContain(stricter.scenarios[0].verdict);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-format.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-format.test.ts
@ -0,0 +1,458 @@
+import {
+	compareBuckets,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type ExperimentBucket,
+	type ScenarioCounts,
+} from '../comparison/compare';
+import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
+import type { MultiRunEvaluation, WorkflowTestCase, ScenarioResult } from '../types';
+
+function ok(result: ComparisonResult): ComparisonOutcome {
+	return { kind: 'ok', result };
+}
+
+function slugMap(evaluation: MultiRunEvaluation, slugs: string[]): Map<WorkflowTestCase, string> {
+	return new Map(evaluation.testCases.map((tc, i) => [tc.testCase, slugs[i] ?? 'unknown']));
+}
+
+function bucket(name: string, scenarios: ScenarioCounts[]): ExperimentBucket {
+	return {
+		experimentName: name,
+		scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
+	};
+}
+
+function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
+	return { testCaseFile: file, scenarioName: scenario, passed, total };
+}
+
+/** Minimal evaluation fixture matching the shape format.ts reads. */
+function evaluation(
+	opts: {
+		totalRuns?: number;
+		testCases?: Array<{
+			prompt?: string;
+			buildSuccessCount?: number;
+			scenarios?: Array<{
+				name: string;
+				passCount: number;
+				passes: boolean[]; // per-iteration pass/fail
+				reasoning?: string;
+				failureCategory?: string;
+			}>;
+		}>;
+	} = {},
+): MultiRunEvaluation {
+	const totalRuns = opts.totalRuns ?? 3;
+	return {
+		totalRuns,
+		testCases: (opts.testCases ?? []).map((tc) => {
+			const testCase = {
+				prompt: tc.prompt ?? 'Test workflow prompt',
+				complexity: 'medium' as const,
+				tags: [],
+				scenarios: (tc.scenarios ?? []).map((sa) => ({
+					name: sa.name,
+					description: '',
+					dataSetup: '',
+					successCriteria: '',
+				})),
+			} as WorkflowTestCase;
+			const buildSuccessCount = tc.buildSuccessCount ?? totalRuns;
+			const scenarios = (tc.scenarios ?? []).map((sa) => ({
+				scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
+				passCount: sa.passCount,
+				passRate: totalRuns > 0 ? sa.passCount / totalRuns : 0,
+				passAtK: new Array(totalRuns).fill(sa.passCount > 0 ? 1 : 0) as number[],
+				passHatK: new Array(totalRuns).fill(sa.passCount === totalRuns ? 1 : 0) as number[],
+				runs: sa.passes.map(
+					(passed): ScenarioResult => ({
+						scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
+						success: passed,
+						score: passed ? 1 : 0,
+						reasoning: sa.reasoning ?? '',
+						failureCategory: !passed ? sa.failureCategory : undefined,
+					}),
+				),
+			}));
+			return {
+				testCase,
+				workflowBuildSuccess: buildSuccessCount > 0,
+				scenarioResults: [],
+				scenarios,
+				runs: new Array(totalRuns).fill(null).map(() => ({
+					testCase,
+					workflowBuildSuccess: buildSuccessCount > 0,
+					scenarioResults: [],
+				})),
+				buildSuccessCount,
+			};
+		}),
+	};
+}
+
+describe('formatComparisonMarkdown', () => {
+	const evalFixture = evaluation({
+		totalRuns: 3,
+		testCases: [
+			{
+				prompt: 'a',
+				scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+			},
+		],
+	});
+
+	it('renders heading, alert, aggregate, and a regression table', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/### Instance AI Workflow Eval/);
+		expect(md).toMatch(/> \[!CAUTION\]/);
+		expect(md).toMatch(/1 regression/);
+		expect(md).toMatch(/\*\*Aggregate\*\*: 0\.0% PR vs 100\.0% baseline/);
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		expect(md).toMatch(/`a\/happy`/);
+		expect(md).toMatch(/0\/3 \(0%\)/);
+		expect(md).toMatch(/-100pp ↓/);
+	});
+
+	it('uses TIP alert when there are only improvements', () => {
+		const pr = bucket('pr', [s('a', 'happy', 3, 3)]);
+		const base = bucket('master', [s('a', 'happy', 0, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/> \[!TIP\]/);
+		expect(md).toMatch(/1 improvement/);
+		expect(md).toMatch(/#### Improvements \(1\)/);
+		expect(md).toMatch(/\+100pp ↑/);
+	});
+
+	it('uses TIP alert with "0 regressions" when everything is stable', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/> \[!TIP\]/);
+		expect(md).toMatch(/0 regressions/);
+		expect(md).toMatch(/1 stable/);
+		expect(md).not.toMatch(/#### Regressions/);
+	});
+
+	it('renders LangSmith-disabled NOTE when outcome is undefined', () => {
+		const md = formatComparisonMarkdown(evalFixture);
+		expect(md).toMatch(/> \[!NOTE\]/);
+		expect(md).toMatch(/LangSmith disabled/);
+		expect(md).not.toMatch(/#### Regressions/);
+	});
+
+	it('renders distinct alerts per skip reason', () => {
+		const noBase = formatComparisonMarkdown(evalFixture, { kind: 'no_baseline' });
+		expect(noBase).toMatch(/> \[!NOTE\]/);
+		expect(noBase).toMatch(/No baseline configured/);
+
+		const selfBase = formatComparisonMarkdown(evalFixture, {
+			kind: 'self_baseline',
+			experimentName: 'instance-ai-baseline-abc',
+		});
+		expect(selfBase).toMatch(/> \[!NOTE\]/);
+		expect(selfBase).toMatch(/This run is the baseline/);
+		expect(selfBase).toMatch(/instance-ai-baseline-abc/);
+
+		const fetchFail = formatComparisonMarkdown(evalFixture, {
+			kind: 'fetch_failed',
+			error: 'LangSmith 503',
+		});
+		// fetch_failed is a real outage, not a benign skip — must be a WARNING.
+		expect(fetchFail).toMatch(/> \[!WARNING\]/);
+		expect(fetchFail).toMatch(/Regression detection did not run/);
+		expect(fetchFail).toMatch(/LangSmith 503/);
+	});
+
+	it('shows mixed-case alert when both regressions and improvements exist', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3), s('b', 'happy', 3, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10), s('b', 'happy', 0, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/> \[!CAUTION\]/);
+		expect(md).toMatch(/1 regression/);
+		expect(md).toMatch(/1 improvement/);
+		expect(md).toMatch(/#### Regressions/);
+		expect(md).toMatch(/#### Improvements/);
+	});
+
+	it('embeds commit SHA in heading when provided', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)), {
+			commitSha: 'abc1234567890def',
+		});
+		expect(md).toMatch(/### Instance AI Workflow Eval — `abc12345`/);
+	});
+
+	it('marks new failure categories with 🆕', () => {
+		const pr: ExperimentBucket = {
+			experimentName: 'pr',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 0, 3) }]]),
+			failureCategoryTotals: { framework_issue: 9 },
+			trialTotal: 145,
+		};
+		const base: ExperimentBucket = {
+			experimentName: 'master',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 5, 10) }]]),
+			failureCategoryTotals: { framework_issue: 0 },
+			trialTotal: 290,
+		};
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/#### Failure breakdown/);
+		expect(md).toMatch(/`framework_issue` 🆕/);
+		expect(md).toMatch(/\*\*notable\*\*/);
+	});
+
+	it('always includes all five tier counts in the alert line', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/0 regressions, 0 soft, 0 notable, 0 improvements, 1 stable/);
+	});
+
+	it('renders a per-scenario breakdown collapsible inside the regression section', () => {
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'a',
+					scenarios: [
+						{
+							name: 'happy',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Builder produced an unsupported node configuration',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['a']),
+		});
+
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		// The regression row's collapsible should appear inside the Regressions
+		// section, before the per-test-case section, and carry the same slug.
+		const regressionsIdx = md.indexOf('#### Regressions');
+		const perTcIdx = md.indexOf('Per-test-case results');
+		const breakdownIdx = md.indexOf('<code>a/happy</code>');
+		expect(breakdownIdx).toBeGreaterThan(regressionsIdx);
+		expect(breakdownIdx).toBeLessThan(perTcIdx);
+		expect(md).toMatch(/3 of 3 failed · 3× builder_issue/);
+		expect(md).toMatch(/Run 1 \[builder_issue\]: Builder produced/);
+	});
+
+	it('uses `file/scenario` slug headers in the bottom Failure details section', () => {
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'Build a cross-team Linear report digest',
+					scenarios: [
+						{
+							name: 'no-cross-team-issues',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'reason',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('cross-team-linear-report', 'no-cross-team-issues', 0, 3)]);
+		const base = bucket('master', [s('cross-team-linear-report', 'no-cross-team-issues', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report']),
+		});
+
+		expect(md).toMatch(/<summary>Failure details<\/summary>/);
+		expect(md).toMatch(/\*\*`cross-team-linear-report\/no-cross-team-issues`\*\* — 3 failed/);
+	});
+
+	it('attaches per-scenario failures to the right file slug when names collide', () => {
+		// Two test cases each defining `happy-path`. Without the slug map,
+		// the renderer would conflate them — Albert's review flagged this
+		// exact bug. With the map, each row's collapsible carries only that
+		// row's failures.
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'cross-team prompt',
+					scenarios: [
+						{
+							name: 'happy-path',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Linear node misconfigured',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+				{
+					prompt: 'weather prompt',
+					scenarios: [
+						{
+							name: 'happy-path',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Weather mock returned empty',
+							failureCategory: 'mock_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [
+			s('cross-team-linear-report', 'happy-path', 0, 3),
+			s('weather-monitoring', 'happy-path', 0, 3),
+		]);
+		const base = bucket('master', [
+			s('cross-team-linear-report', 'happy-path', 10, 10),
+			s('weather-monitoring', 'happy-path', 10, 10),
+		]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report', 'weather-monitoring']),
+		});
+
+		// Each per-scenario collapsible (under the regression table) must show
+		// ONLY its own failures. Slice each block at its closing </details>.
+		function collapsibleFor(slug: string): string {
+			const open = md.indexOf(`<code>${slug}</code>`);
+			expect(open).toBeGreaterThan(-1);
+			const close = md.indexOf('</details>', open);
+			return md.slice(open, close);
+		}
+		const crossTeamBlock = collapsibleFor('cross-team-linear-report/happy-path');
+		const weatherBlock = collapsibleFor('weather-monitoring/happy-path');
+		expect(crossTeamBlock).toMatch(/Linear node misconfigured/);
+		expect(crossTeamBlock).not.toMatch(/Weather mock returned empty/);
+		expect(weatherBlock).toMatch(/Weather mock returned empty/);
+		expect(weatherBlock).not.toMatch(/Linear node misconfigured/);
+	});
+
+	it('uses the slug instead of the prompt in the per-test-case table', () => {
+		const evalFx = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'Build a cross-team Linear report digest from open issues',
+					scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('cross-team-linear-report', 'happy', 0, 3)]);
+		const base = bucket('master', [s('cross-team-linear-report', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalFx, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalFx, ['cross-team-linear-report']),
+		});
+
+		// Per-test-case table cell should be the slug, not the prompt.
+		const perTcSection = md.slice(md.indexOf('Per-test-case results'));
+		expect(perTcSection).toMatch(/`cross-team-linear-report`/);
+		expect(perTcSection).not.toMatch(/Build a cross-team Linear report digest/);
+	});
+
+	it('skips per-scenario breakdown when slugByTestCase is omitted', () => {
+		// Without the slug map, the renderer can't disambiguate. We'd rather
+		// drop the breakdown than show a wrong one.
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'a',
+					scenarios: [
+						{
+							name: 'happy',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Some failure',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)));
+
+		// Regression table still rendered.
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		// But no per-scenario collapsible (which would have used <code>a/happy</code>
+		// with the breakdown summary text).
+		expect(md).not.toMatch(/3 of 3 failed · 3× builder_issue/);
+	});
+
+	it('renders the failure breakdown for non-notable categories with non-zero counts', () => {
+		// 50/100 vs 50/100 — no scenario regression, but still has builder_issue
+		// counts on both sides (non-notable but non-zero).
+		const pr: ExperimentBucket = {
+			experimentName: 'pr',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
+			failureCategoryTotals: { builder_issue: 25 },
+			trialTotal: 100,
+		};
+		const base: ExperimentBucket = {
+			experimentName: 'master',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
+			failureCategoryTotals: { builder_issue: 22 },
+			trialTotal: 100,
+		};
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/#### Failure breakdown/);
+		expect(md).toMatch(/`builder_issue`/);
+		// builder_issue isn't notable here, so no "notable" marker.
+		expect(md).not.toMatch(/builder_issue.*notable/);
+	});
+});
+
+describe('formatComparisonTerminal', () => {
+	const evalFixture = evaluation({
+		totalRuns: 3,
+		testCases: [
+			{
+				prompt: 'a',
+				scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+			},
+		],
+	});
+
+	it('renders title, verdict, aggregate, and regression table without markdown syntax', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
+		const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
+		expect(out).toMatch(/^Instance AI Workflow Eval/);
+		expect(out).toMatch(/▶ 1 regression/);
+		expect(out).toMatch(/PR\s{8}0\.0%/);
+		expect(out).toMatch(/baseline\s{2}100\.0%/);
+		expect(out).toMatch(/REGRESSIONS/);
+		expect(out).toMatch(/a\/happy/);
+		expect(out).not.toMatch(/^###/m);
+		expect(out).not.toMatch(/\| /);
+	});
+
+	it('renders LangSmith-disabled message when outcome is undefined', () => {
+		const out = formatComparisonTerminal(evalFixture);
+		expect(out).toMatch(/LangSmith disabled/);
+		expect(out).not.toMatch(/REGRESSIONS/);
+	});
+
+	it('shows partial banner when scenarios differ on each side', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10), s('b', 'happy', 5, 10)]);
+		const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
+		expect(out).toMatch(/partial: 1 baseline scenarios not run by PR/);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-statistics.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-statistics.test.ts
@ -0,0 +1,161 @@
+import {
+	classifyScenario,
+	fishersExactOneSidedLeft,
+	wilsonInterval,
+} from '../comparison/statistics';
+
+describe('fishersExactOneSidedLeft', () => {
+	it('returns 1 when either row is empty (no information)', () => {
+		expect(fishersExactOneSidedLeft(0, 0, 5, 5)).toBe(1);
+		expect(fishersExactOneSidedLeft(5, 5, 0, 0)).toBe(1);
+	});
+
+	it('returns 1 when no failures or no passes are observed (no test possible)', () => {
+		expect(fishersExactOneSidedLeft(3, 0, 5, 0)).toBe(1);
+		expect(fishersExactOneSidedLeft(0, 3, 0, 5)).toBe(1);
+	});
+
+	it('matches a known textbook case', () => {
+		// 2x2 table where PR (1/3) is much worse than baseline (10/10).
+		// Hypergeometric: P(X = 0) + P(X = 1) | drawn=3 from passes=11, fails=2
+		// = C(11,0)C(2,3)/C(13,3) + C(11,1)C(2,2)/C(13,3)
+		// = 0 + 11/286 ≈ 0.03846
+		const p = fishersExactOneSidedLeft(1, 2, 10, 0);
+		expect(p).toBeCloseTo(0.03846, 4);
+	});
+
+	it('returns p = 1 when PR pass rate equals baseline at maximum', () => {
+		// PR all pass, baseline all pass — under H0 the observed PR is the most likely outcome,
+		// so the left-tail (X ≤ a) p-value is exactly 1.
+		const p = fishersExactOneSidedLeft(5, 0, 5, 0);
+		expect(p).toBe(1);
+	});
+
+	it('detects a strong regression with high N', () => {
+		// PR 0/10, baseline 10/10 — extremely strong evidence PR is worse.
+		const p = fishersExactOneSidedLeft(0, 10, 10, 0);
+		expect(p).toBeLessThan(0.001);
+	});
+
+	it('returns 1 when PR matches baseline rates exactly', () => {
+		// PR 5/10, baseline 5/10 — left tail at the median is around 0.5 + symmetric mass
+		// at the observed value, but should be > 0.5 (we're at the center of the distribution).
+		const p = fishersExactOneSidedLeft(5, 5, 5, 5);
+		expect(p).toBeGreaterThan(0.5);
+	});
+});
+
+describe('wilsonInterval', () => {
+	it('returns [0, 1] for total=0', () => {
+		expect(wilsonInterval(0, 0)).toEqual({ lower: 0, upper: 1 });
+	});
+
+	it('produces reasonable bounds for 5/10', () => {
+		const ci = wilsonInterval(5, 10);
+		// Known Wilson 95% CI for 5/10: roughly [0.237, 0.763]
+		expect(ci.lower).toBeCloseTo(0.237, 2);
+		expect(ci.upper).toBeCloseTo(0.763, 2);
+	});
+
+	it('produces tight bounds for 0/100', () => {
+		const ci = wilsonInterval(0, 100);
+		expect(ci.lower).toBe(0);
+		expect(ci.upper).toBeLessThan(0.05);
+	});
+
+	it('produces tight bounds for 100/100', () => {
+		const ci = wilsonInterval(100, 100);
+		// upper analytically equals 1 but lands slightly under it after FP rounding —
+		// any reasonable CI for 100/100 should still be tight to the top of the range.
+		expect(ci.upper).toBeGreaterThanOrEqual(0.99);
+		expect(ci.lower).toBeGreaterThan(0.95);
+	});
+
+	it('throws when passes > total', () => {
+		expect(() => wilsonInterval(5, 3)).toThrow();
+	});
+});
+
+describe('classifyScenario', () => {
+	it('flags a clear regression on a reliable scenario as hard_regression', () => {
+		const result = classifyScenario(0, 10, 10, 10);
+		expect(result.verdict).toBe('hard_regression');
+		expect(result.delta).toBe(-1);
+	});
+
+	it('marks a hard-significant drop on an unreliable baseline as unreliable_baseline', () => {
+		// Baseline 4/10 (40%) — below hard reliable (70%). PR 0/10 is a 40pp drop with
+		// Fisher p < 0.05. We surface it as `unreliable_baseline` rather than flagging.
+		const result = classifyScenario(0, 10, 4, 10);
+		expect(result.verdict).toBe('unreliable_baseline');
+	});
+
+	it('reports stable when the drop is sub-MDE on a flaky baseline', () => {
+		// Baseline 1/10 (flaky), PR 0/10 — only a 10pp drop, below MDE.
+		const result = classifyScenario(0, 10, 1, 10);
+		expect(result.verdict).toBe('stable');
+	});
+
+	it('does not flag a small drop below the soft MDE threshold', () => {
+		// 9/10 vs 10/10 = 10pp drop, below soft MDE (15pp).
+		const result = classifyScenario(9, 10, 10, 10);
+		expect(result.verdict).toBe('stable');
+	});
+
+	it('flags an improvement when PR is significantly better', () => {
+		const result = classifyScenario(10, 10, 0, 10);
+		expect(result.verdict).toBe('improvement');
+	});
+
+	it('flags improvement even on a never-passing baseline', () => {
+		// "Never passes" baseline (0/10) — fix is worth surfacing without the reliability gate.
+		const result = classifyScenario(8, 10, 0, 10);
+		expect(result.verdict).toBe('improvement');
+	});
+
+	it('returns insufficient_data when either side has no trials', () => {
+		expect(classifyScenario(0, 0, 5, 10).verdict).toBe('insufficient_data');
+		expect(classifyScenario(5, 10, 0, 0).verdict).toBe('insufficient_data');
+	});
+
+	it('flags the most extreme outcome at minimum N as hard_regression', () => {
+		// PR 0/3 vs baseline 3/3 — Fisher one-sided p ≈ 0.05, delta = -100pp.
+		const result = classifyScenario(0, 3, 3, 3);
+		expect(result.verdict).toBe('hard_regression');
+	});
+
+	it('reports stable when N is small enough that even a full flip is sub-significant for soft tier', () => {
+		// PR 1/2 vs baseline 2/2 — delta -50pp but Fisher p ≈ 0.5 (way above soft α=0.20).
+		// Soft MDE met, but significance fails on both tiers.
+		const result = classifyScenario(1, 2, 2, 2);
+		expect(['stable', 'watch']).toContain(result.verdict);
+	});
+
+	it('marks soft regression when hard delta is missed but soft thresholds met', () => {
+		// 6/10 vs 10/10 = 40pp drop, p ≈ 0.043, baseline 100% reliable.
+		// Hard defaults would flag this; force a stricter hard delta to push it to soft.
+		const result = classifyScenario(6, 10, 10, 10, {
+			hard: { maxPValue: 0.05, minDelta: 0.5, minBaselinePassRate: 0.7 },
+			soft: { maxPValue: 0.2, minDelta: 0.15, minBaselinePassRate: 0.5 },
+		});
+		expect(result.verdict).toBe('soft_regression');
+	});
+
+	it('marks watch when delta crosses the watch threshold without significance', () => {
+		// 5/10 vs 7/10 = -20pp drop, p ≈ 0.32 — not significant for hard or soft.
+		// Default watchDelta is 0.35, so this should not be `watch`. Force it via
+		// a smaller threshold to validate the path.
+		const result = classifyScenario(5, 10, 7, 10, { watchDelta: 0.15 });
+		expect(result.verdict).toBe('watch');
+	});
+
+	it('respects custom hard-tier delta override', () => {
+		// 7/10 vs 10/10 = 30pp delta. Default hard minDelta is 0.3, so this barely qualifies.
+		// With hard.minDelta 0.4, it drops into `soft_regression` (still passes soft 0.15 minDelta).
+		// p ≈ 0.105 < soft maxPValue (0.2), so soft fires.
+		const result = classifyScenario(7, 10, 10, 10, {
+			hard: { minDelta: 0.4 },
+		});
+		expect(result.verdict).toBe('soft_regression');
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/cli/args.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts
@ -45,7 +45,7 @@ export interface CliArgs {
 // ---------------------------------------------------------------------------

 const cliArgsSchema = z.object({
-	timeoutMs: z.number().int().positive().default(600_000),
+	timeoutMs: z.number().int().positive().default(900_000),
 	baseUrls: z.array(z.string().url()).min(1).default(['http://localhost:5678']),
 	email: z.string().optional(),
 	password: z.string().optional(),
@ -104,7 +104,7 @@ interface RawArgs {

 function parseRawArgs(argv: string[]): RawArgs {
 	const result: RawArgs = {
-		timeoutMs: 600_000,
+		timeoutMs: 900_000,
 		baseUrls: ['http://localhost:5678'],
 		verbose: false,
 		keepWorkflows: false,
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -23,6 +23,15 @@ import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
 import { LaneAllocator } from './lane-allocator';
 import { expandWithIterations, partitionRoundRobin } from './lanes';
 import { N8nClient } from '../clients/n8n-client';
+import {
+	compareBuckets,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type ExperimentBucket,
+	type ScenarioCounts,
+} from '../comparison/compare';
+import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
+import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
 import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
 import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
 import type { WorkflowTestCaseWithFile } from '../data/workflows';
@ -43,6 +52,7 @@ import type {
 	MultiRunEvaluation,
 	ScenarioResult,
 	TestScenario,
+	WorkflowTestCase,
 	WorkflowTestCaseResult,
 } from '../types';

@ -160,21 +170,40 @@ async function main(): Promise<void> {
 		const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);

 		let evaluation: MultiRunEvaluation;
+		let experimentName: string | undefined;
+		let outcome: ComparisonOutcome | undefined;
+		let slugByTestCase: Map<WorkflowTestCase, string> | undefined;

 		if (hasLangSmith) {
 			logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
-			evaluation = await runWithLangSmith({ args, lanes, logger });
+			const langsmithRun = await runWithLangSmith({ args, lanes, logger });
+			evaluation = langsmithRun.evaluation;
+			experimentName = langsmithRun.experimentName;
+			outcome = langsmithRun.outcome;
+			slugByTestCase = langsmithRun.slugByTestCase;
 		} else {
 			logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
 			evaluation = await runDirectLoop({ args, lanes, logger });
 		}

 		const totalDuration = Date.now() - startTime;
-		const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
-		console.log(`Results: ${outputPath}`);
+		const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
+		const { jsonPath, prCommentPath } = writeEvalResults(
+			evaluation,
+			totalDuration,
+			args.outputDir,
+			experimentName,
+			outcome,
+			commitSha,
+			slugByTestCase,
+		);
+		console.log(`Results:    ${jsonPath}`);
+		console.log(`PR comment: ${prCommentPath}`);
 		const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
-		console.log(`Report:  ${htmlPath}`);
-		printSummary(evaluation);
+		console.log(`Report:     ${htmlPath}`);
+		console.log(
+			'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
+		);
 	} finally {
 		await Promise.all(
 			lanes.map(async (lane) => {
@ -188,7 +217,12 @@ async function main(): Promise<void> {
 // LangSmith mode: evaluate() with dataset sync, tracing, experiments
 // ---------------------------------------------------------------------------

-async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
+async function runWithLangSmith(config: RunConfig): Promise<{
+	evaluation: MultiRunEvaluation;
+	experimentName: string;
+	outcome: ComparisonOutcome;
+	slugByTestCase: Map<WorkflowTestCase, string>;
+}> {
 	const { args, lanes, logger } = config;

 	const lsClient = new Client();
@ -466,7 +500,24 @@ async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation>
 			logger,
 		});

-		return evaluation;
+		const outcome = await tryRunComparison({
+			lsClient,
+			prExperimentName: experimentResults.experimentName,
+			evaluation,
+			testCasesWithFiles,
+			logger,
+		});
+
+		const slugByTestCase = new Map<WorkflowTestCase, string>(
+			testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
+		);
+
+		return {
+			evaluation,
+			experimentName: experimentResults.experimentName,
+			outcome,
+			slugByTestCase,
+		};
 	} finally {
 		if (!args.keepWorkflows) {
 			await Promise.all(
@ -826,15 +877,22 @@ function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
 function writeEvalResults(
 	evaluation: MultiRunEvaluation,
 	duration: number,
-	outputDir?: string,
-): string {
+	outputDir: string | undefined,
+	experimentName: string | undefined,
+	outcome: ComparisonOutcome | undefined,
+	commitSha: string | undefined,
+	slugByTestCase: Map<WorkflowTestCase, string> | undefined,
+): { jsonPath: string; prCommentPath: string } {
 	const { totalRuns, testCases } = evaluation;
 	const metrics = computeAggregateMetrics(evaluation);

+	const result = outcome?.kind === 'ok' ? outcome.result : undefined;
+
 	const report = {
 		timestamp: new Date().toISOString(),
 		duration,
 		totalRuns,
+		experimentName,
 		summary: {
 			testCases: testCases.length,
 			built: metrics.built,
@ -843,6 +901,19 @@ function writeEvalResults(
 			passHatK: metrics.passHatK,
 			passRatePerIter: metrics.passRatePerIter,
 		},
+		// Structured comparison payload only — the rendered markdown lives in
+		// the sibling `eval-pr-comment.md` file so consumers can pick the format
+		// they want without re-running the eval. `comparisonStatus` records why
+		// the comparison was skipped when applicable, so JSON consumers can
+		// distinguish "no baseline yet" from "regression detection broke".
+		comparison: result
+			? {
+					baseline: result.baseline.experimentName,
+					result: serializeComparison(result),
+				}
+			: undefined,
+		comparisonStatus: outcome?.kind ?? 'not_attempted',
+		comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
 		testCases: testCases.map((tc) => ({
 			name: tc.testCase.prompt.slice(0, 70),
 			buildSuccessCount: tc.buildSuccessCount,
@ -868,74 +939,137 @@ function writeEvalResults(

 	const targetDir = outputDir ?? process.cwd();
 	mkdirSync(targetDir, { recursive: true });
-	const outputPath = join(targetDir, 'eval-results.json');
-	writeFileSync(outputPath, JSON.stringify(report, null, 2));
-	return outputPath;
+	const jsonPath = join(targetDir, 'eval-results.json');
+	writeFileSync(jsonPath, JSON.stringify(report, null, 2));
+
+	// Always write the rendered PR comment — the markdown formatter handles
+	// both with-comparison and no-baseline cases. CI consumes this file
+	// directly; local users get a copy-pasteable artifact.
+	const prCommentPath = join(targetDir, 'eval-pr-comment.md');
+	writeFileSync(
+		prCommentPath,
+		formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
+	);
+
+	return { jsonPath, prCommentPath };
+}
+
+/**
+ * Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
+ * JSON.stringify by default).
+ */
+function serializeComparison(result: ComparisonResult): {
+	pr: { experimentName: string };
+	baseline: { experimentName: string };
+	aggregate: ComparisonResult['aggregate'];
+	scenarios: ComparisonResult['scenarios'];
+	prOnly: ComparisonResult['prOnly'];
+	baselineOnly: ComparisonResult['baselineOnly'];
+	failureCategories: ComparisonResult['failureCategories'];
+} {
+	return {
+		pr: result.pr,
+		baseline: result.baseline,
+		aggregate: result.aggregate,
+		scenarios: result.scenarios,
+		prOnly: result.prOnly,
+		baselineOnly: result.baselineOnly,
+		failureCategories: result.failureCategories,
+	};
 }

 // ---------------------------------------------------------------------------
-// Console summary
+// Comparison vs the pinned baseline experiment
 // ---------------------------------------------------------------------------

-function printSummary(evaluation: MultiRunEvaluation): void {
-	const { totalRuns, testCases } = evaluation;
-	const multiRun = totalRuns > 1;
-	const metrics = computeAggregateMetrics(evaluation);
+/**
+ * Best-effort comparison. Returns a tagged outcome so the PR comment can
+ * distinguish "no baseline yet" / "this run IS the baseline" from a real
+ * regression-detection outage (LangSmith down, fetch failure). Never throws
+ * — the eval run is not gated on the comparison.
+ */
+async function tryRunComparison(config: {
+	lsClient: Client;
+	prExperimentName: string;
+	evaluation: MultiRunEvaluation;
+	testCasesWithFiles: WorkflowTestCaseWithFile[];
+	logger: EvalLogger;
+}): Promise<ComparisonOutcome> {
+	const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;

-	console.log('\n=== Workflow Eval Results ===\n');
-	for (const tc of testCases) {
-		console.log(`${tc.testCase.prompt.slice(0, 70)}...`);
-
-		if (multiRun) {
-			console.log(`  Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`);
-		} else {
-			const r = tc.runs[0];
-			const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
-			console.log(`  Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
-			if (r.buildError) {
-				console.log(`  Error: ${r.buildError.slice(0, 200)}`);
-			}
+	try {
+		const baselineName = await findLatestBaseline(lsClient);
+		if (!baselineName) {
+			logger.verbose(
+				'No baseline experiment found — skipping comparison. ' +
+					'Run with --experiment-name instance-ai-baseline to create one.',
+			);
+			return { kind: 'no_baseline' };
+		}
+		if (baselineName === prExperimentName) {
+			logger.verbose('Current run is the baseline — skipping comparison.');
+			return { kind: 'self_baseline', experimentName: baselineName };
 		}

+		logger.info(`Comparing against baseline: ${baselineName}`);
+		const baseline = await fetchBaselineBucket(lsClient, baselineName);
+		const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
+		return { kind: 'ok', result: compareBuckets(pr, baseline) };
+	} catch (error: unknown) {
+		const msg = error instanceof Error ? error.message : String(error);
+		logger.warn(`Comparison vs baseline failed: ${msg}`);
+		return { kind: 'fetch_failed', error: msg };
+	}
+}
+
+/**
+ * Project the in-memory MultiRunEvaluation onto the bucket shape used by
+ * fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
+ *
+ * Looks up `fileSlug` by test case reference rather than array index — the
+ * comparison key depends on getting the right slug, and zipping by index
+ * silently miscompares if anything ever reorders the aggregate.
+ */
+function bucketFromEvaluation(
+	evaluation: MultiRunEvaluation,
+	testCasesWithFiles: WorkflowTestCaseWithFile[],
+	experimentName: string,
+): ExperimentBucket {
+	const slugByTestCase = new Map(
+		testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
+	);
+	const scenarios = new Map<string, ScenarioCounts>();
+	const failureCategoryTotals: Record<string, number> = {};
+	let trialTotal = 0;
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase.get(tc.testCase);
+		if (!fileSlug) {
+			throw new Error(
+				`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.prompt.slice(0, 60)}"`,
+			);
+		}
+		const total = tc.runs.length;
 		for (const sa of tc.scenarios) {
-			if (multiRun) {
-				const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
-				const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
-				console.log(
-					`  ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
-						` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
-				);
-			} else {
-				const sr = sa.runs[0];
-				const icon = sr.success ? '✓' : '✗';
-				const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
-				console.log(
-					`  ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
-				);
-				if (!sr.success) {
-					const execErrors = sr.evalResult?.errors ?? [];
-					if (execErrors.length > 0) {
-						console.log(`    Error: ${execErrors.join('; ').slice(0, 200)}`);
-					}
-					console.log(`    Diagnosis: ${sr.reasoning.slice(0, 200)}`);
+			const key = `${fileSlug}/${sa.scenario.name}`;
+			const failureCategories: Record<string, number> = {};
+			for (const sr of sa.runs) {
+				trialTotal++;
+				if (!sr.success && sr.failureCategory) {
+					failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
+					failureCategoryTotals[sr.failureCategory] =
+						(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
 				}
 			}
+			scenarios.set(key, {
+				testCaseFile: fileSlug,
+				scenarioName: sa.scenario.name,
+				passed: sa.passCount,
+				total,
+				failureCategories,
+			});
 		}
-		console.log('');
-	}
-
-	if (multiRun) {
-		console.log(
-			`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
-		);
-	} else {
-		const allScenarios = testCases.flatMap((tc) => tc.scenarios);
-		const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
-		const total = metrics.scenariosTotal;
-		console.log(
-			`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
-		);
 	}
+	return { experimentName, scenarios, failureCategoryTotals, trialTotal };
 }

 main().catch((error) => {
--- a/packages/@n8n/instance-ai/evaluations/comparison/compare.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/compare.ts
@ -0,0 +1,333 @@
+// ---------------------------------------------------------------------------
+// Comparison core: take two experiment buckets, return a ComparisonResult.
+//
+// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta,
+// minimum baseline pass rate) live in statistics.ts — there's no CLI knob.
+// Tune them there if the false-positive rate drifts.
+// ---------------------------------------------------------------------------
+
+import {
+	classifyScenario,
+	wilsonInterval,
+	type ClassifyOptions,
+	type ScenarioClassification,
+	type ScenarioVerdict,
+} from './statistics';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface ScenarioCounts {
+	testCaseFile: string;
+	scenarioName: string;
+	passed: number;
+	total: number;
+	failureCategories?: Record<string, number>;
+}
+
+export interface ExperimentBucket {
+	experimentName: string;
+	scenarios: Map<string, ScenarioCounts>;
+	/**
+	 * Aggregated failure-category counts across all trials in all scenarios.
+	 * Used for the run-level failure-category drift table — orthogonal to
+	 * per-scenario verdicts.
+	 */
+	failureCategoryTotals?: Record<string, number>;
+	trialTotal?: number;
+}
+
+export interface ScenarioComparison extends ScenarioClassification {
+	testCaseFile: string;
+	scenarioName: string;
+	prPasses: number;
+	prTotal: number;
+	baselinePasses: number;
+	baselineTotal: number;
+}
+
+export interface AggregateComparison {
+	intersectionSize: number;
+	prAggregatePassRate: number;
+	baselineAggregatePassRate: number;
+	prAggregateCI: { lower: number; upper: number };
+	baselineAggregateCI: { lower: number; upper: number };
+	delta: number;
+}
+
+export interface FailureCategoryComparison {
+	category: string;
+	prCount: number;
+	prRate: number; // count / trialTotal
+	baselineCount: number;
+	baselineRate: number;
+	delta: number; // prRate − baselineRate
+	notable: boolean;
+}
+
+export interface ComparisonResult {
+	pr: { experimentName: string };
+	baseline: { experimentName: string };
+	aggregate: AggregateComparison;
+	scenarios: ScenarioComparison[];
+	prOnly: Array<{ testCaseFile: string; scenarioName: string }>;
+	baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>;
+	failureCategories: FailureCategoryComparison[];
+}
+
+/**
+ * Result of a comparison attempt. The `kind` field distinguishes between
+ * "ran successfully", "skipped intentionally" (no baseline yet, current run
+ * IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch
+ * timeout, etc.). The PR comment renders a different alert per kind so
+ * readers can tell a missing baseline from a regression-detection outage.
+ */
+export type ComparisonOutcome =
+	| { kind: 'ok'; result: ComparisonResult }
+	| { kind: 'no_baseline' }
+	| { kind: 'self_baseline'; experimentName: string }
+	| { kind: 'fetch_failed'; error: string };
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/** Hard regressions only — high-confidence, gating-grade flags. */
+export function hardRegressions(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'hard_regression');
+}
+
+/** Soft regressions — looser thresholds, worth investigating but not gating. */
+export function softRegressions(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'soft_regression');
+}
+
+/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */
+export function watchList(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'watch');
+}
+
+export function improvements(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'improvement');
+}
+
+export function byVerdict(result: ComparisonResult): Record<ScenarioVerdict, number> {
+	const counts: Record<ScenarioVerdict, number> = {
+		hard_regression: 0,
+		soft_regression: 0,
+		watch: 0,
+		improvement: 0,
+		stable: 0,
+		unreliable_baseline: 0,
+		insufficient_data: 0,
+	};
+	for (const s of result.scenarios) counts[s.verdict]++;
+	return counts;
+}
+
+// ---------------------------------------------------------------------------
+// Compare
+// ---------------------------------------------------------------------------
+
+/**
+ * Compare two experiment buckets and produce a structured comparison result.
+ *
+ * Aggregate is computed over the *intersection* of scenarios — the only
+ * scenarios for which the rates are directly comparable. PR-only and
+ * baseline-only scenarios are surfaced separately, not folded into the
+ * aggregate.
+ *
+ * Aggregate pass rate is the *micro* average — total passes / total trials
+ * across the intersection.
+ *
+ * `options` exists for tests; production callers pass nothing.
+ */
+export function compareBuckets(
+	pr: ExperimentBucket,
+	baseline: ExperimentBucket,
+	options: ClassifyOptions = {},
+): ComparisonResult {
+	const scenarios: ScenarioComparison[] = [];
+	const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
+	const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
+
+	let prIPasses = 0;
+	let prITotal = 0;
+	let baseIPasses = 0;
+	let baseITotal = 0;
+
+	for (const [key, prCounts] of pr.scenarios) {
+		const baseCounts = baseline.scenarios.get(key);
+		if (!baseCounts) {
+			prOnly.push({
+				testCaseFile: prCounts.testCaseFile,
+				scenarioName: prCounts.scenarioName,
+			});
+			continue;
+		}
+
+		prIPasses += prCounts.passed;
+		prITotal += prCounts.total;
+		baseIPasses += baseCounts.passed;
+		baseITotal += baseCounts.total;
+
+		const classification = classifyScenario(
+			prCounts.passed,
+			prCounts.total,
+			baseCounts.passed,
+			baseCounts.total,
+			options,
+		);
+		scenarios.push({
+			testCaseFile: prCounts.testCaseFile,
+			scenarioName: prCounts.scenarioName,
+			prPasses: prCounts.passed,
+			prTotal: prCounts.total,
+			baselinePasses: baseCounts.passed,
+			baselineTotal: baseCounts.total,
+			...classification,
+		});
+	}
+
+	for (const [key, baseCounts] of baseline.scenarios) {
+		if (!pr.scenarios.has(key)) {
+			baselineOnly.push({
+				testCaseFile: baseCounts.testCaseFile,
+				scenarioName: baseCounts.scenarioName,
+			});
+		}
+	}
+
+	const aggregate: AggregateComparison = {
+		intersectionSize: scenarios.length,
+		prAggregatePassRate: rate(prIPasses, prITotal),
+		baselineAggregatePassRate: rate(baseIPasses, baseITotal),
+		prAggregateCI: wilsonInterval(prIPasses, prITotal),
+		baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal),
+		delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal),
+	};
+
+	scenarios.sort(scenarioComparator);
+
+	const failureCategories = compareFailureCategories(pr, baseline);
+
+	return {
+		pr: { experimentName: pr.experimentName },
+		baseline: { experimentName: baseline.experimentName },
+		aggregate,
+		scenarios,
+		prOnly,
+		baselineOnly,
+		failureCategories,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Failure-category drift
+// ---------------------------------------------------------------------------
+
+/** Min absolute rate gap to consider a category notable (5 percentage points). */
+const CATEGORY_NOTABLE_RATE_DELTA = 0.05;
+/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */
+const CATEGORY_NOTABLE_COUNT_DELTA = 3;
+
+/**
+ * Categories the verifier is supposed to emit. Anything else (malformed
+ * strings like `-`, `>builder_issue`, empty, etc.) is dropped from the
+ * comparison so the PR comment doesn't display verifier noise. Keep in sync
+ * with the verifier's category enum; unknown values are logged at verbose
+ * level via the console (see compareFailureCategories).
+ */
+const KNOWN_FAILURE_CATEGORIES = new Set([
+	'builder_issue',
+	'mock_issue',
+	'framework_issue',
+	'verification_failure',
+	'build_failure',
+]);
+
+function isCategoryNotable(
+	prCount: number,
+	prTotal: number,
+	baselineCount: number,
+	baselineTotal: number,
+): boolean {
+	const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal);
+	if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false;
+	const expectedPrCount = baselineCount * (prTotal / baselineTotal);
+	const countGap = Math.abs(prCount - expectedPrCount);
+	return countGap >= CATEGORY_NOTABLE_COUNT_DELTA;
+}
+
+function compareFailureCategories(
+	pr: ExperimentBucket,
+	baseline: ExperimentBucket,
+): FailureCategoryComparison[] {
+	if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return [];
+	const prTotal = pr.trialTotal ?? 0;
+	const baseTotal = baseline.trialTotal ?? 0;
+	if (prTotal === 0 || baseTotal === 0) return [];
+
+	// Surface unrecognised values so we notice when the verifier adds a new
+	// category (or starts emitting noise we should clean up). Doesn't enter
+	// the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES.
+	for (const category of Object.keys(pr.failureCategoryTotals)) {
+		if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
+			console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
+		}
+	}
+	for (const category of Object.keys(baseline.failureCategoryTotals)) {
+		if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
+			console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
+		}
+	}
+
+	// Always emit a row for every known category, even if both sides are 0.
+	// The renderer can decide whether to suppress 0/0 rows; this gives readers
+	// a complete picture of the failure-type taxonomy by default.
+	const out: FailureCategoryComparison[] = [];
+	for (const category of KNOWN_FAILURE_CATEGORIES) {
+		const prCount = pr.failureCategoryTotals[category] ?? 0;
+		const baselineCount = baseline.failureCategoryTotals[category] ?? 0;
+		out.push({
+			category,
+			prCount,
+			prRate: prCount / prTotal,
+			baselineCount,
+			baselineRate: baselineCount / baseTotal,
+			delta: prCount / prTotal - baselineCount / baseTotal,
+			notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal),
+		});
+	}
+
+	// Sort: notable first, then by absolute delta descending.
+	out.sort((a, b) => {
+		if (a.notable !== b.notable) return a.notable ? -1 : 1;
+		return Math.abs(b.delta) - Math.abs(a.delta);
+	});
+	return out;
+}
+
+function rate(passes: number, total: number): number {
+	return total > 0 ? passes / total : 0;
+}
+
+const VERDICT_ORDER: Record<ScenarioComparison['verdict'], number> = {
+	hard_regression: 0,
+	soft_regression: 1,
+	improvement: 2,
+	watch: 3,
+	unreliable_baseline: 4,
+	stable: 5,
+	insufficient_data: 6,
+};
+
+function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number {
+	const av = VERDICT_ORDER[a.verdict];
+	const bv = VERDICT_ORDER[b.verdict];
+	if (av !== bv) return av - bv;
+	const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile);
+	if (fileCmp !== 0) return fileCmp;
+	return a.scenarioName.localeCompare(b.scenarioName);
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts
@ -0,0 +1,123 @@
+// ---------------------------------------------------------------------------
+// Find and fetch the pinned baseline experiment from LangSmith.
+//
+// The baseline is whichever experiment most recently used the
+// `instance-ai-baseline` prefix. To refresh, run the eval with that prefix:
+//
+//   pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
+//
+// LangSmith appends a random suffix, so successive baseline runs become
+// `instance-ai-baseline-7abc1234`, `instance-ai-baseline-9def5678`, etc.
+// We pick the most recently started one.
+//
+// Two functions, both small:
+//
+//   findLatestBaseline    — list baseline-prefixed projects, pick newest.
+//   fetchBaselineBucket   — read its root runs, bucket per scenario.
+//
+// Both throw on transport errors. Callers are expected to swallow with a log:
+// the comparison is advisory and shouldn't fail the eval run.
+// ---------------------------------------------------------------------------
+
+import type { Client } from 'langsmith';
+import { z } from 'zod';
+
+import type { ExperimentBucket, ScenarioCounts } from './compare';
+
+/**
+ * Prefix the latest-baseline lookup matches against. The CLI flag
+ * `--experiment-name instance-ai-baseline` produces project names like
+ * `instance-ai-baseline-7abc1234` (LangSmith appends a hyphen + suffix), so
+ * the constant must end in `-` to avoid matching unrelated names that
+ * happen to start with `instance-ai-baseline...`.
+ */
+export const BASELINE_EXPERIMENT_PREFIX = 'instance-ai-baseline-';
+
+const inputsSchema = z
+	.object({
+		testCaseFile: z.string().default(''),
+		scenarioName: z.string().default(''),
+	})
+	.passthrough();
+
+const outputsSchema = z
+	.object({
+		passed: z.boolean().default(false),
+		failureCategory: z.string().optional(),
+	})
+	.passthrough();
+
+/**
+ * Return the most recently created baseline experiment, or `undefined` if
+ * none exist. We pick by `start_time` so a re-run of an older snapshot
+ * doesn't displace the latest one.
+ */
+export async function findLatestBaseline(client: Client): Promise<string | undefined> {
+	let latest: { name: string; ts: number } | undefined;
+	for await (const project of client.listProjects({ nameContains: BASELINE_EXPERIMENT_PREFIX })) {
+		const name = project.name;
+		if (!name?.startsWith(BASELINE_EXPERIMENT_PREFIX)) continue;
+		const ts = project.start_time ? new Date(project.start_time).getTime() : 0;
+		if (!latest || ts > latest.ts) latest = { name, ts };
+	}
+	return latest?.name;
+}
+
+/**
+ * Fetch a baseline experiment's per-scenario pass/fail counts. Each root run
+ * corresponds to one (testCaseFile, scenarioName, iteration) triple — we
+ * bucket by `${testCaseFile}/${scenarioName}` and accumulate.
+ *
+ * Throws if the project does not exist.
+ */
+export async function fetchBaselineBucket(
+	client: Client,
+	experimentName: string,
+): Promise<ExperimentBucket> {
+	const project = await client.readProject({ projectName: experimentName });
+	const scenarios = new Map<string, ScenarioCounts>();
+	const failureCategoryTotals: Record<string, number> = {};
+	let trialTotal = 0;
+
+	for await (const run of client.listRuns({ projectId: project.id, isRoot: true })) {
+		const inputs = inputsSchema.safeParse(run.inputs ?? {});
+		if (!inputs.success || !inputs.data.testCaseFile || !inputs.data.scenarioName) continue;
+		// Skip runs that never produced outputs (still running, crashed before
+		// completion, infra error). Without this guard, every field defaults
+		// (passed → false) would coerce them into "failed" trials and inflate
+		// the baseline failure count. Mirrors `parseTargetOutput` in cli/index.ts.
+		const rawOutputs = run.outputs;
+		if (
+			rawOutputs === null ||
+			rawOutputs === undefined ||
+			typeof rawOutputs !== 'object' ||
+			Object.keys(rawOutputs).length === 0
+		) {
+			continue;
+		}
+		const outputs = outputsSchema.safeParse(rawOutputs);
+		if (!outputs.success) continue;
+
+		const key = `${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
+		const existing: ScenarioCounts = scenarios.get(key) ?? {
+			testCaseFile: inputs.data.testCaseFile,
+			scenarioName: inputs.data.scenarioName,
+			passed: 0,
+			total: 0,
+			failureCategories: {},
+		};
+		existing.total++;
+		trialTotal++;
+		if (outputs.data.passed) {
+			existing.passed++;
+		} else if (outputs.data.failureCategory) {
+			const cat = outputs.data.failureCategory;
+			existing.failureCategories = existing.failureCategories ?? {};
+			existing.failureCategories[cat] = (existing.failureCategories[cat] ?? 0) + 1;
+			failureCategoryTotals[cat] = (failureCategoryTotals[cat] ?? 0) + 1;
+		}
+		scenarios.set(key, existing);
+	}
+
+	return { experimentName, scenarios, failureCategoryTotals, trialTotal };
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/format.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/format.ts
@ -0,0 +1,961 @@
+// ---------------------------------------------------------------------------
+// Render the eval run as a PR comment (markdown) or a console summary
+// (aligned plain text). Both formats are driven by:
+//
+//   - MultiRunEvaluation — pass rates, build counts, per-trial reasoning
+//   - ComparisonOutcome (optional) — tagged result of the baseline
+//     comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or
+//     `fetch_failed` / `self_baseline` (skipped for cause). Each kind
+//     drives a distinct top-of-comment alert so a LangSmith outage doesn't
+//     get dressed up as "no baseline configured".
+//
+// When no comparison is available (no baseline yet, LangSmith offline)
+// the renderers still produce a useful per-test-case summary. When a
+// comparison is available, sections render in priority order:
+// regressions, soft regressions, notable movement, improvements,
+// failure-category drift. Only sections with content are emitted.
+// ---------------------------------------------------------------------------
+
+import {
+	hardRegressions,
+	improvements,
+	softRegressions,
+	watchList,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type FailureCategoryComparison,
+	type ScenarioComparison,
+} from './compare';
+import type {
+	MultiRunEvaluation,
+	TestCaseAggregation,
+	WorkflowTestCase,
+	WorkflowTestCaseResult,
+} from '../types';
+
+interface FormatOptions {
+	/** Optional commit SHA to include in the heading. Truncated to 8 chars. */
+	commitSha?: string;
+	/** Maps each test-case reference to its file slug. When provided, the
+	 *  per-scenario failure breakdown looks up failed runs by
+	 *  `${fileSlug}/${scenarioName}` — deterministic across collisions like
+	 *  multiple `happy-path` scenarios. When omitted, the breakdown is
+	 *  skipped (no name-only fallback — that lookup was wrong on real data). */
+	slugByTestCase?: Map<WorkflowTestCase, string>;
+}
+
+// ---------------------------------------------------------------------------
+// Markdown PR comment
+// ---------------------------------------------------------------------------
+
+export function formatComparisonMarkdown(
+	evaluation: MultiRunEvaluation,
+	outcome?: ComparisonOutcome,
+	options: FormatOptions = {},
+): string {
+	const lines: string[] = [];
+	const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
+
+	lines.push(formatHeading(options.commitSha));
+	lines.push('');
+	lines.push(formatTopAlert(outcome));
+	lines.push('');
+	lines.push(formatAggregateBlock(evaluation, comparison));
+	lines.push('');
+
+	if (comparison) {
+		const hard = hardRegressions(comparison);
+		const soft = softRegressions(comparison);
+		const watch = watchList(comparison);
+		const imps = improvements(comparison);
+
+		const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0;
+
+		// Built once and reused across the regression-tier sections so each
+		// scenario row can carry a collapsible breakdown of its failed PR runs.
+		// Improvements skip the breakdown — they passed. Skipped entirely when
+		// the caller didn't pass a slug map (lookup would be ambiguous).
+		const failedIndex = options.slugByTestCase
+			? buildFailedRunsIndex(evaluation, options.slugByTestCase)
+			: undefined;
+
+		if (hard.length > 0) {
+			lines.push(
+				...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex),
+			);
+		}
+		if (soft.length > 0) {
+			lines.push(
+				...renderScenarioSection(
+					'Soft regressions',
+					'— investigate if related to your changes',
+					soft,
+					true,
+					failedIndex,
+				),
+			);
+		}
+		if (watch.length > 0) {
+			lines.push(
+				...renderScenarioSection(
+					'Notable movement',
+					'— large gap, no statistical flag',
+					watch,
+					false,
+					failedIndex,
+				),
+			);
+		}
+		if (imps.length > 0) {
+			lines.push(...renderScenarioSection('Improvements', '', imps, true));
+		}
+
+		if (renderedAnyTable) {
+			lines.push(
+				"_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._",
+			);
+			lines.push('');
+		}
+
+		// Always render the breakdown when comparison data is available — the
+		// renderer drops 0/0 rows itself, so empty categories don't pollute
+		// the output but the reader still sees the full taxonomy of what's
+		// tracked.
+		lines.push(...renderFailureCategorySection(comparison.failureCategories));
+	}
+
+	lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase));
+
+	if (comparison) {
+		const otherFindings = renderOtherFindings(comparison);
+		if (otherFindings.length > 0) lines.push(...otherFindings);
+	}
+
+	const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase);
+	if (failureDetails.length > 0) lines.push(...failureDetails);
+
+	return lines.join('\n');
+}
+
+function formatHeading(commitSha?: string): string {
+	const sha = commitSha ? ` — \`${commitSha.slice(0, 8)}\`` : '';
+	return `### Instance AI Workflow Eval${sha}`;
+}
+
+function formatTopAlert(outcome?: ComparisonOutcome): string {
+	if (!outcome) {
+		return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join(
+			'\n',
+		);
+	}
+
+	if (outcome.kind === 'no_baseline') {
+		return [
+			'> [!NOTE]',
+			'> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.',
+		].join('\n');
+	}
+	if (outcome.kind === 'self_baseline') {
+		return [
+			'> [!NOTE]',
+			`> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`,
+		].join('\n');
+	}
+	if (outcome.kind === 'fetch_failed') {
+		return [
+			'> [!WARNING]',
+			`> Regression detection did not run — baseline fetch failed: ${outcome.error}`,
+		].join('\n');
+	}
+
+	const comparison = outcome.result;
+	const hard = hardRegressions(comparison).length;
+	const soft = softRegressions(comparison).length;
+	const watch = watchList(comparison).length;
+	const imps = improvements(comparison).length;
+	const stable = countByVerdict(comparison, 'stable');
+
+	const aggDelta = comparison.aggregate.delta * 100;
+	const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
+
+	// Always include all five tier counts so readers see what's being tracked,
+	// not just what's > 0. The hard count is bolded when nonzero for emphasis.
+	const summary = [
+		hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions',
+		`${soft} soft`,
+		`${watch} notable`,
+		`${imps} improvement${imps === 1 ? '' : 's'}`,
+		`${stable} stable`,
+	].join(', ');
+
+	let icon: string;
+	let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP';
+
+	if (hard > 0) {
+		icon = '🔴';
+		alertKind = 'CAUTION';
+	} else if (soft > 0) {
+		icon = '🟡';
+		alertKind = 'WARNING';
+	} else if (watch > 0) {
+		icon = '🔵';
+		alertKind = 'NOTE';
+	} else {
+		icon = '🟢';
+		alertKind = 'TIP';
+	}
+
+	return `> [!${alertKind}]\n> ${icon} ${summary}. Pass rate ${aggDeltaText} vs master.`;
+}
+
+function formatAggregateBlock(
+	evaluation: MultiRunEvaluation,
+	comparison?: ComparisonResult,
+): string {
+	if (!comparison) {
+		const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
+		const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
+		const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
+		const rate = total > 0 ? (passed / total) * 100 : 0;
+		return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`;
+	}
+
+	const { aggregate } = comparison;
+	const delta = aggregate.delta * 100;
+	const sign = delta >= 0 ? '+' : '';
+	const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
+
+	const baselineN = inferBaselineN(comparison);
+	const sampleLine = baselineN
+		? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_`
+		: `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`;
+
+	const partial = comparison.baselineOnly.length + comparison.prOnly.length;
+	const partialNote =
+		partial > 0
+			? `\n_Partial: ${[
+					comparison.baselineOnly.length > 0
+						? `${comparison.baselineOnly.length} baseline scenarios not run by PR`
+						: null,
+					comparison.prOnly.length > 0
+						? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)`
+						: null,
+				]
+					.filter((s) => s !== null)
+					.join(', ')}._`
+			: '';
+
+	return [
+		`**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`,
+		sampleLine + partialNote,
+	].join('\n');
+}
+
+function renderScenarioSection(
+	heading: string,
+	subtitle: string,
+	scenarios: ScenarioComparison[],
+	withPValue: boolean,
+	failedIndex?: FailedRunsBySlug,
+): string[] {
+	const lines: string[] = [];
+	const headingLine = subtitle
+		? `#### ${heading} (${scenarios.length}) ${subtitle}`
+		: `#### ${heading} (${scenarios.length})`;
+	lines.push(headingLine);
+	lines.push('');
+	if (withPValue) {
+		lines.push('| Scenario | PR | Baseline | Δ | p |');
+		lines.push('|---|---|---|---|---|');
+	} else {
+		lines.push('| Scenario | PR | Baseline | Δ |');
+		lines.push('|---|---|---|---|');
+	}
+	for (const s of scenarios) {
+		const cells = [
+			`\`${s.testCaseFile}/${s.scenarioName}\``,
+			formatRateCell(s.prPasses, s.prTotal),
+			formatRateCell(s.baselinePasses, s.baselineTotal),
+			formatDeltaCell(s.delta),
+		];
+		if (withPValue) {
+			const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft;
+			cells.push(p.toFixed(3));
+		}
+		lines.push(`| ${cells.join(' | ')} |`);
+	}
+	lines.push('');
+
+	// Per-scenario failure breakdown — one collapsible per row that had failed
+	// PR runs. Lets the reader drill into each flagged scenario without
+	// hunting through a separate "Failure details" section.
+	if (failedIndex) {
+		for (const s of scenarios) {
+			const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? [];
+			if (failedRuns.length === 0) continue;
+			lines.push(...renderScenarioFailureBreakdown(s, failedRuns));
+		}
+	}
+
+	return lines;
+}
+
+function renderScenarioFailureBreakdown(
+	s: ScenarioComparison,
+	failedRuns: FailedRunDetail[],
+): string[] {
+	const slug = `${s.testCaseFile}/${s.scenarioName}`;
+	const categoryMix = summarizeCategories(failedRuns);
+	const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`];
+	if (categoryMix) summaryParts.push(categoryMix);
+
+	const lines: string[] = [];
+	lines.push(`<details><summary><code>${slug}</code> — ${summaryParts.join(' · ')}</summary>`);
+	lines.push('');
+	for (const fr of failedRuns) {
+		const tag = fr.category ? ` [${fr.category}]` : '';
+		lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`);
+		lines.push('>');
+	}
+	// Drop the trailing empty quote line.
+	if (lines[lines.length - 1] === '>') lines.pop();
+	lines.push('');
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] {
+	// Drop rows that are 0/0 on both sides — they carry no signal for the
+	// reader. Categories with non-zero count on either side are kept so the
+	// reader sees the full picture even if not "notable".
+	const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0);
+	if (rows.length === 0) return [];
+
+	const lines: string[] = [];
+	lines.push('#### Failure breakdown');
+	lines.push('');
+	lines.push('| Category | PR | Baseline | Δ | |');
+	lines.push('|---|---|---|---|---|');
+	for (const c of rows) {
+		const isNew = c.baselineCount === 0 && c.prCount > 0;
+		const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``;
+		const delta = c.delta * 100;
+		const sign = delta >= 0 ? '+' : '';
+		const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
+		const notableMarker = c.notable ? '**notable**' : '';
+		lines.push(
+			`| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`,
+		);
+	}
+	lines.push('');
+	return lines;
+}
+
+function renderPerTestCaseDetails(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const { totalRuns, testCases } = evaluation;
+	if (testCases.length === 0) return [];
+	const lines: string[] = [];
+	lines.push(`<details><summary>Per-test-case results (${testCases.length})</summary>`);
+	lines.push('');
+	const renderName = (tc: TestCaseAggregation): string => {
+		const slug = slugByTestCase?.get(tc.testCase);
+		return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``;
+	};
+	if (totalRuns > 1) {
+		lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`);
+		lines.push('|---|---|---|---|');
+		for (const tc of testCases) {
+			const meanPassAtK = tc.scenarios.length
+				? Math.round(
+						(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
+							tc.scenarios.length) *
+							100,
+					)
+				: 0;
+			const meanPassHatK = tc.scenarios.length
+				? Math.round(
+						(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
+							tc.scenarios.length) *
+							100,
+					)
+				: 0;
+			lines.push(
+				`| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`,
+			);
+		}
+	} else {
+		lines.push('| Workflow | Built | Pass rate |');
+		lines.push('|---|---|---|');
+		for (const tc of testCases) {
+			const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗';
+			const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length;
+			const total = tc.scenarios.length;
+			lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`);
+		}
+	}
+	lines.push('');
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderOtherFindings(comparison: ComparisonResult): string[] {
+	const stable = countByVerdict(comparison, 'stable');
+	const flaky = countByVerdict(comparison, 'unreliable_baseline');
+	const noData = countByVerdict(comparison, 'insufficient_data');
+	if (stable === 0 && flaky === 0 && noData === 0) return [];
+
+	const summaryParts: string[] = [];
+	if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`);
+	if (noData > 0) summaryParts.push(`${noData} no data`);
+	if (stable > 0) summaryParts.push(`${stable} stable`);
+	const summary = summaryParts.join(' · ');
+
+	const lines: string[] = [];
+	lines.push(`<details><summary>Other findings: ${summary}</summary>`);
+	lines.push('');
+
+	const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable');
+	const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline');
+	const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data');
+
+	if (flakyScenarios.length > 0) {
+		lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**');
+		lines.push('');
+		lines.push('| Scenario | PR | Baseline | Δ |');
+		lines.push('|---|---|---|---|');
+		for (const s of flakyScenarios) {
+			lines.push(
+				`| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`,
+			);
+		}
+		lines.push('');
+	}
+
+	if (noDataScenarios.length > 0) {
+		lines.push(
+			`**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`,
+		);
+		lines.push('');
+	}
+
+	if (stableScenarios.length > 0) {
+		lines.push(`**Stable (${stableScenarios.length}):**`);
+		lines.push(
+			stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.',
+		);
+		lines.push('');
+	}
+
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderFailureDetails(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const failed: Array<{
+		tc: WorkflowTestCaseResult;
+		fileSlug: string | undefined;
+		scenarioName: string;
+		failedRuns: Array<{ category?: string; reasoning: string }>;
+	}> = [];
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase?.get(tc.testCase);
+		for (const sa of tc.scenarios) {
+			const failedRuns = sa.runs
+				.filter((r) => !r.success)
+				.map((r) => ({ category: r.failureCategory, reasoning: r.reasoning }));
+			if (failedRuns.length > 0) {
+				failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns });
+			}
+		}
+	}
+	if (failed.length === 0) return [];
+
+	const lines: string[] = [];
+	lines.push('<details><summary>Failure details</summary>');
+	lines.push('');
+	for (const { tc, fileSlug, scenarioName, failedRuns } of failed) {
+		const slug = fileSlug
+			? `${fileSlug}/${scenarioName}`
+			: `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`;
+		lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`);
+		for (const fr of failedRuns) {
+			const tag = fr.category ? ` [${fr.category}]` : '';
+			lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`);
+		}
+		lines.push('');
+	}
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+// ---------------------------------------------------------------------------
+// Per-scenario failure lookup
+// ---------------------------------------------------------------------------
+//
+// The comparison carries per-scenario counts (passed / total) but not the
+// underlying reasoning text. The evaluation has the reasoning, but keys
+// testCases by reference identity — not by the `testCaseFile` slug used in
+// the comparison. The slug map (built in cli/index.ts where the file slugs
+// are first known) bridges the two so the lookup is deterministic. Without
+// it we'd have to disambiguate by scenarioName alone, which collides on
+// reused names (`happy-path` shows up across most workflows).
+
+interface FailedRunDetail {
+	category?: string;
+	reasoning: string;
+	runIndex: number; // 1-based for display
+}
+
+type FailedRunsBySlug = Map<string, FailedRunDetail[]>;
+
+function buildFailedRunsIndex(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase: Map<WorkflowTestCase, string>,
+): FailedRunsBySlug {
+	const map: FailedRunsBySlug = new Map();
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase.get(tc.testCase);
+		if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute
+		for (const sa of tc.scenarios) {
+			const failedRuns: FailedRunDetail[] = [];
+			sa.runs.forEach((r, i) => {
+				if (!r.success) {
+					failedRuns.push({
+						category: r.failureCategory,
+						reasoning: r.reasoning,
+						runIndex: i + 1,
+					});
+				}
+			});
+			if (failedRuns.length > 0) {
+				map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns);
+			}
+		}
+	}
+	return map;
+}
+
+function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined {
+	const counts = new Map<string, number>();
+	for (const fr of failedRuns) {
+		if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1);
+	}
+	if (counts.size === 0) return undefined;
+	return [...counts.entries()]
+		.sort((a, b) => b[1] - a[1])
+		.map(([cat, n]) => `${n}× ${cat}`)
+		.join(', ');
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function pct(rate: number): string {
+	return (rate * 100).toFixed(1);
+}
+
+function formatRateCell(passes: number, total: number): string {
+	const rate = total > 0 ? Math.round((passes / total) * 100) : 0;
+	return `${passes}/${total} (${rate}%)`;
+}
+
+function formatDeltaCell(delta: number): string {
+	const pp = delta * 100;
+	const sign = pp >= 0 ? '+' : '';
+	const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : '';
+	return `${sign}${pp.toFixed(0)}pp${arrow}`;
+}
+
+function countByVerdict(
+	comparison: ComparisonResult,
+	verdict: ScenarioComparison['verdict'],
+): number {
+	return comparison.scenarios.filter((s) => s.verdict === verdict).length;
+}
+
+/** Best-effort N=baseline iteration count. The comparison only carries trial
+ *  totals per scenario; we infer N from the most-common scenario total since
+ *  the baseline runs every scenario the same number of times. */
+function inferBaselineN(comparison: ComparisonResult): number | undefined {
+	const totals = comparison.scenarios
+		.filter((s) => s.baselineTotal > 0)
+		.map((s) => s.baselineTotal);
+	if (totals.length === 0) return undefined;
+	const counts = new Map<number, number>();
+	for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1);
+	let best = totals[0];
+	let bestCount = 0;
+	for (const [n, c] of counts) {
+		if (c > bestCount) {
+			best = n;
+			bestCount = c;
+		}
+	}
+	return best;
+}
+
+// ---------------------------------------------------------------------------
+// Terminal renderer: aligned plain text for the eval CLI's end-of-run print.
+// ---------------------------------------------------------------------------
+
+const TERMINAL_INDENT = '  ';
+const TERMINAL_TABLE_INDENT = '    ';
+
+export function formatComparisonTerminal(
+	evaluation: MultiRunEvaluation,
+	outcome?: ComparisonOutcome,
+	options: FormatOptions = {},
+): string {
+	const lines: string[] = [];
+	const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
+
+	const titleSuffix = options.commitSha ? ` — ${options.commitSha.slice(0, 8)}` : '';
+	const title = `Instance AI Workflow Eval${titleSuffix}`;
+	lines.push(title);
+	lines.push('═'.repeat(title.length));
+
+	lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome));
+	lines.push('');
+
+	lines.push(...formatTerminalAggregate(evaluation, comparison));
+	lines.push('');
+
+	lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase));
+
+	if (comparison) {
+		const hard = hardRegressions(comparison);
+		const soft = softRegressions(comparison);
+		const watch = watchList(comparison);
+		const imps = improvements(comparison);
+
+		if (hard.length > 0) {
+			lines.push(
+				TERMINAL_INDENT +
+					'REGRESSIONS  (high-confidence: large drop on a reliable scenario, unlikely noise)',
+			);
+			lines.push(formatTerminalScenarioTable(hard, true));
+			lines.push('');
+		}
+		if (soft.length > 0) {
+			lines.push(
+				TERMINAL_INDENT +
+					'SOFT REGRESSIONS  (likely natural variance — investigate if related to your changes)',
+			);
+			lines.push(formatTerminalScenarioTable(soft, true));
+			lines.push('');
+		}
+		if (watch.length > 0) {
+			lines.push(TERMINAL_INDENT + 'NOTABLE MOVEMENT  (large gap, no statistical flag)');
+			lines.push(formatTerminalScenarioTable(watch, false));
+			lines.push('');
+		}
+		if (imps.length > 0) {
+			lines.push(TERMINAL_INDENT + 'IMPROVEMENTS');
+			lines.push(formatTerminalScenarioTable(imps, true));
+			lines.push('');
+		}
+
+		// Always render the breakdown when comparison data is available — same
+		// rationale as the markdown side. The terminal table drops 0/0 rows
+		// itself.
+		const breakdownRows = comparison.failureCategories.filter(
+			(c) => c.prCount > 0 || c.baselineCount > 0,
+		);
+		if (breakdownRows.length > 0) {
+			lines.push(TERMINAL_INDENT + 'failure breakdown');
+			lines.push(formatTerminalCategoryTable(breakdownRows));
+			lines.push('');
+		}
+
+		// Stable count is already in the verdict line; surface only the rarer
+		// outcomes here.
+		const flaky = countByVerdict(comparison, 'unreliable_baseline');
+		const noData = countByVerdict(comparison, 'insufficient_data');
+		const otherParts: string[] = [];
+		if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`);
+		if (noData > 0) otherParts.push(`${noData} no data`);
+		if (otherParts.length > 0) {
+			lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · '));
+		}
+	}
+
+	return lines.join('\n');
+}
+
+function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string {
+	if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).';
+	if (outcome.kind === 'no_baseline') {
+		return '▶ No baseline configured — comparison skipped.';
+	}
+	if (outcome.kind === 'self_baseline') {
+		return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`;
+	}
+	if (outcome.kind === 'fetch_failed') {
+		return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`;
+	}
+
+	const comparison = outcome.result;
+	const hard = hardRegressions(comparison).length;
+	const soft = softRegressions(comparison).length;
+	const watch = watchList(comparison).length;
+	const imps = improvements(comparison).length;
+	const stable = countByVerdict(comparison, 'stable');
+
+	const aggDelta = comparison.aggregate.delta * 100;
+	const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
+
+	const summary = [
+		`${hard} regression${hard === 1 ? '' : 's'}`,
+		`${soft} soft`,
+		`${watch} notable`,
+		`${imps} improvement${imps === 1 ? '' : 's'}`,
+		`${stable} stable`,
+	].join(', ');
+
+	return `▶ ${summary}. Pass rate ${aggDeltaText} vs master.`;
+}
+
+function formatTerminalAggregate(
+	evaluation: MultiRunEvaluation,
+	comparison?: ComparisonResult,
+): string[] {
+	const lines: string[] = [];
+	if (!comparison) {
+		const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
+		const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
+		const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
+		const rate = total > 0 ? (passed / total) * 100 : 0;
+		lines.push(
+			TERMINAL_INDENT +
+				`Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`,
+		);
+		return lines;
+	}
+
+	const { aggregate } = comparison;
+	const baselineN = inferBaselineN(comparison);
+	const aggDelta = aggregate.delta * 100;
+	const sign = aggDelta >= 0 ? '+' : '';
+	const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : '';
+	lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`);
+	lines.push(
+		TERMINAL_INDENT +
+			`  PR        ${pct(aggregate.prAggregatePassRate)}%   (N=${evaluation.totalRuns})`,
+	);
+	if (baselineN !== undefined) {
+		lines.push(
+			TERMINAL_INDENT +
+				`  baseline  ${pct(aggregate.baselineAggregatePassRate)}%   (N=${baselineN})`,
+		);
+	} else {
+		lines.push(TERMINAL_INDENT + `  baseline  ${pct(aggregate.baselineAggregatePassRate)}%`);
+	}
+	lines.push(TERMINAL_INDENT + `  Δ         ${sign}${aggDelta.toFixed(1)}pp${arrow}`);
+
+	if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) {
+		const partialParts: string[] = [];
+		if (comparison.baselineOnly.length > 0)
+			partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`);
+		if (comparison.prOnly.length > 0)
+			partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`);
+		lines.push(TERMINAL_INDENT + `  partial: ${partialParts.join(', ')}`);
+	}
+
+	return lines;
+}
+
+function formatTerminalPerTestCase(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const { totalRuns, testCases } = evaluation;
+	if (testCases.length === 0) return [];
+	const lines: string[] = [];
+	const heading = `Per-test-case results (${testCases.length})`;
+	lines.push(TERMINAL_INDENT + heading);
+
+	const nameOf = (tc: TestCaseAggregation, max: number): string => {
+		const slug = slugByTestCase?.get(tc.testCase);
+		return slug ?? tc.testCase.prompt.slice(0, max);
+	};
+
+	if (totalRuns > 1) {
+		const rows = testCases.map((tc) => {
+			const meanPassAtK =
+				tc.scenarios.length > 0
+					? Math.round(
+							(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
+								tc.scenarios.length) *
+								100,
+						)
+					: 0;
+			const meanPassHatK =
+				tc.scenarios.length > 0
+					? Math.round(
+							(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
+								tc.scenarios.length) *
+								100,
+						)
+					: 0;
+			return {
+				name: nameOf(tc, 60),
+				builds: `${tc.buildSuccessCount}/${totalRuns}`,
+				passAtK: `${meanPassAtK}%`,
+				passHatK: `${meanPassHatK}%`,
+			};
+		});
+		const nameW = maxWidth(
+			rows.map((r) => r.name),
+			'workflow',
+		);
+		const buildsW = maxWidth(
+			rows.map((r) => r.builds),
+			'builds',
+		);
+		const atKHeader = `pass@${totalRuns}`;
+		const hatKHeader = `pass^${totalRuns}`;
+		const atKW = maxWidth(
+			rows.map((r) => r.passAtK),
+			atKHeader,
+		);
+		const hatKW = maxWidth(
+			rows.map((r) => r.passHatK),
+			hatKHeader,
+		);
+		lines.push(
+			TERMINAL_TABLE_INDENT +
+				`${'workflow'.padEnd(nameW)}  ${'builds'.padEnd(buildsW)}  ${atKHeader.padStart(atKW)}  ${hatKHeader.padStart(hatKW)}`,
+		);
+		lines.push(
+			TERMINAL_TABLE_INDENT +
+				`${'─'.repeat(nameW)}  ${'─'.repeat(buildsW)}  ${'─'.repeat(atKW)}  ${'─'.repeat(hatKW)}`,
+		);
+		for (const r of rows) {
+			lines.push(
+				TERMINAL_TABLE_INDENT +
+					`${r.name.padEnd(nameW)}  ${r.builds.padEnd(buildsW)}  ${r.passAtK.padStart(atKW)}  ${r.passHatK.padStart(hatKW)}`,
+			);
+		}
+	} else {
+		for (const tc of testCases) {
+			const r = tc.runs[0];
+			const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
+			lines.push('');
+			lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}…`);
+			lines.push(TERMINAL_INDENT + `  ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
+			if (r.buildError) lines.push(TERMINAL_INDENT + `  error: ${r.buildError.slice(0, 200)}`);
+			for (const sa of tc.scenarios) {
+				const sr = sa.runs[0];
+				const status = sr.success ? 'PASS' : 'FAIL';
+				const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
+				lines.push(TERMINAL_INDENT + `  ${status}  ${sr.scenario.name}${category}`);
+				if (!sr.success) {
+					const errs = sr.evalResult?.errors ?? [];
+					if (errs.length > 0) {
+						lines.push(TERMINAL_INDENT + `        error: ${errs.join('; ').slice(0, 200)}`);
+					}
+					lines.push(TERMINAL_INDENT + `        diagnosis: ${sr.reasoning.slice(0, 200)}`);
+				}
+			}
+		}
+	}
+	lines.push('');
+	return lines;
+}
+
+function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string {
+	const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`);
+	const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`);
+	const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`);
+	const deltaCells = scenarios.map((s) => {
+		const d = s.delta * 100;
+		const sign = d >= 0 ? '+' : '';
+		const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : '';
+		return `${sign}${d.toFixed(0)}pp${arrow}`;
+	});
+	const pCells = withPValue
+		? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3))
+		: [];
+
+	const nameW = maxWidth(names, 'scenario');
+	const prW = maxWidth(prCells, 'PR');
+	const baseW = maxWidth(baseCells, 'baseline');
+	const deltaW = maxWidth(deltaCells, 'Δ');
+	const pW = withPValue ? maxWidth(pCells, 'p') : 0;
+
+	const headers = [
+		'scenario'.padEnd(nameW),
+		'PR'.padEnd(prW),
+		'baseline'.padEnd(baseW),
+		'Δ'.padEnd(deltaW),
+	];
+	if (withPValue) headers.push('p'.padEnd(pW));
+	const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW];
+	const sep = widths.map((w) => '─'.repeat(w)).join('  ');
+
+	const rows = scenarios.map((_, i) => {
+		const cells = [
+			names[i].padEnd(nameW),
+			prCells[i].padEnd(prW),
+			baseCells[i].padEnd(baseW),
+			deltaCells[i].padEnd(deltaW),
+		];
+		if (withPValue) cells.push(pCells[i].padEnd(pW));
+		return TERMINAL_TABLE_INDENT + cells.join('  ');
+	});
+
+	return [TERMINAL_TABLE_INDENT + headers.join('  '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
+		'\n',
+	);
+}
+
+function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string {
+	const names = cats.map((c) => {
+		const isNew = c.baselineCount === 0 && c.prCount > 0;
+		return c.category + (isNew ? ' 🆕' : '');
+	});
+	const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`);
+	const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`);
+	const deltaCells = cats.map((c) => {
+		const d = c.delta * 100;
+		const sign = d >= 0 ? '+' : '';
+		return `${sign}${d.toFixed(1)}pp`;
+	});
+
+	const nameW = maxWidth(names, 'category');
+	const prW = maxWidth(prCells, 'PR');
+	const baseW = maxWidth(baseCells, 'baseline');
+
+	const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ'];
+	const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join('  ');
+
+	const rows = cats.map(
+		(_, i) =>
+			TERMINAL_TABLE_INDENT +
+			[
+				names[i].padEnd(nameW),
+				prCells[i].padEnd(prW),
+				baseCells[i].padEnd(baseW),
+				deltaCells[i],
+			].join('  '),
+	);
+
+	return [TERMINAL_TABLE_INDENT + headers.join('  '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
+		'\n',
+	);
+}
+
+function maxWidth(values: string[], header: string): number {
+	return values.reduce((m, v) => Math.max(m, v.length), header.length);
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
@ -0,0 +1,304 @@
+// ---------------------------------------------------------------------------
+// Decides whether one scenario's pass rate is meaningfully worse than
+// another, at the small sample sizes evals run at (N=3 typically).
+//
+// Public surface:
+//   - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict
+//   - wilsonInterval(passes, total) — confidence band for a pass rate, used
+//     for the headline aggregate
+//
+// The implementation uses Fisher's exact test and the Wilson score interval
+// under the hood; both are standard small-sample statistics. You don't need
+// to know either to use the public API.
+// ---------------------------------------------------------------------------
+import { strict as assert } from 'node:assert';
+
+// ---------------------------------------------------------------------------
+// Fisher's exact test (one-sided)
+//
+// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the
+// probability of seeing a gap at least as bad as the observed one if the two
+// groups actually had the same pass rate. Small return value ⇒ strong
+// evidence the PR is worse.
+// ---------------------------------------------------------------------------
+
+const logFactorialCache: number[] = [0, 0];
+
+function logFactorial(n: number): number {
+	for (let i = logFactorialCache.length; i <= n; i++) {
+		logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i));
+	}
+	return logFactorialCache[n];
+}
+
+function logBinomial(n: number, k: number): number {
+	if (k < 0 || k > n) return -Infinity;
+	return logFactorial(n) - logFactorial(k) - logFactorial(n - k);
+}
+
+function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number {
+	const total = nPasses + nFails;
+	if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0;
+	return Math.exp(
+		logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn),
+	);
+}
+
+/**
+ * One-sided Fisher's exact test (left tail). Returns the probability that
+ * PR's pass count would be at most `a` if PR and baseline shared the same
+ * underlying pass rate. Small value ⇒ PR is significantly worse.
+ *
+ * 2×2 table:
+ *
+ *              passed   failed
+ *   PR        |   a    |   b   |
+ *   Baseline  |   c    |   d   |
+ *
+ * Returns 1 (no information) when either side has no trials, or when all
+ * trials passed or all failed.
+ */
+export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number {
+	const inputs = [a, b, c, d];
+	for (const v of inputs) {
+		assert(
+			Number.isInteger(v) && v >= 0,
+			'fishersExactOneSidedLeft requires non-negative integers',
+		);
+	}
+
+	const nPr = a + b;
+	const nBase = c + d;
+	const nPasses = a + c;
+	const nFails = b + d;
+
+	if (nPr === 0 || nBase === 0) return 1;
+	if (nPasses === 0 || nFails === 0) return 1;
+
+	let pValue = 0;
+	const kMax = Math.min(a, nPasses);
+	for (let k = 0; k <= kMax; k++) {
+		pValue += hypergeomPmf(nPasses, nFails, nPr, k);
+	}
+	// Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1.
+	return Math.min(1, Math.max(0, pValue));
+}
+
+// ---------------------------------------------------------------------------
+// Wilson score interval (95% confidence)
+//
+// Returns a confidence band for a pass rate that behaves well at small N and
+// at extreme rates (close to 0 or 1) — both common in our evals. Used for
+// the headline aggregate band only; classification doesn't need it.
+// ---------------------------------------------------------------------------
+
+// Standard z-score for a 95% confidence interval. We only ever use 95%, so
+// the value is inlined rather than parameterised.
+const Z_95 = 1.96;
+
+export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } {
+	assert(
+		Number.isInteger(passes) && passes >= 0,
+		'wilsonInterval: passes must be a non-negative integer',
+	);
+	assert(
+		Number.isInteger(total) && total >= 0,
+		'wilsonInterval: total must be a non-negative integer',
+	);
+	assert(passes <= total, 'wilsonInterval: passes cannot exceed total');
+
+	if (total === 0) return { lower: 0, upper: 1 };
+
+	const p = passes / total;
+	const z2 = Z_95 * Z_95;
+	const denom = 1 + z2 / total;
+	const center = (p + z2 / (2 * total)) / denom;
+	const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom;
+	return {
+		lower: Math.max(0, center - halfWidth),
+		upper: Math.min(1, center + halfWidth),
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Per-scenario classification
+//
+// Three flag tiers, evaluated in order of strictness:
+//
+//   hard_regression  — high-confidence drop on a reliable baseline.
+//                      Gating-grade.
+//   soft_regression  — looser bar; investigate, not gating.
+//   watch            — moved noticeably but didn't pass either flag tier.
+//                      Pure visibility.
+//
+// Improvements use the hard tier (we don't surface borderline improvements;
+// they tend to be noise in the positive direction).
+// ---------------------------------------------------------------------------
+
+export type ScenarioVerdict =
+	| 'hard_regression' // PR is confidently worse, baseline was reliable
+	| 'soft_regression' // looser bar — worth investigating, not high-confidence
+	| 'watch' // moved enough to surface but no flag tier triggered
+	| 'improvement' // PR is significantly better
+	| 'stable' // no meaningful change
+	| 'unreliable_baseline' // confident drop but baseline was too flaky to trust
+	| 'insufficient_data'; // either side had zero trials
+
+export interface ScenarioClassification {
+	verdict: ScenarioVerdict;
+	/** PR pass rate (0..1) */
+	prPassRate: number;
+	/** Baseline pass rate (0..1) */
+	baselinePassRate: number;
+	/** PR rate − baseline rate, signed. Negative = PR worse. */
+	delta: number;
+	/** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */
+	pValueLeft: number;
+	/** Probability the PR is at least this much better by chance. */
+	pValueRight: number;
+}
+
+export interface TierThresholds {
+	/** Flag only when the chance the gap happened by noise is below this. */
+	maxPValue: number;
+	/** Flag only when the absolute pass-rate gap is at least this large (0..1). */
+	minDelta: number;
+	/** Flag only when the baseline pass rate was at least this high (0..1). */
+	minBaselinePassRate: number;
+}
+
+export interface ClassifyOptions {
+	/** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */
+	hard?: Partial<TierThresholds>;
+	/** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */
+	soft?: Partial<TierThresholds>;
+	/** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */
+	watchDelta?: number;
+}
+
+const DEFAULT_HARD: TierThresholds = {
+	maxPValue: 0.05,
+	minDelta: 0.3,
+	minBaselinePassRate: 0.7,
+};
+const DEFAULT_SOFT: TierThresholds = {
+	maxPValue: 0.2,
+	minDelta: 0.15,
+	minBaselinePassRate: 0.5,
+};
+// Watch threshold: surface scenarios whose pass rate changed by at least 35pp
+// without reaching a flag tier. High enough that natural noise on rock-solid
+// scenarios (e.g. 2/3 vs 10/10 = −33pp) doesn't crowd the comment.
+const DEFAULT_WATCH_DELTA = 0.35;
+
+function meetsThreshold(
+	pValue: number,
+	delta: number,
+	baselineRate: number,
+	tier: TierThresholds,
+	direction: 'worse' | 'better',
+): boolean {
+	if (pValue >= tier.maxPValue) return false;
+	if (direction === 'worse') {
+		if (delta > -tier.minDelta) return false;
+		if (baselineRate < tier.minBaselinePassRate) return false;
+	} else {
+		if (delta < tier.minDelta) return false;
+		// Improvements skip the reliability gate — fixing flaky scenarios is a real win.
+	}
+	return true;
+}
+
+/**
+ * Classify a single scenario into one of seven verdicts. See ScenarioVerdict
+ * for the tier semantics.
+ *
+ * `options` exists for tests; production callers leave thresholds at defaults.
+ */
+export function classifyScenario(
+	prPasses: number,
+	prTotal: number,
+	baselinePasses: number,
+	baselineTotal: number,
+	options: ClassifyOptions = {},
+): ScenarioClassification {
+	const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard };
+	const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft };
+	const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA;
+
+	const prPassRate = prTotal > 0 ? prPasses / prTotal : 0;
+	const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0;
+
+	if (prTotal === 0 || baselineTotal === 0) {
+		return {
+			verdict: 'insufficient_data',
+			prPassRate,
+			baselinePassRate,
+			delta: prPassRate - baselinePassRate,
+			pValueLeft: 1,
+			pValueRight: 1,
+		};
+	}
+
+	const a = prPasses;
+	const b = prTotal - prPasses;
+	const c = baselinePasses;
+	const d = baselineTotal - baselinePasses;
+
+	const pValueLeft = fishersExactOneSidedLeft(a, b, c, d);
+	const pValueRight = fishersExactOneSidedLeft(c, d, a, b);
+	const delta = prPassRate - baselinePassRate;
+
+	// Improvement (right tail) — single tier, hard thresholds only
+	if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) {
+		return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+	}
+
+	// Hard regression — passes all three hard gates
+	if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) {
+		return {
+			verdict: 'hard_regression',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Confident drop, but on a baseline too flaky to call a regression.
+	// Surface as `unreliable_baseline` so it's visible without being a flag.
+	if (
+		pValueLeft < hard.maxPValue &&
+		delta <= -hard.minDelta &&
+		baselinePassRate < hard.minBaselinePassRate
+	) {
+		return {
+			verdict: 'unreliable_baseline',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Soft regression — passes the looser gates
+	if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) {
+		return {
+			verdict: 'soft_regression',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Watch — meaningful movement but no flag fired. Pure visibility.
+	if (Math.abs(delta) >= watchDelta) {
+		return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+	}
+
+	return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+}
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -28,7 +28,7 @@ import type {
 // Constants
 // ---------------------------------------------------------------------------

-const DEFAULT_TIMEOUT_MS = 600_000;
+const DEFAULT_TIMEOUT_MS = 900_000;
 const SSE_SETTLE_DELAY_MS = 200;
 const POLL_INTERVAL_MS = 500;
 const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000;
--- a/packages/@n8n/instance-ai/evaluations/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/index.ts
@ -39,3 +39,38 @@ export type {
 	ChecklistItem,
 	ChecklistResult,
 } from './types';
+
+// -- Comparison (regression detection) --
+export {
+	compareBuckets,
+	byVerdict,
+	improvements,
+	hardRegressions,
+	softRegressions,
+	watchList,
+} from './comparison/compare';
+export type {
+	ComparisonResult,
+	ScenarioComparison,
+	ScenarioCounts,
+	ExperimentBucket,
+	AggregateComparison,
+	FailureCategoryComparison,
+} from './comparison/compare';
+export {
+	classifyScenario,
+	fishersExactOneSidedLeft,
+	wilsonInterval,
+} from './comparison/statistics';
+export type {
+	ScenarioVerdict,
+	ScenarioClassification,
+	ClassifyOptions,
+	TierThresholds,
+} from './comparison/statistics';
+export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
+export {
+	fetchBaselineBucket,
+	findLatestBaseline,
+	BASELINE_EXPERIMENT_PREFIX,
+} from './comparison/fetch-baseline';