From bbe3e2d1487e06df1e58057ec8c47edb5ad19aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Braulio=20Gonz=C3=A1lez=20Valido?= Date: Wed, 6 May 2026 09:15:08 +0100 Subject: [PATCH] feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456) Co-authored-by: Claude Opus 4.7 (1M context) --- .github/workflows/test-evals-instance-ai.yml | 24 +- .gitignore | 1 + .../@n8n/instance-ai/evaluations/README.md | 43 +- .../__tests__/comparison-compare.test.ts | 190 ++++ .../__tests__/comparison-format.test.ts | 458 +++++++++ .../__tests__/comparison-statistics.test.ts | 161 +++ .../@n8n/instance-ai/evaluations/cli/args.ts | 4 +- .../@n8n/instance-ai/evaluations/cli/index.ts | 262 +++-- .../evaluations/comparison/compare.ts | 333 ++++++ .../evaluations/comparison/fetch-baseline.ts | 123 +++ .../evaluations/comparison/format.ts | 961 ++++++++++++++++++ .../evaluations/comparison/statistics.ts | 304 ++++++ .../instance-ai/evaluations/harness/runner.ts | 2 +- .../@n8n/instance-ai/evaluations/index.ts | 35 + 14 files changed, 2818 insertions(+), 83 deletions(-) create mode 100644 packages/@n8n/instance-ai/evaluations/__tests__/comparison-compare.test.ts create mode 100644 packages/@n8n/instance-ai/evaluations/__tests__/comparison-format.test.ts create mode 100644 packages/@n8n/instance-ai/evaluations/__tests__/comparison-statistics.test.ts create mode 100644 packages/@n8n/instance-ai/evaluations/comparison/compare.ts create mode 100644 packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts create mode 100644 packages/@n8n/instance-ai/evaluations/comparison/format.ts create mode 100644 packages/@n8n/instance-ai/evaluations/comparison/statistics.ts diff --git a/.github/workflows/test-evals-instance-ai.yml b/.github/workflows/test-evals-instance-ai.yml index 474707712cd..edae81fac4c 100644 --- a/.github/workflows/test-evals-instance-ai.yml +++ b/.github/workflows/test-evals-instance-ai.yml @@ -143,7 +143,7 @@ jobs: --base-url "$BASE_URLS" \ --concurrency 32 \ --verbose \ - --iterations 3 \ + --iterations 5 \ ${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }} - name: Stop n8n containers @@ -160,22 +160,16 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json" - if [ ! -f "$RESULTS_FILE" ]; then - echo "No eval results file found" + # The eval CLI writes the full PR comment as eval-pr-comment.md + # (see comparison/format.ts:formatComparisonMarkdown). It includes + # the alert, aggregate, comparison sections, per-test-case results + # collapsed, and failure details collapsed. CI just relays it. + COMMENT_FILE="packages/@n8n/instance-ai/eval-pr-comment.md" + if [ ! -f "$COMMENT_FILE" ]; then + echo "No PR comment file found (eval likely cancelled before writing results)" exit 0 fi - - # Build the full comment body with jq - jq -r ' - "### Instance AI Workflow Eval Results\n\n" + - "**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" + - "| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" + - ([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) + - "\n\n
Failure details\n\n" + - ([.testCases[] as $tc | $tc.scenarios[] | select(.passHatK < 1) | "**\($tc.name) / \(.name)** — \(.passCount)/\(.totalRuns) passed" + "\n" + ([.runs[] | select(.passed == false) | "> Run\(if .failureCategory then " [\(.failureCategory)]" else "" end): \(.reasoning | .[0:200])"] | join("\n"))] | join("\n\n")) + - "\n
" - ' "$RESULTS_FILE" > /tmp/eval-comment.md + cp "$COMMENT_FILE" /tmp/eval-comment.md # Find and update existing eval comment, or create new one COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \ diff --git a/.gitignore b/.gitignore index 771d6734e6e..0b3ab9669f3 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ packages/testing/playwright/playwright-report packages/testing/playwright/test-results packages/testing/playwright/eval-results.json packages/@n8n/instance-ai/eval-results.json +packages/@n8n/instance-ai/eval-pr-comment.md packages/testing/playwright/.playwright-browsers packages/testing/playwright/.playwright-cli test-results/ diff --git a/packages/@n8n/instance-ai/evaluations/README.md b/packages/@n8n/instance-ai/evaluations/README.md index 734e022c45b..fec6826e401 100644 --- a/packages/@n8n/instance-ai/evaluations/README.md +++ b/packages/@n8n/instance-ai/evaluations/README.md @@ -121,7 +121,7 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --iterations 3 | `--base-url` | `http://localhost:5678` | n8n instance URL | | `--email` | E2E test owner | Override login email (or `N8N_EVAL_EMAIL`) | | `--password` | E2E test owner | Override login password (or `N8N_EVAL_PASSWORD`) | -| `--timeout-ms` | `600000` | Per-test-case timeout | +| `--timeout-ms` | `900000` | Per-test-case timeout | | `--output-dir` | cwd | Where to write `eval-results.json` | | `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name | | `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) | @@ -155,6 +155,47 @@ Every run produces: **LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results. +## Regression detection + +When `LANGSMITH_API_KEY` is set, every eval run automatically compares its results against the most recent pinned baseline (any experiment whose name starts with `instance-ai-baseline-`). Two output files are written: + +- `eval-results.json` — structured data only, including `comparison.result` when a baseline was found. +- `eval-pr-comment.md` — the full PR comment rendered as markdown, including the alert, aggregate, comparison sections, per-test-case results, and failure details. Always written; falls back to a no-baseline summary when no comparison ran. + +The CI PR-comment step uses `eval-pr-comment.md` as the entire comment body (no jq assembly in the workflow). The console output uses a separate aligned-text formatter — same data, no markdown noise in the terminal. + +### Refreshing the baseline + +There is no auto-refresh — refresh explicitly when you want a new reference point, ideally with high N for low noise: + +```bash +# From packages/@n8n/instance-ai/, on master at the version you want to pin +LANGSMITH_API_KEY=... dotenvx run -f ../../../.env.local -- \ + pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10 +``` + +LangSmith appends a random suffix (e.g. `instance-ai-baseline-7abc1234`); the most recently started one becomes the comparison target on the next eval run. The comparison is silently skipped on the baseline-creation run itself. + +### How scenarios are tiered + +Each scenario lands in one of three regression tiers, evaluated in order of strictness: + +- **Regression** — high-confidence flag, gating-grade. The drop must be statistically significant (chance of seeing it by noise < 5%), at least 30 percentage points in size, and the baseline must have been reliable (≥ 70% pass rate). +- **Soft regression** — looser bar for visibility on borderline cases. Looser confidence threshold (chance by noise < 20%), drop ≥ 15 percentage points, baseline ≥ 50%. Frequently natural variance — worth a glance only if your changes touch related code paths. +- **Notable movement** — any scenario whose pass rate moved by ≥ 35 percentage points without reaching either flag tier. Pure visibility, no implication of cause. + +Other verdicts: `improvement` (PR significantly better, skips the reliability gate), `unreliable_baseline` (confident drop but baseline was too flaky to call a regression — surfaced but not flagged), `stable`, `insufficient_data`. + +Why these tiers and not a flat percentage threshold? At the small N PR runs use (typically 3 iterations), a flat threshold can't tell a real regression from coin-flip noise. The confidence cutoff filters out gaps that could plausibly happen by chance, and the reliability gate avoids chasing noise on already-flaky scenarios. Implementation lives in `comparison/statistics.ts` (Fisher's exact test for the confidence check, Wilson interval for the headline aggregate band). Tune the soft tier first if the false-positive rate looks off — keep the hard tier strict. + +### Failure-category drift + +When both sides captured per-trial `failureCategory` values, the comparison also surfaces a run-level table of category rates (PR vs baseline). A category is marked **notable** when its absolute rate delta is ≥ 5 percentage points _and_ the count change beyond what scenario-count scaling would predict is ≥ 3 trials. This catches cross-scenario shifts (e.g. mock-generation breaking, or a model getting weaker overall) that per-scenario flags can miss. + +### Best-effort + +Comparison is logged and skipped on any LangSmith failure — it never fails the eval. It is also skipped when no baseline experiment exists yet. + ## Pairwise evals Pairwise evals score a built workflow against the dataset's `dos` / `donts` diff --git a/packages/@n8n/instance-ai/evaluations/__tests__/comparison-compare.test.ts b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-compare.test.ts new file mode 100644 index 00000000000..a94a26dbc7c --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-compare.test.ts @@ -0,0 +1,190 @@ +import { compareBuckets, type ExperimentBucket, type ScenarioCounts } from '../comparison/compare'; + +function bucket( + name: string, + scenarios: ScenarioCounts[], + categories?: { totals: Record; trialTotal: number }, +): ExperimentBucket { + return { + experimentName: name, + scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])), + failureCategoryTotals: categories?.totals, + trialTotal: categories?.trialTotal, + }; +} + +function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts { + return { testCaseFile: file, scenarioName: scenario, passed, total }; +} + +describe('compareBuckets', () => { + it('produces a clean intersection when both sides have the same scenarios', () => { + const pr = bucket('pr', [s('contact', 'happy', 8, 10), s('weather', 'happy', 1, 10)]); + const base = bucket('master', [s('contact', 'happy', 9, 10), s('weather', 'happy', 0, 10)]); + + const result = compareBuckets(pr, base); + + expect(result.scenarios).toHaveLength(2); + expect(result.prOnly).toEqual([]); + expect(result.baselineOnly).toEqual([]); + expect(result.aggregate.intersectionSize).toBe(2); + }); + + it('flags scenarios only present on one side', () => { + const pr = bucket('pr', [s('contact', 'happy', 5, 10)]); + const base = bucket('master', [s('contact', 'happy', 8, 10), s('weather', 'happy', 5, 10)]); + + const result = compareBuckets(pr, base); + + expect(result.scenarios).toHaveLength(1); + expect(result.scenarios[0].testCaseFile).toBe('contact'); + expect(result.baselineOnly).toEqual([{ testCaseFile: 'weather', scenarioName: 'happy' }]); + expect(result.prOnly).toEqual([]); + }); + + it('aggregates only over the intersection, not over baseline-only or pr-only', () => { + const pr = bucket('pr', [s('contact', 'happy', 10, 10)]); + const base = bucket('master', [s('contact', 'happy', 5, 10), s('other', 'happy', 0, 10)]); + + const result = compareBuckets(pr, base); + + expect(result.aggregate.prAggregatePassRate).toBe(1); + expect(result.aggregate.baselineAggregatePassRate).toBe(0.5); + expect(result.aggregate.intersectionSize).toBe(1); + }); + + it('sorts scenarios with regressions first, then improvements, then stable', () => { + const pr = bucket('pr', [ + s('a', 'stable', 10, 10), + s('b', 'regression', 0, 10), + s('c', 'improvement', 10, 10), + ]); + const base = bucket('master', [ + s('a', 'stable', 10, 10), + s('b', 'regression', 10, 10), + s('c', 'improvement', 0, 10), + ]); + + const result = compareBuckets(pr, base); + expect(result.scenarios.map((sc) => sc.scenarioName)).toEqual([ + 'regression', + 'improvement', + 'stable', + ]); + }); + + it('returns insufficient_data when one side has zero trials for a scenario', () => { + const pr = bucket('pr', [s('contact', 'happy', 0, 0)]); + const base = bucket('master', [s('contact', 'happy', 10, 10)]); + + const result = compareBuckets(pr, base); + expect(result.scenarios[0].verdict).toBe('insufficient_data'); + }); + + it('returns no failure-category drift when either side lacks category totals', () => { + const pr = bucket('pr', [s('a', 'happy', 8, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10)]); + expect(compareBuckets(pr, base).failureCategories).toEqual([]); + }); + + it('flags a category as notable when both rate and trial-count gaps clear the bars', () => { + // Haiku-style shift: framework_issue 0/290 → 9/145. + // Rate gap: 6.2pp ≥ 5pp ✓. Expected PR count given baseline = 0 × (145/290) = 0; |9 − 0| = 9 ≥ 3 ✓. + const pr = bucket('pr', [s('a', 'happy', 50, 145)], { + totals: { framework_issue: 9 }, + trialTotal: 145, + }); + const base = bucket('master', [s('a', 'happy', 200, 290)], { + totals: { framework_issue: 0 }, + trialTotal: 290, + }); + const cats = compareBuckets(pr, base).failureCategories; + const fw = cats.find((c) => c.category === 'framework_issue'); + expect(fw?.notable).toBe(true); + }); + + it('does not flag when the rate gap is below the 5pp bar', () => { + // 3/100 vs 2/100 = 1pp gap, count gap = 1 — neither bar cleared. + const pr = bucket('pr', [s('a', 'happy', 50, 100)], { + totals: { mock_issue: 3 }, + trialTotal: 100, + }); + const base = bucket('master', [s('a', 'happy', 50, 100)], { + totals: { mock_issue: 2 }, + trialTotal: 100, + }); + const cats = compareBuckets(pr, base).failureCategories; + expect(cats.find((c) => c.category === 'mock_issue')?.notable).toBe(false); + }); + + it('does not flag when the rate gap is large but the count gap is tiny (small N guard)', () => { + // PR 1/3 vs baseline 0/270 — rate gap = 33pp ≥ 5pp, but expected PR count = 0 + // and observed = 1, count gap = 1 < 3. Should NOT flag — single trial on small N. + const pr = bucket('pr', [s('a', 'happy', 0, 3)], { + totals: { builder_issue: 1 }, + trialTotal: 3, + }); + const base = bucket('master', [s('a', 'happy', 270, 270)], { + totals: { builder_issue: 0 }, + trialTotal: 270, + }); + const cats = compareBuckets(pr, base).failureCategories; + expect(cats.find((c) => c.category === 'builder_issue')?.notable).toBe(false); + }); + + it('drops unknown categories with a console warning, keeps all known categories', () => { + const warn = jest.spyOn(console, 'warn').mockImplementation(() => {}); + const pr = bucket('pr', [s('a', 'happy', 8, 10)], { + totals: { '-': 5, builder_issue: 2 }, + trialTotal: 10, + }); + const base = bucket('master', [s('a', 'happy', 8, 10)], { + totals: { builder_issue: 1 }, + trialTotal: 10, + }); + const cats = compareBuckets(pr, base).failureCategories; + // All five known categories are always present (some at 0/0 — renderer + // drops those). The unknown `-` category is dropped here with a warning. + expect(cats.map((c) => c.category).sort()).toEqual([ + 'build_failure', + 'builder_issue', + 'framework_issue', + 'mock_issue', + 'verification_failure', + ]); + expect(warn).toHaveBeenCalledWith(expect.stringContaining('"-"')); + warn.mockRestore(); + }); + + it('sorts notable categories before non-notable, then by absolute delta', () => { + const pr = bucket('pr', [s('a', 'happy', 50, 100)], { + totals: { framework_issue: 10, mock_issue: 4, builder_issue: 25 }, + trialTotal: 100, + }); + const base = bucket('master', [s('a', 'happy', 50, 100)], { + totals: { framework_issue: 0, mock_issue: 3, builder_issue: 22 }, + trialTotal: 100, + }); + const cats = compareBuckets(pr, base).failureCategories; + // framework_issue is the only notable one (rate gap 10pp, count gap 10). + expect(cats[0].category).toBe('framework_issue'); + expect(cats[0].notable).toBe(true); + expect(cats.slice(1).every((c) => !c.notable)).toBe(true); + }); + + it('accepts custom tiered thresholds for tests', () => { + const pr = bucket('pr', [s('a', 'happy', 5, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10)]); + + // Defaults: 5/10 vs 8/10 = -30pp drop, p ≈ 0.18 → soft_regression + // (passes soft maxPValue=0.20, soft minDelta=0.15, baseline 80% above soft 50%). + const defaults = compareBuckets(pr, base); + expect(defaults.scenarios[0].verdict).toBe('soft_regression'); + + // Stricter soft p-value cutoff excludes this case. + const stricter = compareBuckets(pr, base, { + soft: { maxPValue: 0.1, minDelta: 0.15, minBaselinePassRate: 0.5 }, + }); + expect(['stable', 'watch']).toContain(stricter.scenarios[0].verdict); + }); +}); diff --git a/packages/@n8n/instance-ai/evaluations/__tests__/comparison-format.test.ts b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-format.test.ts new file mode 100644 index 00000000000..7995663861f --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-format.test.ts @@ -0,0 +1,458 @@ +import { + compareBuckets, + type ComparisonOutcome, + type ComparisonResult, + type ExperimentBucket, + type ScenarioCounts, +} from '../comparison/compare'; +import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format'; +import type { MultiRunEvaluation, WorkflowTestCase, ScenarioResult } from '../types'; + +function ok(result: ComparisonResult): ComparisonOutcome { + return { kind: 'ok', result }; +} + +function slugMap(evaluation: MultiRunEvaluation, slugs: string[]): Map { + return new Map(evaluation.testCases.map((tc, i) => [tc.testCase, slugs[i] ?? 'unknown'])); +} + +function bucket(name: string, scenarios: ScenarioCounts[]): ExperimentBucket { + return { + experimentName: name, + scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])), + }; +} + +function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts { + return { testCaseFile: file, scenarioName: scenario, passed, total }; +} + +/** Minimal evaluation fixture matching the shape format.ts reads. */ +function evaluation( + opts: { + totalRuns?: number; + testCases?: Array<{ + prompt?: string; + buildSuccessCount?: number; + scenarios?: Array<{ + name: string; + passCount: number; + passes: boolean[]; // per-iteration pass/fail + reasoning?: string; + failureCategory?: string; + }>; + }>; + } = {}, +): MultiRunEvaluation { + const totalRuns = opts.totalRuns ?? 3; + return { + totalRuns, + testCases: (opts.testCases ?? []).map((tc) => { + const testCase = { + prompt: tc.prompt ?? 'Test workflow prompt', + complexity: 'medium' as const, + tags: [], + scenarios: (tc.scenarios ?? []).map((sa) => ({ + name: sa.name, + description: '', + dataSetup: '', + successCriteria: '', + })), + } as WorkflowTestCase; + const buildSuccessCount = tc.buildSuccessCount ?? totalRuns; + const scenarios = (tc.scenarios ?? []).map((sa) => ({ + scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!, + passCount: sa.passCount, + passRate: totalRuns > 0 ? sa.passCount / totalRuns : 0, + passAtK: new Array(totalRuns).fill(sa.passCount > 0 ? 1 : 0) as number[], + passHatK: new Array(totalRuns).fill(sa.passCount === totalRuns ? 1 : 0) as number[], + runs: sa.passes.map( + (passed): ScenarioResult => ({ + scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!, + success: passed, + score: passed ? 1 : 0, + reasoning: sa.reasoning ?? '', + failureCategory: !passed ? sa.failureCategory : undefined, + }), + ), + })); + return { + testCase, + workflowBuildSuccess: buildSuccessCount > 0, + scenarioResults: [], + scenarios, + runs: new Array(totalRuns).fill(null).map(() => ({ + testCase, + workflowBuildSuccess: buildSuccessCount > 0, + scenarioResults: [], + })), + buildSuccessCount, + }; + }), + }; +} + +describe('formatComparisonMarkdown', () => { + const evalFixture = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'a', + scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }], + }, + ], + }); + + it('renders heading, alert, aggregate, and a regression table', () => { + const pr = bucket('pr', [s('a', 'happy', 0, 3)]); + const base = bucket('master-abc', [s('a', 'happy', 10, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + + expect(md).toMatch(/### Instance AI Workflow Eval/); + expect(md).toMatch(/> \[!CAUTION\]/); + expect(md).toMatch(/1 regression/); + expect(md).toMatch(/\*\*Aggregate\*\*: 0\.0% PR vs 100\.0% baseline/); + expect(md).toMatch(/#### Regressions \(1\)/); + expect(md).toMatch(/`a\/happy`/); + expect(md).toMatch(/0\/3 \(0%\)/); + expect(md).toMatch(/-100pp ↓/); + }); + + it('uses TIP alert when there are only improvements', () => { + const pr = bucket('pr', [s('a', 'happy', 3, 3)]); + const base = bucket('master', [s('a', 'happy', 0, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + + expect(md).toMatch(/> \[!TIP\]/); + expect(md).toMatch(/1 improvement/); + expect(md).toMatch(/#### Improvements \(1\)/); + expect(md).toMatch(/\+100pp ↑/); + }); + + it('uses TIP alert with "0 regressions" when everything is stable', () => { + const pr = bucket('pr', [s('a', 'happy', 8, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + + expect(md).toMatch(/> \[!TIP\]/); + expect(md).toMatch(/0 regressions/); + expect(md).toMatch(/1 stable/); + expect(md).not.toMatch(/#### Regressions/); + }); + + it('renders LangSmith-disabled NOTE when outcome is undefined', () => { + const md = formatComparisonMarkdown(evalFixture); + expect(md).toMatch(/> \[!NOTE\]/); + expect(md).toMatch(/LangSmith disabled/); + expect(md).not.toMatch(/#### Regressions/); + }); + + it('renders distinct alerts per skip reason', () => { + const noBase = formatComparisonMarkdown(evalFixture, { kind: 'no_baseline' }); + expect(noBase).toMatch(/> \[!NOTE\]/); + expect(noBase).toMatch(/No baseline configured/); + + const selfBase = formatComparisonMarkdown(evalFixture, { + kind: 'self_baseline', + experimentName: 'instance-ai-baseline-abc', + }); + expect(selfBase).toMatch(/> \[!NOTE\]/); + expect(selfBase).toMatch(/This run is the baseline/); + expect(selfBase).toMatch(/instance-ai-baseline-abc/); + + const fetchFail = formatComparisonMarkdown(evalFixture, { + kind: 'fetch_failed', + error: 'LangSmith 503', + }); + // fetch_failed is a real outage, not a benign skip — must be a WARNING. + expect(fetchFail).toMatch(/> \[!WARNING\]/); + expect(fetchFail).toMatch(/Regression detection did not run/); + expect(fetchFail).toMatch(/LangSmith 503/); + }); + + it('shows mixed-case alert when both regressions and improvements exist', () => { + const pr = bucket('pr', [s('a', 'happy', 0, 3), s('b', 'happy', 3, 3)]); + const base = bucket('master', [s('a', 'happy', 10, 10), s('b', 'happy', 0, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + expect(md).toMatch(/> \[!CAUTION\]/); + expect(md).toMatch(/1 regression/); + expect(md).toMatch(/1 improvement/); + expect(md).toMatch(/#### Regressions/); + expect(md).toMatch(/#### Improvements/); + }); + + it('embeds commit SHA in heading when provided', () => { + const pr = bucket('pr', [s('a', 'happy', 8, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)), { + commitSha: 'abc1234567890def', + }); + expect(md).toMatch(/### Instance AI Workflow Eval — `abc12345`/); + }); + + it('marks new failure categories with 🆕', () => { + const pr: ExperimentBucket = { + experimentName: 'pr', + scenarios: new Map([['a/happy', { ...s('a', 'happy', 0, 3) }]]), + failureCategoryTotals: { framework_issue: 9 }, + trialTotal: 145, + }; + const base: ExperimentBucket = { + experimentName: 'master', + scenarios: new Map([['a/happy', { ...s('a', 'happy', 5, 10) }]]), + failureCategoryTotals: { framework_issue: 0 }, + trialTotal: 290, + }; + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + expect(md).toMatch(/#### Failure breakdown/); + expect(md).toMatch(/`framework_issue` 🆕/); + expect(md).toMatch(/\*\*notable\*\*/); + }); + + it('always includes all five tier counts in the alert line', () => { + const pr = bucket('pr', [s('a', 'happy', 8, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10)]); + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + expect(md).toMatch(/0 regressions, 0 soft, 0 notable, 0 improvements, 1 stable/); + }); + + it('renders a per-scenario breakdown collapsible inside the regression section', () => { + const evalWithFailures = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'a', + scenarios: [ + { + name: 'happy', + passCount: 0, + passes: [false, false, false], + reasoning: 'Builder produced an unsupported node configuration', + failureCategory: 'builder_issue', + }, + ], + }, + ], + }); + const pr = bucket('pr', [s('a', 'happy', 0, 3)]); + const base = bucket('master', [s('a', 'happy', 10, 10)]); + const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), { + slugByTestCase: slugMap(evalWithFailures, ['a']), + }); + + expect(md).toMatch(/#### Regressions \(1\)/); + // The regression row's collapsible should appear inside the Regressions + // section, before the per-test-case section, and carry the same slug. + const regressionsIdx = md.indexOf('#### Regressions'); + const perTcIdx = md.indexOf('Per-test-case results'); + const breakdownIdx = md.indexOf('a/happy'); + expect(breakdownIdx).toBeGreaterThan(regressionsIdx); + expect(breakdownIdx).toBeLessThan(perTcIdx); + expect(md).toMatch(/3 of 3 failed · 3× builder_issue/); + expect(md).toMatch(/Run 1 \[builder_issue\]: Builder produced/); + }); + + it('uses `file/scenario` slug headers in the bottom Failure details section', () => { + const evalWithFailures = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'Build a cross-team Linear report digest', + scenarios: [ + { + name: 'no-cross-team-issues', + passCount: 0, + passes: [false, false, false], + reasoning: 'reason', + failureCategory: 'builder_issue', + }, + ], + }, + ], + }); + const pr = bucket('pr', [s('cross-team-linear-report', 'no-cross-team-issues', 0, 3)]); + const base = bucket('master', [s('cross-team-linear-report', 'no-cross-team-issues', 10, 10)]); + const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), { + slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report']), + }); + + expect(md).toMatch(/Failure details<\/summary>/); + expect(md).toMatch(/\*\*`cross-team-linear-report\/no-cross-team-issues`\*\* — 3 failed/); + }); + + it('attaches per-scenario failures to the right file slug when names collide', () => { + // Two test cases each defining `happy-path`. Without the slug map, + // the renderer would conflate them — Albert's review flagged this + // exact bug. With the map, each row's collapsible carries only that + // row's failures. + const evalWithFailures = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'cross-team prompt', + scenarios: [ + { + name: 'happy-path', + passCount: 0, + passes: [false, false, false], + reasoning: 'Linear node misconfigured', + failureCategory: 'builder_issue', + }, + ], + }, + { + prompt: 'weather prompt', + scenarios: [ + { + name: 'happy-path', + passCount: 0, + passes: [false, false, false], + reasoning: 'Weather mock returned empty', + failureCategory: 'mock_issue', + }, + ], + }, + ], + }); + const pr = bucket('pr', [ + s('cross-team-linear-report', 'happy-path', 0, 3), + s('weather-monitoring', 'happy-path', 0, 3), + ]); + const base = bucket('master', [ + s('cross-team-linear-report', 'happy-path', 10, 10), + s('weather-monitoring', 'happy-path', 10, 10), + ]); + const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), { + slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report', 'weather-monitoring']), + }); + + // Each per-scenario collapsible (under the regression table) must show + // ONLY its own failures. Slice each block at its closing . + function collapsibleFor(slug: string): string { + const open = md.indexOf(`${slug}`); + expect(open).toBeGreaterThan(-1); + const close = md.indexOf('', open); + return md.slice(open, close); + } + const crossTeamBlock = collapsibleFor('cross-team-linear-report/happy-path'); + const weatherBlock = collapsibleFor('weather-monitoring/happy-path'); + expect(crossTeamBlock).toMatch(/Linear node misconfigured/); + expect(crossTeamBlock).not.toMatch(/Weather mock returned empty/); + expect(weatherBlock).toMatch(/Weather mock returned empty/); + expect(weatherBlock).not.toMatch(/Linear node misconfigured/); + }); + + it('uses the slug instead of the prompt in the per-test-case table', () => { + const evalFx = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'Build a cross-team Linear report digest from open issues', + scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }], + }, + ], + }); + const pr = bucket('pr', [s('cross-team-linear-report', 'happy', 0, 3)]); + const base = bucket('master', [s('cross-team-linear-report', 'happy', 10, 10)]); + const md = formatComparisonMarkdown(evalFx, ok(compareBuckets(pr, base)), { + slugByTestCase: slugMap(evalFx, ['cross-team-linear-report']), + }); + + // Per-test-case table cell should be the slug, not the prompt. + const perTcSection = md.slice(md.indexOf('Per-test-case results')); + expect(perTcSection).toMatch(/`cross-team-linear-report`/); + expect(perTcSection).not.toMatch(/Build a cross-team Linear report digest/); + }); + + it('skips per-scenario breakdown when slugByTestCase is omitted', () => { + // Without the slug map, the renderer can't disambiguate. We'd rather + // drop the breakdown than show a wrong one. + const evalWithFailures = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'a', + scenarios: [ + { + name: 'happy', + passCount: 0, + passes: [false, false, false], + reasoning: 'Some failure', + failureCategory: 'builder_issue', + }, + ], + }, + ], + }); + const pr = bucket('pr', [s('a', 'happy', 0, 3)]); + const base = bucket('master', [s('a', 'happy', 10, 10)]); + const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base))); + + // Regression table still rendered. + expect(md).toMatch(/#### Regressions \(1\)/); + // But no per-scenario collapsible (which would have used a/happy + // with the breakdown summary text). + expect(md).not.toMatch(/3 of 3 failed · 3× builder_issue/); + }); + + it('renders the failure breakdown for non-notable categories with non-zero counts', () => { + // 50/100 vs 50/100 — no scenario regression, but still has builder_issue + // counts on both sides (non-notable but non-zero). + const pr: ExperimentBucket = { + experimentName: 'pr', + scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]), + failureCategoryTotals: { builder_issue: 25 }, + trialTotal: 100, + }; + const base: ExperimentBucket = { + experimentName: 'master', + scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]), + failureCategoryTotals: { builder_issue: 22 }, + trialTotal: 100, + }; + const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base))); + expect(md).toMatch(/#### Failure breakdown/); + expect(md).toMatch(/`builder_issue`/); + // builder_issue isn't notable here, so no "notable" marker. + expect(md).not.toMatch(/builder_issue.*notable/); + }); +}); + +describe('formatComparisonTerminal', () => { + const evalFixture = evaluation({ + totalRuns: 3, + testCases: [ + { + prompt: 'a', + scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }], + }, + ], + }); + + it('renders title, verdict, aggregate, and regression table without markdown syntax', () => { + const pr = bucket('pr', [s('a', 'happy', 0, 3)]); + const base = bucket('master-abc', [s('a', 'happy', 10, 10)]); + const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base))); + expect(out).toMatch(/^Instance AI Workflow Eval/); + expect(out).toMatch(/▶ 1 regression/); + expect(out).toMatch(/PR\s{8}0\.0%/); + expect(out).toMatch(/baseline\s{2}100\.0%/); + expect(out).toMatch(/REGRESSIONS/); + expect(out).toMatch(/a\/happy/); + expect(out).not.toMatch(/^###/m); + expect(out).not.toMatch(/\| /); + }); + + it('renders LangSmith-disabled message when outcome is undefined', () => { + const out = formatComparisonTerminal(evalFixture); + expect(out).toMatch(/LangSmith disabled/); + expect(out).not.toMatch(/REGRESSIONS/); + }); + + it('shows partial banner when scenarios differ on each side', () => { + const pr = bucket('pr', [s('a', 'happy', 8, 10)]); + const base = bucket('master', [s('a', 'happy', 8, 10), s('b', 'happy', 5, 10)]); + const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base))); + expect(out).toMatch(/partial: 1 baseline scenarios not run by PR/); + }); +}); diff --git a/packages/@n8n/instance-ai/evaluations/__tests__/comparison-statistics.test.ts b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-statistics.test.ts new file mode 100644 index 00000000000..68028c7182f --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/__tests__/comparison-statistics.test.ts @@ -0,0 +1,161 @@ +import { + classifyScenario, + fishersExactOneSidedLeft, + wilsonInterval, +} from '../comparison/statistics'; + +describe('fishersExactOneSidedLeft', () => { + it('returns 1 when either row is empty (no information)', () => { + expect(fishersExactOneSidedLeft(0, 0, 5, 5)).toBe(1); + expect(fishersExactOneSidedLeft(5, 5, 0, 0)).toBe(1); + }); + + it('returns 1 when no failures or no passes are observed (no test possible)', () => { + expect(fishersExactOneSidedLeft(3, 0, 5, 0)).toBe(1); + expect(fishersExactOneSidedLeft(0, 3, 0, 5)).toBe(1); + }); + + it('matches a known textbook case', () => { + // 2x2 table where PR (1/3) is much worse than baseline (10/10). + // Hypergeometric: P(X = 0) + P(X = 1) | drawn=3 from passes=11, fails=2 + // = C(11,0)C(2,3)/C(13,3) + C(11,1)C(2,2)/C(13,3) + // = 0 + 11/286 ≈ 0.03846 + const p = fishersExactOneSidedLeft(1, 2, 10, 0); + expect(p).toBeCloseTo(0.03846, 4); + }); + + it('returns p = 1 when PR pass rate equals baseline at maximum', () => { + // PR all pass, baseline all pass — under H0 the observed PR is the most likely outcome, + // so the left-tail (X ≤ a) p-value is exactly 1. + const p = fishersExactOneSidedLeft(5, 0, 5, 0); + expect(p).toBe(1); + }); + + it('detects a strong regression with high N', () => { + // PR 0/10, baseline 10/10 — extremely strong evidence PR is worse. + const p = fishersExactOneSidedLeft(0, 10, 10, 0); + expect(p).toBeLessThan(0.001); + }); + + it('returns 1 when PR matches baseline rates exactly', () => { + // PR 5/10, baseline 5/10 — left tail at the median is around 0.5 + symmetric mass + // at the observed value, but should be > 0.5 (we're at the center of the distribution). + const p = fishersExactOneSidedLeft(5, 5, 5, 5); + expect(p).toBeGreaterThan(0.5); + }); +}); + +describe('wilsonInterval', () => { + it('returns [0, 1] for total=0', () => { + expect(wilsonInterval(0, 0)).toEqual({ lower: 0, upper: 1 }); + }); + + it('produces reasonable bounds for 5/10', () => { + const ci = wilsonInterval(5, 10); + // Known Wilson 95% CI for 5/10: roughly [0.237, 0.763] + expect(ci.lower).toBeCloseTo(0.237, 2); + expect(ci.upper).toBeCloseTo(0.763, 2); + }); + + it('produces tight bounds for 0/100', () => { + const ci = wilsonInterval(0, 100); + expect(ci.lower).toBe(0); + expect(ci.upper).toBeLessThan(0.05); + }); + + it('produces tight bounds for 100/100', () => { + const ci = wilsonInterval(100, 100); + // upper analytically equals 1 but lands slightly under it after FP rounding — + // any reasonable CI for 100/100 should still be tight to the top of the range. + expect(ci.upper).toBeGreaterThanOrEqual(0.99); + expect(ci.lower).toBeGreaterThan(0.95); + }); + + it('throws when passes > total', () => { + expect(() => wilsonInterval(5, 3)).toThrow(); + }); +}); + +describe('classifyScenario', () => { + it('flags a clear regression on a reliable scenario as hard_regression', () => { + const result = classifyScenario(0, 10, 10, 10); + expect(result.verdict).toBe('hard_regression'); + expect(result.delta).toBe(-1); + }); + + it('marks a hard-significant drop on an unreliable baseline as unreliable_baseline', () => { + // Baseline 4/10 (40%) — below hard reliable (70%). PR 0/10 is a 40pp drop with + // Fisher p < 0.05. We surface it as `unreliable_baseline` rather than flagging. + const result = classifyScenario(0, 10, 4, 10); + expect(result.verdict).toBe('unreliable_baseline'); + }); + + it('reports stable when the drop is sub-MDE on a flaky baseline', () => { + // Baseline 1/10 (flaky), PR 0/10 — only a 10pp drop, below MDE. + const result = classifyScenario(0, 10, 1, 10); + expect(result.verdict).toBe('stable'); + }); + + it('does not flag a small drop below the soft MDE threshold', () => { + // 9/10 vs 10/10 = 10pp drop, below soft MDE (15pp). + const result = classifyScenario(9, 10, 10, 10); + expect(result.verdict).toBe('stable'); + }); + + it('flags an improvement when PR is significantly better', () => { + const result = classifyScenario(10, 10, 0, 10); + expect(result.verdict).toBe('improvement'); + }); + + it('flags improvement even on a never-passing baseline', () => { + // "Never passes" baseline (0/10) — fix is worth surfacing without the reliability gate. + const result = classifyScenario(8, 10, 0, 10); + expect(result.verdict).toBe('improvement'); + }); + + it('returns insufficient_data when either side has no trials', () => { + expect(classifyScenario(0, 0, 5, 10).verdict).toBe('insufficient_data'); + expect(classifyScenario(5, 10, 0, 0).verdict).toBe('insufficient_data'); + }); + + it('flags the most extreme outcome at minimum N as hard_regression', () => { + // PR 0/3 vs baseline 3/3 — Fisher one-sided p ≈ 0.05, delta = -100pp. + const result = classifyScenario(0, 3, 3, 3); + expect(result.verdict).toBe('hard_regression'); + }); + + it('reports stable when N is small enough that even a full flip is sub-significant for soft tier', () => { + // PR 1/2 vs baseline 2/2 — delta -50pp but Fisher p ≈ 0.5 (way above soft α=0.20). + // Soft MDE met, but significance fails on both tiers. + const result = classifyScenario(1, 2, 2, 2); + expect(['stable', 'watch']).toContain(result.verdict); + }); + + it('marks soft regression when hard delta is missed but soft thresholds met', () => { + // 6/10 vs 10/10 = 40pp drop, p ≈ 0.043, baseline 100% reliable. + // Hard defaults would flag this; force a stricter hard delta to push it to soft. + const result = classifyScenario(6, 10, 10, 10, { + hard: { maxPValue: 0.05, minDelta: 0.5, minBaselinePassRate: 0.7 }, + soft: { maxPValue: 0.2, minDelta: 0.15, minBaselinePassRate: 0.5 }, + }); + expect(result.verdict).toBe('soft_regression'); + }); + + it('marks watch when delta crosses the watch threshold without significance', () => { + // 5/10 vs 7/10 = -20pp drop, p ≈ 0.32 — not significant for hard or soft. + // Default watchDelta is 0.35, so this should not be `watch`. Force it via + // a smaller threshold to validate the path. + const result = classifyScenario(5, 10, 7, 10, { watchDelta: 0.15 }); + expect(result.verdict).toBe('watch'); + }); + + it('respects custom hard-tier delta override', () => { + // 7/10 vs 10/10 = 30pp delta. Default hard minDelta is 0.3, so this barely qualifies. + // With hard.minDelta 0.4, it drops into `soft_regression` (still passes soft 0.15 minDelta). + // p ≈ 0.105 < soft maxPValue (0.2), so soft fires. + const result = classifyScenario(7, 10, 10, 10, { + hard: { minDelta: 0.4 }, + }); + expect(result.verdict).toBe('soft_regression'); + }); +}); diff --git a/packages/@n8n/instance-ai/evaluations/cli/args.ts b/packages/@n8n/instance-ai/evaluations/cli/args.ts index 85ec5d66f39..830ec247d34 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/args.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts @@ -45,7 +45,7 @@ export interface CliArgs { // --------------------------------------------------------------------------- const cliArgsSchema = z.object({ - timeoutMs: z.number().int().positive().default(600_000), + timeoutMs: z.number().int().positive().default(900_000), baseUrls: z.array(z.string().url()).min(1).default(['http://localhost:5678']), email: z.string().optional(), password: z.string().optional(), @@ -104,7 +104,7 @@ interface RawArgs { function parseRawArgs(argv: string[]): RawArgs { const result: RawArgs = { - timeoutMs: 600_000, + timeoutMs: 900_000, baseUrls: ['http://localhost:5678'], verbose: false, keepWorkflows: false, diff --git a/packages/@n8n/instance-ai/evaluations/cli/index.ts b/packages/@n8n/instance-ai/evaluations/cli/index.ts index ad1d86ba5a5..5796ab048ca 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/index.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts @@ -23,6 +23,15 @@ import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata'; import { LaneAllocator } from './lane-allocator'; import { expandWithIterations, partitionRoundRobin } from './lanes'; import { N8nClient } from '../clients/n8n-client'; +import { + compareBuckets, + type ComparisonOutcome, + type ComparisonResult, + type ExperimentBucket, + type ScenarioCounts, +} from '../comparison/compare'; +import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline'; +import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format'; import { seedCredentials, cleanupCredentials } from '../credentials/seeder'; import { loadWorkflowTestCasesWithFiles } from '../data/workflows'; import type { WorkflowTestCaseWithFile } from '../data/workflows'; @@ -43,6 +52,7 @@ import type { MultiRunEvaluation, ScenarioResult, TestScenario, + WorkflowTestCase, WorkflowTestCaseResult, } from '../types'; @@ -160,21 +170,40 @@ async function main(): Promise { const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY); let evaluation: MultiRunEvaluation; + let experimentName: string | undefined; + let outcome: ComparisonOutcome | undefined; + let slugByTestCase: Map | undefined; if (hasLangSmith) { logger.info('LangSmith API key detected, using evaluate() with experiment tracking'); - evaluation = await runWithLangSmith({ args, lanes, logger }); + const langsmithRun = await runWithLangSmith({ args, lanes, logger }); + evaluation = langsmithRun.evaluation; + experimentName = langsmithRun.experimentName; + outcome = langsmithRun.outcome; + slugByTestCase = langsmithRun.slugByTestCase; } else { logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)'); evaluation = await runDirectLoop({ args, lanes, logger }); } const totalDuration = Date.now() - startTime; - const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir); - console.log(`Results: ${outputPath}`); + const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA; + const { jsonPath, prCommentPath } = writeEvalResults( + evaluation, + totalDuration, + args.outputDir, + experimentName, + outcome, + commitSha, + slugByTestCase, + ); + console.log(`Results: ${jsonPath}`); + console.log(`PR comment: ${prCommentPath}`); const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation)); - console.log(`Report: ${htmlPath}`); - printSummary(evaluation); + console.log(`Report: ${htmlPath}`); + console.log( + '\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }), + ); } finally { await Promise.all( lanes.map(async (lane) => { @@ -188,7 +217,12 @@ async function main(): Promise { // LangSmith mode: evaluate() with dataset sync, tracing, experiments // --------------------------------------------------------------------------- -async function runWithLangSmith(config: RunConfig): Promise { +async function runWithLangSmith(config: RunConfig): Promise<{ + evaluation: MultiRunEvaluation; + experimentName: string; + outcome: ComparisonOutcome; + slugByTestCase: Map; +}> { const { args, lanes, logger } = config; const lsClient = new Client(); @@ -466,7 +500,24 @@ async function runWithLangSmith(config: RunConfig): Promise logger, }); - return evaluation; + const outcome = await tryRunComparison({ + lsClient, + prExperimentName: experimentResults.experimentName, + evaluation, + testCasesWithFiles, + logger, + }); + + const slugByTestCase = new Map( + testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]), + ); + + return { + evaluation, + experimentName: experimentResults.experimentName, + outcome, + slugByTestCase, + }; } finally { if (!args.keepWorkflows) { await Promise.all( @@ -826,15 +877,22 @@ function computePassRatePerIter(evaluation: MultiRunEvaluation): string { function writeEvalResults( evaluation: MultiRunEvaluation, duration: number, - outputDir?: string, -): string { + outputDir: string | undefined, + experimentName: string | undefined, + outcome: ComparisonOutcome | undefined, + commitSha: string | undefined, + slugByTestCase: Map | undefined, +): { jsonPath: string; prCommentPath: string } { const { totalRuns, testCases } = evaluation; const metrics = computeAggregateMetrics(evaluation); + const result = outcome?.kind === 'ok' ? outcome.result : undefined; + const report = { timestamp: new Date().toISOString(), duration, totalRuns, + experimentName, summary: { testCases: testCases.length, built: metrics.built, @@ -843,6 +901,19 @@ function writeEvalResults( passHatK: metrics.passHatK, passRatePerIter: metrics.passRatePerIter, }, + // Structured comparison payload only — the rendered markdown lives in + // the sibling `eval-pr-comment.md` file so consumers can pick the format + // they want without re-running the eval. `comparisonStatus` records why + // the comparison was skipped when applicable, so JSON consumers can + // distinguish "no baseline yet" from "regression detection broke". + comparison: result + ? { + baseline: result.baseline.experimentName, + result: serializeComparison(result), + } + : undefined, + comparisonStatus: outcome?.kind ?? 'not_attempted', + comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined, testCases: testCases.map((tc) => ({ name: tc.testCase.prompt.slice(0, 70), buildSuccessCount: tc.buildSuccessCount, @@ -868,74 +939,137 @@ function writeEvalResults( const targetDir = outputDir ?? process.cwd(); mkdirSync(targetDir, { recursive: true }); - const outputPath = join(targetDir, 'eval-results.json'); - writeFileSync(outputPath, JSON.stringify(report, null, 2)); - return outputPath; + const jsonPath = join(targetDir, 'eval-results.json'); + writeFileSync(jsonPath, JSON.stringify(report, null, 2)); + + // Always write the rendered PR comment — the markdown formatter handles + // both with-comparison and no-baseline cases. CI consumes this file + // directly; local users get a copy-pasteable artifact. + const prCommentPath = join(targetDir, 'eval-pr-comment.md'); + writeFileSync( + prCommentPath, + formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }), + ); + + return { jsonPath, prCommentPath }; +} + +/** + * Convert ComparisonResult into a JSON-serializable shape (Maps don't survive + * JSON.stringify by default). + */ +function serializeComparison(result: ComparisonResult): { + pr: { experimentName: string }; + baseline: { experimentName: string }; + aggregate: ComparisonResult['aggregate']; + scenarios: ComparisonResult['scenarios']; + prOnly: ComparisonResult['prOnly']; + baselineOnly: ComparisonResult['baselineOnly']; + failureCategories: ComparisonResult['failureCategories']; +} { + return { + pr: result.pr, + baseline: result.baseline, + aggregate: result.aggregate, + scenarios: result.scenarios, + prOnly: result.prOnly, + baselineOnly: result.baselineOnly, + failureCategories: result.failureCategories, + }; } // --------------------------------------------------------------------------- -// Console summary +// Comparison vs the pinned baseline experiment // --------------------------------------------------------------------------- -function printSummary(evaluation: MultiRunEvaluation): void { - const { totalRuns, testCases } = evaluation; - const multiRun = totalRuns > 1; - const metrics = computeAggregateMetrics(evaluation); +/** + * Best-effort comparison. Returns a tagged outcome so the PR comment can + * distinguish "no baseline yet" / "this run IS the baseline" from a real + * regression-detection outage (LangSmith down, fetch failure). Never throws + * — the eval run is not gated on the comparison. + */ +async function tryRunComparison(config: { + lsClient: Client; + prExperimentName: string; + evaluation: MultiRunEvaluation; + testCasesWithFiles: WorkflowTestCaseWithFile[]; + logger: EvalLogger; +}): Promise { + const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config; - console.log('\n=== Workflow Eval Results ===\n'); - for (const tc of testCases) { - console.log(`${tc.testCase.prompt.slice(0, 70)}...`); - - if (multiRun) { - console.log(` Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`); - } else { - const r = tc.runs[0]; - const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED'; - console.log(` Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`); - if (r.buildError) { - console.log(` Error: ${r.buildError.slice(0, 200)}`); - } + try { + const baselineName = await findLatestBaseline(lsClient); + if (!baselineName) { + logger.verbose( + 'No baseline experiment found — skipping comparison. ' + + 'Run with --experiment-name instance-ai-baseline to create one.', + ); + return { kind: 'no_baseline' }; + } + if (baselineName === prExperimentName) { + logger.verbose('Current run is the baseline — skipping comparison.'); + return { kind: 'self_baseline', experimentName: baselineName }; } + logger.info(`Comparing against baseline: ${baselineName}`); + const baseline = await fetchBaselineBucket(lsClient, baselineName); + const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName); + return { kind: 'ok', result: compareBuckets(pr, baseline) }; + } catch (error: unknown) { + const msg = error instanceof Error ? error.message : String(error); + logger.warn(`Comparison vs baseline failed: ${msg}`); + return { kind: 'fetch_failed', error: msg }; + } +} + +/** + * Project the in-memory MultiRunEvaluation onto the bucket shape used by + * fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`. + * + * Looks up `fileSlug` by test case reference rather than array index — the + * comparison key depends on getting the right slug, and zipping by index + * silently miscompares if anything ever reorders the aggregate. + */ +function bucketFromEvaluation( + evaluation: MultiRunEvaluation, + testCasesWithFiles: WorkflowTestCaseWithFile[], + experimentName: string, +): ExperimentBucket { + const slugByTestCase = new Map( + testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]), + ); + const scenarios = new Map(); + const failureCategoryTotals: Record = {}; + let trialTotal = 0; + for (const tc of evaluation.testCases) { + const fileSlug = slugByTestCase.get(tc.testCase); + if (!fileSlug) { + throw new Error( + `bucketFromEvaluation: no fileSlug for test case "${tc.testCase.prompt.slice(0, 60)}"`, + ); + } + const total = tc.runs.length; for (const sa of tc.scenarios) { - if (multiRun) { - const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100); - const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100); - console.log( - ` ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` + - ` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`, - ); - } else { - const sr = sa.runs[0]; - const icon = sr.success ? '✓' : '✗'; - const category = sr.failureCategory ? ` [${sr.failureCategory}]` : ''; - console.log( - ` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`, - ); - if (!sr.success) { - const execErrors = sr.evalResult?.errors ?? []; - if (execErrors.length > 0) { - console.log(` Error: ${execErrors.join('; ').slice(0, 200)}`); - } - console.log(` Diagnosis: ${sr.reasoning.slice(0, 200)}`); + const key = `${fileSlug}/${sa.scenario.name}`; + const failureCategories: Record = {}; + for (const sr of sa.runs) { + trialTotal++; + if (!sr.success && sr.failureCategory) { + failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1; + failureCategoryTotals[sr.failureCategory] = + (failureCategoryTotals[sr.failureCategory] ?? 0) + 1; } } + scenarios.set(key, { + testCaseFile: fileSlug, + scenarioName: sa.scenario.name, + passed: sa.passCount, + total, + failureCategories, + }); } - console.log(''); - } - - if (multiRun) { - console.log( - `${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`, - ); - } else { - const allScenarios = testCases.flatMap((tc) => tc.scenarios); - const passed = allScenarios.filter((s) => s.runs[0]?.success).length; - const total = metrics.scenariosTotal; - console.log( - `${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`, - ); } + return { experimentName, scenarios, failureCategoryTotals, trialTotal }; } main().catch((error) => { diff --git a/packages/@n8n/instance-ai/evaluations/comparison/compare.ts b/packages/@n8n/instance-ai/evaluations/comparison/compare.ts new file mode 100644 index 00000000000..12bda63913a --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/comparison/compare.ts @@ -0,0 +1,333 @@ +// --------------------------------------------------------------------------- +// Comparison core: take two experiment buckets, return a ComparisonResult. +// +// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta, +// minimum baseline pass rate) live in statistics.ts — there's no CLI knob. +// Tune them there if the false-positive rate drifts. +// --------------------------------------------------------------------------- + +import { + classifyScenario, + wilsonInterval, + type ClassifyOptions, + type ScenarioClassification, + type ScenarioVerdict, +} from './statistics'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface ScenarioCounts { + testCaseFile: string; + scenarioName: string; + passed: number; + total: number; + failureCategories?: Record; +} + +export interface ExperimentBucket { + experimentName: string; + scenarios: Map; + /** + * Aggregated failure-category counts across all trials in all scenarios. + * Used for the run-level failure-category drift table — orthogonal to + * per-scenario verdicts. + */ + failureCategoryTotals?: Record; + trialTotal?: number; +} + +export interface ScenarioComparison extends ScenarioClassification { + testCaseFile: string; + scenarioName: string; + prPasses: number; + prTotal: number; + baselinePasses: number; + baselineTotal: number; +} + +export interface AggregateComparison { + intersectionSize: number; + prAggregatePassRate: number; + baselineAggregatePassRate: number; + prAggregateCI: { lower: number; upper: number }; + baselineAggregateCI: { lower: number; upper: number }; + delta: number; +} + +export interface FailureCategoryComparison { + category: string; + prCount: number; + prRate: number; // count / trialTotal + baselineCount: number; + baselineRate: number; + delta: number; // prRate − baselineRate + notable: boolean; +} + +export interface ComparisonResult { + pr: { experimentName: string }; + baseline: { experimentName: string }; + aggregate: AggregateComparison; + scenarios: ScenarioComparison[]; + prOnly: Array<{ testCaseFile: string; scenarioName: string }>; + baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>; + failureCategories: FailureCategoryComparison[]; +} + +/** + * Result of a comparison attempt. The `kind` field distinguishes between + * "ran successfully", "skipped intentionally" (no baseline yet, current run + * IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch + * timeout, etc.). The PR comment renders a different alert per kind so + * readers can tell a missing baseline from a regression-detection outage. + */ +export type ComparisonOutcome = + | { kind: 'ok'; result: ComparisonResult } + | { kind: 'no_baseline' } + | { kind: 'self_baseline'; experimentName: string } + | { kind: 'fetch_failed'; error: string }; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Hard regressions only — high-confidence, gating-grade flags. */ +export function hardRegressions(result: ComparisonResult): ScenarioComparison[] { + return result.scenarios.filter((s) => s.verdict === 'hard_regression'); +} + +/** Soft regressions — looser thresholds, worth investigating but not gating. */ +export function softRegressions(result: ComparisonResult): ScenarioComparison[] { + return result.scenarios.filter((s) => s.verdict === 'soft_regression'); +} + +/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */ +export function watchList(result: ComparisonResult): ScenarioComparison[] { + return result.scenarios.filter((s) => s.verdict === 'watch'); +} + +export function improvements(result: ComparisonResult): ScenarioComparison[] { + return result.scenarios.filter((s) => s.verdict === 'improvement'); +} + +export function byVerdict(result: ComparisonResult): Record { + const counts: Record = { + hard_regression: 0, + soft_regression: 0, + watch: 0, + improvement: 0, + stable: 0, + unreliable_baseline: 0, + insufficient_data: 0, + }; + for (const s of result.scenarios) counts[s.verdict]++; + return counts; +} + +// --------------------------------------------------------------------------- +// Compare +// --------------------------------------------------------------------------- + +/** + * Compare two experiment buckets and produce a structured comparison result. + * + * Aggregate is computed over the *intersection* of scenarios — the only + * scenarios for which the rates are directly comparable. PR-only and + * baseline-only scenarios are surfaced separately, not folded into the + * aggregate. + * + * Aggregate pass rate is the *micro* average — total passes / total trials + * across the intersection. + * + * `options` exists for tests; production callers pass nothing. + */ +export function compareBuckets( + pr: ExperimentBucket, + baseline: ExperimentBucket, + options: ClassifyOptions = {}, +): ComparisonResult { + const scenarios: ScenarioComparison[] = []; + const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = []; + const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = []; + + let prIPasses = 0; + let prITotal = 0; + let baseIPasses = 0; + let baseITotal = 0; + + for (const [key, prCounts] of pr.scenarios) { + const baseCounts = baseline.scenarios.get(key); + if (!baseCounts) { + prOnly.push({ + testCaseFile: prCounts.testCaseFile, + scenarioName: prCounts.scenarioName, + }); + continue; + } + + prIPasses += prCounts.passed; + prITotal += prCounts.total; + baseIPasses += baseCounts.passed; + baseITotal += baseCounts.total; + + const classification = classifyScenario( + prCounts.passed, + prCounts.total, + baseCounts.passed, + baseCounts.total, + options, + ); + scenarios.push({ + testCaseFile: prCounts.testCaseFile, + scenarioName: prCounts.scenarioName, + prPasses: prCounts.passed, + prTotal: prCounts.total, + baselinePasses: baseCounts.passed, + baselineTotal: baseCounts.total, + ...classification, + }); + } + + for (const [key, baseCounts] of baseline.scenarios) { + if (!pr.scenarios.has(key)) { + baselineOnly.push({ + testCaseFile: baseCounts.testCaseFile, + scenarioName: baseCounts.scenarioName, + }); + } + } + + const aggregate: AggregateComparison = { + intersectionSize: scenarios.length, + prAggregatePassRate: rate(prIPasses, prITotal), + baselineAggregatePassRate: rate(baseIPasses, baseITotal), + prAggregateCI: wilsonInterval(prIPasses, prITotal), + baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal), + delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal), + }; + + scenarios.sort(scenarioComparator); + + const failureCategories = compareFailureCategories(pr, baseline); + + return { + pr: { experimentName: pr.experimentName }, + baseline: { experimentName: baseline.experimentName }, + aggregate, + scenarios, + prOnly, + baselineOnly, + failureCategories, + }; +} + +// --------------------------------------------------------------------------- +// Failure-category drift +// --------------------------------------------------------------------------- + +/** Min absolute rate gap to consider a category notable (5 percentage points). */ +const CATEGORY_NOTABLE_RATE_DELTA = 0.05; +/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */ +const CATEGORY_NOTABLE_COUNT_DELTA = 3; + +/** + * Categories the verifier is supposed to emit. Anything else (malformed + * strings like `-`, `>builder_issue`, empty, etc.) is dropped from the + * comparison so the PR comment doesn't display verifier noise. Keep in sync + * with the verifier's category enum; unknown values are logged at verbose + * level via the console (see compareFailureCategories). + */ +const KNOWN_FAILURE_CATEGORIES = new Set([ + 'builder_issue', + 'mock_issue', + 'framework_issue', + 'verification_failure', + 'build_failure', +]); + +function isCategoryNotable( + prCount: number, + prTotal: number, + baselineCount: number, + baselineTotal: number, +): boolean { + const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal); + if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false; + const expectedPrCount = baselineCount * (prTotal / baselineTotal); + const countGap = Math.abs(prCount - expectedPrCount); + return countGap >= CATEGORY_NOTABLE_COUNT_DELTA; +} + +function compareFailureCategories( + pr: ExperimentBucket, + baseline: ExperimentBucket, +): FailureCategoryComparison[] { + if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return []; + const prTotal = pr.trialTotal ?? 0; + const baseTotal = baseline.trialTotal ?? 0; + if (prTotal === 0 || baseTotal === 0) return []; + + // Surface unrecognised values so we notice when the verifier adds a new + // category (or starts emitting noise we should clean up). Doesn't enter + // the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES. + for (const category of Object.keys(pr.failureCategoryTotals)) { + if (!KNOWN_FAILURE_CATEGORIES.has(category)) { + console.warn(`[comparison] dropping unknown failureCategory "${category}"`); + } + } + for (const category of Object.keys(baseline.failureCategoryTotals)) { + if (!KNOWN_FAILURE_CATEGORIES.has(category)) { + console.warn(`[comparison] dropping unknown failureCategory "${category}"`); + } + } + + // Always emit a row for every known category, even if both sides are 0. + // The renderer can decide whether to suppress 0/0 rows; this gives readers + // a complete picture of the failure-type taxonomy by default. + const out: FailureCategoryComparison[] = []; + for (const category of KNOWN_FAILURE_CATEGORIES) { + const prCount = pr.failureCategoryTotals[category] ?? 0; + const baselineCount = baseline.failureCategoryTotals[category] ?? 0; + out.push({ + category, + prCount, + prRate: prCount / prTotal, + baselineCount, + baselineRate: baselineCount / baseTotal, + delta: prCount / prTotal - baselineCount / baseTotal, + notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal), + }); + } + + // Sort: notable first, then by absolute delta descending. + out.sort((a, b) => { + if (a.notable !== b.notable) return a.notable ? -1 : 1; + return Math.abs(b.delta) - Math.abs(a.delta); + }); + return out; +} + +function rate(passes: number, total: number): number { + return total > 0 ? passes / total : 0; +} + +const VERDICT_ORDER: Record = { + hard_regression: 0, + soft_regression: 1, + improvement: 2, + watch: 3, + unreliable_baseline: 4, + stable: 5, + insufficient_data: 6, +}; + +function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number { + const av = VERDICT_ORDER[a.verdict]; + const bv = VERDICT_ORDER[b.verdict]; + if (av !== bv) return av - bv; + const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile); + if (fileCmp !== 0) return fileCmp; + return a.scenarioName.localeCompare(b.scenarioName); +} diff --git a/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts b/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts new file mode 100644 index 00000000000..411d6cf267e --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts @@ -0,0 +1,123 @@ +// --------------------------------------------------------------------------- +// Find and fetch the pinned baseline experiment from LangSmith. +// +// The baseline is whichever experiment most recently used the +// `instance-ai-baseline` prefix. To refresh, run the eval with that prefix: +// +// pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10 +// +// LangSmith appends a random suffix, so successive baseline runs become +// `instance-ai-baseline-7abc1234`, `instance-ai-baseline-9def5678`, etc. +// We pick the most recently started one. +// +// Two functions, both small: +// +// findLatestBaseline — list baseline-prefixed projects, pick newest. +// fetchBaselineBucket — read its root runs, bucket per scenario. +// +// Both throw on transport errors. Callers are expected to swallow with a log: +// the comparison is advisory and shouldn't fail the eval run. +// --------------------------------------------------------------------------- + +import type { Client } from 'langsmith'; +import { z } from 'zod'; + +import type { ExperimentBucket, ScenarioCounts } from './compare'; + +/** + * Prefix the latest-baseline lookup matches against. The CLI flag + * `--experiment-name instance-ai-baseline` produces project names like + * `instance-ai-baseline-7abc1234` (LangSmith appends a hyphen + suffix), so + * the constant must end in `-` to avoid matching unrelated names that + * happen to start with `instance-ai-baseline...`. + */ +export const BASELINE_EXPERIMENT_PREFIX = 'instance-ai-baseline-'; + +const inputsSchema = z + .object({ + testCaseFile: z.string().default(''), + scenarioName: z.string().default(''), + }) + .passthrough(); + +const outputsSchema = z + .object({ + passed: z.boolean().default(false), + failureCategory: z.string().optional(), + }) + .passthrough(); + +/** + * Return the most recently created baseline experiment, or `undefined` if + * none exist. We pick by `start_time` so a re-run of an older snapshot + * doesn't displace the latest one. + */ +export async function findLatestBaseline(client: Client): Promise { + let latest: { name: string; ts: number } | undefined; + for await (const project of client.listProjects({ nameContains: BASELINE_EXPERIMENT_PREFIX })) { + const name = project.name; + if (!name?.startsWith(BASELINE_EXPERIMENT_PREFIX)) continue; + const ts = project.start_time ? new Date(project.start_time).getTime() : 0; + if (!latest || ts > latest.ts) latest = { name, ts }; + } + return latest?.name; +} + +/** + * Fetch a baseline experiment's per-scenario pass/fail counts. Each root run + * corresponds to one (testCaseFile, scenarioName, iteration) triple — we + * bucket by `${testCaseFile}/${scenarioName}` and accumulate. + * + * Throws if the project does not exist. + */ +export async function fetchBaselineBucket( + client: Client, + experimentName: string, +): Promise { + const project = await client.readProject({ projectName: experimentName }); + const scenarios = new Map(); + const failureCategoryTotals: Record = {}; + let trialTotal = 0; + + for await (const run of client.listRuns({ projectId: project.id, isRoot: true })) { + const inputs = inputsSchema.safeParse(run.inputs ?? {}); + if (!inputs.success || !inputs.data.testCaseFile || !inputs.data.scenarioName) continue; + // Skip runs that never produced outputs (still running, crashed before + // completion, infra error). Without this guard, every field defaults + // (passed → false) would coerce them into "failed" trials and inflate + // the baseline failure count. Mirrors `parseTargetOutput` in cli/index.ts. + const rawOutputs = run.outputs; + if ( + rawOutputs === null || + rawOutputs === undefined || + typeof rawOutputs !== 'object' || + Object.keys(rawOutputs).length === 0 + ) { + continue; + } + const outputs = outputsSchema.safeParse(rawOutputs); + if (!outputs.success) continue; + + const key = `${inputs.data.testCaseFile}/${inputs.data.scenarioName}`; + const existing: ScenarioCounts = scenarios.get(key) ?? { + testCaseFile: inputs.data.testCaseFile, + scenarioName: inputs.data.scenarioName, + passed: 0, + total: 0, + failureCategories: {}, + }; + existing.total++; + trialTotal++; + if (outputs.data.passed) { + existing.passed++; + } else if (outputs.data.failureCategory) { + const cat = outputs.data.failureCategory; + existing.failureCategories = existing.failureCategories ?? {}; + existing.failureCategories[cat] = (existing.failureCategories[cat] ?? 0) + 1; + failureCategoryTotals[cat] = (failureCategoryTotals[cat] ?? 0) + 1; + } + scenarios.set(key, existing); + } + + return { experimentName, scenarios, failureCategoryTotals, trialTotal }; +} diff --git a/packages/@n8n/instance-ai/evaluations/comparison/format.ts b/packages/@n8n/instance-ai/evaluations/comparison/format.ts new file mode 100644 index 00000000000..1d4c9402038 --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/comparison/format.ts @@ -0,0 +1,961 @@ +// --------------------------------------------------------------------------- +// Render the eval run as a PR comment (markdown) or a console summary +// (aligned plain text). Both formats are driven by: +// +// - MultiRunEvaluation — pass rates, build counts, per-trial reasoning +// - ComparisonOutcome (optional) — tagged result of the baseline +// comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or +// `fetch_failed` / `self_baseline` (skipped for cause). Each kind +// drives a distinct top-of-comment alert so a LangSmith outage doesn't +// get dressed up as "no baseline configured". +// +// When no comparison is available (no baseline yet, LangSmith offline) +// the renderers still produce a useful per-test-case summary. When a +// comparison is available, sections render in priority order: +// regressions, soft regressions, notable movement, improvements, +// failure-category drift. Only sections with content are emitted. +// --------------------------------------------------------------------------- + +import { + hardRegressions, + improvements, + softRegressions, + watchList, + type ComparisonOutcome, + type ComparisonResult, + type FailureCategoryComparison, + type ScenarioComparison, +} from './compare'; +import type { + MultiRunEvaluation, + TestCaseAggregation, + WorkflowTestCase, + WorkflowTestCaseResult, +} from '../types'; + +interface FormatOptions { + /** Optional commit SHA to include in the heading. Truncated to 8 chars. */ + commitSha?: string; + /** Maps each test-case reference to its file slug. When provided, the + * per-scenario failure breakdown looks up failed runs by + * `${fileSlug}/${scenarioName}` — deterministic across collisions like + * multiple `happy-path` scenarios. When omitted, the breakdown is + * skipped (no name-only fallback — that lookup was wrong on real data). */ + slugByTestCase?: Map; +} + +// --------------------------------------------------------------------------- +// Markdown PR comment +// --------------------------------------------------------------------------- + +export function formatComparisonMarkdown( + evaluation: MultiRunEvaluation, + outcome?: ComparisonOutcome, + options: FormatOptions = {}, +): string { + const lines: string[] = []; + const comparison = outcome?.kind === 'ok' ? outcome.result : undefined; + + lines.push(formatHeading(options.commitSha)); + lines.push(''); + lines.push(formatTopAlert(outcome)); + lines.push(''); + lines.push(formatAggregateBlock(evaluation, comparison)); + lines.push(''); + + if (comparison) { + const hard = hardRegressions(comparison); + const soft = softRegressions(comparison); + const watch = watchList(comparison); + const imps = improvements(comparison); + + const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0; + + // Built once and reused across the regression-tier sections so each + // scenario row can carry a collapsible breakdown of its failed PR runs. + // Improvements skip the breakdown — they passed. Skipped entirely when + // the caller didn't pass a slug map (lookup would be ambiguous). + const failedIndex = options.slugByTestCase + ? buildFailedRunsIndex(evaluation, options.slugByTestCase) + : undefined; + + if (hard.length > 0) { + lines.push( + ...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex), + ); + } + if (soft.length > 0) { + lines.push( + ...renderScenarioSection( + 'Soft regressions', + '— investigate if related to your changes', + soft, + true, + failedIndex, + ), + ); + } + if (watch.length > 0) { + lines.push( + ...renderScenarioSection( + 'Notable movement', + '— large gap, no statistical flag', + watch, + false, + failedIndex, + ), + ); + } + if (imps.length > 0) { + lines.push(...renderScenarioSection('Improvements', '', imps, true)); + } + + if (renderedAnyTable) { + lines.push( + "_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._", + ); + lines.push(''); + } + + // Always render the breakdown when comparison data is available — the + // renderer drops 0/0 rows itself, so empty categories don't pollute + // the output but the reader still sees the full taxonomy of what's + // tracked. + lines.push(...renderFailureCategorySection(comparison.failureCategories)); + } + + lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase)); + + if (comparison) { + const otherFindings = renderOtherFindings(comparison); + if (otherFindings.length > 0) lines.push(...otherFindings); + } + + const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase); + if (failureDetails.length > 0) lines.push(...failureDetails); + + return lines.join('\n'); +} + +function formatHeading(commitSha?: string): string { + const sha = commitSha ? ` — \`${commitSha.slice(0, 8)}\`` : ''; + return `### Instance AI Workflow Eval${sha}`; +} + +function formatTopAlert(outcome?: ComparisonOutcome): string { + if (!outcome) { + return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join( + '\n', + ); + } + + if (outcome.kind === 'no_baseline') { + return [ + '> [!NOTE]', + '> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.', + ].join('\n'); + } + if (outcome.kind === 'self_baseline') { + return [ + '> [!NOTE]', + `> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`, + ].join('\n'); + } + if (outcome.kind === 'fetch_failed') { + return [ + '> [!WARNING]', + `> Regression detection did not run — baseline fetch failed: ${outcome.error}`, + ].join('\n'); + } + + const comparison = outcome.result; + const hard = hardRegressions(comparison).length; + const soft = softRegressions(comparison).length; + const watch = watchList(comparison).length; + const imps = improvements(comparison).length; + const stable = countByVerdict(comparison, 'stable'); + + const aggDelta = comparison.aggregate.delta * 100; + const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`; + + // Always include all five tier counts so readers see what's being tracked, + // not just what's > 0. The hard count is bolded when nonzero for emphasis. + const summary = [ + hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions', + `${soft} soft`, + `${watch} notable`, + `${imps} improvement${imps === 1 ? '' : 's'}`, + `${stable} stable`, + ].join(', '); + + let icon: string; + let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP'; + + if (hard > 0) { + icon = '🔴'; + alertKind = 'CAUTION'; + } else if (soft > 0) { + icon = '🟡'; + alertKind = 'WARNING'; + } else if (watch > 0) { + icon = '🔵'; + alertKind = 'NOTE'; + } else { + icon = '🟢'; + alertKind = 'TIP'; + } + + return `> [!${alertKind}]\n> ${icon} ${summary}. Pass rate ${aggDeltaText} vs master.`; +} + +function formatAggregateBlock( + evaluation: MultiRunEvaluation, + comparison?: ComparisonResult, +): string { + if (!comparison) { + const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios); + const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0); + const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0); + const rate = total > 0 ? (passed / total) * 100 : 0; + return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`; + } + + const { aggregate } = comparison; + const delta = aggregate.delta * 100; + const sign = delta >= 0 ? '+' : ''; + const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : ''; + + const baselineN = inferBaselineN(comparison); + const sampleLine = baselineN + ? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_` + : `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`; + + const partial = comparison.baselineOnly.length + comparison.prOnly.length; + const partialNote = + partial > 0 + ? `\n_Partial: ${[ + comparison.baselineOnly.length > 0 + ? `${comparison.baselineOnly.length} baseline scenarios not run by PR` + : null, + comparison.prOnly.length > 0 + ? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)` + : null, + ] + .filter((s) => s !== null) + .join(', ')}._` + : ''; + + return [ + `**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`, + sampleLine + partialNote, + ].join('\n'); +} + +function renderScenarioSection( + heading: string, + subtitle: string, + scenarios: ScenarioComparison[], + withPValue: boolean, + failedIndex?: FailedRunsBySlug, +): string[] { + const lines: string[] = []; + const headingLine = subtitle + ? `#### ${heading} (${scenarios.length}) ${subtitle}` + : `#### ${heading} (${scenarios.length})`; + lines.push(headingLine); + lines.push(''); + if (withPValue) { + lines.push('| Scenario | PR | Baseline | Δ | p |'); + lines.push('|---|---|---|---|---|'); + } else { + lines.push('| Scenario | PR | Baseline | Δ |'); + lines.push('|---|---|---|---|'); + } + for (const s of scenarios) { + const cells = [ + `\`${s.testCaseFile}/${s.scenarioName}\``, + formatRateCell(s.prPasses, s.prTotal), + formatRateCell(s.baselinePasses, s.baselineTotal), + formatDeltaCell(s.delta), + ]; + if (withPValue) { + const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft; + cells.push(p.toFixed(3)); + } + lines.push(`| ${cells.join(' | ')} |`); + } + lines.push(''); + + // Per-scenario failure breakdown — one collapsible per row that had failed + // PR runs. Lets the reader drill into each flagged scenario without + // hunting through a separate "Failure details" section. + if (failedIndex) { + for (const s of scenarios) { + const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? []; + if (failedRuns.length === 0) continue; + lines.push(...renderScenarioFailureBreakdown(s, failedRuns)); + } + } + + return lines; +} + +function renderScenarioFailureBreakdown( + s: ScenarioComparison, + failedRuns: FailedRunDetail[], +): string[] { + const slug = `${s.testCaseFile}/${s.scenarioName}`; + const categoryMix = summarizeCategories(failedRuns); + const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`]; + if (categoryMix) summaryParts.push(categoryMix); + + const lines: string[] = []; + lines.push(`
${slug} — ${summaryParts.join(' · ')}`); + lines.push(''); + for (const fr of failedRuns) { + const tag = fr.category ? ` [${fr.category}]` : ''; + lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`); + lines.push('>'); + } + // Drop the trailing empty quote line. + if (lines[lines.length - 1] === '>') lines.pop(); + lines.push(''); + lines.push('
'); + lines.push(''); + return lines; +} + +function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] { + // Drop rows that are 0/0 on both sides — they carry no signal for the + // reader. Categories with non-zero count on either side are kept so the + // reader sees the full picture even if not "notable". + const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0); + if (rows.length === 0) return []; + + const lines: string[] = []; + lines.push('#### Failure breakdown'); + lines.push(''); + lines.push('| Category | PR | Baseline | Δ | |'); + lines.push('|---|---|---|---|---|'); + for (const c of rows) { + const isNew = c.baselineCount === 0 && c.prCount > 0; + const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``; + const delta = c.delta * 100; + const sign = delta >= 0 ? '+' : ''; + const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : ''; + const notableMarker = c.notable ? '**notable**' : ''; + lines.push( + `| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`, + ); + } + lines.push(''); + return lines; +} + +function renderPerTestCaseDetails( + evaluation: MultiRunEvaluation, + slugByTestCase?: Map, +): string[] { + const { totalRuns, testCases } = evaluation; + if (testCases.length === 0) return []; + const lines: string[] = []; + lines.push(`
Per-test-case results (${testCases.length})`); + lines.push(''); + const renderName = (tc: TestCaseAggregation): string => { + const slug = slugByTestCase?.get(tc.testCase); + return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``; + }; + if (totalRuns > 1) { + lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`); + lines.push('|---|---|---|---|'); + for (const tc of testCases) { + const meanPassAtK = tc.scenarios.length + ? Math.round( + (tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) / + tc.scenarios.length) * + 100, + ) + : 0; + const meanPassHatK = tc.scenarios.length + ? Math.round( + (tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) / + tc.scenarios.length) * + 100, + ) + : 0; + lines.push( + `| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`, + ); + } + } else { + lines.push('| Workflow | Built | Pass rate |'); + lines.push('|---|---|---|'); + for (const tc of testCases) { + const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗'; + const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length; + const total = tc.scenarios.length; + lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`); + } + } + lines.push(''); + lines.push('
'); + lines.push(''); + return lines; +} + +function renderOtherFindings(comparison: ComparisonResult): string[] { + const stable = countByVerdict(comparison, 'stable'); + const flaky = countByVerdict(comparison, 'unreliable_baseline'); + const noData = countByVerdict(comparison, 'insufficient_data'); + if (stable === 0 && flaky === 0 && noData === 0) return []; + + const summaryParts: string[] = []; + if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`); + if (noData > 0) summaryParts.push(`${noData} no data`); + if (stable > 0) summaryParts.push(`${stable} stable`); + const summary = summaryParts.join(' · '); + + const lines: string[] = []; + lines.push(`
Other findings: ${summary}`); + lines.push(''); + + const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable'); + const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline'); + const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data'); + + if (flakyScenarios.length > 0) { + lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**'); + lines.push(''); + lines.push('| Scenario | PR | Baseline | Δ |'); + lines.push('|---|---|---|---|'); + for (const s of flakyScenarios) { + lines.push( + `| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`, + ); + } + lines.push(''); + } + + if (noDataScenarios.length > 0) { + lines.push( + `**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`, + ); + lines.push(''); + } + + if (stableScenarios.length > 0) { + lines.push(`**Stable (${stableScenarios.length}):**`); + lines.push( + stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.', + ); + lines.push(''); + } + + lines.push('
'); + lines.push(''); + return lines; +} + +function renderFailureDetails( + evaluation: MultiRunEvaluation, + slugByTestCase?: Map, +): string[] { + const failed: Array<{ + tc: WorkflowTestCaseResult; + fileSlug: string | undefined; + scenarioName: string; + failedRuns: Array<{ category?: string; reasoning: string }>; + }> = []; + for (const tc of evaluation.testCases) { + const fileSlug = slugByTestCase?.get(tc.testCase); + for (const sa of tc.scenarios) { + const failedRuns = sa.runs + .filter((r) => !r.success) + .map((r) => ({ category: r.failureCategory, reasoning: r.reasoning })); + if (failedRuns.length > 0) { + failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns }); + } + } + } + if (failed.length === 0) return []; + + const lines: string[] = []; + lines.push('
Failure details'); + lines.push(''); + for (const { tc, fileSlug, scenarioName, failedRuns } of failed) { + const slug = fileSlug + ? `${fileSlug}/${scenarioName}` + : `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`; + lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`); + for (const fr of failedRuns) { + const tag = fr.category ? ` [${fr.category}]` : ''; + lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`); + } + lines.push(''); + } + lines.push('
'); + lines.push(''); + return lines; +} + +// --------------------------------------------------------------------------- +// Per-scenario failure lookup +// --------------------------------------------------------------------------- +// +// The comparison carries per-scenario counts (passed / total) but not the +// underlying reasoning text. The evaluation has the reasoning, but keys +// testCases by reference identity — not by the `testCaseFile` slug used in +// the comparison. The slug map (built in cli/index.ts where the file slugs +// are first known) bridges the two so the lookup is deterministic. Without +// it we'd have to disambiguate by scenarioName alone, which collides on +// reused names (`happy-path` shows up across most workflows). + +interface FailedRunDetail { + category?: string; + reasoning: string; + runIndex: number; // 1-based for display +} + +type FailedRunsBySlug = Map; + +function buildFailedRunsIndex( + evaluation: MultiRunEvaluation, + slugByTestCase: Map, +): FailedRunsBySlug { + const map: FailedRunsBySlug = new Map(); + for (const tc of evaluation.testCases) { + const fileSlug = slugByTestCase.get(tc.testCase); + if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute + for (const sa of tc.scenarios) { + const failedRuns: FailedRunDetail[] = []; + sa.runs.forEach((r, i) => { + if (!r.success) { + failedRuns.push({ + category: r.failureCategory, + reasoning: r.reasoning, + runIndex: i + 1, + }); + } + }); + if (failedRuns.length > 0) { + map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns); + } + } + } + return map; +} + +function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined { + const counts = new Map(); + for (const fr of failedRuns) { + if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1); + } + if (counts.size === 0) return undefined; + return [...counts.entries()] + .sort((a, b) => b[1] - a[1]) + .map(([cat, n]) => `${n}× ${cat}`) + .join(', '); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function pct(rate: number): string { + return (rate * 100).toFixed(1); +} + +function formatRateCell(passes: number, total: number): string { + const rate = total > 0 ? Math.round((passes / total) * 100) : 0; + return `${passes}/${total} (${rate}%)`; +} + +function formatDeltaCell(delta: number): string { + const pp = delta * 100; + const sign = pp >= 0 ? '+' : ''; + const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : ''; + return `${sign}${pp.toFixed(0)}pp${arrow}`; +} + +function countByVerdict( + comparison: ComparisonResult, + verdict: ScenarioComparison['verdict'], +): number { + return comparison.scenarios.filter((s) => s.verdict === verdict).length; +} + +/** Best-effort N=baseline iteration count. The comparison only carries trial + * totals per scenario; we infer N from the most-common scenario total since + * the baseline runs every scenario the same number of times. */ +function inferBaselineN(comparison: ComparisonResult): number | undefined { + const totals = comparison.scenarios + .filter((s) => s.baselineTotal > 0) + .map((s) => s.baselineTotal); + if (totals.length === 0) return undefined; + const counts = new Map(); + for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1); + let best = totals[0]; + let bestCount = 0; + for (const [n, c] of counts) { + if (c > bestCount) { + best = n; + bestCount = c; + } + } + return best; +} + +// --------------------------------------------------------------------------- +// Terminal renderer: aligned plain text for the eval CLI's end-of-run print. +// --------------------------------------------------------------------------- + +const TERMINAL_INDENT = ' '; +const TERMINAL_TABLE_INDENT = ' '; + +export function formatComparisonTerminal( + evaluation: MultiRunEvaluation, + outcome?: ComparisonOutcome, + options: FormatOptions = {}, +): string { + const lines: string[] = []; + const comparison = outcome?.kind === 'ok' ? outcome.result : undefined; + + const titleSuffix = options.commitSha ? ` — ${options.commitSha.slice(0, 8)}` : ''; + const title = `Instance AI Workflow Eval${titleSuffix}`; + lines.push(title); + lines.push('═'.repeat(title.length)); + + lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome)); + lines.push(''); + + lines.push(...formatTerminalAggregate(evaluation, comparison)); + lines.push(''); + + lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase)); + + if (comparison) { + const hard = hardRegressions(comparison); + const soft = softRegressions(comparison); + const watch = watchList(comparison); + const imps = improvements(comparison); + + if (hard.length > 0) { + lines.push( + TERMINAL_INDENT + + 'REGRESSIONS (high-confidence: large drop on a reliable scenario, unlikely noise)', + ); + lines.push(formatTerminalScenarioTable(hard, true)); + lines.push(''); + } + if (soft.length > 0) { + lines.push( + TERMINAL_INDENT + + 'SOFT REGRESSIONS (likely natural variance — investigate if related to your changes)', + ); + lines.push(formatTerminalScenarioTable(soft, true)); + lines.push(''); + } + if (watch.length > 0) { + lines.push(TERMINAL_INDENT + 'NOTABLE MOVEMENT (large gap, no statistical flag)'); + lines.push(formatTerminalScenarioTable(watch, false)); + lines.push(''); + } + if (imps.length > 0) { + lines.push(TERMINAL_INDENT + 'IMPROVEMENTS'); + lines.push(formatTerminalScenarioTable(imps, true)); + lines.push(''); + } + + // Always render the breakdown when comparison data is available — same + // rationale as the markdown side. The terminal table drops 0/0 rows + // itself. + const breakdownRows = comparison.failureCategories.filter( + (c) => c.prCount > 0 || c.baselineCount > 0, + ); + if (breakdownRows.length > 0) { + lines.push(TERMINAL_INDENT + 'failure breakdown'); + lines.push(formatTerminalCategoryTable(breakdownRows)); + lines.push(''); + } + + // Stable count is already in the verdict line; surface only the rarer + // outcomes here. + const flaky = countByVerdict(comparison, 'unreliable_baseline'); + const noData = countByVerdict(comparison, 'insufficient_data'); + const otherParts: string[] = []; + if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`); + if (noData > 0) otherParts.push(`${noData} no data`); + if (otherParts.length > 0) { + lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · ')); + } + } + + return lines.join('\n'); +} + +function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string { + if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).'; + if (outcome.kind === 'no_baseline') { + return '▶ No baseline configured — comparison skipped.'; + } + if (outcome.kind === 'self_baseline') { + return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`; + } + if (outcome.kind === 'fetch_failed') { + return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`; + } + + const comparison = outcome.result; + const hard = hardRegressions(comparison).length; + const soft = softRegressions(comparison).length; + const watch = watchList(comparison).length; + const imps = improvements(comparison).length; + const stable = countByVerdict(comparison, 'stable'); + + const aggDelta = comparison.aggregate.delta * 100; + const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`; + + const summary = [ + `${hard} regression${hard === 1 ? '' : 's'}`, + `${soft} soft`, + `${watch} notable`, + `${imps} improvement${imps === 1 ? '' : 's'}`, + `${stable} stable`, + ].join(', '); + + return `▶ ${summary}. Pass rate ${aggDeltaText} vs master.`; +} + +function formatTerminalAggregate( + evaluation: MultiRunEvaluation, + comparison?: ComparisonResult, +): string[] { + const lines: string[] = []; + if (!comparison) { + const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios); + const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0); + const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0); + const rate = total > 0 ? (passed / total) * 100 : 0; + lines.push( + TERMINAL_INDENT + + `Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`, + ); + return lines; + } + + const { aggregate } = comparison; + const baselineN = inferBaselineN(comparison); + const aggDelta = aggregate.delta * 100; + const sign = aggDelta >= 0 ? '+' : ''; + const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : ''; + lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`); + lines.push( + TERMINAL_INDENT + + ` PR ${pct(aggregate.prAggregatePassRate)}% (N=${evaluation.totalRuns})`, + ); + if (baselineN !== undefined) { + lines.push( + TERMINAL_INDENT + + ` baseline ${pct(aggregate.baselineAggregatePassRate)}% (N=${baselineN})`, + ); + } else { + lines.push(TERMINAL_INDENT + ` baseline ${pct(aggregate.baselineAggregatePassRate)}%`); + } + lines.push(TERMINAL_INDENT + ` Δ ${sign}${aggDelta.toFixed(1)}pp${arrow}`); + + if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) { + const partialParts: string[] = []; + if (comparison.baselineOnly.length > 0) + partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`); + if (comparison.prOnly.length > 0) + partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`); + lines.push(TERMINAL_INDENT + ` partial: ${partialParts.join(', ')}`); + } + + return lines; +} + +function formatTerminalPerTestCase( + evaluation: MultiRunEvaluation, + slugByTestCase?: Map, +): string[] { + const { totalRuns, testCases } = evaluation; + if (testCases.length === 0) return []; + const lines: string[] = []; + const heading = `Per-test-case results (${testCases.length})`; + lines.push(TERMINAL_INDENT + heading); + + const nameOf = (tc: TestCaseAggregation, max: number): string => { + const slug = slugByTestCase?.get(tc.testCase); + return slug ?? tc.testCase.prompt.slice(0, max); + }; + + if (totalRuns > 1) { + const rows = testCases.map((tc) => { + const meanPassAtK = + tc.scenarios.length > 0 + ? Math.round( + (tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) / + tc.scenarios.length) * + 100, + ) + : 0; + const meanPassHatK = + tc.scenarios.length > 0 + ? Math.round( + (tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) / + tc.scenarios.length) * + 100, + ) + : 0; + return { + name: nameOf(tc, 60), + builds: `${tc.buildSuccessCount}/${totalRuns}`, + passAtK: `${meanPassAtK}%`, + passHatK: `${meanPassHatK}%`, + }; + }); + const nameW = maxWidth( + rows.map((r) => r.name), + 'workflow', + ); + const buildsW = maxWidth( + rows.map((r) => r.builds), + 'builds', + ); + const atKHeader = `pass@${totalRuns}`; + const hatKHeader = `pass^${totalRuns}`; + const atKW = maxWidth( + rows.map((r) => r.passAtK), + atKHeader, + ); + const hatKW = maxWidth( + rows.map((r) => r.passHatK), + hatKHeader, + ); + lines.push( + TERMINAL_TABLE_INDENT + + `${'workflow'.padEnd(nameW)} ${'builds'.padEnd(buildsW)} ${atKHeader.padStart(atKW)} ${hatKHeader.padStart(hatKW)}`, + ); + lines.push( + TERMINAL_TABLE_INDENT + + `${'─'.repeat(nameW)} ${'─'.repeat(buildsW)} ${'─'.repeat(atKW)} ${'─'.repeat(hatKW)}`, + ); + for (const r of rows) { + lines.push( + TERMINAL_TABLE_INDENT + + `${r.name.padEnd(nameW)} ${r.builds.padEnd(buildsW)} ${r.passAtK.padStart(atKW)} ${r.passHatK.padStart(hatKW)}`, + ); + } + } else { + for (const tc of testCases) { + const r = tc.runs[0]; + const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED'; + lines.push(''); + lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}…`); + lines.push(TERMINAL_INDENT + ` ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`); + if (r.buildError) lines.push(TERMINAL_INDENT + ` error: ${r.buildError.slice(0, 200)}`); + for (const sa of tc.scenarios) { + const sr = sa.runs[0]; + const status = sr.success ? 'PASS' : 'FAIL'; + const category = sr.failureCategory ? ` [${sr.failureCategory}]` : ''; + lines.push(TERMINAL_INDENT + ` ${status} ${sr.scenario.name}${category}`); + if (!sr.success) { + const errs = sr.evalResult?.errors ?? []; + if (errs.length > 0) { + lines.push(TERMINAL_INDENT + ` error: ${errs.join('; ').slice(0, 200)}`); + } + lines.push(TERMINAL_INDENT + ` diagnosis: ${sr.reasoning.slice(0, 200)}`); + } + } + } + } + lines.push(''); + return lines; +} + +function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string { + const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`); + const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`); + const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`); + const deltaCells = scenarios.map((s) => { + const d = s.delta * 100; + const sign = d >= 0 ? '+' : ''; + const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : ''; + return `${sign}${d.toFixed(0)}pp${arrow}`; + }); + const pCells = withPValue + ? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3)) + : []; + + const nameW = maxWidth(names, 'scenario'); + const prW = maxWidth(prCells, 'PR'); + const baseW = maxWidth(baseCells, 'baseline'); + const deltaW = maxWidth(deltaCells, 'Δ'); + const pW = withPValue ? maxWidth(pCells, 'p') : 0; + + const headers = [ + 'scenario'.padEnd(nameW), + 'PR'.padEnd(prW), + 'baseline'.padEnd(baseW), + 'Δ'.padEnd(deltaW), + ]; + if (withPValue) headers.push('p'.padEnd(pW)); + const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW]; + const sep = widths.map((w) => '─'.repeat(w)).join(' '); + + const rows = scenarios.map((_, i) => { + const cells = [ + names[i].padEnd(nameW), + prCells[i].padEnd(prW), + baseCells[i].padEnd(baseW), + deltaCells[i].padEnd(deltaW), + ]; + if (withPValue) cells.push(pCells[i].padEnd(pW)); + return TERMINAL_TABLE_INDENT + cells.join(' '); + }); + + return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join( + '\n', + ); +} + +function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string { + const names = cats.map((c) => { + const isNew = c.baselineCount === 0 && c.prCount > 0; + return c.category + (isNew ? ' 🆕' : ''); + }); + const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`); + const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`); + const deltaCells = cats.map((c) => { + const d = c.delta * 100; + const sign = d >= 0 ? '+' : ''; + return `${sign}${d.toFixed(1)}pp`; + }); + + const nameW = maxWidth(names, 'category'); + const prW = maxWidth(prCells, 'PR'); + const baseW = maxWidth(baseCells, 'baseline'); + + const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ']; + const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join(' '); + + const rows = cats.map( + (_, i) => + TERMINAL_TABLE_INDENT + + [ + names[i].padEnd(nameW), + prCells[i].padEnd(prW), + baseCells[i].padEnd(baseW), + deltaCells[i], + ].join(' '), + ); + + return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join( + '\n', + ); +} + +function maxWidth(values: string[], header: string): number { + return values.reduce((m, v) => Math.max(m, v.length), header.length); +} diff --git a/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts b/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts new file mode 100644 index 00000000000..1cd888eeb13 --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts @@ -0,0 +1,304 @@ +// --------------------------------------------------------------------------- +// Decides whether one scenario's pass rate is meaningfully worse than +// another, at the small sample sizes evals run at (N=3 typically). +// +// Public surface: +// - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict +// - wilsonInterval(passes, total) — confidence band for a pass rate, used +// for the headline aggregate +// +// The implementation uses Fisher's exact test and the Wilson score interval +// under the hood; both are standard small-sample statistics. You don't need +// to know either to use the public API. +// --------------------------------------------------------------------------- +import { strict as assert } from 'node:assert'; + +// --------------------------------------------------------------------------- +// Fisher's exact test (one-sided) +// +// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the +// probability of seeing a gap at least as bad as the observed one if the two +// groups actually had the same pass rate. Small return value ⇒ strong +// evidence the PR is worse. +// --------------------------------------------------------------------------- + +const logFactorialCache: number[] = [0, 0]; + +function logFactorial(n: number): number { + for (let i = logFactorialCache.length; i <= n; i++) { + logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i)); + } + return logFactorialCache[n]; +} + +function logBinomial(n: number, k: number): number { + if (k < 0 || k > n) return -Infinity; + return logFactorial(n) - logFactorial(k) - logFactorial(n - k); +} + +function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number { + const total = nPasses + nFails; + if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0; + return Math.exp( + logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn), + ); +} + +/** + * One-sided Fisher's exact test (left tail). Returns the probability that + * PR's pass count would be at most `a` if PR and baseline shared the same + * underlying pass rate. Small value ⇒ PR is significantly worse. + * + * 2×2 table: + * + * passed failed + * PR | a | b | + * Baseline | c | d | + * + * Returns 1 (no information) when either side has no trials, or when all + * trials passed or all failed. + */ +export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number { + const inputs = [a, b, c, d]; + for (const v of inputs) { + assert( + Number.isInteger(v) && v >= 0, + 'fishersExactOneSidedLeft requires non-negative integers', + ); + } + + const nPr = a + b; + const nBase = c + d; + const nPasses = a + c; + const nFails = b + d; + + if (nPr === 0 || nBase === 0) return 1; + if (nPasses === 0 || nFails === 0) return 1; + + let pValue = 0; + const kMax = Math.min(a, nPasses); + for (let k = 0; k <= kMax; k++) { + pValue += hypergeomPmf(nPasses, nFails, nPr, k); + } + // Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1. + return Math.min(1, Math.max(0, pValue)); +} + +// --------------------------------------------------------------------------- +// Wilson score interval (95% confidence) +// +// Returns a confidence band for a pass rate that behaves well at small N and +// at extreme rates (close to 0 or 1) — both common in our evals. Used for +// the headline aggregate band only; classification doesn't need it. +// --------------------------------------------------------------------------- + +// Standard z-score for a 95% confidence interval. We only ever use 95%, so +// the value is inlined rather than parameterised. +const Z_95 = 1.96; + +export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } { + assert( + Number.isInteger(passes) && passes >= 0, + 'wilsonInterval: passes must be a non-negative integer', + ); + assert( + Number.isInteger(total) && total >= 0, + 'wilsonInterval: total must be a non-negative integer', + ); + assert(passes <= total, 'wilsonInterval: passes cannot exceed total'); + + if (total === 0) return { lower: 0, upper: 1 }; + + const p = passes / total; + const z2 = Z_95 * Z_95; + const denom = 1 + z2 / total; + const center = (p + z2 / (2 * total)) / denom; + const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom; + return { + lower: Math.max(0, center - halfWidth), + upper: Math.min(1, center + halfWidth), + }; +} + +// --------------------------------------------------------------------------- +// Per-scenario classification +// +// Three flag tiers, evaluated in order of strictness: +// +// hard_regression — high-confidence drop on a reliable baseline. +// Gating-grade. +// soft_regression — looser bar; investigate, not gating. +// watch — moved noticeably but didn't pass either flag tier. +// Pure visibility. +// +// Improvements use the hard tier (we don't surface borderline improvements; +// they tend to be noise in the positive direction). +// --------------------------------------------------------------------------- + +export type ScenarioVerdict = + | 'hard_regression' // PR is confidently worse, baseline was reliable + | 'soft_regression' // looser bar — worth investigating, not high-confidence + | 'watch' // moved enough to surface but no flag tier triggered + | 'improvement' // PR is significantly better + | 'stable' // no meaningful change + | 'unreliable_baseline' // confident drop but baseline was too flaky to trust + | 'insufficient_data'; // either side had zero trials + +export interface ScenarioClassification { + verdict: ScenarioVerdict; + /** PR pass rate (0..1) */ + prPassRate: number; + /** Baseline pass rate (0..1) */ + baselinePassRate: number; + /** PR rate − baseline rate, signed. Negative = PR worse. */ + delta: number; + /** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */ + pValueLeft: number; + /** Probability the PR is at least this much better by chance. */ + pValueRight: number; +} + +export interface TierThresholds { + /** Flag only when the chance the gap happened by noise is below this. */ + maxPValue: number; + /** Flag only when the absolute pass-rate gap is at least this large (0..1). */ + minDelta: number; + /** Flag only when the baseline pass rate was at least this high (0..1). */ + minBaselinePassRate: number; +} + +export interface ClassifyOptions { + /** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */ + hard?: Partial; + /** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */ + soft?: Partial; + /** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */ + watchDelta?: number; +} + +const DEFAULT_HARD: TierThresholds = { + maxPValue: 0.05, + minDelta: 0.3, + minBaselinePassRate: 0.7, +}; +const DEFAULT_SOFT: TierThresholds = { + maxPValue: 0.2, + minDelta: 0.15, + minBaselinePassRate: 0.5, +}; +// Watch threshold: surface scenarios whose pass rate changed by at least 35pp +// without reaching a flag tier. High enough that natural noise on rock-solid +// scenarios (e.g. 2/3 vs 10/10 = −33pp) doesn't crowd the comment. +const DEFAULT_WATCH_DELTA = 0.35; + +function meetsThreshold( + pValue: number, + delta: number, + baselineRate: number, + tier: TierThresholds, + direction: 'worse' | 'better', +): boolean { + if (pValue >= tier.maxPValue) return false; + if (direction === 'worse') { + if (delta > -tier.minDelta) return false; + if (baselineRate < tier.minBaselinePassRate) return false; + } else { + if (delta < tier.minDelta) return false; + // Improvements skip the reliability gate — fixing flaky scenarios is a real win. + } + return true; +} + +/** + * Classify a single scenario into one of seven verdicts. See ScenarioVerdict + * for the tier semantics. + * + * `options` exists for tests; production callers leave thresholds at defaults. + */ +export function classifyScenario( + prPasses: number, + prTotal: number, + baselinePasses: number, + baselineTotal: number, + options: ClassifyOptions = {}, +): ScenarioClassification { + const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard }; + const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft }; + const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA; + + const prPassRate = prTotal > 0 ? prPasses / prTotal : 0; + const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0; + + if (prTotal === 0 || baselineTotal === 0) { + return { + verdict: 'insufficient_data', + prPassRate, + baselinePassRate, + delta: prPassRate - baselinePassRate, + pValueLeft: 1, + pValueRight: 1, + }; + } + + const a = prPasses; + const b = prTotal - prPasses; + const c = baselinePasses; + const d = baselineTotal - baselinePasses; + + const pValueLeft = fishersExactOneSidedLeft(a, b, c, d); + const pValueRight = fishersExactOneSidedLeft(c, d, a, b); + const delta = prPassRate - baselinePassRate; + + // Improvement (right tail) — single tier, hard thresholds only + if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) { + return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight }; + } + + // Hard regression — passes all three hard gates + if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) { + return { + verdict: 'hard_regression', + prPassRate, + baselinePassRate, + delta, + pValueLeft, + pValueRight, + }; + } + + // Confident drop, but on a baseline too flaky to call a regression. + // Surface as `unreliable_baseline` so it's visible without being a flag. + if ( + pValueLeft < hard.maxPValue && + delta <= -hard.minDelta && + baselinePassRate < hard.minBaselinePassRate + ) { + return { + verdict: 'unreliable_baseline', + prPassRate, + baselinePassRate, + delta, + pValueLeft, + pValueRight, + }; + } + + // Soft regression — passes the looser gates + if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) { + return { + verdict: 'soft_regression', + prPassRate, + baselinePassRate, + delta, + pValueLeft, + pValueRight, + }; + } + + // Watch — meaningful movement but no flag fired. Pure visibility. + if (Math.abs(delta) >= watchDelta) { + return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight }; + } + + return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight }; +} diff --git a/packages/@n8n/instance-ai/evaluations/harness/runner.ts b/packages/@n8n/instance-ai/evaluations/harness/runner.ts index 4a0387b53e3..db1ed7c6a80 100644 --- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts +++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts @@ -28,7 +28,7 @@ import type { // Constants // --------------------------------------------------------------------------- -const DEFAULT_TIMEOUT_MS = 600_000; +const DEFAULT_TIMEOUT_MS = 900_000; const SSE_SETTLE_DELAY_MS = 200; const POLL_INTERVAL_MS = 500; const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000; diff --git a/packages/@n8n/instance-ai/evaluations/index.ts b/packages/@n8n/instance-ai/evaluations/index.ts index 26b28eda926..908291afdb7 100644 --- a/packages/@n8n/instance-ai/evaluations/index.ts +++ b/packages/@n8n/instance-ai/evaluations/index.ts @@ -39,3 +39,38 @@ export type { ChecklistItem, ChecklistResult, } from './types'; + +// -- Comparison (regression detection) -- +export { + compareBuckets, + byVerdict, + improvements, + hardRegressions, + softRegressions, + watchList, +} from './comparison/compare'; +export type { + ComparisonResult, + ScenarioComparison, + ScenarioCounts, + ExperimentBucket, + AggregateComparison, + FailureCategoryComparison, +} from './comparison/compare'; +export { + classifyScenario, + fishersExactOneSidedLeft, + wilsonInterval, +} from './comparison/statistics'; +export type { + ScenarioVerdict, + ScenarioClassification, + ClassifyOptions, + TierThresholds, +} from './comparison/statistics'; +export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format'; +export { + fetchBaselineBucket, + findLatestBaseline, + BASELINE_EXPERIMENT_PREFIX, +} from './comparison/fetch-baseline';